b1sheng commited on
Commit
f4b9c44
·
1 Parent(s): 618f59b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -264
app.py CHANGED
@@ -18,86 +18,7 @@ from src.assets.css_html_js import custom_css, get_window_url_params
18
  from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
  from src.init import get_all_requested_models, load_all_info_from_hub
20
 
21
- # clone / pull the lmeh eval data
22
- H4_TOKEN = os.environ.get("H4_TOKEN", None)
23
 
24
- QUEUE_REPO = "open-llm-leaderboard/requests"
25
- RESULTS_REPO = "open-llm-leaderboard/results"
26
-
27
- PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
28
- PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
29
-
30
- IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
31
-
32
- EVAL_REQUESTS_PATH = "eval-queue"
33
- EVAL_RESULTS_PATH = "eval-results"
34
-
35
- EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
36
- EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
37
-
38
- api = HfApi()
39
-
40
- def restart_space():
41
- api.restart_space(
42
- repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
43
- )
44
-
45
- eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
46
-
47
- if not IS_PUBLIC:
48
- eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
49
- else:
50
- eval_queue_private, eval_results_private = None, None
51
-
52
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
53
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
54
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
55
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
56
-
57
- if not IS_PUBLIC:
58
- COLS.insert(2, AutoEvalColumn.precision.name)
59
- TYPES.insert(2, AutoEvalColumn.precision.type)
60
-
61
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
62
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
63
-
64
- BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
65
-
66
-
67
- def has_no_nan_values(df, columns):
68
- return df[columns].notna().all(axis=1)
69
-
70
-
71
- def has_nan_values(df, columns):
72
- return df[columns].isna().any(axis=1)
73
-
74
-
75
- def get_leaderboard_df_1():
76
- if eval_results:
77
- print("Pulling evaluation results for the leaderboard.")
78
- eval_results.git_pull()
79
- if eval_results_private:
80
- print("Pulling evaluation results for the leaderboard.")
81
- eval_results_private.git_pull()
82
-
83
- all_data = get_eval_results_dicts(IS_PUBLIC)
84
-
85
- if not IS_PUBLIC:
86
- all_data.append(gpt4_values)
87
- all_data.append(gpt35_values)
88
-
89
- all_data.append(baseline)
90
- apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
91
-
92
- df = pd.DataFrame.from_records(all_data)
93
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
94
- df = df[COLS]
95
-
96
- # filter out if any of the benchmarks have not been produced
97
- df = df[has_no_nan_values(df, BENCHMARK_COLS)]
98
- print(df)
99
- print(type(df))
100
- return df
101
 
102
  def get_leaderboard_df():
103
 
@@ -116,190 +37,18 @@ def get_leaderboard_df():
116
  df = pd.DataFrame(data)
117
  return df
118
 
119
- def get_evaluation_queue_df():
120
- if eval_queue:
121
- print("Pulling changes for the evaluation queue.")
122
- eval_queue.git_pull()
123
- if eval_queue_private:
124
- print("Pulling changes for the evaluation queue.")
125
- eval_queue_private.git_pull()
126
-
127
- entries = [
128
- entry
129
- for entry in os.listdir(EVAL_REQUESTS_PATH)
130
- if not entry.startswith(".")
131
- ]
132
- all_evals = []
133
-
134
- for entry in entries:
135
- if ".json" in entry:
136
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
137
- with open(file_path) as fp:
138
- data = json.load(fp)
139
-
140
- data["# params"] = "unknown"
141
- data["model"] = make_clickable_model(data["model"])
142
- data["revision"] = data.get("revision", "main")
143
-
144
- all_evals.append(data)
145
- elif ".md" not in entry:
146
- # this is a folder
147
- sub_entries = [
148
- e
149
- for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
150
- if not e.startswith(".")
151
- ]
152
- for sub_entry in sub_entries:
153
- file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
154
- with open(file_path) as fp:
155
- data = json.load(fp)
156
-
157
- # data["# params"] = get_n_params(data["model"])
158
- data["model"] = make_clickable_model(data["model"])
159
- all_evals.append(data)
160
-
161
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
162
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
163
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
164
- df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
165
- df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
166
- df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
167
- return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
168
-
169
-
170
 
171
  original_df = get_leaderboard_df()
172
  leaderboard_df = original_df.copy()
173
- (
174
- finished_eval_queue_df,
175
- running_eval_queue_df,
176
- pending_eval_queue_df,
177
- ) = get_evaluation_queue_df()
178
-
179
- def is_model_on_hub(model_name, revision) -> bool:
180
- try:
181
- AutoConfig.from_pretrained(model_name, revision=revision)
182
- return True, None
183
-
184
- except ValueError as e:
185
- return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
186
-
187
- except Exception as e:
188
- print(f"Could not get the model config from the hub.: {e}")
189
- return False, "was not found on hub!"
190
-
191
-
192
- def add_new_eval(
193
- model: str,
194
- base_model: str,
195
- revision: str,
196
- precision: str,
197
- private: bool,
198
- weight_type: str,
199
- model_type: str,
200
- ):
201
- precision = precision.split(" ")[0]
202
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
203
-
204
- # check the model actually exists before adding the eval
205
- if revision == "":
206
- revision = "main"
207
-
208
- if weight_type in ["Delta", "Adapter"]:
209
- base_model_on_hub, error = is_model_on_hub(base_model, revision)
210
- if not base_model_on_hub:
211
- return styled_error(f'Base model "{base_model}" {error}')
212
-
213
-
214
- if not weight_type == "Adapter":
215
- model_on_hub, error = is_model_on_hub(model, revision)
216
- if not model_on_hub:
217
- return styled_error(f'Model "{model}" {error}')
218
-
219
- print("adding new eval")
220
-
221
- eval_entry = {
222
- "model": model,
223
- "base_model": base_model,
224
- "revision": revision,
225
- "private": private,
226
- "precision": precision,
227
- "weight_type": weight_type,
228
- "status": "PENDING",
229
- "submitted_time": current_time,
230
- "model_type": model_type,
231
- }
232
-
233
- user_name = ""
234
- model_path = model
235
- if "/" in model:
236
- user_name = model.split("/")[0]
237
- model_path = model.split("/")[1]
238
-
239
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
240
- os.makedirs(OUT_DIR, exist_ok=True)
241
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
242
-
243
- # Check for duplicate submission
244
- if out_path.split("eval-queue/")[1].lower() in requested_models:
245
- return styled_warning("This model has been already submitted.")
246
-
247
- with open(out_path, "w") as f:
248
- f.write(json.dumps(eval_entry))
249
-
250
- api.upload_file(
251
- path_or_fileobj=out_path,
252
- path_in_repo=out_path.split("eval-queue/")[1],
253
- repo_id=QUEUE_REPO,
254
- token=H4_TOKEN,
255
- repo_type="dataset",
256
- commit_message=f"Add {model} to eval queue",
257
- )
258
-
259
- # remove the local file
260
- os.remove(out_path)
261
-
262
- return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
263
-
264
-
265
- def refresh():
266
- leaderboard_df = get_leaderboard_df()
267
- (
268
- finished_eval_queue_df,
269
- running_eval_queue_df,
270
- pending_eval_queue_df,
271
- ) = get_evaluation_queue_df()
272
- return (
273
- leaderboard_df,
274
- finished_eval_queue_df,
275
- running_eval_queue_df,
276
- pending_eval_queue_df,
277
- )
278
 
279
 
280
  def search_table(df, query):
281
- if AutoEvalColumn.model_type.name in df.columns:
282
- filtered_df = df[
283
- (df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
284
- | (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
285
- ]
286
  else:
287
- filtered_df = df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
288
- return filtered_df
289
 
290
 
291
- def change_tab(query_param):
292
- query_param = query_param.replace("'", '"')
293
- query_param = json.loads(query_param)
294
-
295
- if (
296
- isinstance(query_param, dict)
297
- and "tab" in query_param
298
- and query_param["tab"] == "evaluation"
299
- ):
300
- return gr.Tabs.update(selected=1)
301
- else:
302
- return gr.Tabs.update(selected=0)
303
 
304
 
305
  demo = gr.Blocks(css=custom_css)
@@ -351,15 +100,6 @@ with demo:
351
  elem_id="citation-button",
352
  ).style(show_copy_button=True)
353
 
354
- dummy = gr.Textbox(visible=False)
355
- demo.load(
356
- change_tab,
357
- dummy,
358
- tabs,
359
- _js=get_window_url_params,
360
- )
361
 
362
- scheduler = BackgroundScheduler()
363
- scheduler.add_job(restart_space, "interval", seconds=3600)
364
- scheduler.start()
365
  demo.queue(concurrency_count=40).launch()
 
18
  from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
19
  from src.init import get_all_requested_models, load_all_info_from_hub
20
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def get_leaderboard_df():
24
 
 
37
  df = pd.DataFrame(data)
38
  return df
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  original_df = get_leaderboard_df()
42
  leaderboard_df = original_df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def search_table(df, query):
46
+ if query == "":
47
+ return df
 
 
 
48
  else:
49
+ return df[df.apply(lambda row: query.lower() in row.astype(str).str.lower().any(), axis=1)]
 
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
  demo = gr.Blocks(css=custom_css)
 
100
  elem_id="citation-button",
101
  ).style(show_copy_button=True)
102
 
 
 
 
 
 
 
 
103
 
104
+
 
 
105
  demo.queue(concurrency_count=40).launch()