Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -18,86 +18,7 @@ from src.assets.css_html_js import custom_css, get_window_url_params
|
|
18 |
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
19 |
from src.init import get_all_requested_models, load_all_info_from_hub
|
20 |
|
21 |
-
# clone / pull the lmeh eval data
|
22 |
-
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
23 |
|
24 |
-
QUEUE_REPO = "open-llm-leaderboard/requests"
|
25 |
-
RESULTS_REPO = "open-llm-leaderboard/results"
|
26 |
-
|
27 |
-
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
28 |
-
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
29 |
-
|
30 |
-
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
31 |
-
|
32 |
-
EVAL_REQUESTS_PATH = "eval-queue"
|
33 |
-
EVAL_RESULTS_PATH = "eval-results"
|
34 |
-
|
35 |
-
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
36 |
-
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
37 |
-
|
38 |
-
api = HfApi()
|
39 |
-
|
40 |
-
def restart_space():
|
41 |
-
api.restart_space(
|
42 |
-
repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
|
43 |
-
)
|
44 |
-
|
45 |
-
eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
|
46 |
-
|
47 |
-
if not IS_PUBLIC:
|
48 |
-
eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
|
49 |
-
else:
|
50 |
-
eval_queue_private, eval_results_private = None, None
|
51 |
-
|
52 |
-
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
53 |
-
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
54 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
55 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
56 |
-
|
57 |
-
if not IS_PUBLIC:
|
58 |
-
COLS.insert(2, AutoEvalColumn.precision.name)
|
59 |
-
TYPES.insert(2, AutoEvalColumn.precision.type)
|
60 |
-
|
61 |
-
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
62 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
63 |
-
|
64 |
-
BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
|
65 |
-
|
66 |
-
|
67 |
-
def has_no_nan_values(df, columns):
|
68 |
-
return df[columns].notna().all(axis=1)
|
69 |
-
|
70 |
-
|
71 |
-
def has_nan_values(df, columns):
|
72 |
-
return df[columns].isna().any(axis=1)
|
73 |
-
|
74 |
-
|
75 |
-
def get_leaderboard_df_1():
|
76 |
-
if eval_results:
|
77 |
-
print("Pulling evaluation results for the leaderboard.")
|
78 |
-
eval_results.git_pull()
|
79 |
-
if eval_results_private:
|
80 |
-
print("Pulling evaluation results for the leaderboard.")
|
81 |
-
eval_results_private.git_pull()
|
82 |
-
|
83 |
-
all_data = get_eval_results_dicts(IS_PUBLIC)
|
84 |
-
|
85 |
-
if not IS_PUBLIC:
|
86 |
-
all_data.append(gpt4_values)
|
87 |
-
all_data.append(gpt35_values)
|
88 |
-
|
89 |
-
all_data.append(baseline)
|
90 |
-
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
91 |
-
|
92 |
-
df = pd.DataFrame.from_records(all_data)
|
93 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
94 |
-
df = df[COLS]
|
95 |
-
|
96 |
-
# filter out if any of the benchmarks have not been produced
|
97 |
-
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
98 |
-
print(df)
|
99 |
-
print(type(df))
|
100 |
-
return df
|
101 |
|
102 |
def get_leaderboard_df():
|
103 |
|
@@ -116,190 +37,18 @@ def get_leaderboard_df():
|
|
116 |
df = pd.DataFrame(data)
|
117 |
return df
|
118 |
|
119 |
-
def get_evaluation_queue_df():
|
120 |
-
if eval_queue:
|
121 |
-
print("Pulling changes for the evaluation queue.")
|
122 |
-
eval_queue.git_pull()
|
123 |
-
if eval_queue_private:
|
124 |
-
print("Pulling changes for the evaluation queue.")
|
125 |
-
eval_queue_private.git_pull()
|
126 |
-
|
127 |
-
entries = [
|
128 |
-
entry
|
129 |
-
for entry in os.listdir(EVAL_REQUESTS_PATH)
|
130 |
-
if not entry.startswith(".")
|
131 |
-
]
|
132 |
-
all_evals = []
|
133 |
-
|
134 |
-
for entry in entries:
|
135 |
-
if ".json" in entry:
|
136 |
-
file_path = os.path.join(EVAL_REQUESTS_PATH, entry)
|
137 |
-
with open(file_path) as fp:
|
138 |
-
data = json.load(fp)
|
139 |
-
|
140 |
-
data["# params"] = "unknown"
|
141 |
-
data["model"] = make_clickable_model(data["model"])
|
142 |
-
data["revision"] = data.get("revision", "main")
|
143 |
-
|
144 |
-
all_evals.append(data)
|
145 |
-
elif ".md" not in entry:
|
146 |
-
# this is a folder
|
147 |
-
sub_entries = [
|
148 |
-
e
|
149 |
-
for e in os.listdir(f"{EVAL_REQUESTS_PATH}/{entry}")
|
150 |
-
if not e.startswith(".")
|
151 |
-
]
|
152 |
-
for sub_entry in sub_entries:
|
153 |
-
file_path = os.path.join(EVAL_REQUESTS_PATH, entry, sub_entry)
|
154 |
-
with open(file_path) as fp:
|
155 |
-
data = json.load(fp)
|
156 |
-
|
157 |
-
# data["# params"] = get_n_params(data["model"])
|
158 |
-
data["model"] = make_clickable_model(data["model"])
|
159 |
-
all_evals.append(data)
|
160 |
-
|
161 |
-
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
162 |
-
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
163 |
-
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
|
164 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
|
165 |
-
df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
|
166 |
-
df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
|
167 |
-
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
168 |
-
|
169 |
-
|
170 |
|
171 |
original_df = get_leaderboard_df()
|
172 |
leaderboard_df = original_df.copy()
|
173 |
-
(
|
174 |
-
finished_eval_queue_df,
|
175 |
-
running_eval_queue_df,
|
176 |
-
pending_eval_queue_df,
|
177 |
-
) = get_evaluation_queue_df()
|
178 |
-
|
179 |
-
def is_model_on_hub(model_name, revision) -> bool:
|
180 |
-
try:
|
181 |
-
AutoConfig.from_pretrained(model_name, revision=revision)
|
182 |
-
return True, None
|
183 |
-
|
184 |
-
except ValueError as e:
|
185 |
-
return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
|
186 |
-
|
187 |
-
except Exception as e:
|
188 |
-
print(f"Could not get the model config from the hub.: {e}")
|
189 |
-
return False, "was not found on hub!"
|
190 |
-
|
191 |
-
|
192 |
-
def add_new_eval(
|
193 |
-
model: str,
|
194 |
-
base_model: str,
|
195 |
-
revision: str,
|
196 |
-
precision: str,
|
197 |
-
private: bool,
|
198 |
-
weight_type: str,
|
199 |
-
model_type: str,
|
200 |
-
):
|
201 |
-
precision = precision.split(" ")[0]
|
202 |
-
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
203 |
-
|
204 |
-
# check the model actually exists before adding the eval
|
205 |
-
if revision == "":
|
206 |
-
revision = "main"
|
207 |
-
|
208 |
-
if weight_type in ["Delta", "Adapter"]:
|
209 |
-
base_model_on_hub, error = is_model_on_hub(base_model, revision)
|
210 |
-
if not base_model_on_hub:
|
211 |
-
return styled_error(f'Base model "{base_model}" {error}')
|
212 |
-
|
213 |
-
|
214 |
-
if not weight_type == "Adapter":
|
215 |
-
model_on_hub, error = is_model_on_hub(model, revision)
|
216 |
-
if not model_on_hub:
|
217 |
-
return styled_error(f'Model "{model}" {error}')
|
218 |
-
|
219 |
-
print("adding new eval")
|
220 |
-
|
221 |
-
eval_entry = {
|
222 |
-
"model": model,
|
223 |
-
"base_model": base_model,
|
224 |
-
"revision": revision,
|
225 |
-
"private": private,
|
226 |
-
"precision": precision,
|
227 |
-
"weight_type": weight_type,
|
228 |
-
"status": "PENDING",
|
229 |
-
"submitted_time": current_time,
|
230 |
-
"model_type": model_type,
|
231 |
-
}
|
232 |
-
|
233 |
-
user_name = ""
|
234 |
-
model_path = model
|
235 |
-
if "/" in model:
|
236 |
-
user_name = model.split("/")[0]
|
237 |
-
model_path = model.split("/")[1]
|
238 |
-
|
239 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
240 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
241 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
|
242 |
-
|
243 |
-
# Check for duplicate submission
|
244 |
-
if out_path.split("eval-queue/")[1].lower() in requested_models:
|
245 |
-
return styled_warning("This model has been already submitted.")
|
246 |
-
|
247 |
-
with open(out_path, "w") as f:
|
248 |
-
f.write(json.dumps(eval_entry))
|
249 |
-
|
250 |
-
api.upload_file(
|
251 |
-
path_or_fileobj=out_path,
|
252 |
-
path_in_repo=out_path.split("eval-queue/")[1],
|
253 |
-
repo_id=QUEUE_REPO,
|
254 |
-
token=H4_TOKEN,
|
255 |
-
repo_type="dataset",
|
256 |
-
commit_message=f"Add {model} to eval queue",
|
257 |
-
)
|
258 |
-
|
259 |
-
# remove the local file
|
260 |
-
os.remove(out_path)
|
261 |
-
|
262 |
-
return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
|
263 |
-
|
264 |
-
|
265 |
-
def refresh():
|
266 |
-
leaderboard_df = get_leaderboard_df()
|
267 |
-
(
|
268 |
-
finished_eval_queue_df,
|
269 |
-
running_eval_queue_df,
|
270 |
-
pending_eval_queue_df,
|
271 |
-
) = get_evaluation_queue_df()
|
272 |
-
return (
|
273 |
-
leaderboard_df,
|
274 |
-
finished_eval_queue_df,
|
275 |
-
running_eval_queue_df,
|
276 |
-
pending_eval_queue_df,
|
277 |
-
)
|
278 |
|
279 |
|
280 |
def search_table(df, query):
|
281 |
-
if
|
282 |
-
|
283 |
-
(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))
|
284 |
-
| (df[AutoEvalColumn.model_type.name].str.contains(query, case=False))
|
285 |
-
]
|
286 |
else:
|
287 |
-
|
288 |
-
return filtered_df
|
289 |
|
290 |
|
291 |
-
def change_tab(query_param):
|
292 |
-
query_param = query_param.replace("'", '"')
|
293 |
-
query_param = json.loads(query_param)
|
294 |
-
|
295 |
-
if (
|
296 |
-
isinstance(query_param, dict)
|
297 |
-
and "tab" in query_param
|
298 |
-
and query_param["tab"] == "evaluation"
|
299 |
-
):
|
300 |
-
return gr.Tabs.update(selected=1)
|
301 |
-
else:
|
302 |
-
return gr.Tabs.update(selected=0)
|
303 |
|
304 |
|
305 |
demo = gr.Blocks(css=custom_css)
|
@@ -351,15 +100,6 @@ with demo:
|
|
351 |
elem_id="citation-button",
|
352 |
).style(show_copy_button=True)
|
353 |
|
354 |
-
dummy = gr.Textbox(visible=False)
|
355 |
-
demo.load(
|
356 |
-
change_tab,
|
357 |
-
dummy,
|
358 |
-
tabs,
|
359 |
-
_js=get_window_url_params,
|
360 |
-
)
|
361 |
|
362 |
-
|
363 |
-
scheduler.add_job(restart_space, "interval", seconds=3600)
|
364 |
-
scheduler.start()
|
365 |
demo.queue(concurrency_count=40).launch()
|
|
|
18 |
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
19 |
from src.init import get_all_requested_models, load_all_info_from_hub
|
20 |
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def get_leaderboard_df():
|
24 |
|
|
|
37 |
df = pd.DataFrame(data)
|
38 |
return df
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
original_df = get_leaderboard_df()
|
42 |
leaderboard_df = original_df.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
|
45 |
def search_table(df, query):
|
46 |
+
if query == "":
|
47 |
+
return df
|
|
|
|
|
|
|
48 |
else:
|
49 |
+
return df[df.apply(lambda row: query.lower() in row.astype(str).str.lower().any(), axis=1)]
|
|
|
50 |
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
|
54 |
demo = gr.Blocks(css=custom_css)
|
|
|
100 |
elem_id="citation-button",
|
101 |
).style(show_copy_button=True)
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
|
|
|
|
|
105 |
demo.queue(concurrency_count=40).launch()
|