Spaces:
Runtime error
Runtime error
Quentin Gallouédec
commited on
Commit
·
0811d37
1
Parent(s):
6b9db30
works with cartpole!
Browse files- app.py +6 -4
- custom_tasks.py +0 -90
- main_backend_harness.py +37 -15
- main_backend_lighteval.py +0 -92
- requirements.txt +2 -11
- scripts/create_request_file.py +4 -36
- scripts/fix_harness_import.py +1 -1
- src/about.py +9 -6
- src/backend/manage_requests.py +18 -36
- src/backend/run_eval_suite_harness.py +58 -25
- src/backend/run_eval_suite_lighteval.py +0 -72
- src/backend/sort_queue.py +2 -7
- src/display/log_visualizer.py +5 -5
- src/envs.py +5 -6
- src/logging.py +4 -4
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import logging
|
2 |
from src.logging import configure_root_logger
|
|
|
3 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
4 |
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
5 |
logging.getLogger("absl").setLevel(logging.WARNING)
|
@@ -8,7 +9,7 @@ configure_root_logger()
|
|
8 |
from functools import partial
|
9 |
|
10 |
import gradio as gr
|
11 |
-
from
|
12 |
from src.display.log_visualizer import log_file_to_html_string
|
13 |
from src.display.css_html_js import dark_mode_gradio_js
|
14 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
@@ -32,6 +33,7 @@ links_md = f"""
|
|
32 |
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
33 |
"""
|
34 |
|
|
|
35 |
def button_auto_eval():
|
36 |
logger.info("Manually triggering Auto Eval")
|
37 |
run_auto_eval()
|
@@ -45,7 +47,7 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
|
45 |
output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
|
46 |
with gr.Row():
|
47 |
download_button = gr.DownloadButton("Download Log File", value=log_file)
|
48 |
-
with gr.Accordion(
|
49 |
reverse_order_checkbox.render()
|
50 |
# Add a button that when pressed, triggers run_auto_eval
|
51 |
button = gr.Button("Manually Run Evaluation")
|
@@ -56,5 +58,5 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
|
56 |
button.click(fn=button_auto_eval, inputs=[], outputs=[])
|
57 |
|
58 |
|
59 |
-
if __name__ ==
|
60 |
-
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
|
|
1 |
import logging
|
2 |
from src.logging import configure_root_logger
|
3 |
+
|
4 |
logging.getLogger("httpx").setLevel(logging.WARNING)
|
5 |
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
6 |
logging.getLogger("absl").setLevel(logging.WARNING)
|
|
|
9 |
from functools import partial
|
10 |
|
11 |
import gradio as gr
|
12 |
+
from main_backend_harness import run_auto_eval
|
13 |
from src.display.log_visualizer import log_file_to_html_string
|
14 |
from src.display.css_html_js import dark_mode_gradio_js
|
15 |
from src.envs import REFRESH_RATE, REPO_ID, QUEUE_REPO, RESULTS_REPO
|
|
|
33 |
| Results Repo | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
34 |
"""
|
35 |
|
36 |
+
|
37 |
def button_auto_eval():
|
38 |
logger.info("Manually triggering Auto Eval")
|
39 |
run_auto_eval()
|
|
|
47 |
output_html = gr.HTML(partial(log_file_to_html_string, reverse=reverse_order_checkbox), every=1)
|
48 |
with gr.Row():
|
49 |
download_button = gr.DownloadButton("Download Log File", value=log_file)
|
50 |
+
with gr.Accordion("Log View Configuration", open=False):
|
51 |
reverse_order_checkbox.render()
|
52 |
# Add a button that when pressed, triggers run_auto_eval
|
53 |
button = gr.Button("Manually Run Evaluation")
|
|
|
58 |
button.click(fn=button_auto_eval, inputs=[], outputs=[])
|
59 |
|
60 |
|
61 |
+
if __name__ == "__main__":
|
62 |
+
demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0", show_error=True, server_port=7860)
|
custom_tasks.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
# ruff: noqa: F405, F403, F401
|
2 |
-
"""
|
3 |
-
Custom evaluation tasks for lighteval. Copy this file and complete it with the info for your task.
|
4 |
-
|
5 |
-
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
6 |
-
|
7 |
-
Author:
|
8 |
-
"""
|
9 |
-
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
10 |
-
from lighteval.tasks.requests import Doc
|
11 |
-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
12 |
-
|
13 |
-
|
14 |
-
## EVAL WITH NO SUBSET ##
|
15 |
-
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
16 |
-
# attached to it, and one evaluation possible.
|
17 |
-
task = LightevalTaskConfig(
|
18 |
-
name="myothertask",
|
19 |
-
prompt_function="prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
20 |
-
suite=["community"],
|
21 |
-
hf_repo="",
|
22 |
-
hf_subset="default",
|
23 |
-
hf_avail_splits=[],
|
24 |
-
evaluation_splits=[],
|
25 |
-
few_shots_split="",
|
26 |
-
few_shots_select="",
|
27 |
-
metric=[""],
|
28 |
-
)
|
29 |
-
|
30 |
-
## EVALS WITH SUBSET
|
31 |
-
# This is how you create a subset task (like MMLU), which has several subset
|
32 |
-
# each being its own evaluation task.
|
33 |
-
|
34 |
-
# fmt: off
|
35 |
-
SAMPLE_SUBSETS = [] # list of all the subsets to use for this eval
|
36 |
-
# fmt: on
|
37 |
-
|
38 |
-
|
39 |
-
class CustomSubsetTask(LightevalTaskConfig):
|
40 |
-
def __init__(
|
41 |
-
self,
|
42 |
-
name,
|
43 |
-
hf_subset,
|
44 |
-
):
|
45 |
-
super().__init__(
|
46 |
-
name=name,
|
47 |
-
hf_subset=hf_subset,
|
48 |
-
prompt_function="prompt_fn", # must be defined in the file
|
49 |
-
hf_repo="",
|
50 |
-
metric=[""],
|
51 |
-
hf_avail_splits=[],
|
52 |
-
evaluation_splits=[],
|
53 |
-
few_shots_split="",
|
54 |
-
few_shots_select="",
|
55 |
-
suite=["community"],
|
56 |
-
generation_size=-1,
|
57 |
-
stop_sequence=None,
|
58 |
-
output_regex=None,
|
59 |
-
frozen=False,
|
60 |
-
)
|
61 |
-
|
62 |
-
|
63 |
-
## DEFINE YOUR PROMPT FUNCTIONS
|
64 |
-
# Define as many as you need for your different tasks
|
65 |
-
def prompt_fn(line, task_name: str = None):
|
66 |
-
"""Defines how to go from a dataset line to a doc object.
|
67 |
-
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
68 |
-
about what this function should do in the README.
|
69 |
-
"""
|
70 |
-
return Doc(
|
71 |
-
task_name=task_name,
|
72 |
-
query="",
|
73 |
-
choices="",
|
74 |
-
gold_index=0,
|
75 |
-
instruction="",
|
76 |
-
)
|
77 |
-
|
78 |
-
|
79 |
-
## STORE YOUR EVALS
|
80 |
-
SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
|
81 |
-
_TASKS = SUBSET_TASKS + [task]
|
82 |
-
|
83 |
-
## MODULE LOGIC
|
84 |
-
# You should not need to touch this
|
85 |
-
# Convert to dict for lighteval
|
86 |
-
TASKS_TABLE = [task.as_dict() for task in _TASKS]
|
87 |
-
|
88 |
-
if __name__ == "__main__":
|
89 |
-
print(t["name"] for t in TASKS_TABLE)
|
90 |
-
print(len(TASKS_TABLE))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main_backend_harness.py
CHANGED
@@ -5,13 +5,23 @@ from huggingface_hub import snapshot_download
|
|
5 |
|
6 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
|
8 |
-
from backend.run_eval_suite_harness import run_evaluation
|
9 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
|
12 |
-
from src.envs import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
from src.about import Tasks, NUM_FEWSHOT
|
14 |
from src.logging import setup_logger
|
|
|
15 |
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
16 |
|
17 |
# logging.basicConfig(level=logging.ERROR)
|
@@ -23,8 +33,23 @@ RUNNING_STATUS = "RUNNING"
|
|
23 |
FINISHED_STATUS = "FINISHED"
|
24 |
FAILED_STATUS = "FAILED"
|
25 |
|
26 |
-
snapshot_download(
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
def run_auto_eval():
|
30 |
current_pending_status = [PENDING_STATUS]
|
@@ -39,11 +64,13 @@ def run_auto_eval():
|
|
39 |
hf_repo=QUEUE_REPO,
|
40 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
41 |
hf_repo_results=RESULTS_REPO,
|
42 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
43 |
)
|
44 |
|
45 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
46 |
-
eval_requests = get_eval_requests(
|
|
|
|
|
47 |
# Sort the evals by priority (first submitted first run)
|
48 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
49 |
|
@@ -64,17 +91,12 @@ def run_auto_eval():
|
|
64 |
)
|
65 |
|
66 |
run_evaluation(
|
67 |
-
eval_request=eval_request,
|
68 |
-
task_names=TASKS_HARNESS,
|
69 |
-
num_fewshot=NUM_FEWSHOT,
|
70 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
71 |
results_repo=RESULTS_REPO,
|
72 |
-
|
73 |
-
device=DEVICE,
|
74 |
-
no_cache=True,
|
75 |
-
limit=LIMIT
|
76 |
-
)
|
77 |
|
78 |
|
79 |
if __name__ == "__main__":
|
80 |
-
run_auto_eval()
|
|
|
5 |
|
6 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
|
8 |
+
from src.backend.run_eval_suite_harness import run_evaluation
|
9 |
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
from src.backend.sort_queue import sort_models_by_priority
|
11 |
|
12 |
+
from src.envs import (
|
13 |
+
QUEUE_REPO,
|
14 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
15 |
+
RESULTS_REPO,
|
16 |
+
EVAL_RESULTS_PATH_BACKEND,
|
17 |
+
DEVICE,
|
18 |
+
API,
|
19 |
+
LIMIT,
|
20 |
+
TOKEN,
|
21 |
+
)
|
22 |
from src.about import Tasks, NUM_FEWSHOT
|
23 |
from src.logging import setup_logger
|
24 |
+
|
25 |
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
26 |
|
27 |
# logging.basicConfig(level=logging.ERROR)
|
|
|
33 |
FINISHED_STATUS = "FINISHED"
|
34 |
FAILED_STATUS = "FAILED"
|
35 |
|
36 |
+
snapshot_download(
|
37 |
+
repo_id=RESULTS_REPO,
|
38 |
+
revision="main",
|
39 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
40 |
+
repo_type="dataset",
|
41 |
+
max_workers=60,
|
42 |
+
token=TOKEN,
|
43 |
+
)
|
44 |
+
snapshot_download(
|
45 |
+
repo_id=QUEUE_REPO,
|
46 |
+
revision="main",
|
47 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
48 |
+
repo_type="dataset",
|
49 |
+
max_workers=60,
|
50 |
+
token=TOKEN,
|
51 |
+
)
|
52 |
+
|
53 |
|
54 |
def run_auto_eval():
|
55 |
current_pending_status = [PENDING_STATUS]
|
|
|
64 |
hf_repo=QUEUE_REPO,
|
65 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
66 |
hf_repo_results=RESULTS_REPO,
|
67 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
68 |
)
|
69 |
|
70 |
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
71 |
+
eval_requests = get_eval_requests(
|
72 |
+
job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
|
73 |
+
)
|
74 |
# Sort the evals by priority (first submitted first run)
|
75 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
76 |
|
|
|
91 |
)
|
92 |
|
93 |
run_evaluation(
|
94 |
+
eval_request=eval_request,
|
95 |
+
task_names=TASKS_HARNESS,
|
|
|
96 |
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
97 |
results_repo=RESULTS_REPO,
|
98 |
+
)
|
|
|
|
|
|
|
|
|
99 |
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
+
run_auto_eval()
|
main_backend_lighteval.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
import pprint
|
3 |
-
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
-
|
6 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
-
|
8 |
-
from src.backend.run_eval_suite_lighteval import run_evaluation
|
9 |
-
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
-
from src.backend.sort_queue import sort_models_by_priority
|
11 |
-
|
12 |
-
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, LIMIT, TOKEN, ACCELERATOR, VENDOR, REGION
|
13 |
-
from src.about import TASKS_LIGHTEVAL
|
14 |
-
from src.logging import setup_logger
|
15 |
-
|
16 |
-
logger = setup_logger(__name__)
|
17 |
-
|
18 |
-
# logging.basicConfig(level=logging.ERROR)
|
19 |
-
pp = pprint.PrettyPrinter(width=80)
|
20 |
-
|
21 |
-
PENDING_STATUS = "PENDING"
|
22 |
-
RUNNING_STATUS = "RUNNING"
|
23 |
-
FINISHED_STATUS = "FINISHED"
|
24 |
-
FAILED_STATUS = "FAILED"
|
25 |
-
|
26 |
-
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
27 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
28 |
-
|
29 |
-
def run_auto_eval():
|
30 |
-
current_pending_status = [PENDING_STATUS]
|
31 |
-
|
32 |
-
# pull the eval dataset from the hub and parse any eval requests
|
33 |
-
# check completed evals and set them to finished
|
34 |
-
check_completed_evals(
|
35 |
-
api=API,
|
36 |
-
checked_status=RUNNING_STATUS,
|
37 |
-
completed_status=FINISHED_STATUS,
|
38 |
-
failed_status=FAILED_STATUS,
|
39 |
-
hf_repo=QUEUE_REPO,
|
40 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
41 |
-
hf_repo_results=RESULTS_REPO,
|
42 |
-
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
43 |
-
)
|
44 |
-
|
45 |
-
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
46 |
-
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
47 |
-
# Sort the evals by priority (first submitted first run)
|
48 |
-
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
49 |
-
|
50 |
-
logger.info(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
51 |
-
|
52 |
-
if len(eval_requests) == 0:
|
53 |
-
return
|
54 |
-
|
55 |
-
eval_request = eval_requests[0]
|
56 |
-
logger.info(pp.pformat(eval_request))
|
57 |
-
|
58 |
-
|
59 |
-
set_eval_request(
|
60 |
-
api=API,
|
61 |
-
eval_request=eval_request,
|
62 |
-
set_to_status=RUNNING_STATUS,
|
63 |
-
hf_repo=QUEUE_REPO,
|
64 |
-
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
65 |
-
)
|
66 |
-
|
67 |
-
# This needs to be done
|
68 |
-
#instance_size, instance_type = get_instance_for_model(eval_request)
|
69 |
-
# For GPU
|
70 |
-
# instance_size, instance_type = "small", "g4dn.xlarge"
|
71 |
-
# For CPU
|
72 |
-
instance_size, instance_type = "medium", "c6i"
|
73 |
-
logger.info(f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
|
74 |
-
|
75 |
-
run_evaluation(
|
76 |
-
eval_request=eval_request,
|
77 |
-
task_names=TASKS_LIGHTEVAL,
|
78 |
-
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
79 |
-
batch_size=1,
|
80 |
-
accelerator=ACCELERATOR,
|
81 |
-
region=REGION,
|
82 |
-
vendor=VENDOR,
|
83 |
-
instance_size=instance_size,
|
84 |
-
instance_type=instance_type,
|
85 |
-
limit=LIMIT
|
86 |
-
)
|
87 |
-
|
88 |
-
logger.info(f'Completed Evaluation of {eval_request.json_filepath} on Inference endpoints: {instance_size} {instance_type}')
|
89 |
-
|
90 |
-
|
91 |
-
if __name__ == "__main__":
|
92 |
-
run_auto_eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -2,8 +2,9 @@ APScheduler==3.10.1
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
-
gradio==4.
|
6 |
gradio_client
|
|
|
7 |
huggingface-hub>=0.18.0
|
8 |
matplotlib==3.7.1
|
9 |
numpy==1.24.2
|
@@ -11,16 +12,6 @@ pandas==2.0.0
|
|
11 |
python-dateutil==2.8.2
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
-
transformers
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
-
git+https://github.com/huggingface/lighteval.git#egg=lighteval
|
18 |
-
accelerate==0.24.1
|
19 |
-
sentencepiece
|
20 |
-
|
21 |
-
# Evaluation suites
|
22 |
-
lighteval
|
23 |
-
lm_eval
|
24 |
|
25 |
# Log Visualizer
|
26 |
BeautifulSoup4==4.12.2
|
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
+
gradio==4.25.0
|
6 |
gradio_client
|
7 |
+
gymnasium==0.29.1
|
8 |
huggingface-hub>=0.18.0
|
9 |
matplotlib==3.7.1
|
10 |
numpy==1.24.2
|
|
|
12 |
python-dateutil==2.8.2
|
13 |
requests==2.28.2
|
14 |
tqdm==4.65.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Log Visualizer
|
17 |
BeautifulSoup4==4.12.2
|
scripts/create_request_file.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import pprint
|
4 |
-
import re
|
5 |
from datetime import datetime, timezone
|
6 |
|
7 |
import click
|
@@ -9,39 +8,16 @@ from colorama import Fore
|
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
|
11 |
|
12 |
-
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
|
13 |
-
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
14 |
-
weight_types = ("Original", "Delta", "Adapter")
|
15 |
-
|
16 |
-
|
17 |
-
def get_model_size(model_info, precision: str):
|
18 |
-
size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
|
19 |
-
try:
|
20 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
21 |
-
except (AttributeError, TypeError):
|
22 |
-
try:
|
23 |
-
size_match = re.search(size_pattern, model_info.modelId.lower())
|
24 |
-
model_size = size_match.group(0)
|
25 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
26 |
-
except AttributeError:
|
27 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
28 |
-
|
29 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
30 |
-
model_size = size_factor * model_size
|
31 |
-
return model_size
|
32 |
-
|
33 |
|
34 |
def main():
|
35 |
api = HfApi()
|
36 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
37 |
-
snapshot_download(
|
|
|
|
|
38 |
|
39 |
model_name = click.prompt("Enter model name")
|
40 |
revision = click.prompt("Enter revision", default="main")
|
41 |
-
precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
|
42 |
-
model_type = click.prompt("Enter model type", type=click.Choice(model_types))
|
43 |
-
weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
|
44 |
-
base_model = click.prompt("Enter base model", default="")
|
45 |
status = click.prompt("Enter status", default="FINISHED")
|
46 |
|
47 |
try:
|
@@ -50,8 +26,6 @@ def main():
|
|
50 |
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
51 |
return 1
|
52 |
|
53 |
-
model_size = get_model_size(model_info=model_info, precision=precision)
|
54 |
-
|
55 |
try:
|
56 |
license = model_info.cardData["license"]
|
57 |
except Exception:
|
@@ -59,16 +33,10 @@ def main():
|
|
59 |
|
60 |
eval_entry = {
|
61 |
"model": model_name,
|
62 |
-
"base_model": base_model,
|
63 |
"revision": revision,
|
64 |
-
"private": False,
|
65 |
-
"precision": precision,
|
66 |
-
"weight_type": weight_type,
|
67 |
"status": status,
|
68 |
"submitted_time": current_time,
|
69 |
-
"model_type": model_type,
|
70 |
"likes": model_info.likes,
|
71 |
-
"params": model_size,
|
72 |
"license": license,
|
73 |
}
|
74 |
|
@@ -85,7 +53,7 @@ def main():
|
|
85 |
|
86 |
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
87 |
os.makedirs(out_dir, exist_ok=True)
|
88 |
-
out_path = f"{out_dir}/{model_path}
|
89 |
|
90 |
with open(out_path, "w") as f:
|
91 |
f.write(json.dumps(eval_entry))
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import pprint
|
|
|
4 |
from datetime import datetime, timezone
|
5 |
|
6 |
import click
|
|
|
8 |
from huggingface_hub import HfApi, snapshot_download
|
9 |
from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def main():
|
13 |
api = HfApi()
|
14 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
15 |
+
snapshot_download(
|
16 |
+
repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN
|
17 |
+
)
|
18 |
|
19 |
model_name = click.prompt("Enter model name")
|
20 |
revision = click.prompt("Enter revision", default="main")
|
|
|
|
|
|
|
|
|
21 |
status = click.prompt("Enter status", default="FINISHED")
|
22 |
|
23 |
try:
|
|
|
26 |
print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
|
27 |
return 1
|
28 |
|
|
|
|
|
29 |
try:
|
30 |
license = model_info.cardData["license"]
|
31 |
except Exception:
|
|
|
33 |
|
34 |
eval_entry = {
|
35 |
"model": model_name,
|
|
|
36 |
"revision": revision,
|
|
|
|
|
|
|
37 |
"status": status,
|
38 |
"submitted_time": current_time,
|
|
|
39 |
"likes": model_info.likes,
|
|
|
40 |
"license": license,
|
41 |
}
|
42 |
|
|
|
53 |
|
54 |
out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
55 |
os.makedirs(out_dir, exist_ok=True)
|
56 |
+
out_path = f"{out_dir}/{model_path}_eval_request.json"
|
57 |
|
58 |
with open(out_path, "w") as f:
|
59 |
f.write(json.dumps(eval_entry))
|
scripts/fix_harness_import.py
CHANGED
@@ -8,4 +8,4 @@ import lm_eval
|
|
8 |
|
9 |
if __name__ == "__main__":
|
10 |
lm_eval_path = lm_eval.__path__[0]
|
11 |
-
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
|
|
8 |
|
9 |
if __name__ == "__main__":
|
10 |
lm_eval_path = lm_eval.__path__[0]
|
11 |
+
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
src/about.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
@@ -11,14 +12,16 @@ class Task:
|
|
11 |
# Change for your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
17 |
|
18 |
-
NUM_FEWSHOT = 0
|
19 |
|
20 |
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
-
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
#custom|myothertask|0|0
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
|
5 |
@dataclass
|
6 |
class Task:
|
7 |
benchmark: str
|
|
|
12 |
# Change for your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
# task0 = Task("PongNoFrameskip-v4", "episodic_return", "PongNoFrameskip-v4")
|
17 |
+
task1 = Task("BreakoutNoFrameskip-v4", "episodic_return", "BreakoutNoFrameskip-v4")
|
18 |
+
task2 = Task("CartPole-v1", "episodic_return", "CartPole-v1")
|
19 |
+
|
20 |
|
21 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
|
23 |
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
24 |
# ---------------------------------------------------
|
25 |
|
26 |
+
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
27 |
+
# custom|myothertask|0|0
|
src/backend/manage_requests.py
CHANGED
@@ -9,41 +9,18 @@ from src.logging import setup_logger
|
|
9 |
|
10 |
logger = setup_logger(__name__)
|
11 |
|
|
|
12 |
@dataclass
|
13 |
class EvalRequest:
|
14 |
model: str
|
15 |
-
private: bool
|
16 |
status: str
|
17 |
json_filepath: str
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
revision: str = "main" # commit
|
23 |
-
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
24 |
-
model_type: Optional[str] = None
|
25 |
likes: Optional[int] = 0
|
26 |
-
params: Optional[int] = None
|
27 |
license: Optional[str] = ""
|
28 |
-
|
29 |
-
def get_model_args(self):
|
30 |
-
model_args = f"pretrained={self.model},revision={self.revision}"
|
31 |
-
|
32 |
-
if self.precision in ["float16", "bfloat16", "float32"]:
|
33 |
-
model_args += f",dtype={self.precision}"
|
34 |
-
# Quantized models need some added config, the install of bits and bytes, etc
|
35 |
-
#elif self.precision == "8bit":
|
36 |
-
# model_args += ",load_in_8bit=True"
|
37 |
-
#elif self.precision == "4bit":
|
38 |
-
# model_args += ",load_in_4bit=True"
|
39 |
-
#elif self.precision == "GPTQ":
|
40 |
-
# A GPTQ model does not need dtype to be specified,
|
41 |
-
# it will be inferred from the config
|
42 |
-
pass
|
43 |
-
else:
|
44 |
-
raise Exception(f"Unknown precision {self.precision}.")
|
45 |
-
|
46 |
-
return model_args
|
47 |
|
48 |
|
49 |
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
@@ -74,7 +51,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
74 |
Returns:
|
75 |
`list[EvalRequest]`: a list of model info dicts.
|
76 |
"""
|
77 |
-
snapshot_download(
|
|
|
|
|
78 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
79 |
|
80 |
eval_requests = []
|
@@ -100,7 +79,14 @@ def check_completed_evals(
|
|
100 |
local_dir_results: str,
|
101 |
):
|
102 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
103 |
-
snapshot_download(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
106 |
|
@@ -114,12 +100,8 @@ def check_completed_evals(
|
|
114 |
output_file_exists = len(glob.glob(output_file)) > 0
|
115 |
|
116 |
if output_file_exists:
|
117 |
-
logger.info(
|
118 |
-
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
119 |
-
)
|
120 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
121 |
else:
|
122 |
-
logger.info(
|
123 |
-
f"No result file found for {model} setting it to {failed_status}"
|
124 |
-
)
|
125 |
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
|
|
9 |
|
10 |
logger = setup_logger(__name__)
|
11 |
|
12 |
+
|
13 |
@dataclass
|
14 |
class EvalRequest:
|
15 |
model: str
|
|
|
16 |
status: str
|
17 |
json_filepath: str
|
18 |
+
revision: str = "main" # commit
|
19 |
+
submitted_time: Optional[
|
20 |
+
str
|
21 |
+
] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
|
|
|
|
|
|
22 |
likes: Optional[int] = 0
|
|
|
23 |
license: Optional[str] = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
|
|
51 |
Returns:
|
52 |
`list[EvalRequest]`: a list of model info dicts.
|
53 |
"""
|
54 |
+
snapshot_download(
|
55 |
+
repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN
|
56 |
+
)
|
57 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
58 |
|
59 |
eval_requests = []
|
|
|
79 |
local_dir_results: str,
|
80 |
):
|
81 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
82 |
+
snapshot_download(
|
83 |
+
repo_id=hf_repo_results,
|
84 |
+
revision="main",
|
85 |
+
local_dir=local_dir_results,
|
86 |
+
repo_type="dataset",
|
87 |
+
max_workers=60,
|
88 |
+
token=TOKEN,
|
89 |
+
)
|
90 |
|
91 |
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
92 |
|
|
|
100 |
output_file_exists = len(glob.glob(output_file)) > 0
|
101 |
|
102 |
if output_file_exists:
|
103 |
+
logger.info(f"EXISTS output file exists for {model} setting it to {completed_status}")
|
|
|
|
|
104 |
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
105 |
else:
|
106 |
+
logger.info(f"No result file found for {model} setting it to {failed_status}")
|
|
|
|
|
107 |
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite_harness.py
CHANGED
@@ -3,41 +3,76 @@ import os
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
6 |
-
from lm_eval import tasks, evaluator, utils
|
7 |
-
|
8 |
from src.envs import RESULTS_REPO, API
|
9 |
from src.backend.manage_requests import EvalRequest
|
10 |
from src.logging import setup_logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
-
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
16 |
-
if limit:
|
17 |
-
logger.info(
|
18 |
-
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
19 |
-
)
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
logger.info(f"Selected Tasks: {task_names}")
|
24 |
|
25 |
-
results =
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
dumped = json.dumps(results, indent=2)
|
43 |
logger.info(dumped)
|
@@ -47,8 +82,6 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
|
|
47 |
with open(output_path, "w") as f:
|
48 |
f.write(dumped)
|
49 |
|
50 |
-
logger.info(evaluator.make_table(results))
|
51 |
-
|
52 |
API.upload_file(
|
53 |
path_or_fileobj=output_path,
|
54 |
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
|
|
3 |
import logging
|
4 |
from datetime import datetime
|
5 |
|
|
|
|
|
6 |
from src.envs import RESULTS_REPO, API
|
7 |
from src.backend.manage_requests import EvalRequest
|
8 |
from src.logging import setup_logger
|
9 |
+
from src.backend.evaluate import run_evaluation
|
10 |
+
import fnmatch
|
11 |
+
import torch
|
12 |
+
from torch import nn
|
13 |
+
from huggingface_hub.utils._errors import EntryNotFoundError
|
14 |
+
|
15 |
+
import gymnasium as gym
|
16 |
+
|
17 |
+
|
18 |
+
import numpy as np
|
19 |
+
from typing import List
|
20 |
+
from huggingface_hub import hf_hub_download
|
21 |
+
from src.backend.manage_requests import EvalRequest
|
22 |
|
23 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
24 |
logger = setup_logger(__name__)
|
25 |
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
def pattern_match(patterns, source_list):
|
28 |
+
if isinstance(patterns, str):
|
29 |
+
patterns = [patterns]
|
30 |
+
|
31 |
+
task_names = set()
|
32 |
+
for pattern in patterns:
|
33 |
+
for matching in fnmatch.filter(source_list, pattern):
|
34 |
+
task_names.add(matching)
|
35 |
+
return sorted(list(task_names))
|
36 |
+
|
37 |
+
|
38 |
+
def run_evaluation(eval_request: EvalRequest, task_names, local_dir: str, results_repo: str):
|
39 |
+
tags = API.model_info(eval_request.model).tags
|
40 |
+
task_names = pattern_match(tags, task_names)
|
41 |
|
42 |
logger.info(f"Selected Tasks: {task_names}")
|
43 |
|
44 |
+
results = {
|
45 |
+
"config": {
|
46 |
+
"model_name": eval_request.model,
|
47 |
+
"model_sha": eval_request.revision,
|
48 |
+
},
|
49 |
+
"results": {},
|
50 |
+
}
|
51 |
+
try:
|
52 |
+
agent_path = hf_hub_download(repo_id=eval_request.model, filename="agent.pt")
|
53 |
+
except EntryNotFoundError:
|
54 |
+
logger.error("Agent not found")
|
55 |
+
return
|
56 |
+
agent = torch.jit.load(agent_path)
|
57 |
|
58 |
+
episodic_rewards = []
|
59 |
+
for task_name in task_names:
|
60 |
+
env = gym.make(task_name)
|
61 |
+
for _ in range(10):
|
62 |
+
episodic_reward = 0.0
|
63 |
+
observation, info = env.reset()
|
64 |
+
done = False
|
65 |
+
while not done:
|
66 |
+
torch_observation = torch.from_numpy(np.array([observation]))
|
67 |
+
action = agent(torch_observation).numpy()[0]
|
68 |
+
observation, reward, terminated, truncated, info = env.step(action)
|
69 |
+
done = terminated or truncated
|
70 |
+
episodic_reward += reward
|
71 |
+
|
72 |
+
episodic_rewards.append(episodic_reward)
|
73 |
+
|
74 |
+
mean_reward = np.mean(episodic_rewards)
|
75 |
+
results[task_name] = {"episodic_return": mean_reward}
|
76 |
|
77 |
dumped = json.dumps(results, indent=2)
|
78 |
logger.info(dumped)
|
|
|
82 |
with open(output_path, "w") as f:
|
83 |
f.write(dumped)
|
84 |
|
|
|
|
|
85 |
API.upload_file(
|
86 |
path_or_fileobj=output_path,
|
87 |
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
src/backend/run_eval_suite_lighteval.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import argparse
|
3 |
-
import logging
|
4 |
-
from datetime import datetime
|
5 |
-
|
6 |
-
from lighteval.main_accelerate import main, EnvConfig, create_model_config, load_model
|
7 |
-
|
8 |
-
from src.envs import RESULTS_REPO, CACHE_PATH, TOKEN
|
9 |
-
from src.backend.manage_requests import EvalRequest
|
10 |
-
from src.logging import setup_logger
|
11 |
-
|
12 |
-
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
-
logger = setup_logger(__name__)
|
14 |
-
|
15 |
-
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
16 |
-
if limit:
|
17 |
-
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
18 |
-
|
19 |
-
args_dict = {
|
20 |
-
# Endpoint parameters
|
21 |
-
"endpoint_model_name":eval_request.model,
|
22 |
-
"accelerator": accelerator,
|
23 |
-
"vendor": vendor,
|
24 |
-
"region": region,
|
25 |
-
"instance_size": instance_size,
|
26 |
-
"instance_type": instance_type,
|
27 |
-
"reuse_existing": False,
|
28 |
-
"model_dtype": eval_request.precision,
|
29 |
-
"revision": eval_request.revision,
|
30 |
-
# Save parameters
|
31 |
-
"push_results_to_hub": True,
|
32 |
-
"save_details": True,
|
33 |
-
"push_details_to_hub": True,
|
34 |
-
"public_run": False,
|
35 |
-
"cache_dir": CACHE_PATH,
|
36 |
-
"results_org": RESULTS_REPO,
|
37 |
-
"output_dir": local_dir,
|
38 |
-
"job_id": str(datetime.now()),
|
39 |
-
# Experiment parameters
|
40 |
-
"override_batch_size": batch_size,
|
41 |
-
"custom_tasks": "custom_tasks.py",
|
42 |
-
"tasks": task_names,
|
43 |
-
"max_samples": limit,
|
44 |
-
"use_chat_template": False,
|
45 |
-
"system_prompt": None,
|
46 |
-
# Parameters which would be set to things by the kwargs if actually using argparse
|
47 |
-
"inference_server_address": None,
|
48 |
-
"model_args": None,
|
49 |
-
"num_fewshot_seeds": None,
|
50 |
-
"delta_weights": False,
|
51 |
-
"adapter_weights": False
|
52 |
-
}
|
53 |
-
args = argparse.Namespace(**args_dict)
|
54 |
-
|
55 |
-
try:
|
56 |
-
results = main(args)
|
57 |
-
|
58 |
-
results["config"]["model_dtype"] = eval_request.precision
|
59 |
-
results["config"]["model_name"] = eval_request.model
|
60 |
-
results["config"]["model_sha"] = eval_request.revision
|
61 |
-
|
62 |
-
dumped = json.dumps(results, indent=2)
|
63 |
-
logger.info(dumped)
|
64 |
-
except Exception as e: # if eval failed, we force a cleanup
|
65 |
-
env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
|
66 |
-
|
67 |
-
model_config = create_model_config(args=args, accelerator=accelerator)
|
68 |
-
model, _ = load_model(config=model_config, env_config=env_config)
|
69 |
-
model.cleanup()
|
70 |
-
|
71 |
-
|
72 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/sort_queue.py
CHANGED
@@ -9,20 +9,15 @@ from src.backend.manage_requests import EvalRequest
|
|
9 |
@dataclass
|
10 |
class ModelMetadata:
|
11 |
likes: int = 0
|
12 |
-
size: int = 15
|
13 |
|
14 |
|
15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
-
|
17 |
-
public_models = [model for model in models if not model.private]
|
18 |
|
19 |
-
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
|
21 |
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
22 |
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
23 |
|
24 |
-
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
25 |
-
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
26 |
|
27 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
-
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
|
|
9 |
@dataclass
|
10 |
class ModelMetadata:
|
11 |
likes: int = 0
|
|
|
12 |
|
13 |
|
14 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
15 |
+
return sort_by_submit_date(models)
|
|
|
16 |
|
|
|
17 |
|
18 |
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
19 |
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
20 |
|
|
|
|
|
21 |
|
22 |
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
23 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/display/log_visualizer.py
CHANGED
@@ -12,8 +12,8 @@ from src.logging import log_file
|
|
12 |
|
13 |
def log_file_to_html_string(reverse=True):
|
14 |
with open(log_file, "rt") as f:
|
15 |
-
|
16 |
-
|
17 |
|
18 |
if reverse:
|
19 |
lines = reversed(lines)
|
@@ -26,12 +26,12 @@ def log_file_to_html_string(reverse=True):
|
|
26 |
html_content = console.export_html(inline_styles=True)
|
27 |
|
28 |
# Parse the HTML content using BeautifulSoup
|
29 |
-
soup = BeautifulSoup(html_content,
|
30 |
|
31 |
# Modify the <pre> tag and add custom styles
|
32 |
pre_tag = soup.pre
|
33 |
-
pre_tag[
|
34 |
-
del pre_tag[
|
35 |
|
36 |
# Add your custom styles and the .scrollable CSS to the <style> tag
|
37 |
style_tag = soup.style
|
|
|
12 |
|
13 |
def log_file_to_html_string(reverse=True):
|
14 |
with open(log_file, "rt") as f:
|
15 |
+
lines = f.readlines()
|
16 |
+
lines = lines[-NUM_LINES_VISUALIZE:]
|
17 |
|
18 |
if reverse:
|
19 |
lines = reversed(lines)
|
|
|
26 |
html_content = console.export_html(inline_styles=True)
|
27 |
|
28 |
# Parse the HTML content using BeautifulSoup
|
29 |
+
soup = BeautifulSoup(html_content, "lxml")
|
30 |
|
31 |
# Modify the <pre> tag and add custom styles
|
32 |
pre_tag = soup.pre
|
33 |
+
pre_tag["class"] = "scrollable"
|
34 |
+
del pre_tag["style"]
|
35 |
|
36 |
# Add your custom styles and the .scrollable CSS to the <style> tag
|
37 |
style_tag = soup.style
|
src/envs.py
CHANGED
@@ -4,13 +4,13 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("TOKEN")
|
8 |
|
9 |
-
OWNER = "open-rl-leaderboard"
|
10 |
|
11 |
# For harness evaluations
|
12 |
-
DEVICE = "cpu"
|
13 |
-
LIMIT = 20
|
14 |
|
15 |
# For lighteval evaluations
|
16 |
ACCELERATOR = "cpu"
|
@@ -23,7 +23,7 @@ QUEUE_REPO = f"{OWNER}/requests"
|
|
23 |
RESULTS_REPO = f"{OWNER}/results"
|
24 |
|
25 |
# If you setup a cache later, just change HF_HOME
|
26 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
27 |
|
28 |
# Local caches
|
29 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
@@ -35,4 +35,3 @@ REFRESH_RATE = 1 * 60 # 1 min
|
|
35 |
NUM_LINES_VISUALIZE = 300
|
36 |
|
37 |
API = HfApi(token=TOKEN)
|
38 |
-
|
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "open-rl-leaderboard" # Change to your org - don't forget to create a results and request file
|
10 |
|
11 |
# For harness evaluations
|
12 |
+
DEVICE = "cpu" # "cuda:0" if you add compute, for harness evaluations
|
13 |
+
LIMIT = 20 # !!!! Should be None for actual evaluations!!!
|
14 |
|
15 |
# For lighteval evaluations
|
16 |
ACCELERATOR = "cpu"
|
|
|
23 |
RESULTS_REPO = f"{OWNER}/results"
|
24 |
|
25 |
# If you setup a cache later, just change HF_HOME
|
26 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
27 |
|
28 |
# Local caches
|
29 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
35 |
NUM_LINES_VISUALIZE = 300
|
36 |
|
37 |
API = HfApi(token=TOKEN)
|
|
src/logging.py
CHANGED
@@ -3,7 +3,7 @@ from pathlib import Path
|
|
3 |
|
4 |
proj_dir = Path(__file__).parents[1]
|
5 |
|
6 |
-
log_file = proj_dir/"output.log"
|
7 |
|
8 |
|
9 |
import logging
|
@@ -13,7 +13,7 @@ def setup_logger(name: str):
|
|
13 |
logger = logging.getLogger(name)
|
14 |
logger.setLevel(logging.INFO)
|
15 |
|
16 |
-
formatter = logging.Formatter(
|
17 |
|
18 |
# Create a file handler to write logs to a file
|
19 |
file_handler = logging.FileHandler(log_file)
|
@@ -29,10 +29,10 @@ def configure_root_logger():
|
|
29 |
logging.basicConfig(level=logging.INFO)
|
30 |
root_logger = logging.getLogger()
|
31 |
|
32 |
-
formatter = logging.Formatter(
|
33 |
|
34 |
file_handler = logging.FileHandler(log_file)
|
35 |
file_handler.setLevel(logging.INFO)
|
36 |
file_handler.setFormatter(formatter)
|
37 |
|
38 |
-
root_logger.addHandler(file_handler)
|
|
|
3 |
|
4 |
proj_dir = Path(__file__).parents[1]
|
5 |
|
6 |
+
log_file = proj_dir / "output.log"
|
7 |
|
8 |
|
9 |
import logging
|
|
|
13 |
logger = logging.getLogger(name)
|
14 |
logger.setLevel(logging.INFO)
|
15 |
|
16 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
17 |
|
18 |
# Create a file handler to write logs to a file
|
19 |
file_handler = logging.FileHandler(log_file)
|
|
|
29 |
logging.basicConfig(level=logging.INFO)
|
30 |
root_logger = logging.getLogger()
|
31 |
|
32 |
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
33 |
|
34 |
file_handler = logging.FileHandler(log_file)
|
35 |
file_handler.setLevel(logging.INFO)
|
36 |
file_handler.setFormatter(formatter)
|
37 |
|
38 |
+
root_logger.addHandler(file_handler)
|