Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Clémentine
commited on
Commit
·
0c7ef71
1
Parent(s):
9d02a6b
wip
Browse files- app.py +42 -23
- src/envs.py +3 -0
- src/leaderboard/read_evals.py +19 -43
- src/populate.py +2 -2
- {scripts → src/scripts}/create_request_file.py +5 -20
- src/scripts/update_all_request_files.py +97 -0
- src/submission/check_validity.py +13 -7
- src/submission/submit.py +52 -3
app.py
CHANGED
|
@@ -27,7 +27,7 @@ from src.display.utils import (
|
|
| 27 |
WeightType,
|
| 28 |
Precision
|
| 29 |
)
|
| 30 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 31 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 32 |
from src.submission.submit import add_new_eval
|
| 33 |
from src.tools.collections import update_collections
|
|
@@ -43,33 +43,52 @@ enable_space_ci()
|
|
| 43 |
def restart_space():
|
| 44 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
|
|
|
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
plot_df
|
| 67 |
|
| 68 |
-
(
|
| 69 |
-
finished_eval_queue_df,
|
| 70 |
-
running_eval_queue_df,
|
| 71 |
-
pending_eval_queue_df,
|
| 72 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 73 |
|
| 74 |
|
| 75 |
# Searching and filtering
|
|
|
|
| 27 |
WeightType,
|
| 28 |
Precision
|
| 29 |
)
|
| 30 |
+
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 31 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 32 |
from src.submission.submit import add_new_eval
|
| 33 |
from src.tools.collections import update_collections
|
|
|
|
| 43 |
def restart_space():
|
| 44 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
| 45 |
|
| 46 |
+
|
| 47 |
+
def init_space():
|
| 48 |
+
try:
|
| 49 |
+
print(EVAL_REQUESTS_PATH)
|
| 50 |
+
snapshot_download(
|
| 51 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 52 |
+
)
|
| 53 |
+
except Exception:
|
| 54 |
+
restart_space()
|
| 55 |
+
try:
|
| 56 |
+
print(DYNAMIC_INFO_PATH)
|
| 57 |
+
snapshot_download(
|
| 58 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 59 |
+
)
|
| 60 |
+
except Exception:
|
| 61 |
+
restart_space()
|
| 62 |
+
try:
|
| 63 |
+
print(EVAL_RESULTS_PATH)
|
| 64 |
+
snapshot_download(
|
| 65 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 66 |
+
)
|
| 67 |
+
except Exception:
|
| 68 |
+
restart_space()
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
raw_data, original_df = get_leaderboard_df(
|
| 72 |
+
results_path=EVAL_RESULTS_PATH,
|
| 73 |
+
requests_path=EVAL_REQUESTS_PATH,
|
| 74 |
+
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
| 75 |
+
cols=COLS,
|
| 76 |
+
benchmark_cols=BENCHMARK_COLS
|
| 77 |
)
|
| 78 |
+
update_collections(original_df.copy())
|
| 79 |
+
leaderboard_df = original_df.copy()
|
| 80 |
|
| 81 |
+
plot_df = create_plot_df(create_scores_df(raw_data))
|
| 82 |
|
| 83 |
+
(
|
| 84 |
+
finished_eval_queue_df,
|
| 85 |
+
running_eval_queue_df,
|
| 86 |
+
pending_eval_queue_df,
|
| 87 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 88 |
|
| 89 |
+
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
|
| 90 |
|
| 91 |
+
leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
# Searching and filtering
|
src/envs.py
CHANGED
|
@@ -7,6 +7,7 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
|
| 7 |
|
| 8 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
| 9 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
|
|
|
| 10 |
RESULTS_REPO = "open-llm-leaderboard/results"
|
| 11 |
|
| 12 |
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
|
@@ -18,6 +19,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
| 18 |
|
| 19 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 20 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
|
|
| 21 |
|
| 22 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
| 23 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
|
|
|
| 7 |
|
| 8 |
REPO_ID = "HuggingFaceH4/open_llm_leaderboard"
|
| 9 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
| 10 |
+
DYNAMIC_INFO_REPO = "open-llm-leaderboard/dynamic_model_information"
|
| 11 |
RESULTS_REPO = "open-llm-leaderboard/results"
|
| 12 |
|
| 13 |
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
|
|
|
| 19 |
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
+
DYNAMIC_INFO_PATH = os.path.join(CACHE_PATH, "dynamic-info")
|
| 23 |
+
DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
|
| 24 |
|
| 25 |
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
| 26 |
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -11,7 +11,6 @@ from huggingface_hub import ModelCard
|
|
| 11 |
|
| 12 |
from src.display.formatting import make_clickable_model
|
| 13 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 14 |
-
from src.submission.check_validity import is_model_on_hub, check_model_card
|
| 15 |
|
| 16 |
|
| 17 |
@dataclass
|
|
@@ -34,6 +33,7 @@ class EvalResult:
|
|
| 34 |
still_on_hub: bool = False
|
| 35 |
is_merge: bool = False
|
| 36 |
flagged: bool = False
|
|
|
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def init_from_json_file(self, json_filepath):
|
|
@@ -42,13 +42,13 @@ class EvalResult:
|
|
| 42 |
data = json.load(fp)
|
| 43 |
|
| 44 |
# We manage the legacy config format
|
| 45 |
-
config = data.get("
|
| 46 |
|
| 47 |
# Precision
|
| 48 |
precision = Precision.from_str(config.get("model_dtype"))
|
| 49 |
|
| 50 |
# Get model and org
|
| 51 |
-
org_and_model = config.get("model_name"
|
| 52 |
org_and_model = org_and_model.split("/", 1)
|
| 53 |
|
| 54 |
if len(org_and_model) == 1:
|
|
@@ -61,37 +61,6 @@ class EvalResult:
|
|
| 61 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 62 |
full_model = "/".join(org_and_model)
|
| 63 |
|
| 64 |
-
still_on_hub, error, model_config = is_model_on_hub(
|
| 65 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 66 |
-
)
|
| 67 |
-
architecture = "?"
|
| 68 |
-
if model_config is not None:
|
| 69 |
-
architectures = getattr(model_config, "architectures", None)
|
| 70 |
-
if architectures:
|
| 71 |
-
architecture = ";".join(architectures)
|
| 72 |
-
|
| 73 |
-
# If the model doesn't have a model card or a license, we consider it's deleted
|
| 74 |
-
if still_on_hub:
|
| 75 |
-
try:
|
| 76 |
-
if check_model_card(full_model)[0] is False:
|
| 77 |
-
still_on_hub = False
|
| 78 |
-
except Exception:
|
| 79 |
-
still_on_hub = False
|
| 80 |
-
|
| 81 |
-
# Check if the model is a merge
|
| 82 |
-
is_merge_from_metadata = False
|
| 83 |
-
flagged = False
|
| 84 |
-
if still_on_hub:
|
| 85 |
-
model_card = ModelCard.load(full_model)
|
| 86 |
-
|
| 87 |
-
if model_card.data.tags:
|
| 88 |
-
is_merge_from_metadata = "merge" in model_card.data.tags
|
| 89 |
-
merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
|
| 90 |
-
# If the model is a merge but not saying it in the metadata, we flag it
|
| 91 |
-
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
| 92 |
-
flagged = is_merge_from_model_card and not is_merge_from_metadata
|
| 93 |
-
|
| 94 |
-
|
| 95 |
# Extract results available in this file (some results are split in several files)
|
| 96 |
results = {}
|
| 97 |
for task in Tasks:
|
|
@@ -128,10 +97,6 @@ class EvalResult:
|
|
| 128 |
results=results,
|
| 129 |
precision=precision,
|
| 130 |
revision= config.get("model_sha", ""),
|
| 131 |
-
still_on_hub=still_on_hub,
|
| 132 |
-
architecture=architecture,
|
| 133 |
-
is_merge=is_merge_from_metadata,
|
| 134 |
-
flagged=flagged,
|
| 135 |
)
|
| 136 |
|
| 137 |
def update_with_request_file(self, requests_path):
|
|
@@ -143,13 +108,21 @@ class EvalResult:
|
|
| 143 |
request = json.load(f)
|
| 144 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
| 145 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
| 146 |
-
self.license = request.get("license", "?")
|
| 147 |
-
self.likes = request.get("likes", 0)
|
| 148 |
self.num_params = request.get("params", 0)
|
| 149 |
self.date = request.get("submitted_time", "")
|
|
|
|
| 150 |
except Exception:
|
| 151 |
print(f"Could not find request file for {self.org}/{self.model}")
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
def to_dict(self):
|
| 154 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 155 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
@@ -158,7 +131,7 @@ class EvalResult:
|
|
| 158 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
| 159 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 160 |
AutoEvalColumn.merged.name: self.is_merge,
|
| 161 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 162 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 163 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 164 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
@@ -170,7 +143,6 @@ class EvalResult:
|
|
| 170 |
AutoEvalColumn.params.name: self.num_params,
|
| 171 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 172 |
AutoEvalColumn.flagged.name: self.flagged
|
| 173 |
-
|
| 174 |
}
|
| 175 |
|
| 176 |
for task in Tasks:
|
|
@@ -201,7 +173,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
| 201 |
return request_file
|
| 202 |
|
| 203 |
|
| 204 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 205 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 206 |
model_result_filepaths = []
|
| 207 |
|
|
@@ -219,11 +191,15 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 219 |
for file in files:
|
| 220 |
model_result_filepaths.append(os.path.join(root, file))
|
| 221 |
|
|
|
|
|
|
|
|
|
|
| 222 |
eval_results = {}
|
| 223 |
for model_result_filepath in model_result_filepaths:
|
| 224 |
# Creation of result
|
| 225 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 226 |
eval_result.update_with_request_file(requests_path)
|
|
|
|
| 227 |
|
| 228 |
# Store results of same eval together
|
| 229 |
eval_name = eval_result.eval_name
|
|
|
|
| 11 |
|
| 12 |
from src.display.formatting import make_clickable_model
|
| 13 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
@dataclass
|
|
|
|
| 33 |
still_on_hub: bool = False
|
| 34 |
is_merge: bool = False
|
| 35 |
flagged: bool = False
|
| 36 |
+
tags: list = None
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def init_from_json_file(self, json_filepath):
|
|
|
|
| 42 |
data = json.load(fp)
|
| 43 |
|
| 44 |
# We manage the legacy config format
|
| 45 |
+
config = data.get("config_general")
|
| 46 |
|
| 47 |
# Precision
|
| 48 |
precision = Precision.from_str(config.get("model_dtype"))
|
| 49 |
|
| 50 |
# Get model and org
|
| 51 |
+
org_and_model = config.get("model_name")
|
| 52 |
org_and_model = org_and_model.split("/", 1)
|
| 53 |
|
| 54 |
if len(org_and_model) == 1:
|
|
|
|
| 61 |
result_key = f"{org}_{model}_{precision.value.name}"
|
| 62 |
full_model = "/".join(org_and_model)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# Extract results available in this file (some results are split in several files)
|
| 65 |
results = {}
|
| 66 |
for task in Tasks:
|
|
|
|
| 97 |
results=results,
|
| 98 |
precision=precision,
|
| 99 |
revision= config.get("model_sha", ""),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
def update_with_request_file(self, requests_path):
|
|
|
|
| 108 |
request = json.load(f)
|
| 109 |
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
| 110 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
|
|
|
|
|
|
| 111 |
self.num_params = request.get("params", 0)
|
| 112 |
self.date = request.get("submitted_time", "")
|
| 113 |
+
self.architecture = request["architectures"]
|
| 114 |
except Exception:
|
| 115 |
print(f"Could not find request file for {self.org}/{self.model}")
|
| 116 |
|
| 117 |
+
def update_with_dynamic_file_dict(self, file_dict):
|
| 118 |
+
self.license = file_dict.get("license", "?")
|
| 119 |
+
self.likes = file_dict.get("likes", 0)
|
| 120 |
+
self.still_on_hub = file_dict["still_on_hub"]
|
| 121 |
+
self.flagged = any("flagged" in tag for tag in file_dict["tags"])
|
| 122 |
+
self.is_merge = "merge" in file_dict["tags"]
|
| 123 |
+
self.tags = file_dict["tags"]
|
| 124 |
+
|
| 125 |
+
|
| 126 |
def to_dict(self):
|
| 127 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 128 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
|
| 131 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
| 132 |
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 133 |
AutoEvalColumn.merged.name: self.is_merge,
|
| 134 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 135 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 136 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 137 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
|
|
|
| 143 |
AutoEvalColumn.params.name: self.num_params,
|
| 144 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
| 145 |
AutoEvalColumn.flagged.name: self.flagged
|
|
|
|
| 146 |
}
|
| 147 |
|
| 148 |
for task in Tasks:
|
|
|
|
| 173 |
return request_file
|
| 174 |
|
| 175 |
|
| 176 |
+
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
| 177 |
"""From the path of the results folder root, extract all needed info for results"""
|
| 178 |
model_result_filepaths = []
|
| 179 |
|
|
|
|
| 191 |
for file in files:
|
| 192 |
model_result_filepaths.append(os.path.join(root, file))
|
| 193 |
|
| 194 |
+
with open(dynamic_path) as f:
|
| 195 |
+
dynamic_data = json.load(f)
|
| 196 |
+
|
| 197 |
eval_results = {}
|
| 198 |
for model_result_filepath in model_result_filepaths:
|
| 199 |
# Creation of result
|
| 200 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 201 |
eval_result.update_with_request_file(requests_path)
|
| 202 |
+
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
| 203 |
|
| 204 |
# Store results of same eval together
|
| 205 |
eval_name = eval_result.eval_name
|
src/populate.py
CHANGED
|
@@ -9,8 +9,8 @@ from src.leaderboard.filter_models import filter_models
|
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
| 11 |
|
| 12 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
all_data_json.append(baseline_row)
|
| 16 |
filter_models(all_data_json)
|
|
|
|
| 9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 10 |
|
| 11 |
|
| 12 |
+
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 13 |
+
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
all_data_json.append(baseline_row)
|
| 16 |
filter_models(all_data_json)
|
{scripts → src/scripts}/create_request_file.py
RENAMED
|
@@ -1,36 +1,21 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import pprint
|
| 4 |
-
import re
|
| 5 |
from datetime import datetime, timezone
|
| 6 |
|
| 7 |
import click
|
| 8 |
from colorama import Fore
|
| 9 |
from huggingface_hub import HfApi, snapshot_download
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
EVAL_REQUESTS_PATH = "eval-queue"
|
| 12 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
| 13 |
|
| 14 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
| 15 |
-
model_types =
|
| 16 |
-
weight_types =
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
def get_model_size(model_info, precision: str):
|
| 20 |
-
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
| 21 |
-
try:
|
| 22 |
-
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
| 23 |
-
except (AttributeError, TypeError):
|
| 24 |
-
try:
|
| 25 |
-
size_match = re.search(size_pattern, model_info.modelId.lower())
|
| 26 |
-
model_size = size_match.group(0)
|
| 27 |
-
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
| 28 |
-
except AttributeError:
|
| 29 |
-
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 30 |
-
|
| 31 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
| 32 |
-
model_size = size_factor * model_size
|
| 33 |
-
return model_size
|
| 34 |
|
| 35 |
|
| 36 |
def main():
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import pprint
|
|
|
|
| 4 |
from datetime import datetime, timezone
|
| 5 |
|
| 6 |
import click
|
| 7 |
from colorama import Fore
|
| 8 |
from huggingface_hub import HfApi, snapshot_download
|
| 9 |
|
| 10 |
+
from src.submission.check_validity import get_model_size
|
| 11 |
+
from src.display.utils import ModelType, WeightType
|
| 12 |
+
|
| 13 |
EVAL_REQUESTS_PATH = "eval-queue"
|
| 14 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
| 15 |
|
| 16 |
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
| 17 |
+
model_types = [e.name for e in ModelType]
|
| 18 |
+
weight_types = [e.name for e in WeightType]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def main():
|
src/scripts/update_all_request_files.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import HfApi, ModelFilter, snapshot_download
|
| 2 |
+
from huggingface_hub import ModelCard
|
| 3 |
+
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import time
|
| 7 |
+
import shutil
|
| 8 |
+
from src.submission.check_validity import is_model_on_hub, check_model_card, get_model_size
|
| 9 |
+
from src.envs import DYNAMIC_INFO_REPO, DYNAMIC_INFO_FILE_PATH, API
|
| 10 |
+
|
| 11 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
| 12 |
+
|
| 13 |
+
TMP_FOLDER = "tmp_requests"
|
| 14 |
+
snapshot_download(
|
| 15 |
+
repo_id=DYNAMIC_INFO_REPO, local_dir=TMP_FOLDER, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Get models
|
| 19 |
+
start = time.time()
|
| 20 |
+
|
| 21 |
+
models = list(API.list_models(
|
| 22 |
+
filter=ModelFilter(task="text-generation"),
|
| 23 |
+
full=False,
|
| 24 |
+
cardData=True,
|
| 25 |
+
fetch_config=True,
|
| 26 |
+
))
|
| 27 |
+
|
| 28 |
+
print(f"Downloaded list of models in {time.time() - start:.2f} seconds")
|
| 29 |
+
|
| 30 |
+
def update_models(file_path, models):
|
| 31 |
+
"""
|
| 32 |
+
Search through all JSON files in the specified root folder and its subfolders,
|
| 33 |
+
and update the likes key in JSON dict from value of input dict
|
| 34 |
+
"""
|
| 35 |
+
with open(file_path, "r") as f:
|
| 36 |
+
model_infos = json.load(f)
|
| 37 |
+
for model_id, data in model_infos.items():
|
| 38 |
+
if model_id not in models:
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
model_cfg = models[model_id]
|
| 42 |
+
data['likes'] = model_cfg.likes
|
| 43 |
+
#data['params'] = get_model_size(model_cfg, data['precision'])
|
| 44 |
+
data['license'] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
|
| 45 |
+
|
| 46 |
+
# Is the model still on the hub
|
| 47 |
+
still_on_hub, error, model_config = is_model_on_hub(
|
| 48 |
+
model_name=model_id, revision=data.get("revision"), trust_remote_code=True, test_tokenizer=False
|
| 49 |
+
)
|
| 50 |
+
# If the model doesn't have a model card or a license, we consider it's deleted
|
| 51 |
+
if still_on_hub:
|
| 52 |
+
try:
|
| 53 |
+
if check_model_card(model_id)[0] is False:
|
| 54 |
+
still_on_hub = False
|
| 55 |
+
except Exception:
|
| 56 |
+
still_on_hub = False
|
| 57 |
+
data['still_on_hub'] = still_on_hub
|
| 58 |
+
|
| 59 |
+
# Check if the model is a merge
|
| 60 |
+
is_merge_from_metadata = False
|
| 61 |
+
if still_on_hub:
|
| 62 |
+
model_card = ModelCard.load(model_id)
|
| 63 |
+
|
| 64 |
+
# Storing the model metadata
|
| 65 |
+
tags = []
|
| 66 |
+
if model_card.data.tags:
|
| 67 |
+
is_merge_from_metadata = "merge" in model_card.data.tags
|
| 68 |
+
merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
|
| 69 |
+
# If the model is a merge but not saying it in the metadata, we flag it
|
| 70 |
+
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
| 71 |
+
if is_merge_from_model_card:
|
| 72 |
+
tags.append("merge")
|
| 73 |
+
if not is_merge_from_metadata:
|
| 74 |
+
tags.append("flagged:undisclosed_merge")
|
| 75 |
+
if "moe" in model_card.data.tags:
|
| 76 |
+
tags.append("moe")
|
| 77 |
+
|
| 78 |
+
data["tags"] = tags
|
| 79 |
+
|
| 80 |
+
with open(file_path, 'w') as f:
|
| 81 |
+
json.dump(model_infos, f, indent=2)
|
| 82 |
+
|
| 83 |
+
start = time.time()
|
| 84 |
+
|
| 85 |
+
updated_ids = update_models(DYNAMIC_INFO_FILE_PATH, models)
|
| 86 |
+
|
| 87 |
+
print(f"updated in {time.time() - start:.2f} seconds, updated ids: {len(updated_ids)}")
|
| 88 |
+
|
| 89 |
+
API.upload_file(
|
| 90 |
+
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
| 91 |
+
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
| 92 |
+
repo_id=DYNAMIC_INFO_REPO,
|
| 93 |
+
repo_type="dataset",
|
| 94 |
+
commit_message=f"Daily request file update.",
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
shutil.rmtree(TMP_FOLDER)
|
src/submission/check_validity.py
CHANGED
|
@@ -6,7 +6,7 @@ from datetime import datetime, timedelta, timezone
|
|
| 6 |
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
| 9 |
-
from huggingface_hub.hf_api import ModelInfo
|
| 10 |
from transformers import AutoConfig, AutoTokenizer
|
| 11 |
|
| 12 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
|
@@ -36,7 +36,7 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
| 36 |
return True, ""
|
| 37 |
|
| 38 |
|
| 39 |
-
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
| 40 |
try:
|
| 41 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
|
| 42 |
if test_tokenizer:
|
|
@@ -65,17 +65,23 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
| 65 |
|
| 66 |
def get_model_size(model_info: ModelInfo, precision: str):
|
| 67 |
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
|
|
|
| 68 |
try:
|
| 69 |
-
|
| 70 |
-
except
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
try:
|
| 72 |
-
size_match = re.search(size_pattern, model_info.
|
| 73 |
model_size = size_match.group(0)
|
| 74 |
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
| 75 |
-
except AttributeError:
|
| 76 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 77 |
|
| 78 |
-
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.
|
| 79 |
model_size = size_factor * model_size
|
| 80 |
return model_size
|
| 81 |
|
|
|
|
| 6 |
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
| 9 |
+
from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata
|
| 10 |
from transformers import AutoConfig, AutoTokenizer
|
| 11 |
|
| 12 |
from src.envs import HAS_HIGHER_RATE_LIMIT
|
|
|
|
| 36 |
return True, ""
|
| 37 |
|
| 38 |
|
| 39 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str, AutoConfig]:
|
| 40 |
try:
|
| 41 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token) #, force_download=True)
|
| 42 |
if test_tokenizer:
|
|
|
|
| 65 |
|
| 66 |
def get_model_size(model_info: ModelInfo, precision: str):
|
| 67 |
size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
|
| 68 |
+
safetensors = None
|
| 69 |
try:
|
| 70 |
+
safetensors = get_safetensors_metadata(model_info.id)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
print(e)
|
| 73 |
+
|
| 74 |
+
if safetensors is not None:
|
| 75 |
+
model_size = round(sum(safetensors.parameter_count.values()) / 1e9, 3)
|
| 76 |
+
else:
|
| 77 |
try:
|
| 78 |
+
size_match = re.search(size_pattern, model_info.id.lower())
|
| 79 |
model_size = size_match.group(0)
|
| 80 |
model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
|
| 81 |
+
except AttributeError as e:
|
| 82 |
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
| 83 |
|
| 84 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.id.lower()) else 1
|
| 85 |
model_size = size_factor * model_size
|
| 86 |
return model_size
|
| 87 |
|
src/submission/submit.py
CHANGED
|
@@ -2,8 +2,10 @@ import json
|
|
| 2 |
import os
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
|
|
|
|
|
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
| 7 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
| 8 |
from src.submission.check_validity import (
|
| 9 |
already_submitted_models,
|
|
@@ -65,9 +67,15 @@ def add_new_eval(
|
|
| 65 |
return styled_error(f'Base model "{base_model}" {error}')
|
| 66 |
|
| 67 |
if not weight_type == "Adapter":
|
| 68 |
-
model_on_hub, error,
|
| 69 |
if not model_on_hub:
|
| 70 |
return styled_error(f'Model "{model}" {error}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Is the model info correctly filled?
|
| 73 |
try:
|
|
@@ -86,6 +94,22 @@ def add_new_eval(
|
|
| 86 |
modelcard_OK, error_msg = check_model_card(model)
|
| 87 |
if not modelcard_OK:
|
| 88 |
return styled_error(error_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Seems good, creating the eval
|
| 91 |
print("Adding new eval")
|
|
@@ -96,13 +120,21 @@ def add_new_eval(
|
|
| 96 |
"revision": revision,
|
| 97 |
"private": private,
|
| 98 |
"precision": precision,
|
|
|
|
|
|
|
| 99 |
"weight_type": weight_type,
|
| 100 |
"status": "PENDING",
|
| 101 |
"submitted_time": current_time,
|
| 102 |
"model_type": model_type,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
"likes": model_info.likes,
|
| 104 |
-
"params": model_size,
|
| 105 |
"license": license,
|
|
|
|
|
|
|
| 106 |
}
|
| 107 |
|
| 108 |
# Check for duplicate submission
|
|
@@ -126,6 +158,23 @@ def add_new_eval(
|
|
| 126 |
commit_message=f"Add {model} to eval queue",
|
| 127 |
)
|
| 128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
# Remove the local file
|
| 130 |
os.remove(out_path)
|
| 131 |
|
|
|
|
| 2 |
import os
|
| 3 |
from datetime import datetime, timezone
|
| 4 |
|
| 5 |
+
from huggingface_hub import ModelCard
|
| 6 |
+
|
| 7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 8 |
+
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
| 9 |
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
| 10 |
from src.submission.check_validity import (
|
| 11 |
already_submitted_models,
|
|
|
|
| 67 |
return styled_error(f'Base model "{base_model}" {error}')
|
| 68 |
|
| 69 |
if not weight_type == "Adapter":
|
| 70 |
+
model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
| 71 |
if not model_on_hub:
|
| 72 |
return styled_error(f'Model "{model}" {error}')
|
| 73 |
+
architecture = "?"
|
| 74 |
+
if model_config is not None:
|
| 75 |
+
architectures = getattr(model_config, "architectures", None)
|
| 76 |
+
if architectures:
|
| 77 |
+
architecture = ";".join(architectures)
|
| 78 |
+
|
| 79 |
|
| 80 |
# Is the model info correctly filled?
|
| 81 |
try:
|
|
|
|
| 94 |
modelcard_OK, error_msg = check_model_card(model)
|
| 95 |
if not modelcard_OK:
|
| 96 |
return styled_error(error_msg)
|
| 97 |
+
|
| 98 |
+
# Storing the model tags
|
| 99 |
+
tags = []
|
| 100 |
+
|
| 101 |
+
model_card = ModelCard.load(model)
|
| 102 |
+
is_merge_from_metadata = "merge" in model_card.data.tags if model_card.data.tags else False
|
| 103 |
+
merge_keywords = ["mergekit", "merged model", "merge model", "merging"]
|
| 104 |
+
# If the model is a merge but not saying it in the metadata, we flag it
|
| 105 |
+
is_merge_from_model_card = any(keyword in model_card.text.lower() for keyword in merge_keywords)
|
| 106 |
+
if is_merge_from_model_card:
|
| 107 |
+
tags.append("merge")
|
| 108 |
+
if not is_merge_from_metadata:
|
| 109 |
+
tags.append("flagged:undisclosed_merge")
|
| 110 |
+
if "moe" in model_card.data.tags:
|
| 111 |
+
tags.append("moe")
|
| 112 |
+
|
| 113 |
|
| 114 |
# Seems good, creating the eval
|
| 115 |
print("Adding new eval")
|
|
|
|
| 120 |
"revision": revision,
|
| 121 |
"private": private,
|
| 122 |
"precision": precision,
|
| 123 |
+
"params": model_size,
|
| 124 |
+
"architectures": architecture,
|
| 125 |
"weight_type": weight_type,
|
| 126 |
"status": "PENDING",
|
| 127 |
"submitted_time": current_time,
|
| 128 |
"model_type": model_type,
|
| 129 |
+
"job_id": -1,
|
| 130 |
+
"job_start_time": None,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
supplementary_info = {
|
| 134 |
"likes": model_info.likes,
|
|
|
|
| 135 |
"license": license,
|
| 136 |
+
"still_on_hub": True,
|
| 137 |
+
"tags": tags,
|
| 138 |
}
|
| 139 |
|
| 140 |
# Check for duplicate submission
|
|
|
|
| 158 |
commit_message=f"Add {model} to eval queue",
|
| 159 |
)
|
| 160 |
|
| 161 |
+
with open(DYNAMIC_INFO_FILE_PATH) as f:
|
| 162 |
+
all_supplementary_info = json.load(f)
|
| 163 |
+
|
| 164 |
+
all_supplementary_info[model] = supplementary_info
|
| 165 |
+
with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
|
| 166 |
+
json.dump(all_supplementary_info, f, indent=2)
|
| 167 |
+
|
| 168 |
+
API.upload_file(
|
| 169 |
+
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
| 170 |
+
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
| 171 |
+
repo_id=DYNAMIC_INFO_REPO,
|
| 172 |
+
repo_type="dataset",
|
| 173 |
+
commit_message=f"Add {model} to dynamic info queue",
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
|
| 178 |
# Remove the local file
|
| 179 |
os.remove(out_path)
|
| 180 |
|