File size: 4,417 Bytes
d30410b a6b0f6f d30410b a6b0f6f d30410b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
from constants import EVAL_REQUESTS_PATH
from pathlib import Path
from huggingface_hub import HfApi, Repository
TOKEN_HUB = os.environ.get("TOKEN_HUB")
QUEUE_REPO = os.environ.get("QUEUE_REPO")
QUEUE_PATH = os.environ.get("QUEUE_PATH")
hf_api = HfApi(
endpoint="https://huggingface.co",
token=TOKEN_HUB,
)
print(TOKEN_HUB)
# Language code for Persian
PERSIAN_LANGUAGE_CODE = "fa"
def load_all_info_from_dataset_hub():
eval_queue_repo = None
requested_models = None
passed = True
if TOKEN_HUB is None:
passed = False
else:
print("Pulling evaluation requests and results.")
eval_queue_repo = Repository(
local_dir=QUEUE_PATH,
clone_from=QUEUE_REPO,
use_auth_token=TOKEN_HUB,
repo_type="dataset",
)
eval_queue_repo.git_pull()
# Local directory where dataset repo is cloned + folder with eval requests
directory = QUEUE_PATH / EVAL_REQUESTS_PATH
requested_models = get_all_requested_models(directory)
requested_models = [p.stem for p in requested_models]
# Filter models to only include those supporting Persian language
requested_models = filter_persian_models(requested_models)
# Local directory where dataset repo is cloned
csv_results = get_csv_with_results(QUEUE_PATH)
if csv_results is None:
passed = False
if not passed:
raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")
return eval_queue_repo, requested_models, csv_results
def upload_file(requested_model_name, path_or_fileobj):
dest_repo_file = Path(EVAL_REQUESTS_PATH) / path_or_fileobj.name
dest_repo_file = str(dest_repo_file)
hf_api.upload_file(
path_or_fileobj=path_or_fileobj,
path_in_repo=str(dest_repo_file),
repo_id=QUEUE_REPO,
token=TOKEN_HUB,
repo_type="dataset",
commit_message=f"Add {requested_model_name} to eval queue")
def get_all_requested_models(directory):
directory = Path(directory)
all_requested_models = list(directory.glob("*.txt"))
return all_requested_models
def get_csv_with_results(directory):
directory = Path(directory)
all_csv_files = list(directory.glob("*.csv"))
latest = [f for f in all_csv_files if f.stem.endswith("latest")]
if len(latest) != 1:
return None
return latest[0]
def is_model_on_hub(model_name, revision="main") -> bool:
try:
model_name = model_name.replace(" ","")
author = model_name.split("/")[0]
model_id = model_name.split("/")[1]
if len(author) == 0 or len(model_id) == 0:
return False, "is not a valid model name. Please use the format `author/model_name`."
except Exception as e:
return False, "is not a valid model name. Please use the format `author/model_name`."
try:
models = list(hf_api.list_models(author=author, search=model_id))
matched = [model_name for m in models if m.modelId == model_name]
if len(matched) != 1:
return False, "was not found on the hub!"
else:
return True, None
except Exception as e:
print(f"Could not get the model from the hub.: {e}")
return False, "was not found on hub!"
def filter_persian_models(model_list):
"""
Filters the provided list of models to include only those that support Persian (fa).
Args:
model_list (list): List of model names to filter.
Returns:
list: List of models that support Persian.
"""
persian_models = []
for model_name in model_list:
try:
# Get model information from Hugging Face Hub
model_info = hf_api.model_info(model_name)
languages = model_info.cardData.get("languages", [])
# Check if Persian ('fa') is listed in the model's languages
if PERSIAN_LANGUAGE_CODE in languages:
persian_models.append(model_name)
print(f"{model_name} supports Persian language.")
else:
print(f"{model_name} does not support Persian language. Skipping.")
except Exception as e:
print(f"Error fetching model info for {model_name}: {str(e)}")
return persian_models
|