File size: 4,417 Bytes
d30410b
 
 
 
 
a6b0f6f
 
 
d30410b
 
 
 
 
a6b0f6f
d30410b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os 
from constants import EVAL_REQUESTS_PATH
from pathlib import Path
from huggingface_hub import HfApi, Repository

TOKEN_HUB = os.environ.get("TOKEN_HUB")
QUEUE_REPO = os.environ.get("QUEUE_REPO")
QUEUE_PATH = os.environ.get("QUEUE_PATH")

hf_api = HfApi(
    endpoint="https://huggingface.co", 
    token=TOKEN_HUB, 
)
print(TOKEN_HUB)
# Language code for Persian
PERSIAN_LANGUAGE_CODE = "fa"

def load_all_info_from_dataset_hub():
    eval_queue_repo = None
    requested_models = None

    passed = True
    if TOKEN_HUB is None:
        passed = False
    else:
        print("Pulling evaluation requests and results.")

        eval_queue_repo = Repository(
            local_dir=QUEUE_PATH,
            clone_from=QUEUE_REPO,
            use_auth_token=TOKEN_HUB,
            repo_type="dataset",
        )
        eval_queue_repo.git_pull()
        
        # Local directory where dataset repo is cloned + folder with eval requests
        directory = QUEUE_PATH / EVAL_REQUESTS_PATH
        requested_models = get_all_requested_models(directory)
        requested_models = [p.stem for p in requested_models]

        # Filter models to only include those supporting Persian language
        requested_models = filter_persian_models(requested_models)
        
        # Local directory where dataset repo is cloned
        csv_results = get_csv_with_results(QUEUE_PATH)
        if csv_results is None:
            passed = False
    if not passed:
        raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")

    return eval_queue_repo, requested_models, csv_results


def upload_file(requested_model_name, path_or_fileobj):
    dest_repo_file = Path(EVAL_REQUESTS_PATH) / path_or_fileobj.name
    dest_repo_file = str(dest_repo_file)
    hf_api.upload_file(
            path_or_fileobj=path_or_fileobj,
            path_in_repo=str(dest_repo_file),
            repo_id=QUEUE_REPO,
            token=TOKEN_HUB,
            repo_type="dataset",
            commit_message=f"Add {requested_model_name} to eval queue")


def get_all_requested_models(directory):
    directory = Path(directory)
    all_requested_models = list(directory.glob("*.txt"))
    return all_requested_models


def get_csv_with_results(directory):
    directory = Path(directory)
    all_csv_files = list(directory.glob("*.csv"))
    latest = [f for f in all_csv_files if f.stem.endswith("latest")]
    if len(latest) != 1:
        return None
    return latest[0]


def is_model_on_hub(model_name, revision="main") -> bool:
    try:
        model_name = model_name.replace(" ","")
        author = model_name.split("/")[0]
        model_id = model_name.split("/")[1]
        if len(author) == 0 or len(model_id) == 0:
            return False, "is not a valid model name. Please use the format `author/model_name`."
    except Exception as e:
        return False, "is not a valid model name. Please use the format `author/model_name`."

    try:
        models = list(hf_api.list_models(author=author, search=model_id))
        matched = [model_name for m in models if m.modelId == model_name]
        if len(matched) != 1:
            return False, "was not found on the hub!"
        else:
            return True, None
    except Exception as e:
        print(f"Could not get the model from the hub.: {e}")
        return False, "was not found on hub!"


def filter_persian_models(model_list):
    """
    Filters the provided list of models to include only those that support Persian (fa).
    
    Args:
        model_list (list): List of model names to filter.
    
    Returns:
        list: List of models that support Persian.
    """
    persian_models = []
    for model_name in model_list:
        try:
            # Get model information from Hugging Face Hub
            model_info = hf_api.model_info(model_name)
            languages = model_info.cardData.get("languages", [])
            
            # Check if Persian ('fa') is listed in the model's languages
            if PERSIAN_LANGUAGE_CODE in languages:
                persian_models.append(model_name)
                print(f"{model_name} supports Persian language.")
            else:
                print(f"{model_name} does not support Persian language. Skipping.")
        except Exception as e:
            print(f"Error fetching model info for {model_name}: {str(e)}")
    
    return persian_models