File size: 1,714 Bytes
b6a7e2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import argparse
import huggingface_hub


def model_has_dataset(model):
    for tag in model.tags:
        if tag.startswith("dataset:"):
            return True
    return False


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog="Giskard Retriever", description="Retrieves HF models that are bound to datasets."
    )
    parser.add_argument(
        "--model_type",
        help="Hugging Face model types. default: text-classification",
        required=False,
    )
    parser.add_argument("--output_format",
                        help="Format of the information retrieved. Default: parquet. Options: parquet, csv, json.")

    args = parser.parse_args()

    MODEL_TYPE = args.model_type if args.model_type is not None else "text-classification"

    models_with_dataset = filter(
        model_has_dataset, huggingface_hub.list_models(filter=MODEL_TYPE, sort="likes", direction=-1)
    )

    import pandas as pd

    df = pd.DataFrame(
        [
            {
                "modelId": m.modelId,
                "modelType": MODEL_TYPE,
                "author": m.author,
                "downloads": m.downloads,
                "likes": m.likes,
                "datasets": [t[8:] for t in m.tags if t.startswith("dataset:")],
            }
            for m in models_with_dataset
        ]
    )

    output_format = args.output_format

    if output_format is None or output_format == "parquet":
        df.to_parquet(f"models_{MODEL_TYPE}.parquet", index=False)
    elif output_format == "csv":
        df.to_csv(f"models_{MODEL_TYPE}.csv", columns=df.columns, index=False)
    elif output_format == "json":
        df.to_json(f"models_{MODEL_TYPE}.json", index=False)