File size: 3,322 Bytes
4cb150c
0acccaf
75abe88
 
0acccaf
 
75abe88
4cb150c
568085d
75abe88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0acccaf
75abe88
 
 
 
 
 
 
 
 
 
 
 
0acccaf
75abe88
 
 
 
 
0acccaf
4cb150c
0acccaf
 
 
4cb150c
 
75abe88
4cb150c
75abe88
 
 
4cb150c
75abe88
 
 
4cb150c
75abe88
 
4cb150c
75abe88
0acccaf
 
 
 
62d55e9
0acccaf
 
 
 
 
 
 
 
 
62d55e9
0acccaf
 
 
 
 
 
 
 
 
 
 
62d55e9
0acccaf
 
 
62d55e9
4cb150c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from huggingface_hub import HfApi
import pandas as pd
import os
import streamlit as st
import altair as alt
import numpy as np
import datetime
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES

from huggingface_hub import Repository

today = datetime.date.today()
year, week, _ = today.isocalendar()

DATASET_REPO_URL = "https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data"
DATA_FILENAME = f"data_{week}_{year}.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)


def retrieve_model_stats():
    hf_api = HfApi()
    all_stats = {}
    total_downloads = 0

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        model_stats = {"num_downloads": 0, "%_of_all_downloads": 0, "num_models": 0, "download_per_model": 0}
        models = hf_api.list_models(filter=model_name)

        model_stats["num_models"] = len(models)
        model_stats["num_downloads"] = sum([m.downloads for m in models if hasattr(m, "downloads")])
        if len(models) > 0:
            model_stats["download_per_model"] = round(model_stats["num_downloads"] / len(models), 2)
        total_downloads += model_stats["num_downloads"]

        # save in overall dict
        all_stats[model_name] = model_stats

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        all_stats[model_name]["%_of_all_downloads"] = round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100  # noqa: E501
        downloads = all_stats[model_name]["num_downloads"]
        all_stats[model_name]["num_downloads"] = f"{downloads:,}"

    sorted_results = dict(reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"])))
    dataframe = pd.DataFrame.from_dict(sorted_results, orient="index")

    # give header to model names
    result = "model_names" + dataframe.to_csv()
    return result


repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL)

if not os.path.isfile(DATA_FILE):
    print("Create datafile...")
    result = retrieve_model_stats()

    if not os.path.isfile(DATA_FILE):
        with open(DATA_FILE, "w") as f:
            f.write(result)

        commit_url = repo.push_to_hub()
        print(commit_url)

with open(DATA_FILE, "r") as f:
    dataframe = pd.read_csv(DATA_FILE)

int_downloads = np.array([int(x.replace(",", "")) for x in dataframe["num_downloads"].values])

st.title(f"Transformers stats for year {year} and week {week}")
# print top 20 downloads
source = pd.DataFrame({
    'Number of total downloads': int_downloads[:20],
    'Model architecture name': dataframe["model_names"].values[:20],
})
bar_chart = alt.Chart(source).mark_bar().encode(
    y="Number of total downloads",
    x=alt.X("Model architecture name", sort=None),
)
st.title("Top 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)

# print bottom 20 downloads
source = pd.DataFrame({
    'Number of total downloads': int_downloads[-20:],
    'Model architecture name': dataframe["model_names"].values[-20:],
})
bar_chart = alt.Chart(source).mark_bar().encode(
    y="Number of total downloads",
    x=alt.X("Model architecture name", sort=None),
)
st.title("Bottom 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)

# print all stats
st.title("All stats last 30 days")
st.table(dataframe)