File size: 3,322 Bytes
4cb150c
0acccaf
75abe88
 
0acccaf
 
75abe88
4cb150c
568085d
75abe88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0acccaf
75abe88
 
 
 
 
 
 
 
 
 
 
 
0acccaf
75abe88
 
 
 
 
0acccaf
4cb150c
0acccaf
 
 
4cb150c
 
75abe88
4cb150c
75abe88
 
 
4cb150c
75abe88
 
 
4cb150c
75abe88
 
4cb150c
75abe88
0acccaf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cb150c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from huggingface_hub import HfApi
import pandas as pd
import os
import streamlit as st
import altair as alt
import numpy as np
import datetime
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES

from huggingface_hub import Repository

today = datetime.date.today()
year, week, _ = today.isocalendar()

DATASET_REPO_URL = "https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data"
DATA_FILENAME = f"data_{week}_{year}.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)


def retrieve_model_stats():
    hf_api = HfApi()
    all_stats = {}
    total_downloads = 0

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        model_stats = {"num_downloads": 0, "%_of_all_downloads": 0, "num_models": 0, "download_per_model": 0}
        models = hf_api.list_models(filter=model_name)

        model_stats["num_models"] = len(models)
        model_stats["num_downloads"] = sum([m.downloads for m in models if hasattr(m, "downloads")])
        if len(models) > 0:
            model_stats["download_per_model"] = round(model_stats["num_downloads"] / len(models), 2)
        total_downloads += model_stats["num_downloads"]

        # save in overall dict
        all_stats[model_name] = model_stats

    for model_name in list(CONFIG_MAPPING_NAMES.keys()):
        all_stats[model_name]["%_of_all_downloads"] = round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100  # noqa: E501
        downloads = all_stats[model_name]["num_downloads"]
        all_stats[model_name]["num_downloads"] = f"{downloads:,}"

    sorted_results = dict(reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"])))
    dataframe = pd.DataFrame.from_dict(sorted_results, orient="index")

    # give header to model names
    result = "model_names" + dataframe.to_csv()
    return result


repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL)

if not os.path.isfile(DATA_FILE):
    print("Create datafile...")
    result = retrieve_model_stats()

    if not os.path.isfile(DATA_FILE):
        with open(DATA_FILE, "w") as f:
            f.write(result)

        commit_url = repo.push_to_hub()
        print(commit_url)

with open(DATA_FILE, "r") as f:
    dataframe = pd.read_csv(DATA_FILE)

int_downloads = np.array([int(x.replace(",", "")) for x in dataframe["num_downloads"].values])

# print top 20 downloads
source = pd.DataFrame({
    'Number of total downloads': int_downloads[:20],
    'Model architecture name': dataframe["model_names"].values[:20],
})
bar_chart = alt.Chart(source).mark_bar().encode(
    y="Number of total downloads",
    x=alt.X("Model architecture name", sort=None),
)
st.title(f'Top 20 downloads for year {year} and week {week}')
st.altair_chart(bar_chart, use_container_width=True)

# print bottom 20 downloads
source = pd.DataFrame({
    'Number of total downloads': int_downloads[-20:],
    'Model architecture name': dataframe["model_names"].values[-20:],
})
bar_chart = alt.Chart(source).mark_bar().encode(
    y="Number of total downloads",
    x=alt.X("Model architecture name", sort=None),
)
st.title(f'Bottom 20 downloads for year {year} and week {week}')
st.altair_chart(bar_chart, use_container_width=True)

# print all stats
st.title(f'All downloads for year {year} and week {week}')
st.table(dataframe)