File size: 4,859 Bytes
4cb150c 0acccaf 75abe88 0acccaf 75abe88 20283f2 4cb150c 20283f2 568085d cf690ac 20283f2 cf690ac 75abe88 20283f2 75abe88 0acccaf 20283f2 75abe88 20283f2 75abe88 20283f2 75abe88 0acccaf 20283f2 75abe88 20283f2 0acccaf 4cb150c 0acccaf 4cb150c 75abe88 4cb150c 75abe88 4cb150c 75abe88 4cb150c 75abe88 4cb150c 75abe88 0acccaf 20283f2 0acccaf 62d55e9 0acccaf 20283f2 0acccaf 62d55e9 0acccaf 20283f2 0acccaf 62d55e9 0acccaf fa111f8 0acccaf 62d55e9 4cb150c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
from huggingface_hub import HfApi
import pandas as pd
import os
import streamlit as st
import altair as alt
import numpy as np
import datetime
from huggingface_hub import Repository
from transformers.models.auto.configuration_auto import CONFIG_MAPPING_NAMES
from transformers.models.auto.modeling_auto import (
MODEL_FOR_CTC_MAPPING_NAMES,
MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES,
MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES,
MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES,
MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES,
)
audio_models = list(MODEL_FOR_CTC_MAPPING_NAMES.keys()) + list(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.keys())
vision_models = list(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES.keys()) + list(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES.keys()) + list(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES.keys()) + list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES.keys())
today = datetime.date.today()
year, week, _ = today.isocalendar()
DATASET_REPO_URL = (
"https://huggingface.co/datasets/patrickvonplaten/model-archs-downloads-space-data"
)
DATA_FILENAME = f"data_{week}_{year}.csv"
DATA_FILE = os.path.join("data", DATA_FILENAME)
def retrieve_model_stats():
hf_api = HfApi()
all_stats = {}
total_downloads = 0
for model_name in list(CONFIG_MAPPING_NAMES.keys()):
if model_name in audio_models:
modality = "audio"
elif model_name in vision_models:
modality = "vision"
else:
modality = "text"
model_stats = {
"num_downloads": 0,
"%_of_all_downloads": 0,
"num_models": 0,
"download_per_model": 0,
"modality": modality,
}
models = hf_api.list_models(filter=model_name)
model_stats["num_models"] = len(models)
model_stats["num_downloads"] = sum(
[m.downloads for m in models if hasattr(m, "downloads")]
)
if len(models) > 0:
model_stats["download_per_model"] = round(
model_stats["num_downloads"] / len(models), 2
)
total_downloads += model_stats["num_downloads"]
# save in overall dict
all_stats[model_name] = model_stats
for model_name in list(CONFIG_MAPPING_NAMES.keys()):
all_stats[model_name]["%_of_all_downloads"] = (
round(all_stats[model_name]["num_downloads"] / total_downloads, 5) * 100
) # noqa: E501
downloads = all_stats[model_name]["num_downloads"]
all_stats[model_name]["num_downloads"] = f"{downloads:,}"
sorted_results = dict(
reversed(sorted(all_stats.items(), key=lambda d: d[1]["%_of_all_downloads"]))
)
dataframe = pd.DataFrame.from_dict(sorted_results, orient="index")
# give header to model names
result = "model_names" + dataframe.to_csv()
return result
repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL)
if not os.path.isfile(DATA_FILE):
print("Create datafile...")
result = retrieve_model_stats()
if not os.path.isfile(DATA_FILE):
with open(DATA_FILE, "w") as f:
f.write(result)
commit_url = repo.push_to_hub()
print(commit_url)
with open(DATA_FILE, "r") as f:
dataframe = pd.read_csv(DATA_FILE)
int_downloads = np.array(
[int(x.replace(",", "")) for x in dataframe["num_downloads"].values]
)
st.title(f"Transformers stats for year {year} and week {week}")
# print top 20 downloads
source = pd.DataFrame(
{
"Number of total downloads": int_downloads[:20],
"Model architecture name": dataframe["model_names"].values[:20],
}
)
bar_chart = (
alt.Chart(source)
.mark_bar()
.encode(
y="Number of total downloads",
x=alt.X("Model architecture name", sort=None),
)
)
st.title("Top 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)
# print bottom 20 downloads
source = pd.DataFrame(
{
"Number of total downloads": int_downloads[-20:],
"Model architecture name": dataframe["model_names"].values[-20:],
}
)
bar_chart = (
alt.Chart(source)
.mark_bar()
.encode(
y="Number of total downloads",
x=alt.X("Model architecture name", sort=None),
)
)
st.title("Bottom 20 downloads last 30 days")
st.altair_chart(bar_chart, use_container_width=True)
import ipdb; ipdb.set_trace()
# print all stats
st.title("All stats last 30 days")
st.table(dataframe)
|