davanstrien's picture
davanstrien HF Staff
chore: Add lru_cache to improve performance of data fetching
195a3cd
raw
history blame
3.44 kB
import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd
from functools import lru_cache
choices = sorted(
[
"art",
"biology",
"code",
"distilabel",
"fiftyone",
"legal",
"medical",
"sentence-transformers",
"synthetic",
]
)
@lru_cache(maxsize=100)
def fetch_data(framework):
r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
data = r.json()
grouped = groupby(lambda x: x["author"], data)
grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
return data, grouped
def generate_dashboard(data, grouped, framework):
total_datasets = sum(len(v) for v in grouped.values())
dashboard = f"## Hugging Face datasets for {framework} \n\n"
dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
dashboard += "### Datasets per Author\n\n"
for k, v in grouped.items():
dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
dashboard += f" - **Number of datasets:** {len(v)}\n"
return dashboard
def plot_datasets_growth(data, framework):
df = pd.DataFrame(data)
df["createdAt"] = pd.to_datetime(df["createdAt"])
df["month"] = df["createdAt"].dt.to_period("M").astype(str)
df_counts = df.groupby("month").size().reset_index(name="count")
df_counts["cumulative_count"] = df_counts["count"].cumsum()
df_counts["growth_rate"] = df_counts["count"].pct_change()
fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
fig.update_layout(
xaxis_title="Month",
yaxis_title="Cumulative Number of Datasets",
yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
yaxis2=dict(
title="Month-over-Month Growth Rate",
overlaying="y",
side="right",
tickformat=",.0%",
),
legend=dict(
title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
),
)
fig.add_scatter(
x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
)
fig.update_layout(
title={
"text": f"Dataset Growth for {framework} datasets",
"y": 0.95,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
title_font=dict(size=24),
annotations=[
dict(
x=0.5,
y=0.85,
xref="paper",
yref="paper",
text="Cumulative number of datasets and month-over-month growth rate",
showarrow=False,
font=dict(size=14),
)
],
)
return fig
def update_dashboard(framework):
data, grouped = fetch_data(framework)
dashboard = generate_dashboard(data, grouped, framework)
fig = plot_datasets_growth(data, framework)
return fig, dashboard
with gr.Blocks() as demo:
gr.Markdown("# View the growth of dataset frameworks/tags on the Hub")
framework = gr.Dropdown(
choices=choices,
allow_custom_value=True,
)
plot = gr.Plot()
markdown = gr.Markdown()
framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])
demo.launch()