File size: 3,388 Bytes
38fcf43
 
 
 
 
 
842e3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
38fcf43
 
 
 
 
 
 
 
 
 
 
 
842e3d0
38fcf43
bb1ee8a
38fcf43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
842e3d0
38fcf43
842e3d0
38fcf43
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import httpx
from toolz import groupby
import plotly.express as px
import pandas as pd

choices = sorted(
    [
        "art",
        "biology",
        "code",
        "distilabel",
        "fiftyone",
        "legal",
        "medical",
        "sentence-transformers",
        "synthetic",
    ]
)


def fetch_data(framework):
    r = httpx.get(f"https://huggingface.co/api/datasets?filter={framework}")
    data = r.json()
    grouped = groupby(lambda x: x["author"], data)
    grouped = dict(sorted(grouped.items(), key=lambda x: len(x[1]), reverse=True))
    return data, grouped


def generate_dashboard(data, grouped, framework):
    total_datasets = sum(len(v) for v in grouped.values())

    dashboard = f"## Hugging Face datasets for {framework} \n\n"
    dashboard += f"**Total number of datasets: {total_datasets}**\n\n"
    dashboard += f"**Total number of authors: {len(grouped)}**\n\n"
    dashboard += "### Datasets per Author\n\n"

    for k, v in grouped.items():
        dashboard += f"- **Author:** [{k}](https://huggingface.co/{k})\n"
        dashboard += f"  - **Number of datasets:** {len(v)}\n"

    return dashboard


def plot_datasets_growth(data, framework):
    df = pd.DataFrame(data)
    df["createdAt"] = pd.to_datetime(df["createdAt"])
    df["month"] = df["createdAt"].dt.to_period("M").astype(str)
    df_counts = df.groupby("month").size().reset_index(name="count")
    df_counts["cumulative_count"] = df_counts["count"].cumsum()
    df_counts["growth_rate"] = df_counts["count"].pct_change()

    fig = px.line(df_counts, x="month", y="cumulative_count", title="Dataset Growth")
    fig.update_layout(
        xaxis_title="Month",
        yaxis_title="Cumulative Number of Datasets",
        yaxis=dict(title=f"Cumulative Number of Datasets ({framework}"),
        yaxis2=dict(
            title="Month-over-Month Growth Rate",
            overlaying="y",
            side="right",
            tickformat=",.0%",
        ),
        legend=dict(
            title="", orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
        ),
    )

    fig.add_scatter(
        x=df_counts["month"], y=df_counts["growth_rate"], name="Growth Rate", yaxis="y2"
    )

    fig.update_layout(
        title={
            "text": f"Dataset Growth for {framework} datasets",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        title_font=dict(size=24),
        annotations=[
            dict(
                x=0.5,
                y=0.85,
                xref="paper",
                yref="paper",
                text="Cumulative number of datasets and month-over-month growth rate",
                showarrow=False,
                font=dict(size=14),
            )
        ],
    )

    return fig


def update_dashboard(framework):
    data, grouped = fetch_data(framework)
    dashboard = generate_dashboard(data, grouped, framework)
    fig = plot_datasets_growth(data, framework)
    return fig, dashboard


with gr.Blocks() as demo:
    gr.Markdown("# View the growth of dataset frameworks/tags on the Hub")
    framework = gr.Dropdown(
        choices=choices,
        allow_custom_value=True,
    )
    plot = gr.Plot()
    markdown = gr.Markdown()
    framework.change(update_dashboard, inputs=[framework], outputs=[plot, markdown])

demo.launch()