File size: 10,237 Bytes
eea405a
 
 
 
2b02896
eea405a
 
cc6d57f
adbb8fc
 
eea405a
 
 
 
adbb8fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eea405a
 
 
 
 
 
 
 
 
 
 
adbb8fc
eea405a
 
 
 
 
 
 
 
 
 
adbb8fc
eea405a
 
 
ed3daa4
 
 
 
 
eea405a
 
 
 
 
 
 
 
 
 
ed3daa4
 
 
add58d5
 
eea405a
 
 
 
 
 
 
2b02896
 
adbb8fc
2b02896
 
 
 
 
 
 
 
b4cf22f
2b02896
 
adbb8fc
eea405a
 
 
 
 
add58d5
 
 
b4cf22f
add58d5
 
 
 
 
 
 
 
 
b4cf22f
add58d5
eea405a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
adbb8fc
 
b4cf22f
add58d5
eea405a
 
 
b4cf22f
eea405a
 
7472693
eea405a
 
cc6d57f
b4cf22f
535e3c5
 
b4cf22f
 
eea405a
7472693
eea405a
 
 
 
 
b4cf22f
eea405a
 
adbb8fc
eea405a
 
adbb8fc
b4cf22f
 
 
 
eea405a
 
 
 
b4cf22f
eea405a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# pylint: disable=no-member
import gradio as gr
import requests
from huggingface_hub import HfApi
from huggingface_hub.errors import RepositoryNotFoundError
import pandas as pd
import plotly.express as px
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from collections import defaultdict
import numpy as np

HF_API = HfApi()


def apply_power_scaling(sizes, exponent=0.2):
    """Apply custom power scaling to the sizes."""
    """skip over if size is none, but make sure to fill it as 0"""
    return [size**exponent if size is not None else 0 for size in sizes]


def count_chunks(sizes):
    """Count the number of chunks, which are 64KB each in size - which are bytes"""
    """always round up to the nearest chunk"""
    return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]


def build_hierarchy(siblings):
    """Builds a hierarchical structure from the list of RepoSibling objects."""
    hierarchy = defaultdict(dict)

    for sibling in siblings:
        path_parts = sibling.rfilename.split("/")
        size = sibling.lfs.size if sibling.lfs else sibling.size

        current_level = hierarchy
        for part in path_parts[:-1]:  # Traverse directories
            current_level = current_level.setdefault(part, {})
        current_level[path_parts[-1]] = size  # Assign size to the file

    return hierarchy


def calculate_directory_sizes(hierarchy):
    """Recursively calculates the size of each directory as the sum of its contents."""
    total_size = 0

    for key, value in hierarchy.items():
        if isinstance(value, dict):  # Directory
            dir_size = calculate_directory_sizes(value)  # Recursively calculate size
            hierarchy[key] = {
                "__size__": dir_size,
                **value,
            }  # Add size to directory metadata
            total_size += dir_size
        else:  # File
            total_size += value

    return total_size


def flatten_hierarchy_with_directory_sizes(hierarchy, root_name="Repository"):
    """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
    labels = []
    parents = []
    sizes = []

    # Recursively process the hierarchy
    def process_level(current_hierarchy, current_parent):
        for key, value in current_hierarchy.items():
            if isinstance(value, dict) and "__size__" in value:  # Directory
                dir_size = value.pop("__size__")  # Extract directory size
                labels.append(key)
                parents.append(current_parent)
                sizes.append(dir_size)
                process_level(value, key)  # Recurse into subdirectories
            else:  # File
                labels.append(key)
                parents.append(current_parent)
                sizes.append(value)

    # Add the root node
    total_size = calculate_directory_sizes(hierarchy)
    labels.append(root_name)
    parents.append("")  # Root has no parent
    sizes.append(total_size)

    # Process the hierarchy
    process_level(hierarchy, root_name)

    return labels, parents, sizes


def visualize_repo_treemap(r_info):
    """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
    siblings = r_info.siblings
    hierarchy = build_hierarchy(siblings)

    # Calculate directory sizes
    calculate_directory_sizes(hierarchy)

    # Flatten the hierarchy into Plotly-compatible format
    labels, parents, sizes = flatten_hierarchy_with_directory_sizes(hierarchy)

    # Apply the chosen scaling function for visualization
    scaled_sizes = apply_power_scaling(sizes)

    # Format the original sizes using the helper function
    formatted_sizes = [
        (
            format_repo_size(size) if size is not None else None
        )  # Format both files and directories
        for size in sizes
    ]

    chunks = count_chunks(sizes)

    # Create the treemap
    fig = px.treemap(
        names=labels,
        parents=parents,
        values=scaled_sizes,
        title="Repo by Chunks",
        custom_data=[formatted_sizes, chunks],
    )

    # Add subtitle by updating the layout
    fig.update_layout(
        title={
            "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
            "x": 0.5,  # Center the title and subtitle
            "xanchor": "center",
        }
    )

    # Customize the hover template to include directory sizes
    fig.update_traces(
        hovertemplate=(
            "<b>%{label}</b><br>"  # File/Directory name
            "Size: %{customdata[0]}<br>"  # Scaled size shown in treemap
            "# of Chunks: %{customdata[1]}"  # Formatted size from custom data
        )
    )
    fig.update_traces(root_color="lightgrey")
    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))

    return fig


def format_repo_size(r_size: int) -> str:
    units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
    order = 0
    while r_size >= 1024 and order < len(units) - 1:
        r_size /= 1024
        order += 1
    return f"{r_size:.2f} {units[order]}"


def repo_files(r_type: str, r_id: str) -> dict:
    r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
    fig = visualize_repo_treemap(r_info)
    files = {}
    for sibling in r_info.siblings:
        ext = sibling.rfilename.split(".")[-1]
        if ext in files:
            files[ext]["size"] += sibling.size
            files[ext]["count"] += 1
        else:
            files[ext] = {}
            files[ext]["size"] = sibling.size
            files[ext]["count"] = 1
    return files, fig


def repo_size(r_type, r_id):
    try:
        r_refs = HF_API.list_repo_refs(repo_id=r_id, repo_type=r_type)
    except RepositoryNotFoundError:
        gr.Warning(f"Repository is gated, branch information for {r_id} not available.")
        return {}
    repo_sizes = {}
    for branch in r_refs.branches:
        try:
            response = requests.get(
                f"https://huggingface.co/api/{r_type}s/{r_id}/treesize/{branch.name}",
                timeout=1000,
            )
            response = response.json()
        except Exception:
            response = {}
        if response.get("error") and (
            "restricted" in response.get("error") or "gated" in response.get("error")
        ):
            gr.Warning(f"Branch information for {r_id} not available.")
            return {}
        size = response.get("size")
        if size is not None:
            repo_sizes[branch.name] = size
    return repo_sizes


def get_repo_info(r_type, r_id):
    try:
        repo_sizes = repo_size(r_type, r_id)
        repo_files_info, treemap_fig = repo_files(r_type, r_id)
    except RepositoryNotFoundError:
        gr.Warning(
            "Repository not found. Make sure you've entered a valid repo ID and type that corresponds to the repository."
        )
        return (
            gr.Row(visible=False),
            gr.Dataframe(visible=False),
            gr.Plot(visible=False),
            gr.Row(visible=False),
            gr.Dataframe(visible=False),
        )

    rf_sizes_df = (
        pd.DataFrame(repo_files_info)
        .T.reset_index(names="ext")
        .sort_values(by="size", ascending=False)
    )
    # check if repo_sizes is just {}
    if not repo_sizes:
        r_sizes_component = gr.Dataframe(visible=False)
        b_block = gr.Row(visible=False)
    else:
        r_sizes_df = pd.DataFrame(repo_sizes, index=["size"]).T.reset_index(
            names="branch"
        )
        r_sizes_df["formatted_size"] = r_sizes_df["size"].apply(format_repo_size)
        r_sizes_df.columns = ["Branch", "bytes", "Size"]
        r_sizes_component = gr.Dataframe(
            value=r_sizes_df[["Branch", "Size"]], visible=True
        )
        b_block = gr.Row(visible=True)

    rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
    rf_sizes_df.columns = ["Extension", "bytes", "Count", "Size"]
    rf_sizes_plot = px.pie(
        rf_sizes_df,
        values="bytes",
        names="Extension",
        hover_data=["Size"],
        title=f"File Distribution in {r_id}",
        hole=0.3,
    )
    return (
        gr.Row(visible=True),
        gr.Dataframe(
            value=rf_sizes_df[["Extension", "Count", "Size"]],
            visible=True,
        ),
        # gr.Plot(rf_sizes_plot, visible=True),
        gr.Plot(treemap_fig, visible=True),
        b_block,
        r_sizes_component,
    )


with gr.Blocks(theme="ocean") as demo:
    gr.Markdown("# Repository Information")
    gr.Markdown(
        "Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's files and branches."
    )
    with gr.Blocks():
        # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
        repo_id = HuggingfaceHubSearch(
            label="Hub Repository Search (enter user, organization, or repository name to start searching)",
            placeholder="Search for model or dataset repositories on Huggingface",
            search_type=["model", "dataset"],
        )
        repo_type = gr.Radio(
            choices=["model", "dataset"],
            label="Repository Type",
            value="model",
        )
        search_button = gr.Button(value="Search")
    with gr.Blocks():
        with gr.Row(visible=False) as results_block:
            with gr.Column():
                gr.Markdown("## File Information")
                file_info_plot = gr.Plot(visible=False)
                with gr.Row():
                    file_info = gr.Dataframe(visible=False)
                    # file_info_plot = gr.Plot(visible=False)
                with gr.Row(visible=False) as branch_block:
                    with gr.Column():
                        gr.Markdown("## Branch Sizes")
                        branch_sizes = gr.Dataframe(visible=False)

    search_button.click(
        get_repo_info,
        inputs=[repo_type, repo_id],
        outputs=[results_block, file_info, file_info_plot, branch_block, branch_sizes],
    )

demo.launch()