Spaces:
Running
Running
| <html> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width" /> | |
| <title>xet-repo-dedupe</title> | |
| <link rel="stylesheet" href="style.css" /> | |
| <script src="https://cdn.jsdelivr.net/npm/vega@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> | |
| <style> | |
| #vis { | |
| width: 100%; | |
| text-align: center; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="card"> | |
| <h1>Visualizing Repo-level Dedupe</h1> | |
| <p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p> | |
| <p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p> | |
| <p>Interactions: | |
| <ul> | |
| <li> | |
| Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>. | |
| </li> | |
| <li> | |
| Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection. | |
| </li> | |
| </ul> | |
| </p> | |
| </div> | |
| <div id="vis"></div> | |
| <script> | |
| var vlSpec = { | |
| "$schema": "https://vega.github.io/schema/vega-lite/v5.json", | |
| "resolve": {"scale": {"x": "independent"}}, | |
| "width": 600, | |
| "height": 12, | |
| "params": [ | |
| { | |
| "name": "highlight", | |
| "select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"} | |
| }, | |
| { | |
| "name": "select", | |
| "select": {"type": "point", "fields": ["repo"], "toggle": "false"} | |
| }, | |
| { | |
| "name": "xorbs_selected", | |
| "expr": "pluck(data('source_0'), 'repo_xorb_selected')" | |
| }, | |
| {"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"} | |
| ], | |
| "transform": [ | |
| { | |
| "calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", | |
| "as": "repo_selected" | |
| }, | |
| { | |
| "calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)", | |
| "as": "repo_xorb_selected" | |
| } | |
| ], | |
| "data": { | |
| "url": "xorbs.json" | |
| }, | |
| "mark": "rect", | |
| "encoding": { | |
| "x": { | |
| "field": "xorb_id", | |
| "axis": null, | |
| "stack": "normalize" | |
| }, | |
| "color": { | |
| "condition": [ | |
| {"test": "datum.xorb_id == highlight.xorb_id", "value": "orange"} | |
| ], | |
| "field": "dedupe_factor", | |
| "type": "quantitative", | |
| "scale": {"domain": [0, 10]} | |
| }, | |
| "opacity": { | |
| "condition": [ | |
| { | |
| "test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", | |
| "value": 0.2 | |
| } | |
| ] | |
| }, | |
| "tooltip": {"field": "dedupe_factor"}, | |
| "row": { | |
| "field": "repo", | |
| "spacing": 1, | |
| "header": {"labelAngle": 0, "labelAlign": "left"}, | |
| "sort": {"field": "dedupe_factor", "order": "descending"} | |
| } | |
| } | |
| }; | |
| vegaEmbed('#vis', vlSpec); | |
| </script> | |
| </body> | |
| </html> | |