Spaces:
Running
Running
File size: 5,028 Bytes
3f63dc8 6e5511f 3f63dc8 35da923 3f63dc8 35da923 1288276 6a1da3c 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 35da923 3f63dc8 35da923 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 72714de 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 728801d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Repo-Level Dedupe Visualization</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
</head>
<body>
<div class="container">
<div class="header">
<h1>Visualizing Repo-Level Dedupe</h1>
<p>
This visualization demonstrates block-level deduplication across all
models in
<a
target="_blank"
href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
>bartowski/gemma-2-9b-it-GGUF</a
>.
</p>
<p>
Each row represents a file in the repository grouped into blocks of up
to 64MB. The color of each block represents the deduplication ratio
for the block, which is a function of how often the chunks in the
block are shared between files. The darker the color, the more
frequently content is shared, the better the overall upload and
download times for a given file! The deduplication savings here take a
191GB repo and cut it down to 97GB, helping to shave a few hours off
the upload time.
</p>
<p>
You can read more about chunks, blocks, and the nitty gritty details
of how we make this all work in our accompanying
<a
target="_blank"
href="https://huggingface.co/blog/from-chunks-to-blocks"
>blog post</a
>.
</p>
To explore the visualization:
<ul>
<li>
<strong>Hover</strong> over a block in an individual file to
highlight it and see where else it appears in the repository.
</li>
<li>
<strong>Click</strong> any block in a file to see all other files
that share blocks.
</li>
<li>
<strong>Double-click</strong> anywhere on any file to reset and
continue exploring.
</li>
</ul>
</div>
<div class="heatmap-container">
<div id="vis"></div>
</div>
</div>
<script>
var vlSpec = {
$schema: "https://vega.github.io/schema/vega-lite/v5.json",
resolve: { scale: { x: "independent" } },
width: 800,
height: 25,
params: [
{
name: "highlight",
select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
},
{
name: "select",
select: { type: "point", fields: ["repo"], toggle: "false" },
},
{
name: "xorbs_selected",
expr: "pluck(data('source_0'), 'repo_xorb_selected')",
},
{
name: "any_xorbs_selected",
expr: "extent(xorbs_selected)[0] != null",
},
],
transform: [
{
calculate:
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
as: "repo_selected",
},
{
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
as: "repo_xorb_selected",
},
{
calculate:
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
as: "repo",
},
],
data: {
url: "xorbs.json",
},
mark: "rect",
encoding: {
x: {
field: "xorb_id",
axis: null,
sort: { field: "dedupe_factor", order: "descending" },
stack: "normalize",
},
color: {
condition: [
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
],
field: "dedupe_factor",
type: "quantitative",
scale: { scheme: "blues", domain: [0, 10] },
},
opacity: {
condition: [
{
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
value: 0.2,
},
],
},
tooltip: [
{ field: "repo", type: "nominal", title: "File" },
{ field: "xorb_id", type: "nominal", title: "Block Hash" },
{
field: "dedupe_factor",
type: "quantitative",
title: "Dedupe Factor",
},
],
row: {
field: "repo",
title: "",
spacing: 1,
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
sort: { field: "repo", order: "ascending" },
},
},
};
vegaEmbed("#vis", vlSpec);
</script>
</body>
</html>
|