quantization-dedup / index.html
jsulz's picture
jsulz HF staff
updating intro text
1288276
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Repo-Level Dedupe Visualization</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
</head>
<body>
<div class="container">
<div class="header">
<h1>Visualizing Repo-Level Dedupe</h1>
<p>
This visualization demonstrates block-level deduplication across all
models in
<a
target="_blank"
href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
>bartowski/gemma-2-9b-it-GGUF</a
>.
</p>
<p>
Each row represents a file in the repository grouped into blocks of up
to 64MB. The color of each block represents the deduplication ratio
for the block, which is a function of how often the chunks in the
block are shared between files. The darker the color, the more
frequently content is shared, the better the overall upload and
download times for a given file! The deduplication savings here take a
191GB repo and cut it down to 97GB, helping to shave a few hours off
the upload time.
</p>
<p>
You can read more about chunks, blocks, and the nitty gritty details
of how we make this all work in our accompanying
<a
target="_blank"
href="https://huggingface.co/blog/from-chunks-to-blocks"
>blog post</a
>.
</p>
To explore the visualization:
<ul>
<li>
<strong>Hover</strong> over a block in an individual file to
highlight it and see where else it appears in the repository.
</li>
<li>
<strong>Click</strong> any block in a file to see all other files
that share blocks.
</li>
<li>
<strong>Double-click</strong> anywhere on any file to reset and
continue exploring.
</li>
</ul>
</div>
<div class="heatmap-container">
<div id="vis"></div>
</div>
</div>
<script>
var vlSpec = {
$schema: "https://vega.github.io/schema/vega-lite/v5.json",
resolve: { scale: { x: "independent" } },
width: 800,
height: 25,
params: [
{
name: "highlight",
select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
},
{
name: "select",
select: { type: "point", fields: ["repo"], toggle: "false" },
},
{
name: "xorbs_selected",
expr: "pluck(data('source_0'), 'repo_xorb_selected')",
},
{
name: "any_xorbs_selected",
expr: "extent(xorbs_selected)[0] != null",
},
],
transform: [
{
calculate:
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
as: "repo_selected",
},
{
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
as: "repo_xorb_selected",
},
{
calculate:
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
as: "repo",
},
],
data: {
url: "xorbs.json",
},
mark: "rect",
encoding: {
x: {
field: "xorb_id",
axis: null,
sort: { field: "dedupe_factor", order: "descending" },
stack: "normalize",
},
color: {
condition: [
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
],
field: "dedupe_factor",
type: "quantitative",
scale: { scheme: "blues", domain: [0, 10] },
},
opacity: {
condition: [
{
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
value: 0.2,
},
],
},
tooltip: [
{ field: "repo", type: "nominal", title: "File" },
{ field: "xorb_id", type: "nominal", title: "Block Hash" },
{
field: "dedupe_factor",
type: "quantitative",
title: "Dedupe Factor",
},
],
row: {
field: "repo",
title: "",
spacing: 1,
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
sort: { field: "repo", order: "ascending" },
},
},
};
vegaEmbed("#vis", vlSpec);
</script>
</body>
</html>