File size: 5,028 Bytes
3f63dc8
 
 
 
 
 
 
6e5511f
 
 
3f63dc8
 
 
 
 
 
 
 
35da923
 
 
3f63dc8
 
 
 
 
35da923
 
 
 
1288276
 
 
 
 
 
 
6a1da3c
 
 
 
 
3f63dc8
 
6e5511f
 
3f63dc8
 
6e5511f
 
3f63dc8
35da923
3f63dc8
 
35da923
3f63dc8
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
 
3f63dc8
 
 
 
 
6e5511f
3f63dc8
 
6e5511f
 
3f63dc8
 
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
3f63dc8
 
6e5511f
 
3f63dc8
 
 
 
6e5511f
3f63dc8
 
6e5511f
3f63dc8
 
 
 
 
 
 
6e5511f
3f63dc8
 
 
6e5511f
3f63dc8
 
 
6e5511f
3f63dc8
 
6e5511f
3f63dc8
 
 
 
6e5511f
3f63dc8
 
 
 
 
 
 
 
 
 
 
 
 
 
72714de
3f63dc8
 
6e5511f
3f63dc8
6e5511f
3f63dc8
728801d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Repo-Level Dedupe Visualization</title>
    <link rel="stylesheet" href="style.css" />
    <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
  </head>
  <body>
    <div class="container">
      <div class="header">
        <h1>Visualizing Repo-Level Dedupe</h1>
        <p>
          This visualization demonstrates block-level deduplication across all
          models in
          <a
            target="_blank"
            href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
            >bartowski/gemma-2-9b-it-GGUF</a
          >.
        </p>
        <p>
          Each row represents a file in the repository grouped into blocks of up
          to 64MB. The color of each block represents the deduplication ratio
          for the block, which is a function of how often the chunks in the
          block are shared between files. The darker the color, the more
          frequently content is shared, the better the overall upload and
          download times for a given file! The deduplication savings here take a
          191GB repo and cut it down to 97GB, helping to shave a few hours off
          the upload time.
        </p>
        <p>
          You can read more about chunks, blocks, and the nitty gritty details
          of how we make this all work in our accompanying
          <a
            target="_blank"
            href="https://huggingface.co/blog/from-chunks-to-blocks"
            >blog post</a
          >.
        </p>
        To explore the visualization:
        <ul>
          <li>
            <strong>Hover</strong> over a block in an individual file to
            highlight it and see where else it appears in the repository.
          </li>
          <li>
            <strong>Click</strong> any block in a file to see all other files
            that share blocks.
          </li>
          <li>
            <strong>Double-click</strong> anywhere on any file to reset and
            continue exploring.
          </li>
        </ul>
      </div>

      <div class="heatmap-container">
        <div id="vis"></div>
      </div>
    </div>
    <script>
      var vlSpec = {
        $schema: "https://vega.github.io/schema/vega-lite/v5.json",
        resolve: { scale: { x: "independent" } },
        width: 800,
        height: 25,
        params: [
          {
            name: "highlight",
            select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
          },
          {
            name: "select",
            select: { type: "point", fields: ["repo"], toggle: "false" },
          },
          {
            name: "xorbs_selected",
            expr: "pluck(data('source_0'), 'repo_xorb_selected')",
          },
          {
            name: "any_xorbs_selected",
            expr: "extent(xorbs_selected)[0] != null",
          },
        ],
        transform: [
          {
            calculate:
              "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
            as: "repo_selected",
          },
          {
            calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
            as: "repo_xorb_selected",
          },
          {
            calculate:
              "split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
            as: "repo",
          },
        ],
        data: {
          url: "xorbs.json",
        },
        mark: "rect",
        encoding: {
          x: {
            field: "xorb_id",
            axis: null,
            sort: { field: "dedupe_factor", order: "descending" },
            stack: "normalize",
          },
          color: {
            condition: [
              { test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
            ],
            field: "dedupe_factor",
            type: "quantitative",
            scale: { scheme: "blues", domain: [0, 10] },
          },
          opacity: {
            condition: [
              {
                test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
                value: 0.2,
              },
            ],
          },
          tooltip: [
            { field: "repo", type: "nominal", title: "File" },
            { field: "xorb_id", type: "nominal", title: "Block Hash" },
            {
              field: "dedupe_factor",
              type: "quantitative",
              title: "Dedupe Factor",
            },
          ],
          row: {
            field: "repo",
            title: "",
            spacing: 1,
            header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
            sort: { field: "repo", order: "ascending" },
          },
        },
      };
      vegaEmbed("#vis", vlSpec);
    </script>
  </body>
</html>