File size: 4,773 Bytes
3f63dc8
 
 
 
 
 
 
6e5511f
 
 
3f63dc8
 
 
 
 
 
 
 
35da923
 
 
3f63dc8
 
 
 
 
35da923
 
 
 
 
 
 
 
3f63dc8
 
6e5511f
 
3f63dc8
 
6e5511f
 
3f63dc8
35da923
3f63dc8
 
35da923
3f63dc8
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
 
3f63dc8
 
 
 
 
6e5511f
3f63dc8
 
6e5511f
 
3f63dc8
 
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
 
3f63dc8
 
 
 
 
 
6e5511f
3f63dc8
 
6e5511f
 
3f63dc8
 
 
 
6e5511f
3f63dc8
 
6e5511f
3f63dc8
 
 
 
 
 
 
6e5511f
3f63dc8
 
 
6e5511f
3f63dc8
 
 
6e5511f
3f63dc8
 
6e5511f
3f63dc8
 
 
 
6e5511f
3f63dc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e5511f
3f63dc8
6e5511f
3f63dc8
728801d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Repo-Level Dedupe Visualization</title>
    <link rel="stylesheet" href="style.css" />
    <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
  </head>
  <body>
    <div class="container">
      <div class="header">
        <h1>Visualizing Repo-Level Dedupe</h1>
        <p>
          This visualization demonstrates block-level deduplication across all
          models in
          <a
            target="_blank"
            href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
            >bartowski/gemma-2-9b-it-GGUF</a
          >.
        </p>
        <p>
          Each row represents a file in the repository grouped into blocks of up
          to 64MB. The color of each block represents the deduplication ratio
          for the block, which is a function of how often the chunks in the
          block are shared between files. The darker the color, the more
          frequently content is shared, the better the overall upload and
          download times for a given file! You can read more about chunks,
          blocks, and the nitty gritty details of how we make this all work in
          our accompanying
          <a target="_blank" href="#">blog post</a>.
        </p>
        To explore the visualization:
        <ul>
          <li>
            <strong>Hover</strong> over a block in an individual file to
            highlight it and see where else it appears in the repository.
          </li>
          <li>
            <strong>Click</strong> any block in a file to see all other files
            that share blocks.
          </li>
          <li>
            <strong>Double-click</strong> anywhere on any file to reset and
            continue exploring.
          </li>
        </ul>
      </div>

      <div class="heatmap-container">
        <div id="vis"></div>
      </div>
    </div>
    <script>
      var vlSpec = {
        $schema: "https://vega.github.io/schema/vega-lite/v5.json",
        resolve: { scale: { x: "independent" } },
        width: 800,
        height: 25,
        params: [
          {
            name: "highlight",
            select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
          },
          {
            name: "select",
            select: { type: "point", fields: ["repo"], toggle: "false" },
          },
          {
            name: "xorbs_selected",
            expr: "pluck(data('source_0'), 'repo_xorb_selected')",
          },
          {
            name: "any_xorbs_selected",
            expr: "extent(xorbs_selected)[0] != null",
          },
        ],
        transform: [
          {
            calculate:
              "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
            as: "repo_selected",
          },
          {
            calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
            as: "repo_xorb_selected",
          },
          {
            calculate:
              "split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
            as: "repo",
          },
        ],
        data: {
          url: "xorbs.json",
        },
        mark: "rect",
        encoding: {
          x: {
            field: "xorb_id",
            axis: null,
            sort: { field: "dedupe_factor", order: "descending" },
            stack: "normalize",
          },
          color: {
            condition: [
              { test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
            ],
            field: "dedupe_factor",
            type: "quantitative",
            scale: { scheme: "blues", domain: [0, 10] },
          },
          opacity: {
            condition: [
              {
                test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
                value: 0.2,
              },
            ],
          },
          tooltip: [
            { field: "repo", type: "nominal", title: "File" },
            { field: "xorb_id", type: "nominal", title: "Block Hash" },
            {
              field: "dedupe_factor",
              type: "quantitative",
              title: "Dedupe Factor",
            },
          ],
          row: {
            field: "repo",
            title: "",
            spacing: 1,
            header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
            sort: { field: "dedupe_factor", order: "descending" },
          },
        },
      };
      vegaEmbed("#vis", vlSpec);
    </script>
  </body>
</html>