jsulz HF staff commited on
Commit
3f63dc8
·
1 Parent(s): 1c35e78

updating viz, space layout, and data

Browse files
Files changed (3) hide show
  1. index.html +110 -74
  2. style.css +50 -17
  3. xorbs.json +0 -0
index.html CHANGED
@@ -1,105 +1,141 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>xet-repo-dedupe</title>
7
- <link rel="stylesheet" href="style.css" />
8
  <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
9
  <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
10
  <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
11
- <style>
12
- #vis {
13
- width: 100%;
14
- text-align: center;
15
- }
16
- </style>
17
- </head>
18
- <body>
19
- <div class="card">
20
- <h1>Visualizing Repo-level Dedupe</h1>
21
- <p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p>
22
- <p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p>
23
- <p>Interactions:
 
 
 
 
 
 
 
 
 
24
  <ul>
25
  <li>
26
- Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>.
 
27
  </li>
28
  <li>
29
- Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection.
 
 
 
 
 
30
  </li>
31
  </ul>
32
- </p>
33
- </div>
34
- <div id="vis"></div>
 
 
 
35
  <script>
36
  var vlSpec = {
37
- "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
38
- "resolve": {"scale": {"x": "independent"}},
39
- "width": 600,
40
- "height": 12,
41
- "params": [
42
  {
43
- "name": "highlight",
44
- "select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"}
45
  },
46
  {
47
- "name": "select",
48
- "select": {"type": "point", "fields": ["repo"], "toggle": "false"}
49
  },
50
  {
51
- "name": "xorbs_selected",
52
- "expr": "pluck(data('source_0'), 'repo_xorb_selected')"
 
 
 
 
53
  },
54
- {"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"}
55
  ],
56
- "transform": [
 
 
 
 
 
57
  {
58
- "calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
59
- "as": "repo_selected"
60
  },
61
  {
62
- "calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)",
63
- "as": "repo_xorb_selected"
64
- }
 
65
  ],
66
- "data": {
67
- "url": "xorbs.json"
68
  },
69
- "mark": "rect",
70
- "encoding": {
71
- "x": {
72
- "field": "xorb_id",
73
- "axis": null,
74
- "sort": {"field": "dedupe_factor", "order": "descending"},
75
- "stack": "normalize"
76
  },
77
- "color": {
78
- "condition": [
79
- {"test": "datum.xorb_id == highlight.xorb_id", "value": "orange"}
80
  ],
81
- "field": "dedupe_factor",
82
- "type": "quantitative",
83
- "scale": {"domain": [0, 10]}
84
  },
85
- "opacity": {
86
- "condition": [
87
  {
88
- "test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
89
- "value": 0.2
90
- }
91
- ]
92
  },
93
- "tooltip": {"field": "dedupe_factor"},
94
- "row": {
95
- "field": "repo",
96
- "spacing": 1,
97
- "header": {"labelAngle": 0, "labelAlign": "left"},
98
- "sort": {"field": "dedupe_factor", "order": "descending"}
99
- }
100
- }
 
 
 
 
 
 
 
 
 
101
  };
102
- vegaEmbed('#vis', vlSpec);
103
  </script>
104
- </body>
105
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>Repo-Level Dedupe Visualization</title>
7
+ <link rel="stylesheet" href="style.css" />
8
  <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
9
  <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
10
  <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
11
+ </head>
12
+ <body>
13
+ <div class="container">
14
+ <div class="header">
15
+ <h1>Visualizing Repo-Level Dedupe</h1>
16
+ <p>
17
+ This visualization demonstrates block-level deduplication across all
18
+ models in
19
+ <a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
20
+ >bartowski/gemma-2-9b-it-GGUF</a
21
+ >.
22
+ </p>
23
+ <p>
24
+ Each row represents a file in the repository grouped into blocks of up
25
+ to 64MB. The color of each block represents the dedupe factor for the
26
+ block, which is a function of how many chunks in the block are present
27
+ across all files in the repository. The darker the color, the more the
28
+ block shares content. You can read more about chunks, blocks, and how
29
+ the dedupe factor is calculated
30
+ <a href="#">in this blog post</a>.
31
+ </p>
32
+ To explore the visualization:
33
  <ul>
34
  <li>
35
+ <strong>Hover</strong> over a block in an individual file to
36
+ highlight it and see where else it appears in the repository.
37
  </li>
38
  <li>
39
+ <strong>Click</strong> any block in a file to see all other files
40
+ that share blocks with it.
41
+ </li>
42
+ <li>
43
+ <strong>Double-click</strong> anywhere on a file to reset and
44
+ continue exploring.
45
  </li>
46
  </ul>
47
+ </div>
48
+
49
+ <div class="heatmap-container">
50
+ <div id="vis"></div>
51
+ </div>
52
+ </div>
53
  <script>
54
  var vlSpec = {
55
+ $schema: "https://vega.github.io/schema/vega-lite/v5.json",
56
+ resolve: { scale: { x: "independent" } },
57
+ width: 800,
58
+ height: 25,
59
+ params: [
60
  {
61
+ name: "highlight",
62
+ select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
63
  },
64
  {
65
+ name: "select",
66
+ select: { type: "point", fields: ["repo"], toggle: "false" },
67
  },
68
  {
69
+ name: "xorbs_selected",
70
+ expr: "pluck(data('source_0'), 'repo_xorb_selected')",
71
+ },
72
+ {
73
+ name: "any_xorbs_selected",
74
+ expr: "extent(xorbs_selected)[0] != null",
75
  },
 
76
  ],
77
+ transform: [
78
+ {
79
+ calculate:
80
+ "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
81
+ as: "repo_selected",
82
+ },
83
  {
84
+ calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
85
+ as: "repo_xorb_selected",
86
  },
87
  {
88
+ calculate:
89
+ "split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
90
+ as: "repo",
91
+ },
92
  ],
93
+ data: {
94
+ url: "xorbs.json",
95
  },
96
+ mark: "rect",
97
+ encoding: {
98
+ x: {
99
+ field: "xorb_id",
100
+ axis: null,
101
+ sort: { field: "dedupe_factor", order: "descending" },
102
+ stack: "normalize",
103
  },
104
+ color: {
105
+ condition: [
106
+ { test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
107
  ],
108
+ field: "dedupe_factor",
109
+ type: "quantitative",
110
+ scale: { scheme: "blues", domain: [0, 10] },
111
  },
112
+ opacity: {
113
+ condition: [
114
  {
115
+ test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
116
+ value: 0.2,
117
+ },
118
+ ],
119
  },
120
+ tooltip: [
121
+ { field: "repo", type: "nominal", title: "File" },
122
+ { field: "xorb_id", type: "nominal", title: "Block Hash" },
123
+ {
124
+ field: "dedupe_factor",
125
+ type: "quantitative",
126
+ title: "Dedupe Factor",
127
+ },
128
+ ],
129
+ row: {
130
+ field: "repo",
131
+ title: "",
132
+ spacing: 1,
133
+ header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
134
+ sort: { field: "dedupe_factor", order: "descending" },
135
+ },
136
+ },
137
  };
138
+ vegaEmbed("#vis", vlSpec);
139
  </script>
140
+ </body>
141
  </html>
style.css CHANGED
@@ -1,28 +1,61 @@
1
  body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
 
 
 
4
  }
5
 
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
 
 
 
 
 
 
 
 
 
9
  }
10
 
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
  margin-bottom: 10px;
15
- margin-top: 5px;
16
  }
17
 
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
- .card p:last-child {
27
- margin-bottom: 0;
 
 
 
28
  }
 
1
  body {
2
+ font-family: Arial, sans-serif;
3
+ margin: 0;
4
+ padding: 20px;
5
+ background-color: #f8f9fa;
6
+ color: #333;
7
  }
8
 
9
+ .container {
10
+ width: 85%;
11
+ margin: auto;
12
+ text-align: center;
13
+ }
14
+
15
+ .header {
16
+ background: white;
17
+ padding: 15px;
18
+ border-radius: 8px;
19
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
20
+ text-align: left;
21
  }
22
 
23
+ .header h1 {
24
+ font-size: 20px;
 
25
  margin-bottom: 10px;
 
26
  }
27
 
28
+ .header a {
29
+ color: #007bff;
30
+ text-decoration: none;
31
+ }
32
+
33
+ .heatmap-container {
34
+ display: flex;
35
+ justify-content: center;
36
+ align-items: start;
37
+ margin-top: 20px;
38
+ }
39
+
40
+ #heatmap {
41
+ flex: 1;
42
+ padding: 10px;
43
+ }
44
+
45
+ .legend {
46
+ width: 150px;
47
+ text-align: left;
48
+ margin-left: 20px;
49
+ }
50
+
51
+ .legend h3 {
52
+ font-size: 16px;
53
+ margin-bottom: 8px;
54
  }
55
 
56
+ .color-scale {
57
+ width: 100%;
58
+ height: 20px;
59
+ background: linear-gradient(to right, #e0f3f8, #abd9e9, #74add1, #4575b4);
60
+ border-radius: 5px;
61
  }
xorbs.json CHANGED
The diff for this file is too large to render. See raw diff