PeterPinetree commited on
Commit
c994801
·
verified ·
1 Parent(s): 2cb5f4f

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +248 -11
index.html CHANGED
@@ -1,14 +1,251 @@
1
  <!doctype html>
2
  <html lang="en">
3
- <head>
4
- <meta charset="UTF-8" />
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
- <title>The Tokenizer Playground</title>
7
- <script type="module" crossorigin src="/assets/index-DEbmRw68.js"></script>
8
- <link rel="stylesheet" crossorigin href="/assets/index-Dhl4q2CV.css">
9
- </head>
10
-
11
- <body>
12
- <div id="root"></div>
13
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  </html>
 
1
  <!doctype html>
2
  <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta
6
+ name="viewport"
7
+ content="width=device-width, initial-scale=1"
8
+ />
9
+ <title>TokenVisualizer — Two-Pane</title>
10
+ <link rel="preconnect" href="https://fonts.googleapis.com">
11
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet">
12
+ <style>
13
+ :root{
14
+ --bg:#0b0f14; /* black-leaning bg */
15
+ --text:#ffffff; /* white */
16
+ --muted:#9aa4b2;
17
+ --accent:#38bdf8; /* sky blue */
18
+ --card:#0e1624;
19
+ --border:#1f2a3a;
20
+ --chip:#111827;
21
+ --chip-border:#263246;
22
+ --chip-hover:#1a2434;
23
+ --mono:'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
24
+ --sans:Inter, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, 'Helvetica Neue', Arial;
25
+ }
26
+ *{box-sizing:border-box}
27
+ body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)}
28
+ .container{max-width:1100px;margin:0 auto;padding:1.25rem}
29
+ header{padding-top:1.5rem}
30
+ h1{margin:.2rem 0 .4rem;font-size:1.9rem}
31
+ .sub{color:var(--muted);margin:.25rem 0 1rem}
32
+ .card{background:linear-gradient(180deg,#0c1624,#0a1220);border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)}
33
+ label span{color:var(--muted);font-size:.9rem}
34
+ select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none}
35
+ select:focus,textarea:focus{border-color:var(--accent)}
36
+ .controls{display:grid;gap:.8rem;margin-bottom:1rem}
37
+ .row{display:flex;gap:.75rem;align-items:center}
38
+ .btn{background:var(--accent);color:#07222d;border:0;border-radius:10px;padding:.55rem .95rem;font-weight:600;cursor:pointer}
39
+ .btn.secondary{background:#152236;color:var(--text);border:1px solid var(--border)}
40
+ .status{color:var(--muted)}
41
+ .grid{display:grid;gap:1rem;grid-template-columns:1fr}
42
+ @media (min-width:900px){.grid{grid-template-columns:1fr 1fr}}
43
+ .head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem}
44
+ .actions .link{background:none;border:none;color:var(--accent);cursor:pointer;margin-left:.5rem}
45
+ .tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem}
46
+ .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
47
+ .chip:hover{background:var(--chip-hover);border-color:var(--accent)}
48
+ .chip.active{outline:2px solid var(--accent)}
49
+ pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
50
+ footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
51
+ a{color:var(--accent)}
52
+ </style>
53
+ </head>
54
+ <body>
55
+ <header class="container">
56
+ <h1>TokenVisualizer</h1>
57
+ <p class="sub">See tokens and token IDs at the same time. (Runs entirely in your browser.)</p>
58
+ </header>
59
+
60
+ <main class="container">
61
+ <section class="card controls">
62
+ <label>
63
+ <span>Model</span>
64
+ <select id="model">
65
+ <option value="Xenova/gpt2">GPT-2 (BPE)</option>
66
+ <option value="Xenova/llama2-tokenizer">Llama-2 (SentencePiece/BPE)</option>
67
+ <option value="Xenova/mistral-tokenizer">Mistral (SentencePiece/BPE)</option>
68
+ <option value="Xenova/gemma-tokenizer">Gemma (SentencePiece/BPE)</option>
69
+ <option value="Xenova/bert-base-uncased">BERT Base Uncased (WordPiece)</option>
70
+ </select>
71
+ </label>
72
+ <label>
73
+ <span>Text</span>
74
+ <textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea>
75
+ </label>
76
+ <div class="row">
77
+ <button id="tokenize" class="btn">Tokenize</button>
78
+ <span id="status" class="status">Loading tokenizer…</span>
79
+ </div>
80
+ </section>
81
+
82
+ <section class="grid">
83
+ <article class="card">
84
+ <div class="head">
85
+ <h3>Tokens</h3>
86
+ <div class="actions">
87
+ <button id="copyTokens" class="link">Copy</button>
88
+ <button id="exportTokens" class="link">Export JSON</button>
89
+ </div>
90
+ </div>
91
+ <div id="tokens" class="tokens"></div>
92
+ </article>
93
+
94
+ <article class="card">
95
+ <div class="head">
96
+ <h3>Token IDs</h3>
97
+ <div class="actions">
98
+ <button id="copyIds" class="link">Copy</button>
99
+ <button id="exportIds" class="link">Export JSON</button>
100
+ </div>
101
+ </div>
102
+ <pre id="ids" class="ids"></pre>
103
+ </article>
104
+ </section>
105
+
106
+ <section class="container" style="text-align:right;margin:1rem 0">
107
+ <button id="exportCSV" class="btn secondary">Download CSV (index, token, id)</button>
108
+ </section>
109
+ </main>
110
+
111
+ <footer class="container">
112
+ <small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small>
113
+ </footer>
114
+
115
+ <!-- Transformers.js (ESM) -->
116
+ <script type="module">
117
+ // Load transformers.js from CDN
118
+ const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.mjs');
119
+ tf.env.useBrowserCache = true;
120
+ tf.env.allowLocalModels = false;
121
+
122
+ const $ = sel => document.querySelector(sel);
123
+ const el = {
124
+ model: $('#model'),
125
+ input: $('#input'),
126
+ btn: $('#tokenize'),
127
+ status: $('#status'),
128
+ tokens: $('#tokens'),
129
+ ids: $('#ids'),
130
+ copyTokens: $('#copyTokens'),
131
+ exportTokens: $('#exportTokens'),
132
+ copyIds: $('#copyIds'),
133
+ exportIds: $('#exportIds'),
134
+ exportCSV: $('#exportCSV'),
135
+ };
136
+
137
+ let tokenizer = null;
138
+ let last = { tokens: [], ids: [] };
139
+ let runId = 0;
140
+
141
+ function status(msg){ el.status.textContent = msg; }
142
+ function debounce(fn, ms=250){ let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; }
143
+
144
+ async function loadTokenizer(modelId){
145
+ status('Loading tokenizer…');
146
+ tokenizer = await tf.AutoTokenizer.from_pretrained(modelId);
147
+ status('Tokenizer ready.');
148
+ }
149
+
150
+ async function tokenize(){
151
+ const myRun = ++runId;
152
+ if (!tokenizer) await loadTokenizer(el.model.value);
153
+
154
+ const text = el.input.value ?? '';
155
+ if (!text.trim()){
156
+ last = { tokens: [], ids: [] };
157
+ render();
158
+ status('Type to tokenize…');
159
+ return;
160
+ }
161
+
162
+ status('Tokenizing…');
163
+ try{
164
+ const enc = await tokenizer.encode(text);
165
+ if (myRun !== runId) return; // stale run
166
+ last.tokens = enc.tokens;
167
+ last.ids = enc.ids;
168
+ render();
169
+ status(`Done. ${last.tokens.length} tokens.`);
170
+ }catch(err){
171
+ console.error(err);
172
+ status('Error tokenizing. See console.');
173
+ }
174
+ }
175
+
176
+ function render(){
177
+ // Tokens pane
178
+ el.tokens.innerHTML = '';
179
+ last.tokens.forEach((tok, i)=>{
180
+ const span = document.createElement('span');
181
+ span.className = 'chip';
182
+ span.dataset.idx = i;
183
+ span.textContent = tok;
184
+ span.addEventListener('mouseenter', ()=>highlightID(i,true));
185
+ span.addEventListener('mouseleave', ()=>highlightID(i,false));
186
+ el.tokens.appendChild(span);
187
+ });
188
+
189
+ // IDs pane
190
+ el.ids.textContent = last.ids.join(' ');
191
+ if (!last.tokens.length) status('Type to tokenize…');
192
+ }
193
+
194
+ function highlightID(i, on){
195
+ if (!last.ids.length) return;
196
+ const parts = last.ids.map((id, idx)=> (idx===i && on ? `[${id}]` : String(id)));
197
+ el.ids.textContent = parts.join(' ');
198
+ const chip = el.tokens.querySelector(`[data-idx="${i}"]`);
199
+ if (chip) chip.classList.toggle('active', on);
200
+ }
201
+
202
+ // Copy / Export
203
+ el.copyTokens.addEventListener('click', async ()=>{
204
+ if (!last.tokens.length) return;
205
+ await navigator.clipboard.writeText(last.tokens.join(' '));
206
+ status('Tokens copied.');
207
+ });
208
+ el.exportTokens.addEventListener('click', ()=>{
209
+ download('tokens.json', JSON.stringify(last.tokens, null, 2), 'application/json');
210
+ });
211
+ el.copyIds.addEventListener('click', async ()=>{
212
+ if (!last.ids.length) return;
213
+ await navigator.clipboard.writeText(last.ids.join(' '));
214
+ status('IDs copied.');
215
+ });
216
+ el.exportIds.addEventListener('click', ()=>{
217
+ download('ids.json', JSON.stringify(last.ids, null, 2), 'application/json');
218
+ });
219
+ el.exportCSV.addEventListener('click', ()=>{
220
+ if (!last.tokens.length) return;
221
+ const rows = last.tokens.map((t,i)=>[i,t,last.ids[i]]);
222
+ const csv = [['index','token','id'], ...rows].map(r=>r.map(csvCell).join(',')).join('\n');
223
+ download('tokens_and_ids.csv', csv, 'text/csv');
224
+ });
225
+
226
+ function csvCell(v){
227
+ const s = String(v);
228
+ return /[",\n]/.test(s) ? `"${s.replace(/"/g,'""')}"` : s;
229
+ }
230
+ function download(name, data, type){
231
+ const blob = new Blob([data], {type});
232
+ const url = URL.createObjectURL(blob);
233
+ const a = Object.assign(document.createElement('a'), {href:url, download:name});
234
+ a.click(); URL.revokeObjectURL(url);
235
+ }
236
+
237
+ // Events
238
+ el.btn.addEventListener('click', tokenize);
239
+ el.input.addEventListener('input', debounce(tokenize, 250));
240
+ el.model.addEventListener('change', async ()=>{
241
+ tokenizer = null;
242
+ await loadTokenizer(el.model.value);
243
+ tokenize();
244
+ });
245
+
246
+ // Initial load
247
+ await loadTokenizer(el.model.value);
248
+ tokenize();
249
+ </script>
250
+ </body>
251
  </html>