Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta | |
| name="viewport" | |
| content="width=device-width, initial-scale=1" | |
| /> | |
| <title>TokenVisualizer — Two-Pane</title> | |
| <link rel="preconnect" href="https://fonts.googleapis.com"> | |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600&family=JetBrains+Mono:wght@400;600&display=swap" rel="stylesheet"> | |
| <style> | |
| :root{ | |
| --bg:#0b0f14; /* black-leaning bg */ | |
| --text:#ffffff; /* white */ | |
| --muted:#9aa4b2; | |
| --accent:#38bdf8; /* sky blue */ | |
| --card:#0e1624; | |
| --border:#1f2a3a; | |
| --chip:#111827; | |
| --chip-border:#263246; | |
| --chip-hover:#1a2434; | |
| --mono:'JetBrains Mono', ui-monospace, SFMono-Regular, Menlo, Consolas, monospace; | |
| --sans:Inter, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, 'Helvetica Neue', Arial; | |
| } | |
| *{box-sizing:border-box} | |
| body{margin:0;background:radial-gradient(900px 500px at 10% -10%, #07314a, transparent),var(--bg);color:var(--text);font-family:var(--sans)} | |
| .container{max-width:1100px;margin:0 auto;padding:1.25rem} | |
| header{padding-top:1.5rem} | |
| h1{margin:.2rem 0 .4rem;font-size:1.9rem} | |
| .sub{color:var(--muted);margin:.25rem 0 1rem} | |
| .card{background:linear-gradient(180deg,#0c1624,#0a1220);border:1px solid var(--border);border-radius:14px;padding:1rem;box-shadow:0 10px 40px rgba(0,0,0,.35)} | |
| label span{color:var(--muted);font-size:.9rem} | |
| select,textarea{width:100%;border-radius:10px;border:1px solid var(--border);background:#0a1220;color:var(--text);padding:.7rem .85rem;outline:none} | |
| select:focus,textarea:focus{border-color:var(--accent)} | |
| .controls{display:grid;gap:.8rem;margin-bottom:1rem} | |
| .row{display:flex;gap:.75rem;align-items:center} | |
| .btn{background:var(--accent);color:#07222d;border:0;border-radius:10px;padding:.55rem .95rem;font-weight:600;cursor:pointer} | |
| .btn.secondary{background:#152236;color:var(--text);border:1px solid var(--border)} | |
| .status{color:var(--muted)} | |
| .grid{display:grid;gap:1rem;grid-template-columns:1fr} | |
| @media (min-width:900px){.grid{grid-template-columns:1fr 1fr}} | |
| .head{display:flex;align-items:center;justify-content:space-between;margin-bottom:.5rem} | |
| .actions .link{background:none;border:none;color:var(--accent);cursor:pointer;margin-left:.5rem} | |
| .tokens{display:flex;flex-wrap:wrap;gap:.5rem;max-height:360px;overflow:auto;padding:.25rem} | |
| .chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s} | |
| .chip:hover{background:var(--chip-hover);border-color:var(--accent)} | |
| .chip.active{outline:2px solid var(--accent)} | |
| pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap} | |
| footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem} | |
| a{color:var(--accent)} | |
| </style> | |
| </head> | |
| <body> | |
| <header class="container"> | |
| <h1>TokenVisualizer</h1> | |
| <p class="sub">See tokens and token IDs at the same time. (Runs entirely in your browser.)</p> | |
| </header> | |
| <main class="container"> | |
| <section class="card controls"> | |
| <label> | |
| <span>Model</span> | |
| <select id="model"> | |
| <option value="Xenova/gpt2">GPT-2 (BPE)</option> | |
| <option value="Xenova/llama2-tokenizer">Llama-2 (SentencePiece/BPE)</option> | |
| <option value="Xenova/mistral-tokenizer">Mistral (SentencePiece/BPE)</option> | |
| <option value="Xenova/gemma-tokenizer">Gemma (SentencePiece/BPE)</option> | |
| <option value="Xenova/bert-base-uncased">BERT Base Uncased (WordPiece)</option> | |
| </select> | |
| </label> | |
| <label> | |
| <span>Text</span> | |
| <textarea id="input" rows="3">Hello world! This is a tokenizer demo.</textarea> | |
| </label> | |
| <div class="row"> | |
| <button id="tokenize" class="btn">Tokenize</button> | |
| <span id="status" class="status">Loading tokenizer…</span> | |
| </div> | |
| </section> | |
| <section class="grid"> | |
| <article class="card"> | |
| <div class="head"> | |
| <h3>Tokens</h3> | |
| <div class="actions"> | |
| <button id="copyTokens" class="link">Copy</button> | |
| <button id="exportTokens" class="link">Export JSON</button> | |
| </div> | |
| </div> | |
| <div id="tokens" class="tokens"></div> | |
| </article> | |
| <article class="card"> | |
| <div class="head"> | |
| <h3>Token IDs</h3> | |
| <div class="actions"> | |
| <button id="copyIds" class="link">Copy</button> | |
| <button id="exportIds" class="link">Export JSON</button> | |
| </div> | |
| </div> | |
| <pre id="ids" class="ids"></pre> | |
| </article> | |
| </section> | |
| <section class="container" style="text-align:right;margin:1rem 0"> | |
| <button id="exportCSV" class="btn secondary">Download CSV (index, token, id)</button> | |
| </section> | |
| </main> | |
| <footer class="container"> | |
| <small>Built by Peter Adams • Powered by <a href="https://github.com/xenova/transformers.js" target="_blank" rel="noreferrer">Transformers.js</a></small> | |
| </footer> | |
| <!-- Transformers.js (ESM) --> | |
| <script type="module"> | |
| // Load transformers.js from CDN | |
| const tf = await import('https://cdn.jsdelivr.net/npm/@xenova/[email protected]/dist/transformers.min.mjs'); | |
| tf.env.useBrowserCache = true; | |
| tf.env.allowLocalModels = false; | |
| const $ = sel => document.querySelector(sel); | |
| const el = { | |
| model: $('#model'), | |
| input: $('#input'), | |
| btn: $('#tokenize'), | |
| status: $('#status'), | |
| tokens: $('#tokens'), | |
| ids: $('#ids'), | |
| copyTokens: $('#copyTokens'), | |
| exportTokens: $('#exportTokens'), | |
| copyIds: $('#copyIds'), | |
| exportIds: $('#exportIds'), | |
| exportCSV: $('#exportCSV'), | |
| }; | |
| let tokenizer = null; | |
| let last = { tokens: [], ids: [] }; | |
| let runId = 0; | |
| function status(msg){ el.status.textContent = msg; } | |
| function debounce(fn, ms=250){ let t; return (...a)=>{ clearTimeout(t); t=setTimeout(()=>fn(...a), ms); }; } | |
| async function loadTokenizer(modelId){ | |
| status('Loading tokenizer…'); | |
| tokenizer = await tf.AutoTokenizer.from_pretrained(modelId); | |
| status('Tokenizer ready.'); | |
| } | |
| async function tokenize(){ | |
| const myRun = ++runId; | |
| if (!tokenizer) await loadTokenizer(el.model.value); | |
| const text = el.input.value ?? ''; | |
| if (!text.trim()){ | |
| last = { tokens: [], ids: [] }; | |
| render(); | |
| status('Type to tokenize…'); | |
| return; | |
| } | |
| status('Tokenizing…'); | |
| try{ | |
| const enc = await tokenizer.encode(text); | |
| if (myRun !== runId) return; // stale run | |
| last.tokens = enc.tokens; | |
| last.ids = enc.ids; | |
| render(); | |
| status(`Done. ${last.tokens.length} tokens.`); | |
| }catch(err){ | |
| console.error(err); | |
| status('Error tokenizing. See console.'); | |
| } | |
| } | |
| function render(){ | |
| // Tokens pane | |
| el.tokens.innerHTML = ''; | |
| last.tokens.forEach((tok, i)=>{ | |
| const span = document.createElement('span'); | |
| span.className = 'chip'; | |
| span.dataset.idx = i; | |
| span.textContent = tok; | |
| span.addEventListener('mouseenter', ()=>highlightID(i,true)); | |
| span.addEventListener('mouseleave', ()=>highlightID(i,false)); | |
| el.tokens.appendChild(span); | |
| }); | |
| // IDs pane | |
| el.ids.textContent = last.ids.join(' '); | |
| if (!last.tokens.length) status('Type to tokenize…'); | |
| } | |
| function highlightID(i, on){ | |
| if (!last.ids.length) return; | |
| const parts = last.ids.map((id, idx)=> (idx===i && on ? `[${id}]` : String(id))); | |
| el.ids.textContent = parts.join(' '); | |
| const chip = el.tokens.querySelector(`[data-idx="${i}"]`); | |
| if (chip) chip.classList.toggle('active', on); | |
| } | |
| // Copy / Export | |
| el.copyTokens.addEventListener('click', async ()=>{ | |
| if (!last.tokens.length) return; | |
| await navigator.clipboard.writeText(last.tokens.join(' ')); | |
| status('Tokens copied.'); | |
| }); | |
| el.exportTokens.addEventListener('click', ()=>{ | |
| download('tokens.json', JSON.stringify(last.tokens, null, 2), 'application/json'); | |
| }); | |
| el.copyIds.addEventListener('click', async ()=>{ | |
| if (!last.ids.length) return; | |
| await navigator.clipboard.writeText(last.ids.join(' ')); | |
| status('IDs copied.'); | |
| }); | |
| el.exportIds.addEventListener('click', ()=>{ | |
| download('ids.json', JSON.stringify(last.ids, null, 2), 'application/json'); | |
| }); | |
| el.exportCSV.addEventListener('click', ()=>{ | |
| if (!last.tokens.length) return; | |
| const rows = last.tokens.map((t,i)=>[i,t,last.ids[i]]); | |
| const csv = [['index','token','id'], ...rows].map(r=>r.map(csvCell).join(',')).join('\n'); | |
| download('tokens_and_ids.csv', csv, 'text/csv'); | |
| }); | |
| function csvCell(v){ | |
| const s = String(v); | |
| return /[",\n]/.test(s) ? `"${s.replace(/"/g,'""')}"` : s; | |
| } | |
| function download(name, data, type){ | |
| const blob = new Blob([data], {type}); | |
| const url = URL.createObjectURL(blob); | |
| const a = Object.assign(document.createElement('a'), {href:url, download:name}); | |
| a.click(); URL.revokeObjectURL(url); | |
| } | |
| // Events | |
| el.btn.addEventListener('click', tokenize); | |
| el.input.addEventListener('input', debounce(tokenize, 250)); | |
| el.model.addEventListener('change', async ()=>{ | |
| tokenizer = null; | |
| await loadTokenizer(el.model.value); | |
| tokenize(); | |
| }); | |
| // Initial load | |
| await loadTokenizer(el.model.value); | |
| tokenize(); | |
| </script> | |
| </body> | |
| </html> | |