Spaces:
Sleeping
Sleeping
File size: 4,038 Bytes
b253e66 8443315 b253e66 8443315 b253e66 a114a02 b253e66 a114a02 8443315 228bed3 fe36eff f962dd0 c290138 8d171c2 b6ab215 dd5d2e0 b6ab215 f962dd0 b6ab215 e5222c4 dd5d2e0 8443315 b6ab215 e5222c4 f962dd0 b6ab215 f962dd0 b6ab215 8443315 b6ab215 8443315 b253e66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
from enum import Enum
from pathlib import Path
import streamlit as st
import streamlit.components.v1 as components
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding
root_dir = Path(__file__).resolve().parent
highlighted_text_component = components.declare_component(
"highlighted_text", path=root_dir / "highlighted_text" / "build"
)
def get_windows_batched(examples: BatchEncoding, window_len: int, stride: int = 1, pad_id: int = 0) -> BatchEncoding:
return BatchEncoding({
k: [
t[i][j : j + window_len] + [
pad_id if k == "input_ids" else 0
] * (j + window_len - len(t[i]))
for i in range(len(examples["input_ids"]))
for j in range(0, len(examples["input_ids"][i]) - 1, stride)
]
for k, t in examples.items()
})
BAD_CHAR = chr(0xfffd)
def ids_to_readable_tokens(tokenizer, ids, strip_whitespace=False):
cur_ids = []
result = []
for idx in ids:
cur_ids.append(idx)
decoded = tokenizer.decode(cur_ids)
if BAD_CHAR not in decoded:
if strip_whitespace:
decoded = decoded.strip()
result.append(decoded)
del cur_ids[:]
else:
result.append("")
return result
st.header("Context length probing")
model_name = st.selectbox("Model", ["distilgpt2", "gpt2", "EleutherAI/gpt-neo-125m"])
metric_name = st.selectbox("Metric", ["KL divergence", "Cross entropy"], index=1)
window_len = st.select_slider(r"Window size ($c_\text{max}$)", options=[8, 16, 32, 64, 128, 256, 512, 1024], value=512)
text = st.text_area(
"Input text",
"The complex houses married and single soldiers and their families.",
)
if metric_name == "KL divergence":
st.error("KL divergence is not supported yet. Stay tuned!", icon="😭")
st.stop()
tokenizer = st.cache_resource(AutoTokenizer.from_pretrained, show_spinner=False)(model_name)
model = st.cache_resource(AutoModelForCausalLM.from_pretrained, show_spinner=False)(model_name)
inputs = tokenizer([text])
[input_ids] = inputs["input_ids"]
window_len = min(window_len, len(input_ids))
if len(input_ids) < 2:
st.error("Please enter at least 2 tokens.", icon="🚨")
st.stop()
@st.cache_data(show_spinner=False)
@torch.inference_mode()
def run_context_length_probing(model_name, text, window_len):
assert model.name_or_path == model_name
del text # needed as a cache key but for the computation we access inputs directly
inputs_sliding = get_windows_batched(
inputs,
window_len=window_len,
pad_id=tokenizer.eos_token_id
).convert_to_tensors("pt")
logits = []
pbar = st.progress(0.)
batch_size = 8
num_items = len(inputs_sliding["input_ids"])
for i in range(0, num_items, batch_size):
pbar.progress(i / num_items * 0.9, f"Running model… ({i}/{num_items})")
batch = {k: v[i:i + batch_size] for k, v in inputs_sliding.items()}
logits.append(model(**batch).logits.to(torch.float16))
pbar.progress(0.9, "Computing scores…")
logits = torch.cat(logits, dim=0)
logits = logits.permute(1, 0, 2)
logits = F.pad(logits, (0, 0, 0, window_len, 0, 0), value=torch.nan)
logits = logits.view(-1, logits.shape[-1])[:-window_len]
logits = logits.view(window_len, len(input_ids) + window_len - 2, logits.shape[-1])
scores = logits.to(torch.float32).log_softmax(dim=-1)
scores = scores[:, torch.arange(len(input_ids[1:])), input_ids[1:]]
scores = scores.diff(dim=0).transpose(0, 1)
scores = scores.nan_to_num()
scores /= scores.abs().max(dim=1, keepdim=True).values + 1e-9
scores = scores.to(torch.float16)
pbar.progress(1., "Done!")
return scores
scores = run_context_length_probing(
model_name=model_name,
text=text,
window_len=window_len
)
tokens = ids_to_readable_tokens(tokenizer, input_ids)
highlighted_text_component(tokens=tokens, scores=scores.tolist())
|