Spaces:
Running
Running
File size: 8,096 Bytes
26ddb6c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 |
import streamlit as st
import sys
from datetime import datetime
from pathlib import Path
import base64
import colorsys
import shutil
import atexit
import requests
import importlib.util
# Set page config - MUST BE FIRST STREAMLIT COMMAND
st.set_page_config(
page_title="Turkish Tiktokenizer",
page_icon="πΉπ·",
layout="wide"
)
# Initialize session state
if 'text' not in st.session_state:
st.session_state.text = ""
if 'token_results' not in st.session_state:
st.session_state.token_results = None
# Constants
GITHUB_REPO = "malibayram/tokenizer"
GITHUB_BRANCH = "main"
# Special tokens and their IDs
SPECIAL_TOKENS = {
"<uppercase>": 0, # Uppercase letter marker
"<space>": 1, # Space character
"<newline>": 2, # Newline character
"<tab>": 3, # Tab character
"<unknown>": 4 # Unknown token
}
# Special token display symbols
SPECIAL_TOKEN_SYMBOLS = {
"<uppercase>": "[uppercase]", # Up arrow for uppercase
"<space>": "[space]", # Space symbol
"<newline>": "[newline]", # Return symbol
"<tab>": "[tab]", # Tab symbol
"<unknown>": "[unknown]" # Question mark for unknown
}
# Colors for special tokens
SPECIAL_COLORS = {
"<uppercase>": "#FF9999", # Light red for uppercase markers
"<space>": "#CCCCCC", # Gray for spaces
"<newline>": "#CCCCCC", # Gray for newlines
"<tab>": "#CCCCCC", # Gray for tabs
"<unknown>": "#FF0000" # Red for unknown tokens
}
# Required files mapping
REQUIRED_FILES = {
'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json',
'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
}
# Token ID ranges
TOKEN_RANGES = {
'special': (0, 4), # Special tokens
'root_words': (5, 20000), # Root words
'suffixes': (22268, 22767), # Suffixes
'bpe': (20000, None) # BPE tokens (20000+)
}
def generate_colors(n):
"""Generate n visually distinct colors."""
colors = []
for i in range(n):
hue = i / n
saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5
value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
hex_color = "#{:02x}{:02x}{:02x}".format(
int(rgb[0] * 255),
int(rgb[1] * 255),
int(rgb[2] * 255)
)
colors.append(hex_color)
return colors
def fetch_github_file(path, ref=GITHUB_BRANCH):
"""Fetch file content from GitHub repository."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
response = requests.get(url)
if response.status_code == 200:
content = base64.b64decode(response.json()['content']).decode('utf-8')
return content
else:
st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
return None
@st.cache_resource
def load_tokenizer():
"""Load and initialize the tokenizer from GitHub."""
temp_dir = Path("temp_tokenizer")
temp_dir.mkdir(exist_ok=True)
# Fetch required files
for local_name, github_path in REQUIRED_FILES.items():
content = fetch_github_file(github_path)
if content is None:
return None
with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
f.write(content)
# Modify tokenizer to use correct paths
tokenizer_path = temp_dir / "tokenizer.py"
with open(tokenizer_path, 'r', encoding='utf-8') as f:
tokenizer_code = f.read()
modified_code = tokenizer_code.replace(
'def load_json(filename):',
f'''def load_json(filename):
full_path = os.path.join("{temp_dir.absolute()}", filename)
with open(full_path, 'r', encoding='utf-8') as file:
return json.load(file)'''
)
with open(tokenizer_path, 'w', encoding='utf-8') as f:
f.write(modified_code)
# Load module
spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
module = importlib.util.module_from_spec(spec)
sys.modules["tokenizer"] = module
spec.loader.exec_module(module)
return module.tokenize
@st.cache_data(ttl=3600)
def get_commit_history():
"""Fetch commit history from GitHub."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
try:
response = requests.get(url)
if response.status_code == 200:
commits = response.json()
versions = []
for commit in commits[:10]:
date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
sha = commit['sha'][:7]
message = commit['commit']['message'].split('\n')[0][:50]
versions.append(f"{date} - {sha} - {message}")
return versions
return ["latest"]
except Exception as e:
st.warning(f"Could not fetch commit history: {str(e)}")
return ["latest"]
def render_tokens(tokens, token_colors):
"""Render colored token visualization."""
html_tokens = []
for token in tokens:
color = token_colors[token]
display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens
html_tokens.append(
f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
)
return " ".join(html_tokens)
# Load tokenizer
tokenize = load_tokenizer()
if tokenize is None:
st.error("Failed to load tokenizer from GitHub")
st.stop()
# UI Layout
st.title("πΉπ· Turkish Tiktokenizer")
# Model selection
versions = get_commit_history()
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")
# Main layout
col1, col2 = st.columns([0.4, 0.6])
# Input column
with col1:
text = st.text_area(
"Enter Turkish text to tokenize",
value=st.session_state.text,
height=200,
key="text_input",
label_visibility="collapsed",
placeholder="Enter Turkish text to tokenize"
)
if st.button("Tokenize", type="primary"):
st.session_state.text = text
if text.strip():
try:
st.session_state.token_results = tokenize(text)
except Exception as e:
st.session_state.token_results = None
st.error(f"Error tokenizing text: {str(e)}")
else:
st.session_state.token_results = None
# Results column
with col2:
st.markdown("Token count")
if st.session_state.token_results is not None:
result = st.session_state.token_results
token_count = len(result["tokens"])
st.markdown(f"### {token_count}")
st.markdown("Tokenized text")
# Generate token colors
regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
token_colors = {**SPECIAL_COLORS, **regular_token_colors}
# Render tokens
with st.container():
st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
st.markdown("Token IDs")
st.code(", ".join(map(str, result["ids"])), language=None)
else:
st.markdown("### 0")
st.markdown("Tokenized text")
st.markdown("")
st.markdown("Token IDs")
st.text("")
# Footer
st.markdown("""
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
<a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
</div>
""", unsafe_allow_html=True)
# Cleanup
def cleanup():
if Path("temp_tokenizer").exists():
shutil.rmtree("temp_tokenizer")
atexit.register(cleanup) |