alibayram's picture
Update REQUIRED_FILES to reference kokler_v07.json instead of kokler_v05.json
a5583b8
import streamlit as st
import sys
from datetime import datetime
from pathlib import Path
import base64
import colorsys
import shutil
import atexit
import requests
import importlib.util
# Set page config - MUST BE FIRST STREAMLIT COMMAND
st.set_page_config(
page_title="Turkish Tiktokenizer",
page_icon="🇹🇷",
layout="wide"
)
# Initialize session state
if 'text' not in st.session_state:
st.session_state.text = "Akademisyenler ve aileleri birlikte çalışıyorlar."
if 'token_results' not in st.session_state:
st.session_state.token_results = None
# Constants
GITHUB_REPO = "malibayram/tokenizer"
GITHUB_BRANCH = "main"
# Special tokens and their IDs
SPECIAL_TOKENS = {
"<uppercase>": 0, # Uppercase letter marker
"<space>": 1, # Space character
"<newline>": 2, # Newline character
"<tab>": 3, # Tab character
"<unknown>": 4 # Unknown token
}
# Special token display symbols
SPECIAL_TOKEN_SYMBOLS = {
"<uppercase>": "[uppercase]", # Up arrow for uppercase
"<space>": "[space]", # Space symbol
"<newline>": "[newline]", # Return symbol
"<tab>": "[tab]", # Tab symbol
"<unknown>": "[unknown]" # Question mark for unknown
}
# Colors for special tokens
SPECIAL_COLORS = {
"<uppercase>": "#FF9999", # Light red for uppercase markers
"<space>": "#CCCCCC", # Gray for spaces
"<newline>": "#CCCCCC", # Gray for newlines
"<tab>": "#CCCCCC", # Gray for tabs
"<unknown>": "#FF0000" # Red for unknown tokens
}
# Required files mapping
REQUIRED_FILES = {
'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
'kokler_v07.json': 'turkish_tokenizer/kokler_v07.json',
'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
}
# Token ID ranges
TOKEN_RANGES = {
'special': (0, 4), # Special tokens
'root_words': (5, 20000), # Root words
'suffixes': (22268, 22767), # Suffixes
'bpe': (20000, None) # BPE tokens (20000+)
}
def generate_colors(n):
"""Generate n visually distinct colors."""
colors = []
for i in range(n):
hue = i / n
saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5
value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
hex_color = "#{:02x}{:02x}{:02x}".format(
int(rgb[0] * 255),
int(rgb[1] * 255),
int(rgb[2] * 255)
)
colors.append(hex_color)
return colors
def fetch_github_file(path, ref=GITHUB_BRANCH):
"""Fetch file content from GitHub repository."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
response = requests.get(url)
if response.status_code == 200:
content = base64.b64decode(response.json()['content']).decode('utf-8')
return content
else:
st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
return None
@st.cache_resource
def load_tokenizer():
"""Load and initialize the tokenizer from GitHub."""
temp_dir = Path("temp_tokenizer")
temp_dir.mkdir(exist_ok=True)
# Fetch required files
for local_name, github_path in REQUIRED_FILES.items():
content = fetch_github_file(github_path)
if content is None:
return None
with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
f.write(content)
# Modify tokenizer to use correct paths
tokenizer_path = temp_dir / "tokenizer.py"
with open(tokenizer_path, 'r', encoding='utf-8') as f:
tokenizer_code = f.read()
modified_code = tokenizer_code.replace(
'def load_json(filename):',
f'''def load_json(filename):
full_path = os.path.join("{temp_dir.absolute()}", filename)
with open(full_path, 'r', encoding='utf-8') as file:
return json.load(file)'''
)
with open(tokenizer_path, 'w', encoding='utf-8') as f:
f.write(modified_code)
# Load module
spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
module = importlib.util.module_from_spec(spec)
sys.modules["tokenizer"] = module
spec.loader.exec_module(module)
return module.tokenize
@st.cache_data(ttl=3600)
def get_commit_history():
"""Fetch commit history from GitHub."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
try:
response = requests.get(url)
if response.status_code == 200:
commits = response.json()
versions = []
for commit in commits[:10]:
date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
sha = commit['sha'][:7]
message = commit['commit']['message'].split('\n')[0][:50]
versions.append(f"{date} - {sha} - {message}")
return versions
return ["latest"]
except Exception as e:
st.warning(f"Could not fetch commit history: {str(e)}")
return ["latest"]
def render_tokens(tokens, token_colors):
"""Render colored token visualization."""
html_tokens = []
for token in tokens:
color = token_colors[token]
display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens
html_tokens.append(
f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
)
return " ".join(html_tokens)
# Load tokenizer
tokenize = load_tokenizer()
if tokenize is None:
st.error("Failed to load tokenizer from GitHub")
st.stop()
# Tokenize example text on startup if no results exist
if st.session_state.token_results is None and st.session_state.text:
try:
st.session_state.token_results = tokenize(st.session_state.text)
except Exception as e:
st.error(f"Error tokenizing text: {str(e)}")
# UI Layout
st.title("🇹🇷 Turkish Tiktokenizer")
# Model selection
versions = get_commit_history()
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")
# Main layout
col1, col2 = st.columns([0.4, 0.6])
# Input column
with col1:
text = st.text_area(
"Enter Turkish text to tokenize",
value=st.session_state.text,
height=200,
key="text_input",
label_visibility="collapsed",
placeholder="Enter Turkish text to tokenize"
)
if st.button("Tokenize", type="primary"):
st.session_state.text = text
if text.strip():
try:
st.session_state.token_results = tokenize(text)
except Exception as e:
st.session_state.token_results = None
st.error(f"Error tokenizing text: {str(e)}")
else:
st.session_state.token_results = None
# Results column
with col2:
st.markdown("Token count")
if st.session_state.token_results is not None:
result = st.session_state.token_results
token_count = len(result["tokens"])
st.markdown(f"### {token_count}")
st.markdown("Tokenized text")
# Generate token colors
regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
token_colors = {**SPECIAL_COLORS, **regular_token_colors}
# Render tokens
with st.container():
st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
st.markdown("Token IDs")
st.code(", ".join(map(str, result["ids"])), language=None)
else:
st.markdown("### 0")
st.markdown("Tokenized text")
st.markdown("")
st.markdown("Token IDs")
st.text("")
# Footer
st.markdown("""
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
<a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
</div>
""", unsafe_allow_html=True)
# Cleanup
def cleanup():
if Path("temp_tokenizer").exists():
shutil.rmtree("temp_tokenizer")
atexit.register(cleanup)