Spaces:
Running
Running
import streamlit as st | |
import sys | |
from datetime import datetime | |
from pathlib import Path | |
import base64 | |
import colorsys | |
import shutil | |
import atexit | |
import requests | |
import importlib.util | |
# Set page config - MUST BE FIRST STREAMLIT COMMAND | |
st.set_page_config( | |
page_title="Turkish Tiktokenizer", | |
page_icon="πΉπ·", | |
layout="wide" | |
) | |
# Initialize session state | |
if 'text' not in st.session_state: | |
st.session_state.text = "" | |
if 'token_results' not in st.session_state: | |
st.session_state.token_results = None | |
# Constants | |
GITHUB_REPO = "malibayram/tokenizer" | |
GITHUB_BRANCH = "main" | |
# Special tokens and their IDs | |
SPECIAL_TOKENS = { | |
"<uppercase>": 0, # Uppercase letter marker | |
"<space>": 1, # Space character | |
"<newline>": 2, # Newline character | |
"<tab>": 3, # Tab character | |
"<unknown>": 4 # Unknown token | |
} | |
# Special token display symbols | |
SPECIAL_TOKEN_SYMBOLS = { | |
"<uppercase>": "[uppercase]", # Up arrow for uppercase | |
"<space>": "[space]", # Space symbol | |
"<newline>": "[newline]", # Return symbol | |
"<tab>": "[tab]", # Tab symbol | |
"<unknown>": "[unknown]" # Question mark for unknown | |
} | |
# Colors for special tokens | |
SPECIAL_COLORS = { | |
"<uppercase>": "#FF9999", # Light red for uppercase markers | |
"<space>": "#CCCCCC", # Gray for spaces | |
"<newline>": "#CCCCCC", # Gray for newlines | |
"<tab>": "#CCCCCC", # Gray for tabs | |
"<unknown>": "#FF0000" # Red for unknown tokens | |
} | |
# Required files mapping | |
REQUIRED_FILES = { | |
'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py', | |
'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json', | |
'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json', | |
'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json' | |
} | |
# Token ID ranges | |
TOKEN_RANGES = { | |
'special': (0, 4), # Special tokens | |
'root_words': (5, 20000), # Root words | |
'suffixes': (22268, 22767), # Suffixes | |
'bpe': (20000, None) # BPE tokens (20000+) | |
} | |
def generate_colors(n): | |
"""Generate n visually distinct colors.""" | |
colors = [] | |
for i in range(n): | |
hue = i / n | |
saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5 | |
value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95 | |
rgb = colorsys.hsv_to_rgb(hue, saturation, value) | |
hex_color = "#{:02x}{:02x}{:02x}".format( | |
int(rgb[0] * 255), | |
int(rgb[1] * 255), | |
int(rgb[2] * 255) | |
) | |
colors.append(hex_color) | |
return colors | |
def fetch_github_file(path, ref=GITHUB_BRANCH): | |
"""Fetch file content from GitHub repository.""" | |
url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}" | |
response = requests.get(url) | |
if response.status_code == 200: | |
content = base64.b64decode(response.json()['content']).decode('utf-8') | |
return content | |
else: | |
st.error(f"Could not fetch {path} from GitHub: {response.status_code}") | |
return None | |
def load_tokenizer(): | |
"""Load and initialize the tokenizer from GitHub.""" | |
temp_dir = Path("temp_tokenizer") | |
temp_dir.mkdir(exist_ok=True) | |
# Fetch required files | |
for local_name, github_path in REQUIRED_FILES.items(): | |
content = fetch_github_file(github_path) | |
if content is None: | |
return None | |
with open(temp_dir / local_name, 'w', encoding='utf-8') as f: | |
f.write(content) | |
# Modify tokenizer to use correct paths | |
tokenizer_path = temp_dir / "tokenizer.py" | |
with open(tokenizer_path, 'r', encoding='utf-8') as f: | |
tokenizer_code = f.read() | |
modified_code = tokenizer_code.replace( | |
'def load_json(filename):', | |
f'''def load_json(filename): | |
full_path = os.path.join("{temp_dir.absolute()}", filename) | |
with open(full_path, 'r', encoding='utf-8') as file: | |
return json.load(file)''' | |
) | |
with open(tokenizer_path, 'w', encoding='utf-8') as f: | |
f.write(modified_code) | |
# Load module | |
spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py")) | |
module = importlib.util.module_from_spec(spec) | |
sys.modules["tokenizer"] = module | |
spec.loader.exec_module(module) | |
return module.tokenize | |
def get_commit_history(): | |
"""Fetch commit history from GitHub.""" | |
url = f"https://api.github.com/repos/{GITHUB_REPO}/commits" | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
commits = response.json() | |
versions = [] | |
for commit in commits[:10]: | |
date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') | |
sha = commit['sha'][:7] | |
message = commit['commit']['message'].split('\n')[0][:50] | |
versions.append(f"{date} - {sha} - {message}") | |
return versions | |
return ["latest"] | |
except Exception as e: | |
st.warning(f"Could not fetch commit history: {str(e)}") | |
return ["latest"] | |
def render_tokens(tokens, token_colors): | |
"""Render colored token visualization.""" | |
html_tokens = [] | |
for token in tokens: | |
color = token_colors[token] | |
display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens | |
html_tokens.append( | |
f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>' | |
) | |
return " ".join(html_tokens) | |
# Load tokenizer | |
tokenize = load_tokenizer() | |
if tokenize is None: | |
st.error("Failed to load tokenizer from GitHub") | |
st.stop() | |
# UI Layout | |
st.title("πΉπ· Turkish Tiktokenizer") | |
# Model selection | |
versions = get_commit_history() | |
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed") | |
# Main layout | |
col1, col2 = st.columns([0.4, 0.6]) | |
# Input column | |
with col1: | |
text = st.text_area( | |
"Enter Turkish text to tokenize", | |
value=st.session_state.text, | |
height=200, | |
key="text_input", | |
label_visibility="collapsed", | |
placeholder="Enter Turkish text to tokenize" | |
) | |
if st.button("Tokenize", type="primary"): | |
st.session_state.text = text | |
if text.strip(): | |
try: | |
st.session_state.token_results = tokenize(text) | |
except Exception as e: | |
st.session_state.token_results = None | |
st.error(f"Error tokenizing text: {str(e)}") | |
else: | |
st.session_state.token_results = None | |
# Results column | |
with col2: | |
st.markdown("Token count") | |
if st.session_state.token_results is not None: | |
result = st.session_state.token_results | |
token_count = len(result["tokens"]) | |
st.markdown(f"### {token_count}") | |
st.markdown("Tokenized text") | |
# Generate token colors | |
regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS] | |
regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens)))) | |
token_colors = {**SPECIAL_COLORS, **regular_token_colors} | |
# Render tokens | |
with st.container(): | |
st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True) | |
st.markdown("Token IDs") | |
st.code(", ".join(map(str, result["ids"])), language=None) | |
else: | |
st.markdown("### 0") | |
st.markdown("Tokenized text") | |
st.markdown("") | |
st.markdown("Token IDs") | |
st.text("") | |
# Footer | |
st.markdown(""" | |
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;"> | |
<a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a> | |
</div> | |
""", unsafe_allow_html=True) | |
# Cleanup | |
def cleanup(): | |
if Path("temp_tokenizer").exists(): | |
shutil.rmtree("temp_tokenizer") | |
atexit.register(cleanup) |