alibayram's picture
Add application file
26ddb6c
raw
history blame
8.1 kB
import streamlit as st
import sys
from datetime import datetime
from pathlib import Path
import base64
import colorsys
import shutil
import atexit
import requests
import importlib.util
# Set page config - MUST BE FIRST STREAMLIT COMMAND
st.set_page_config(
page_title="Turkish Tiktokenizer",
page_icon="πŸ‡ΉπŸ‡·",
layout="wide"
)
# Initialize session state
if 'text' not in st.session_state:
st.session_state.text = ""
if 'token_results' not in st.session_state:
st.session_state.token_results = None
# Constants
GITHUB_REPO = "malibayram/tokenizer"
GITHUB_BRANCH = "main"
# Special tokens and their IDs
SPECIAL_TOKENS = {
"<uppercase>": 0, # Uppercase letter marker
"<space>": 1, # Space character
"<newline>": 2, # Newline character
"<tab>": 3, # Tab character
"<unknown>": 4 # Unknown token
}
# Special token display symbols
SPECIAL_TOKEN_SYMBOLS = {
"<uppercase>": "[uppercase]", # Up arrow for uppercase
"<space>": "[space]", # Space symbol
"<newline>": "[newline]", # Return symbol
"<tab>": "[tab]", # Tab symbol
"<unknown>": "[unknown]" # Question mark for unknown
}
# Colors for special tokens
SPECIAL_COLORS = {
"<uppercase>": "#FF9999", # Light red for uppercase markers
"<space>": "#CCCCCC", # Gray for spaces
"<newline>": "#CCCCCC", # Gray for newlines
"<tab>": "#CCCCCC", # Gray for tabs
"<unknown>": "#FF0000" # Red for unknown tokens
}
# Required files mapping
REQUIRED_FILES = {
'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json',
'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
}
# Token ID ranges
TOKEN_RANGES = {
'special': (0, 4), # Special tokens
'root_words': (5, 20000), # Root words
'suffixes': (22268, 22767), # Suffixes
'bpe': (20000, None) # BPE tokens (20000+)
}
def generate_colors(n):
"""Generate n visually distinct colors."""
colors = []
for i in range(n):
hue = i / n
saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5
value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
hex_color = "#{:02x}{:02x}{:02x}".format(
int(rgb[0] * 255),
int(rgb[1] * 255),
int(rgb[2] * 255)
)
colors.append(hex_color)
return colors
def fetch_github_file(path, ref=GITHUB_BRANCH):
"""Fetch file content from GitHub repository."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
response = requests.get(url)
if response.status_code == 200:
content = base64.b64decode(response.json()['content']).decode('utf-8')
return content
else:
st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
return None
@st.cache_resource
def load_tokenizer():
"""Load and initialize the tokenizer from GitHub."""
temp_dir = Path("temp_tokenizer")
temp_dir.mkdir(exist_ok=True)
# Fetch required files
for local_name, github_path in REQUIRED_FILES.items():
content = fetch_github_file(github_path)
if content is None:
return None
with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
f.write(content)
# Modify tokenizer to use correct paths
tokenizer_path = temp_dir / "tokenizer.py"
with open(tokenizer_path, 'r', encoding='utf-8') as f:
tokenizer_code = f.read()
modified_code = tokenizer_code.replace(
'def load_json(filename):',
f'''def load_json(filename):
full_path = os.path.join("{temp_dir.absolute()}", filename)
with open(full_path, 'r', encoding='utf-8') as file:
return json.load(file)'''
)
with open(tokenizer_path, 'w', encoding='utf-8') as f:
f.write(modified_code)
# Load module
spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
module = importlib.util.module_from_spec(spec)
sys.modules["tokenizer"] = module
spec.loader.exec_module(module)
return module.tokenize
@st.cache_data(ttl=3600)
def get_commit_history():
"""Fetch commit history from GitHub."""
url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
try:
response = requests.get(url)
if response.status_code == 200:
commits = response.json()
versions = []
for commit in commits[:10]:
date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
sha = commit['sha'][:7]
message = commit['commit']['message'].split('\n')[0][:50]
versions.append(f"{date} - {sha} - {message}")
return versions
return ["latest"]
except Exception as e:
st.warning(f"Could not fetch commit history: {str(e)}")
return ["latest"]
def render_tokens(tokens, token_colors):
"""Render colored token visualization."""
html_tokens = []
for token in tokens:
color = token_colors[token]
display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens
html_tokens.append(
f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
)
return " ".join(html_tokens)
# Load tokenizer
tokenize = load_tokenizer()
if tokenize is None:
st.error("Failed to load tokenizer from GitHub")
st.stop()
# UI Layout
st.title("πŸ‡ΉπŸ‡· Turkish Tiktokenizer")
# Model selection
versions = get_commit_history()
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")
# Main layout
col1, col2 = st.columns([0.4, 0.6])
# Input column
with col1:
text = st.text_area(
"Enter Turkish text to tokenize",
value=st.session_state.text,
height=200,
key="text_input",
label_visibility="collapsed",
placeholder="Enter Turkish text to tokenize"
)
if st.button("Tokenize", type="primary"):
st.session_state.text = text
if text.strip():
try:
st.session_state.token_results = tokenize(text)
except Exception as e:
st.session_state.token_results = None
st.error(f"Error tokenizing text: {str(e)}")
else:
st.session_state.token_results = None
# Results column
with col2:
st.markdown("Token count")
if st.session_state.token_results is not None:
result = st.session_state.token_results
token_count = len(result["tokens"])
st.markdown(f"### {token_count}")
st.markdown("Tokenized text")
# Generate token colors
regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
token_colors = {**SPECIAL_COLORS, **regular_token_colors}
# Render tokens
with st.container():
st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
st.markdown("Token IDs")
st.code(", ".join(map(str, result["ids"])), language=None)
else:
st.markdown("### 0")
st.markdown("Tokenized text")
st.markdown("")
st.markdown("Token IDs")
st.text("")
# Footer
st.markdown("""
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
<a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
</div>
""", unsafe_allow_html=True)
# Cleanup
def cleanup():
if Path("temp_tokenizer").exists():
shutil.rmtree("temp_tokenizer")
atexit.register(cleanup)