import streamlit as st import sys from datetime import datetime from pathlib import Path import base64 import colorsys import shutil import atexit import requests import importlib.util # Set page config - MUST BE FIRST STREAMLIT COMMAND st.set_page_config( page_title="Turkish Tiktokenizer", page_icon="🇹🇷", layout="wide" ) # Initialize session state if 'text' not in st.session_state: st.session_state.text = "" if 'token_results' not in st.session_state: st.session_state.token_results = None # Constants GITHUB_REPO = "malibayram/tokenizer" GITHUB_BRANCH = "main" # Special tokens and their IDs SPECIAL_TOKENS = { "": 0, # Uppercase letter marker "": 1, # Space character "": 2, # Newline character "": 3, # Tab character "": 4 # Unknown token } # Special token display symbols SPECIAL_TOKEN_SYMBOLS = { "": "[uppercase]", # Up arrow for uppercase "": "[space]", # Space symbol "": "[newline]", # Return symbol "": "[tab]", # Tab symbol "": "[unknown]" # Question mark for unknown } # Colors for special tokens SPECIAL_COLORS = { "": "#FF9999", # Light red for uppercase markers "": "#CCCCCC", # Gray for spaces "": "#CCCCCC", # Gray for newlines "": "#CCCCCC", # Gray for tabs "": "#FF0000" # Red for unknown tokens } # Required files mapping REQUIRED_FILES = { 'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py', 'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json', 'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json', 'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json' } # Token ID ranges TOKEN_RANGES = { 'special': (0, 4), # Special tokens 'root_words': (5, 20000), # Root words 'suffixes': (22268, 22767), # Suffixes 'bpe': (20000, None) # BPE tokens (20000+) } def generate_colors(n): """Generate n visually distinct colors.""" colors = [] for i in range(n): hue = i / n saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5 value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95 rgb = colorsys.hsv_to_rgb(hue, saturation, value) hex_color = "#{:02x}{:02x}{:02x}".format( int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255) ) colors.append(hex_color) return colors def fetch_github_file(path, ref=GITHUB_BRANCH): """Fetch file content from GitHub repository.""" url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}" response = requests.get(url) if response.status_code == 200: content = base64.b64decode(response.json()['content']).decode('utf-8') return content else: st.error(f"Could not fetch {path} from GitHub: {response.status_code}") return None @st.cache_resource def load_tokenizer(): """Load and initialize the tokenizer from GitHub.""" temp_dir = Path("temp_tokenizer") temp_dir.mkdir(exist_ok=True) # Fetch required files for local_name, github_path in REQUIRED_FILES.items(): content = fetch_github_file(github_path) if content is None: return None with open(temp_dir / local_name, 'w', encoding='utf-8') as f: f.write(content) # Modify tokenizer to use correct paths tokenizer_path = temp_dir / "tokenizer.py" with open(tokenizer_path, 'r', encoding='utf-8') as f: tokenizer_code = f.read() modified_code = tokenizer_code.replace( 'def load_json(filename):', f'''def load_json(filename): full_path = os.path.join("{temp_dir.absolute()}", filename) with open(full_path, 'r', encoding='utf-8') as file: return json.load(file)''' ) with open(tokenizer_path, 'w', encoding='utf-8') as f: f.write(modified_code) # Load module spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py")) module = importlib.util.module_from_spec(spec) sys.modules["tokenizer"] = module spec.loader.exec_module(module) return module.tokenize @st.cache_data(ttl=3600) def get_commit_history(): """Fetch commit history from GitHub.""" url = f"https://api.github.com/repos/{GITHUB_REPO}/commits" try: response = requests.get(url) if response.status_code == 200: commits = response.json() versions = [] for commit in commits[:10]: date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d') sha = commit['sha'][:7] message = commit['commit']['message'].split('\n')[0][:50] versions.append(f"{date} - {sha} - {message}") return versions return ["latest"] except Exception as e: st.warning(f"Could not fetch commit history: {str(e)}") return ["latest"] def render_tokens(tokens, token_colors): """Render colored token visualization.""" html_tokens = [] for token in tokens: color = token_colors[token] display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens html_tokens.append( f'{display_text}' ) return " ".join(html_tokens) # Load tokenizer tokenize = load_tokenizer() if tokenize is None: st.error("Failed to load tokenizer from GitHub") st.stop() # UI Layout st.title("🇹🇷 Turkish Tiktokenizer") # Model selection versions = get_commit_history() model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed") # Main layout col1, col2 = st.columns([0.4, 0.6]) # Input column with col1: text = st.text_area( "Enter Turkish text to tokenize", value=st.session_state.text, height=200, key="text_input", label_visibility="collapsed", placeholder="Enter Turkish text to tokenize" ) if st.button("Tokenize", type="primary"): st.session_state.text = text if text.strip(): try: st.session_state.token_results = tokenize(text) except Exception as e: st.session_state.token_results = None st.error(f"Error tokenizing text: {str(e)}") else: st.session_state.token_results = None # Results column with col2: st.markdown("Token count") if st.session_state.token_results is not None: result = st.session_state.token_results token_count = len(result["tokens"]) st.markdown(f"### {token_count}") st.markdown("Tokenized text") # Generate token colors regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS] regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens)))) token_colors = {**SPECIAL_COLORS, **regular_token_colors} # Render tokens with st.container(): st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True) st.markdown("Token IDs") st.code(", ".join(map(str, result["ids"])), language=None) else: st.markdown("### 0") st.markdown("Tokenized text") st.markdown("") st.markdown("Token IDs") st.text("") # Footer st.markdown(""" """, unsafe_allow_html=True) # Cleanup def cleanup(): if Path("temp_tokenizer").exists(): shutil.rmtree("temp_tokenizer") atexit.register(cleanup)