Spaces:

alibayram
/

turkish_tiktokenizer

Running

App Files Files Community

turkish_tiktokenizer / app.py

alibayram

Update REQUIRED_FILES to reference kokler_v07.json instead of kokler_v05.json

a5583b8 4 months ago

raw

history blame contribute delete

8.44 kB

	import streamlit as st
	import sys
	from datetime import datetime
	from pathlib import Path
	import base64
	import colorsys
	import shutil
	import atexit
	import requests
	import importlib.util

	# Set page config - MUST BE FIRST STREAMLIT COMMAND
	st.set_page_config(
	page_title="Turkish Tiktokenizer",
	page_icon="🇹🇷",
	layout="wide"
	)

	# Initialize session state
	if 'text' not in st.session_state:
	st.session_state.text = "Akademisyenler ve aileleri birlikte çalışıyorlar."
	if 'token_results' not in st.session_state:
	st.session_state.token_results = None

	# Constants
	GITHUB_REPO = "malibayram/tokenizer"
	GITHUB_BRANCH = "main"

	# Special tokens and their IDs
	SPECIAL_TOKENS = {
	"<uppercase>": 0, # Uppercase letter marker
	"<space>": 1, # Space character
	"<newline>": 2, # Newline character
	"<tab>": 3, # Tab character
	"<unknown>": 4 # Unknown token
	}

	# Special token display symbols
	SPECIAL_TOKEN_SYMBOLS = {
	"<uppercase>": "[uppercase]", # Up arrow for uppercase
	"<space>": "[space]", # Space symbol
	"<newline>": "[newline]", # Return symbol
	"<tab>": "[tab]", # Tab symbol
	"<unknown>": "[unknown]" # Question mark for unknown
	}

	# Colors for special tokens
	SPECIAL_COLORS = {
	"<uppercase>": "#FF9999", # Light red for uppercase markers
	"<space>": "#CCCCCC", # Gray for spaces
	"<newline>": "#CCCCCC", # Gray for newlines
	"<tab>": "#CCCCCC", # Gray for tabs
	"<unknown>": "#FF0000" # Red for unknown tokens
	}

	# Required files mapping
	REQUIRED_FILES = {
	'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
	'kokler_v07.json': 'turkish_tokenizer/kokler_v07.json',
	'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
	'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
	}

	# Token ID ranges
	TOKEN_RANGES = {
	'special': (0, 4), # Special tokens
	'root_words': (5, 20000), # Root words
	'suffixes': (22268, 22767), # Suffixes
	'bpe': (20000, None) # BPE tokens (20000+)
	}

	def generate_colors(n):
	"""Generate n visually distinct colors."""
	colors = []
	for i in range(n):
	hue = i / n
	saturation = 0.3 + (i % 3) * 0.1 # Vary saturation between 0.3-0.5
	value = 0.95 - (i % 2) * 0.1 # Vary value between 0.85-0.95
	rgb = colorsys.hsv_to_rgb(hue, saturation, value)
	hex_color = "#{:02x}{:02x}{:02x}".format(
	int(rgb[0] * 255),
	int(rgb[1] * 255),
	int(rgb[2] * 255)
	)
	colors.append(hex_color)
	return colors

	def fetch_github_file(path, ref=GITHUB_BRANCH):
	"""Fetch file content from GitHub repository."""
	url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
	response = requests.get(url)
	if response.status_code == 200:
	content = base64.b64decode(response.json()['content']).decode('utf-8')
	return content
	else:
	st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
	return None

	@st.cache_resource
	def load_tokenizer():
	"""Load and initialize the tokenizer from GitHub."""
	temp_dir = Path("temp_tokenizer")
	temp_dir.mkdir(exist_ok=True)

	# Fetch required files
	for local_name, github_path in REQUIRED_FILES.items():
	content = fetch_github_file(github_path)
	if content is None:
	return None

	with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
	f.write(content)

	# Modify tokenizer to use correct paths
	tokenizer_path = temp_dir / "tokenizer.py"
	with open(tokenizer_path, 'r', encoding='utf-8') as f:
	tokenizer_code = f.read()

	modified_code = tokenizer_code.replace(
	'def load_json(filename):',
	f'''def load_json(filename):
	full_path = os.path.join("{temp_dir.absolute()}", filename)
	with open(full_path, 'r', encoding='utf-8') as file:
	return json.load(file)'''
	)

	with open(tokenizer_path, 'w', encoding='utf-8') as f:
	f.write(modified_code)

	# Load module
	spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
	module = importlib.util.module_from_spec(spec)
	sys.modules["tokenizer"] = module
	spec.loader.exec_module(module)

	return module.tokenize

	@st.cache_data(ttl=3600)
	def get_commit_history():
	"""Fetch commit history from GitHub."""
	url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
	try:
	response = requests.get(url)
	if response.status_code == 200:
	commits = response.json()
	versions = []
	for commit in commits[:10]:
	date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
	sha = commit['sha'][:7]
	message = commit['commit']['message'].split('\n')[0][:50]
	versions.append(f"{date} - {sha} - {message}")
	return versions
	return ["latest"]
	except Exception as e:
	st.warning(f"Could not fetch commit history: {str(e)}")
	return ["latest"]

	def render_tokens(tokens, token_colors):
	"""Render colored token visualization."""
	html_tokens = []
	for token in tokens:
	color = token_colors[token]
	display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token) # Use symbol for special tokens
	html_tokens.append(
	f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
	)
	return " ".join(html_tokens)

	# Load tokenizer
	tokenize = load_tokenizer()
	if tokenize is None:
	st.error("Failed to load tokenizer from GitHub")
	st.stop()

	# Tokenize example text on startup if no results exist
	if st.session_state.token_results is None and st.session_state.text:
	try:
	st.session_state.token_results = tokenize(st.session_state.text)
	except Exception as e:
	st.error(f"Error tokenizing text: {str(e)}")

	# UI Layout
	st.title("🇹🇷 Turkish Tiktokenizer")

	# Model selection
	versions = get_commit_history()
	model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")

	# Main layout
	col1, col2 = st.columns([0.4, 0.6])

	# Input column
	with col1:
	text = st.text_area(
	"Enter Turkish text to tokenize",
	value=st.session_state.text,
	height=200,
	key="text_input",
	label_visibility="collapsed",
	placeholder="Enter Turkish text to tokenize"
	)

	if st.button("Tokenize", type="primary"):
	st.session_state.text = text
	if text.strip():
	try:
	st.session_state.token_results = tokenize(text)
	except Exception as e:
	st.session_state.token_results = None
	st.error(f"Error tokenizing text: {str(e)}")
	else:
	st.session_state.token_results = None

	# Results column
	with col2:
	st.markdown("Token count")
	if st.session_state.token_results is not None:
	result = st.session_state.token_results
	token_count = len(result["tokens"])
	st.markdown(f"### {token_count}")

	st.markdown("Tokenized text")

	# Generate token colors
	regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
	regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
	token_colors = {SPECIAL_COLORS, regular_token_colors}

	# Render tokens
	with st.container():
	st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)

	st.markdown("Token IDs")
	st.code(", ".join(map(str, result["ids"])), language=None)
	else:
	st.markdown("### 0")
	st.markdown("Tokenized text")
	st.markdown("")
	st.markdown("Token IDs")
	st.text("")

	# Footer
	st.markdown("""
	<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
	<a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
	</div>
	""", unsafe_allow_html=True)

	# Cleanup
	def cleanup():
	if Path("temp_tokenizer").exists():
	shutil.rmtree("temp_tokenizer")

	atexit.register(cleanup)