Spaces:

alibayram
/

turkish_tiktokenizer

Running

File size: 8,096 Bytes

26ddb6c

import streamlit as st
import sys
from datetime import datetime
from pathlib import Path
import base64
import colorsys
import shutil
import atexit
import requests
import importlib.util

# Set page config - MUST BE FIRST STREAMLIT COMMAND
st.set_page_config(
    page_title="Turkish Tiktokenizer",
    page_icon="🇹🇷",
    layout="wide"
)

# Initialize session state
if 'text' not in st.session_state:
    st.session_state.text = ""
if 'token_results' not in st.session_state:
    st.session_state.token_results = None

# Constants
GITHUB_REPO = "malibayram/tokenizer"
GITHUB_BRANCH = "main"

# Special tokens and their IDs
SPECIAL_TOKENS = {
    "<uppercase>": 0,    # Uppercase letter marker
    "<space>": 1,       # Space character
    "<newline>": 2,     # Newline character
    "<tab>": 3,         # Tab character
    "<unknown>": 4      # Unknown token
}

# Special token display symbols
SPECIAL_TOKEN_SYMBOLS = {
    "<uppercase>": "[uppercase]",    # Up arrow for uppercase
    "<space>": "[space]",        # Space symbol
    "<newline>": "[newline]",      # Return symbol
    "<tab>": "[tab]",          # Tab symbol
    "<unknown>": "[unknown]"       # Question mark for unknown
}

# Colors for special tokens
SPECIAL_COLORS = {
    "<uppercase>": "#FF9999",  # Light red for uppercase markers
    "<space>": "#CCCCCC",      # Gray for spaces
    "<newline>": "#CCCCCC",    # Gray for newlines
    "<tab>": "#CCCCCC",        # Gray for tabs
    "<unknown>": "#FF0000"     # Red for unknown tokens
}

# Required files mapping
REQUIRED_FILES = {
    'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
    'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json',
    'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
    'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
}

# Token ID ranges
TOKEN_RANGES = {
    'special': (0, 4),          # Special tokens
    'root_words': (5, 20000),   # Root words
    'suffixes': (22268, 22767), # Suffixes
    'bpe': (20000, None)        # BPE tokens (20000+)
}

def generate_colors(n):
    """Generate n visually distinct colors."""
    colors = []
    for i in range(n):
        hue = i / n
        saturation = 0.3 + (i % 3) * 0.1  # Vary saturation between 0.3-0.5
        value = 0.95 - (i % 2) * 0.1      # Vary value between 0.85-0.95
        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
        hex_color = "#{:02x}{:02x}{:02x}".format(
            int(rgb[0] * 255),
            int(rgb[1] * 255),
            int(rgb[2] * 255)
        )
        colors.append(hex_color)
    return colors

def fetch_github_file(path, ref=GITHUB_BRANCH):
    """Fetch file content from GitHub repository."""
    url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
    response = requests.get(url)
    if response.status_code == 200:
        content = base64.b64decode(response.json()['content']).decode('utf-8')
        return content
    else:
        st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
        return None

@st.cache_resource
def load_tokenizer():
    """Load and initialize the tokenizer from GitHub."""
    temp_dir = Path("temp_tokenizer")
    temp_dir.mkdir(exist_ok=True)
    
    # Fetch required files
    for local_name, github_path in REQUIRED_FILES.items():
        content = fetch_github_file(github_path)
        if content is None:
            return None
        
        with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
            f.write(content)
    
    # Modify tokenizer to use correct paths
    tokenizer_path = temp_dir / "tokenizer.py"
    with open(tokenizer_path, 'r', encoding='utf-8') as f:
        tokenizer_code = f.read()
    
    modified_code = tokenizer_code.replace(
        'def load_json(filename):',
        f'''def load_json(filename):
    full_path = os.path.join("{temp_dir.absolute()}", filename)
    with open(full_path, 'r', encoding='utf-8') as file:
        return json.load(file)'''
    )
    
    with open(tokenizer_path, 'w', encoding='utf-8') as f:
        f.write(modified_code)
    
    # Load module
    spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
    module = importlib.util.module_from_spec(spec)
    sys.modules["tokenizer"] = module
    spec.loader.exec_module(module)
    
    return module.tokenize

@st.cache_data(ttl=3600)
def get_commit_history():
    """Fetch commit history from GitHub."""
    url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            commits = response.json()
            versions = []
            for commit in commits[:10]:
                date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
                sha = commit['sha'][:7]
                message = commit['commit']['message'].split('\n')[0][:50]
                versions.append(f"{date} - {sha} - {message}")
            return versions
        return ["latest"]
    except Exception as e:
        st.warning(f"Could not fetch commit history: {str(e)}")
        return ["latest"]

def render_tokens(tokens, token_colors):
    """Render colored token visualization."""
    html_tokens = []
    for token in tokens:
        color = token_colors[token]
        display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token)  # Use symbol for special tokens
        html_tokens.append(
            f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
        )
    return " ".join(html_tokens)

# Load tokenizer
tokenize = load_tokenizer()
if tokenize is None:
    st.error("Failed to load tokenizer from GitHub")
    st.stop()

# UI Layout
st.title("🇹🇷 Turkish Tiktokenizer")

# Model selection
versions = get_commit_history()
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")

# Main layout
col1, col2 = st.columns([0.4, 0.6])

# Input column
with col1:
    text = st.text_area(
        "Enter Turkish text to tokenize",
        value=st.session_state.text,
        height=200,
        key="text_input",
        label_visibility="collapsed",
        placeholder="Enter Turkish text to tokenize"
    )
    
    if st.button("Tokenize", type="primary"):
        st.session_state.text = text
        if text.strip():
            try:
                st.session_state.token_results = tokenize(text)
            except Exception as e:
                st.session_state.token_results = None
                st.error(f"Error tokenizing text: {str(e)}")
        else:
            st.session_state.token_results = None

# Results column
with col2:
    st.markdown("Token count")
    if st.session_state.token_results is not None:
        result = st.session_state.token_results
        token_count = len(result["tokens"])
        st.markdown(f"### {token_count}")
        
        st.markdown("Tokenized text")
        
        # Generate token colors
        regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
        regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
        token_colors = {**SPECIAL_COLORS, **regular_token_colors}
        
        # Render tokens
        with st.container():
            st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
        
        st.markdown("Token IDs")
        st.code(", ".join(map(str, result["ids"])), language=None)
    else:
        st.markdown("### 0")
        st.markdown("Tokenized text")
        st.markdown("")
        st.markdown("Token IDs")
        st.text("")

# Footer
st.markdown("""
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
    <a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
</div>
""", unsafe_allow_html=True)

# Cleanup
def cleanup():
    if Path("temp_tokenizer").exists():
        shutil.rmtree("temp_tokenizer")

atexit.register(cleanup)