File size: 8,096 Bytes
26ddb6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import streamlit as st
import sys
from datetime import datetime
from pathlib import Path
import base64
import colorsys
import shutil
import atexit
import requests
import importlib.util

# Set page config - MUST BE FIRST STREAMLIT COMMAND
st.set_page_config(
    page_title="Turkish Tiktokenizer",
    page_icon="πŸ‡ΉπŸ‡·",
    layout="wide"
)

# Initialize session state
if 'text' not in st.session_state:
    st.session_state.text = ""
if 'token_results' not in st.session_state:
    st.session_state.token_results = None

# Constants
GITHUB_REPO = "malibayram/tokenizer"
GITHUB_BRANCH = "main"

# Special tokens and their IDs
SPECIAL_TOKENS = {
    "<uppercase>": 0,    # Uppercase letter marker
    "<space>": 1,       # Space character
    "<newline>": 2,     # Newline character
    "<tab>": 3,         # Tab character
    "<unknown>": 4      # Unknown token
}

# Special token display symbols
SPECIAL_TOKEN_SYMBOLS = {
    "<uppercase>": "[uppercase]",    # Up arrow for uppercase
    "<space>": "[space]",        # Space symbol
    "<newline>": "[newline]",      # Return symbol
    "<tab>": "[tab]",          # Tab symbol
    "<unknown>": "[unknown]"       # Question mark for unknown
}

# Colors for special tokens
SPECIAL_COLORS = {
    "<uppercase>": "#FF9999",  # Light red for uppercase markers
    "<space>": "#CCCCCC",      # Gray for spaces
    "<newline>": "#CCCCCC",    # Gray for newlines
    "<tab>": "#CCCCCC",        # Gray for tabs
    "<unknown>": "#FF0000"     # Red for unknown tokens
}

# Required files mapping
REQUIRED_FILES = {
    'tokenizer.py': 'turkish_tokenizer/turkish_tokenizer.py',
    'kokler_v05.json': 'turkish_tokenizer/kokler_v05.json',
    'ekler_v05.json': 'turkish_tokenizer/ekler_v05.json',
    'bpe_v05.json': 'turkish_tokenizer/bpe_v05.json'
}

# Token ID ranges
TOKEN_RANGES = {
    'special': (0, 4),          # Special tokens
    'root_words': (5, 20000),   # Root words
    'suffixes': (22268, 22767), # Suffixes
    'bpe': (20000, None)        # BPE tokens (20000+)
}

def generate_colors(n):
    """Generate n visually distinct colors."""
    colors = []
    for i in range(n):
        hue = i / n
        saturation = 0.3 + (i % 3) * 0.1  # Vary saturation between 0.3-0.5
        value = 0.95 - (i % 2) * 0.1      # Vary value between 0.85-0.95
        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
        hex_color = "#{:02x}{:02x}{:02x}".format(
            int(rgb[0] * 255),
            int(rgb[1] * 255),
            int(rgb[2] * 255)
        )
        colors.append(hex_color)
    return colors

def fetch_github_file(path, ref=GITHUB_BRANCH):
    """Fetch file content from GitHub repository."""
    url = f"https://api.github.com/repos/{GITHUB_REPO}/contents/{path}?ref={ref}"
    response = requests.get(url)
    if response.status_code == 200:
        content = base64.b64decode(response.json()['content']).decode('utf-8')
        return content
    else:
        st.error(f"Could not fetch {path} from GitHub: {response.status_code}")
        return None

@st.cache_resource
def load_tokenizer():
    """Load and initialize the tokenizer from GitHub."""
    temp_dir = Path("temp_tokenizer")
    temp_dir.mkdir(exist_ok=True)
    
    # Fetch required files
    for local_name, github_path in REQUIRED_FILES.items():
        content = fetch_github_file(github_path)
        if content is None:
            return None
        
        with open(temp_dir / local_name, 'w', encoding='utf-8') as f:
            f.write(content)
    
    # Modify tokenizer to use correct paths
    tokenizer_path = temp_dir / "tokenizer.py"
    with open(tokenizer_path, 'r', encoding='utf-8') as f:
        tokenizer_code = f.read()
    
    modified_code = tokenizer_code.replace(
        'def load_json(filename):',
        f'''def load_json(filename):
    full_path = os.path.join("{temp_dir.absolute()}", filename)
    with open(full_path, 'r', encoding='utf-8') as file:
        return json.load(file)'''
    )
    
    with open(tokenizer_path, 'w', encoding='utf-8') as f:
        f.write(modified_code)
    
    # Load module
    spec = importlib.util.spec_from_file_location("tokenizer", str(temp_dir / "tokenizer.py"))
    module = importlib.util.module_from_spec(spec)
    sys.modules["tokenizer"] = module
    spec.loader.exec_module(module)
    
    return module.tokenize

@st.cache_data(ttl=3600)
def get_commit_history():
    """Fetch commit history from GitHub."""
    url = f"https://api.github.com/repos/{GITHUB_REPO}/commits"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            commits = response.json()
            versions = []
            for commit in commits[:10]:
                date = datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d')
                sha = commit['sha'][:7]
                message = commit['commit']['message'].split('\n')[0][:50]
                versions.append(f"{date} - {sha} - {message}")
            return versions
        return ["latest"]
    except Exception as e:
        st.warning(f"Could not fetch commit history: {str(e)}")
        return ["latest"]

def render_tokens(tokens, token_colors):
    """Render colored token visualization."""
    html_tokens = []
    for token in tokens:
        color = token_colors[token]
        display_text = SPECIAL_TOKEN_SYMBOLS.get(token, token)  # Use symbol for special tokens
        html_tokens.append(
            f'<span style="background-color: {color}; padding: 2px 4px; margin: 2px; border-radius: 3px;" title="{token}">{display_text}</span>'
        )
    return " ".join(html_tokens)

# Load tokenizer
tokenize = load_tokenizer()
if tokenize is None:
    st.error("Failed to load tokenizer from GitHub")
    st.stop()

# UI Layout
st.title("πŸ‡ΉπŸ‡· Turkish Tiktokenizer")

# Model selection
versions = get_commit_history()
model = st.selectbox("", versions, key="model_selection", label_visibility="collapsed")

# Main layout
col1, col2 = st.columns([0.4, 0.6])

# Input column
with col1:
    text = st.text_area(
        "Enter Turkish text to tokenize",
        value=st.session_state.text,
        height=200,
        key="text_input",
        label_visibility="collapsed",
        placeholder="Enter Turkish text to tokenize"
    )
    
    if st.button("Tokenize", type="primary"):
        st.session_state.text = text
        if text.strip():
            try:
                st.session_state.token_results = tokenize(text)
            except Exception as e:
                st.session_state.token_results = None
                st.error(f"Error tokenizing text: {str(e)}")
        else:
            st.session_state.token_results = None

# Results column
with col2:
    st.markdown("Token count")
    if st.session_state.token_results is not None:
        result = st.session_state.token_results
        token_count = len(result["tokens"])
        st.markdown(f"### {token_count}")
        
        st.markdown("Tokenized text")
        
        # Generate token colors
        regular_tokens = [t for t in result["tokens"] if t not in SPECIAL_COLORS]
        regular_token_colors = dict(zip(regular_tokens, generate_colors(len(regular_tokens))))
        token_colors = {**SPECIAL_COLORS, **regular_token_colors}
        
        # Render tokens
        with st.container():
            st.markdown(render_tokens(result["tokens"], token_colors), unsafe_allow_html=True)
        
        st.markdown("Token IDs")
        st.code(", ".join(map(str, result["ids"])), language=None)
    else:
        st.markdown("### 0")
        st.markdown("Tokenized text")
        st.markdown("")
        st.markdown("Token IDs")
        st.text("")

# Footer
st.markdown("""
<div style="position: fixed; bottom: 0; width: 100%; text-align: center; padding: 10px; background-color: white;">
    <a href="https://github.com/malibayram/tokenizer" target="_blank">View on GitHub</a>
</div>
""", unsafe_allow_html=True)

# Cleanup
def cleanup():
    if Path("temp_tokenizer").exists():
        shutil.rmtree("temp_tokenizer")

atexit.register(cleanup)