File size: 7,748 Bytes
2d0a0f5
5dca0b0
 
e88497a
 
 
0d3569b
 
 
 
2d0a0f5
e88497a
5dca0b0
 
0d3569b
 
 
 
 
 
 
 
 
 
e88497a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dca0b0
0d3569b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e43644
0d3569b
6e43644
0d3569b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e43644
0d3569b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dca0b0
0d3569b
5dca0b0
0d3569b
5dca0b0
e88497a
 
 
0d3569b
 
e88497a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d3569b
 
 
e88497a
b9a925b
 
 
 
 
 
e88497a
 
 
 
 
 
0d3569b
e88497a
0d3569b
e88497a
 
b9a925b
 
 
 
e88497a
 
 
0d3569b
 
e88497a
 
 
 
 
 
 
b9a925b
 
 
 
 
 
 
 
 
 
e88497a
 
0d3569b
 
e88497a
2d0a0f5
86eb1ac
e88497a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import gradio as gr
from transformers import AutoTokenizer
import ast
from collections import Counter
import re
import plotly.graph_objs as go
import html
import random
import tiktoken
import anthropic

model_path = "models/"

# Available models
MODELS = ["Meta-Llama-3.1-8B", "gemma-2b", "gpt-3.5-turbo","gpt-4","gpt-4o" "Claude-3-Sonnet"]
openai_models = ["gpt-3.5-turbo","gpt-4","gpt-4o"]
# Color palette visible on both light and dark themes
COLOR_PALETTE = [
    "#e6194B", "#3cb44b", "#ffe119", "#4363d8",
    "#f58231", "#911eb4", "#42d4f4", "#f032e6",
    "#bfef45", "#fabed4", "#469990", "#dcbeff",
    "#9A6324", "#fffac8", "#800000", "#aaffc3",
    "#808000", "#ffd8b1", "#000075", "#a9a9a9"
]

def create_vertical_histogram(data, title):
    labels, values = zip(*data) if data else ([], [])
    fig = go.Figure(go.Bar(
        x=labels,
        y=values
    ))
    fig.update_layout(
        title=title,
        xaxis_title="Item",
        yaxis_title="Count",
        height=400,
        xaxis=dict(tickangle=-45)
    )
    return fig

def process_text(text: str, model_name: str, api_key: str = None):
    if model_name in ["Meta-Llama-3.1-8B", "gemma-2b"]:
        tokenizer = AutoTokenizer.from_pretrained(model_path + model_name)
        token_ids = tokenizer.encode(text, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
    elif model_name in openai_models:
        encoding = tiktoken.encoding_for_model(model_name=model_name)
        token_ids = encoding.encode(text)
        tokens = [encoding.decode([id]) for id in token_ids]
    elif model_name == "Claude-3-Sonnet":
        if not api_key:
            raise ValueError("API key is required for Claude models")
        client = anthropic.Anthropic(api_key=api_key)
        tokenizer = client.get_tokenizer()
        token_ids = tokenizer.encode(text).ids
        tokens = [tokenizer.decode([id]) for id in token_ids]
    else:
        raise ValueError(f"Unsupported model: {model_name}")
    
    return text, tokens, token_ids

def process_ids(ids: str, model_name: str, api_key: str = None):
    token_ids = ast.literal_eval(ids)
    if model_name in ["Meta-Llama-3.1-8B", "gemma-2b"]:
        tokenizer = AutoTokenizer.from_pretrained(model_path + model_name)
        text = tokenizer.decode(token_ids)
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
    elif model_name == openai_models:
        encoding = tiktoken.encoding_for_model(model_name=model_name)
        text = encoding.decode(token_ids)
        tokens = [encoding.decode([id]) for id in token_ids]
    elif model_name == "Claude-3-Sonnet":
        client = anthropic.Anthropic(api_key=api_key)
        tokenizer = client.get_tokenizer()
        text = tokenizer.decode(token_ids)
        tokens = [tokenizer.decode([id]) for id in token_ids]
    else:
        raise ValueError(f"Unsupported model: {model_name}")
    
    return text, tokens, token_ids

def get_token_color(token, token_colors):
    if token.startswith('<') and token.endswith('>'):
        return "#42d4f4"  # Cyan for special tokens
    elif token == '▁' or token == ' ':
        return "#3cb44b"  # Green for space tokens
    elif not token.isalnum():
        return "#f032e6"  # Magenta for special characters
    else:
        if token not in token_colors:
            token_colors[token] = random.choice(COLOR_PALETTE)
        return token_colors[token]

def create_html_tokens(tokens):
    html_output = '<div style="font-family: monospace; border: 1px solid #ccc; padding: 10px; border-radius: 5px; background-color: #f9f9f9; white-space: pre-wrap; word-break: break-all;">'
    token_colors = {}
    for token in tokens:
        color = get_token_color(token, token_colors)
        escaped_token = html.escape(token)
        html_output += f'<span style="background-color: {color}; color: black; padding: 2px 4px; margin: 1px; border-radius: 3px; display: inline-block;">{escaped_token}</span>'
    html_output += '</div>'
    return html_output

def process_input(input_type, input_value, model_name, api_key):
    if input_type == "Text":
        text, tokens, token_ids = process_text(text=input_value, model_name=model_name, api_key=api_key)
    elif input_type == "Token IDs":
        text, tokens, token_ids = process_ids(ids=input_value, model_name=model_name, api_key=api_key)

    character_count = len(text)
    word_count = len(text.split())
    
    space_count = sum(1 for token in tokens if token in ['▁', ' '])
    special_char_count = sum(1 for token in tokens if not token.isalnum() and token not in ['▁', ' '])
    
    words = re.findall(r'\b\w+\b', text.lower())
    special_chars = re.findall(r'[^\w\s]', text)
    numbers = re.findall(r'\d+', text)
    
    most_common_words = Counter(words).most_common(10)
    most_common_special_chars = Counter(special_chars).most_common(10)
    most_common_numbers = Counter(numbers).most_common(10)
    
    words_hist = create_vertical_histogram(most_common_words, "Most Common Words")
    special_chars_hist = create_vertical_histogram(most_common_special_chars, "Most Common Special Characters")
    numbers_hist = create_vertical_histogram(most_common_numbers, "Most Common Numbers")
    
    analysis = f"Token count: {len(tokens)}\n"
    analysis += f"Character count: {character_count}\n"
    analysis += f"Word count: {word_count}\n"
    analysis += f"Space tokens: {space_count}\n"
    analysis += f"Special character tokens: {special_char_count}\n"
    analysis += f"Other tokens: {len(tokens) - space_count - special_char_count}"
    
    html_tokens = create_html_tokens(tokens)
    
    return analysis, text, html_tokens, str(token_ids), words_hist, special_chars_hist, numbers_hist

def text_example():
    return "Hello, world! This is an example text input for tokenization."

def token_ids_example():
    return "[128000, 9906, 11, 1917, 0, 1115, 374, 459, 3187, 1495, 1988, 369, 4037, 2065, 13]"

with gr.Blocks() as iface:
    gr.Markdown("# LLM Tokenization - Convert Text to tokens and vice versa!")
    gr.Markdown("Enter text or token IDs and select a model to see the results, including word count, token analysis, and histograms of most common elements.")
    
    with gr.Row():
        input_type = gr.Radio(["Text", "Token IDs"], label="Input Type", value="Text")
        model_name = gr.Dropdown(choices=MODELS, label="Select Model", value=MODELS[0])
    
    api_key = gr.Textbox(label="API Key Claude models)", type="password")
    input_text = gr.Textbox(lines=5, label="Input")
    
    with gr.Row():
        text_example_button = gr.Button("Load Text Example")
        token_ids_example_button = gr.Button("Load Token IDs Example")
    
    submit_button = gr.Button("Process")
    
    analysis_output = gr.Textbox(label="Analysis", lines=6)
    text_output = gr.Textbox(label="Text", lines=6)
    tokens_output = gr.HTML(label="Tokens")
    token_ids_output = gr.Textbox(label="Token IDs", lines=2)
    
    with gr.Row():
        words_plot = gr.Plot(label="Most Common Words")
        special_chars_plot = gr.Plot(label="Most Common Special Characters")
        numbers_plot = gr.Plot(label="Most Common Numbers")
    
    text_example_button.click(
        lambda: (text_example(), "Text"),
        outputs=[input_text, input_type]
    )
    
    token_ids_example_button.click(
        lambda: (token_ids_example(), "Token IDs"),
        outputs=[input_text, input_type]
    )
    
    submit_button.click(
        process_input,
        inputs=[input_type, input_text, model_name, api_key],
        outputs=[analysis_output, text_output, tokens_output, token_ids_output, words_plot, special_chars_plot, numbers_plot]
    )

if __name__ == "__main__":
    iface.launch()