TokenizerViz / app.py
prasanna kumar
added llama3 models for tokenize and show as well as pass token ids will give back text
5dca0b0
raw
history blame
2.01 kB
import gradio as gr
from transformers import AutoTokenizer
import ast
model_path = "models/"
import gradio as gr
# Available models
MODELS = ["Meta-Llama-3.1-8B"]
def process_input(input_type, input_value, model_name):
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
if input_type == "Text":
# Tokenize the text
tokens = tokenizer.tokenize(input_value)
token_ids = tokenizer.encode(input_value)
# Create output strings
# tokens_str = [f"{i+1}. {token}" for i, token in enumerate(tokens)]
# token_ids_str = " ".join(map(str, token_ids))
return f"Total tokens: {len(tokens)}", tokens, token_ids
elif input_type == "Token IDs":
try:
token_ids = ast.literal_eval(input_value)
# Convert string of token IDs to list of integers
# token_ids = list(map(int, input_value.split()))
# Convert token IDs back to text
text = tokenizer.decode(token_ids)
print("The decoded text",text)
# Tokenize the text to get individual tokens
# Create output strings
return f"Total tokens: {len(token_ids)}", text, input_value
except ValueError:
return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", ""
# Create Gradio interface
iface = gr.Interface(
fn=process_input,
inputs=[
gr.Radio(["Text", "Token IDs"], label="Input Type", value="Text"),
gr.Textbox(lines=5, label="Input"),
gr.Dropdown(choices=MODELS, label="Select Model")
],
outputs=[
gr.Textbox(label="Token Count"),
gr.Textbox(label="Tokens", lines=10),
gr.Textbox(label="Token IDS", lines=5)
],
title="LLM Tokenization and Token ID Converter",
description="Enter text or token IDs and select a model to see the conversion results."
)
# Launch the app
iface.launch(share=True)