import gradio as gr from transformers import AutoTokenizer import pandas as pd import json def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False): """ Processes text using a specified Hugging Face tokenizer model. """ try: # Dynamically load the tokenizer based on the selected model name tokenizer = AutoTokenizer.from_pretrained(model_name) except Exception as e: return ( pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]), "", "", ) encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True) # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text) # Adjust special token handling based on the flag if not include_special_tokens: # Attempt to remove special tokens by decoding and then encoding without special tokens. # This approach aims for a general solution but might behave differently for # tokenizers with complex special token handling or if tokens are meant to be inseparable. try: decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True) token_ids = tokenizer.encode(decoded_text, add_special_tokens=False) tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False) except Exception as e: # Fallback if specific handling fails. It's better to process without removing # special tokens if an error occurs rather than failing the whole process. print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}") # Keep original tokens and IDs which include special tokens tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text) token_info = [] # Ensure tokens and token_ids have matching lengths for zipping min_len = min(len(tokens), len(token_ids)) for i in range(min_len): token = tokens[i] token_id = token_ids[i] info = { "Token": token, "ID": token_id, } # Check if attention_mask is available and has the correct dimension before accessing if show_attention_mask and encoding["attention_mask"].shape[1] > i: info["Attention Mask"] = encoding["attention_mask"][0][i] token_info.append(info) df = pd.DataFrame(token_info) stats = f""" Number of tokens: {len(tokens)} Input text length: {len(text)} Tokens/character ratio: {len(tokens)/len(text):.2f} Vocabulary size: {tokenizer.vocab_size} """ json_output = json.dumps( { "input_ids": token_ids, "tokens": tokens, }, indent=2, ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON ) return df, stats, json_output # Define available models using your specified paths model_choices = [ "roberta-base", "klue/roberta-large", "distilbert/distilbert-base-uncased", "BAAI/bge-m3-retromae", "DTAI-KULeuven/robbert-2023-dutch-base", "DTAI-KULeuven/robbert-2023-dutch-large", ] iface = gr.Interface( fn=process_text, inputs=[ gr.Dropdown( choices=model_choices, value="roberta-base", label="Select Model", ), gr.Textbox( lines=5, placeholder="Enter text to tokenize...", label="Input Text" ), gr.Checkbox(label="Include Special Tokens", value=False), gr.Checkbox(label="Show Attention Mask", value=False), ], outputs=[ gr.Dataframe( headers=["Token", "ID", "Attention Mask"], label="Tokenization Results" ), gr.Textbox(label="Statistics", lines=4), gr.JSON(label="JSON Output"), ], title="Hugging Face Tokenizer Playground", description=""" An interactive demonstration of various Hugging Face tokenizers. Select a model from the dropdown to see how it tokenizes your input text. """, theme="default", ) if __name__ == "__main__": iface.launch(share=True)