File size: 1,791 Bytes
9715926 ca4f672 9715926 76d1dbc ca4f672 76d1dbc ca4f672 76d1dbc e5aaf6d 9715926 b6678bf 9715926 76d1dbc 44c45da 9715926 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
from transformers import AutoTokenizer
chart_html = gr.HTML(label="Token Frequency Chart")
# Define a function to tokenize text and create visualization
def tokenize_text(text, tokenizer_name):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenized_text = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
decoded_text = tokenizer.decode(input_ids)
# Create visualization HTML
chart_html = create_token_frequency_chart(tokenized_text)
return (
f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}",
chart_html,
)
# Define available tokenizers
tokenizer_names = [
"riotu-lab/ArabianGPT-01B",
"riotu-lab/ArabianGPT-03B",
"riotu-lab/ArabianGPT-08B",
"FreedomIntelligence/AceGPT-13B",
"FreedomIntelligence/AceGPT-7B",
"inception-mbzuai/jais-13b",
"aubmindlab/aragpt2-base",
"aubmindlab/aragpt2-medium",
"aubmindlab/aragpt2-large",
"aubmindlab/aragpt2-mega"
]
# Create the Gradio interface
iface = gr.Interface(
fn=tokenize_text,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
],
outputs=[
gr.Textbox(label="Tokenized Text"),
gr.Textbox(label="Input IDs"),
gr.Textbox(label="Decoded Text"),
gr.HTML(label="Token Frequency Chart"), # Include chart_html
]
title="Kalemat: Explore Arabic Tokenizers",
description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process",
)
# Launch the app
iface.launch() |