Spaces:

Omartificial-Intelligence-Space
/

Kalemat

Sleeping

App Files Files Community

Kalemat / app.py

Omartificial-Intelligence-Space

Update app.py

76d1dbc verified over 1 year ago

raw

history blame

1.79 kB

	import gradio as gr
	from transformers import AutoTokenizer

	chart_html = gr.HTML(label="Token Frequency Chart")

	# Define a function to tokenize text and create visualization
	def tokenize_text(text, tokenizer_name):
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
	tokenized_text = tokenizer.tokenize(text)
	input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
	decoded_text = tokenizer.decode(input_ids)

	# Create visualization HTML
	chart_html = create_token_frequency_chart(tokenized_text)

	return (
	f"Tokenized Text: {tokenized_text}\nInput IDs: {input_ids}\nDecoded Text: {decoded_text}",
	chart_html,
	)


	# Define available tokenizers
	tokenizer_names = [
	"riotu-lab/ArabianGPT-01B",
	"riotu-lab/ArabianGPT-03B",
	"riotu-lab/ArabianGPT-08B",
	"FreedomIntelligence/AceGPT-13B",
	"FreedomIntelligence/AceGPT-7B",
	"inception-mbzuai/jais-13b",
	"aubmindlab/aragpt2-base",
	"aubmindlab/aragpt2-medium",
	"aubmindlab/aragpt2-large",
	"aubmindlab/aragpt2-mega"
	]

	# Create the Gradio interface
	iface = gr.Interface(
	fn=tokenize_text,
	inputs=[
	gr.Textbox(label="Enter Text"),
	gr.Dropdown(choices=tokenizer_names, label="Select Tokenizer"),
	],
	outputs=[
	gr.Textbox(label="Tokenized Text"),
	gr.Textbox(label="Input IDs"),
	gr.Textbox(label="Decoded Text"),
	gr.HTML(label="Token Frequency Chart"), # Include chart_html
	]
	title="Kalemat: Explore Arabic Tokenizers",
	description="This interactive tool allows you to experiment with different Arabic tokenizers and see how they break down text into individual units. Try out various tokenizers and observe the tokenized form, input IDs, and decoded text to gain insights into the tokenization process",
	)

	# Launch the app
	iface.launch()