Spaces:

Tousifahamed
/

Kannada_Text_Tokenizer

Sleeping

App Files Files Community

Kannada_Text_Tokenizer / app.py

Tousifahamed

Upload 3 files

35bb218 verified 3 months ago

raw

history blame

3.72 kB

	import gradio as gr
	import json

	# Load the tokenizer vocabulary
	def load_tokenizer():
	with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f:
	tokenizer = json.load(f)
	# Reconstruct itos from stoi
	tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()}
	return tokenizer

	def encode_text(text: str) -> str:
	"""Convert text to token indices"""
	tokenizer = load_tokenizer()
	try:
	# Convert text to tokens using the stoi (string to integer) mapping
	tokens = []
	current_pos = 0
	while current_pos < len(text):
	# Try to match the longest possible token
	found = False
	for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1):
	substr = text[current_pos:current_pos + token_length]
	if substr in tokenizer['stoi']:
	tokens.append(tokenizer['stoi'][substr])
	current_pos += token_length
	found = True
	break
	if not found:
	return f"Error: Unable to encode character at position {current_pos}"

	return str(tokens)
	except Exception as e:
	return f"Error encoding text: {str(e)}"

	def decode_tokens(token_string: str) -> str:
	"""Convert token indices back to text"""
	tokenizer = load_tokenizer()
	try:
	# Clean the input string and convert to list of integers
	token_string = token_string.strip('[]').replace(' ', '')
	if not token_string:
	return "Error: Empty input"

	tokens = [int(t) for t in token_string.split(',') if t]

	# Convert tokens back to text using the itos (integer to string) mapping
	result = ""
	for token in tokens:
	token_str = str(token)
	if token_str not in tokenizer['itos']:
	return f"Error: Invalid token {token}"
	result += tokenizer['itos'][token_str]

	return result
	except ValueError:
	return "Error: Invalid input format. Please enter numbers separated by commas"
	except Exception as e:
	return f"Error decoding tokens: {str(e)}"

	# Create Gradio interface
	def create_interface():
	with gr.Blocks(title="Kannada Text Tokenizer") as interface:
	gr.Markdown("# Kannada Text Tokenizer")

	with gr.Tab("Encode"):
	with gr.Row():
	input_text = gr.Textbox(label="Enter Kannada Text", lines=5)
	encode_button = gr.Button("Encode")
	encoded_output = gr.Textbox(label="Encoded Tokens", lines=5)
	encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output)

	with gr.Tab("Decode"):
	with gr.Row():
	input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5)
	decode_button = gr.Button("Decode")
	decoded_output = gr.Textbox(label="Decoded Text", lines=5)
	decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output)

	gr.Markdown("""
	### Instructions:
	- Encode: Enter Kannada text in the input box and click 'Encode' to get token indices
	- Decode: Enter a list of token indices and click 'Decode' to get back the original text
	- Token indices must be in the format: [123, 456, 789]
	""")

	return interface

	if __name__ == "__main__":
	interface = create_interface()
	interface.launch(share=False)