import gradio as gr import json # Load the tokenizer vocabulary def load_tokenizer(): with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f: tokenizer = json.load(f) # Reconstruct itos from stoi tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()} return tokenizer def encode_text(text: str) -> str: """Convert text to token indices""" tokenizer = load_tokenizer() try: # Convert text to tokens using the stoi (string to integer) mapping tokens = [] current_pos = 0 while current_pos < len(text): # Try to match the longest possible token found = False for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1): substr = text[current_pos:current_pos + token_length] if substr in tokenizer['stoi']: tokens.append(tokenizer['stoi'][substr]) current_pos += token_length found = True break if not found: return f"Error: Unable to encode character at position {current_pos}" return str(tokens) except Exception as e: return f"Error encoding text: {str(e)}" def decode_tokens(token_string: str) -> str: """Convert token indices back to text""" tokenizer = load_tokenizer() try: # Clean the input string and convert to list of integers token_string = token_string.strip('[]').replace(' ', '') if not token_string: return "Error: Empty input" tokens = [int(t) for t in token_string.split(',') if t] # Convert tokens back to text using the itos (integer to string) mapping result = "" for token in tokens: token_str = str(token) if token_str not in tokenizer['itos']: return f"Error: Invalid token {token}" result += tokenizer['itos'][token_str] return result except ValueError: return "Error: Invalid input format. Please enter numbers separated by commas" except Exception as e: return f"Error decoding tokens: {str(e)}" # Create Gradio interface def create_interface(): with gr.Blocks(title="Kannada Text Tokenizer") as interface: gr.Markdown("# Kannada Text Tokenizer") with gr.Tab("Encode"): with gr.Row(): input_text = gr.Textbox(label="Enter Kannada Text", lines=5) encode_button = gr.Button("Encode") encoded_output = gr.Textbox(label="Encoded Tokens", lines=5) encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output) with gr.Tab("Decode"): with gr.Row(): input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5) decode_button = gr.Button("Decode") decoded_output = gr.Textbox(label="Decoded Text", lines=5) decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output) gr.Markdown(""" ### Instructions: - **Encode**: Enter Kannada text in the input box and click 'Encode' to get token indices - **Decode**: Enter a list of token indices and click 'Decode' to get back the original text - Token indices must be in the format: [123, 456, 789] """) return interface if __name__ == "__main__": interface = create_interface() interface.launch(share=False)