Spaces:

Tousifahamed
/

Kannada_Text_Tokenizer

Sleeping

File size: 3,723 Bytes

35bb218

import gradio as gr
import json

# Load the tokenizer vocabulary
def load_tokenizer():
    with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f:
        tokenizer = json.load(f)
        # Reconstruct itos from stoi
        tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()}
    return tokenizer

def encode_text(text: str) -> str:
    """Convert text to token indices"""
    tokenizer = load_tokenizer()
    try:
        # Convert text to tokens using the stoi (string to integer) mapping
        tokens = []
        current_pos = 0
        while current_pos < len(text):
            # Try to match the longest possible token
            found = False
            for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1):
                substr = text[current_pos:current_pos + token_length]
                if substr in tokenizer['stoi']:
                    tokens.append(tokenizer['stoi'][substr])
                    current_pos += token_length
                    found = True
                    break
            if not found:
                return f"Error: Unable to encode character at position {current_pos}"
        
        return str(tokens)
    except Exception as e:
        return f"Error encoding text: {str(e)}"

def decode_tokens(token_string: str) -> str:
    """Convert token indices back to text"""
    tokenizer = load_tokenizer()
    try:
        # Clean the input string and convert to list of integers
        token_string = token_string.strip('[]').replace(' ', '')
        if not token_string:
            return "Error: Empty input"
            
        tokens = [int(t) for t in token_string.split(',') if t]
        
        # Convert tokens back to text using the itos (integer to string) mapping
        result = ""
        for token in tokens:
            token_str = str(token)
            if token_str not in tokenizer['itos']:
                return f"Error: Invalid token {token}"
            result += tokenizer['itos'][token_str]
        
        return result
    except ValueError:
        return "Error: Invalid input format. Please enter numbers separated by commas"
    except Exception as e:
        return f"Error decoding tokens: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="Kannada Text Tokenizer") as interface:
        gr.Markdown("# Kannada Text Tokenizer")
        
        with gr.Tab("Encode"):
            with gr.Row():
                input_text = gr.Textbox(label="Enter Kannada Text", lines=5)
                encode_button = gr.Button("Encode")
                encoded_output = gr.Textbox(label="Encoded Tokens", lines=5)
            encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output)
        
        with gr.Tab("Decode"):
            with gr.Row():
                input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5)
                decode_button = gr.Button("Decode")
                decoded_output = gr.Textbox(label="Decoded Text", lines=5)
            decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output)
        
        gr.Markdown("""

        ### Instructions:

        - **Encode**: Enter Kannada text in the input box and click 'Encode' to get token indices

        - **Decode**: Enter a list of token indices and click 'Decode' to get back the original text

        - Token indices must be in the format: [123, 456, 789]

        """)
    
    return interface

if __name__ == "__main__": 
    interface = create_interface()
    interface.launch(share=False)