Spaces:

Tousifahamed
/

Kannada_Text_Tokenizer

Sleeping

App Files Files Community

Tousifahamed commited on Jan 10

Commit

35bb218

verified ·

1 Parent(s): 7b58fbe

Upload 3 files

Browse files

Files changed (3) hide show

app.py +91 -0
kannada_tokenizer.json +0 -0
requirements.txt +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import json
+# Load the tokenizer vocabulary
+def load_tokenizer():
+    with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f:
+        tokenizer = json.load(f)
+        # Reconstruct itos from stoi
+        tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()}
+    return tokenizer
+def encode_text(text: str) -> str:
+    """Convert text to token indices"""
+    tokenizer = load_tokenizer()
+    try:
+        # Convert text to tokens using the stoi (string to integer) mapping
+        tokens = []
+        current_pos = 0
+        while current_pos < len(text):
+            # Try to match the longest possible token
+            found = False
+            for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1):
+                substr = text[current_pos:current_pos + token_length]
+                if substr in tokenizer['stoi']:
+                    tokens.append(tokenizer['stoi'][substr])
+                    current_pos += token_length
+                    found = True
+                    break
+            if not found:
+                return f"Error: Unable to encode character at position {current_pos}"
+        return str(tokens)
+    except Exception as e:
+        return f"Error encoding text: {str(e)}"
+def decode_tokens(token_string: str) -> str:
+    """Convert token indices back to text"""
+    tokenizer = load_tokenizer()
+    try:
+        # Clean the input string and convert to list of integers
+        token_string = token_string.strip('[]').replace(' ', '')
+        if not token_string:
+            return "Error: Empty input"
+        tokens = [int(t) for t in token_string.split(',') if t]
+        # Convert tokens back to text using the itos (integer to string) mapping
+        result = ""
+        for token in tokens:
+            token_str = str(token)
+            if token_str not in tokenizer['itos']:
+                return f"Error: Invalid token {token}"
+            result += tokenizer['itos'][token_str]
+        return result
+    except ValueError:
+        return "Error: Invalid input format. Please enter numbers separated by commas"
+    except Exception as e:
+        return f"Error decoding tokens: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Kannada Text Tokenizer") as interface:
+        gr.Markdown("# Kannada Text Tokenizer")
+        with gr.Tab("Encode"):
+            with gr.Row():
+                input_text = gr.Textbox(label="Enter Kannada Text", lines=5)
+                encode_button = gr.Button("Encode")
+                encoded_output = gr.Textbox(label="Encoded Tokens", lines=5)
+            encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output)
+        with gr.Tab("Decode"):
+            with gr.Row():
+                input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5)
+                decode_button = gr.Button("Decode")
+                decoded_output = gr.Textbox(label="Decoded Text", lines=5)
+            decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output)
+        gr.Markdown("""
+        ### Instructions:
+        - **Encode**: Enter Kannada text in the input box and click 'Encode' to get token indices
+        - **Decode**: Enter a list of token indices and click 'Decode' to get back the original text
+        - Token indices must be in the format: [123, 456, 789]
+        """)
+    return interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(share=False)

kannada_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

File without changes