Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +91 -0
- kannada_tokenizer.json +0 -0
- requirements.txt +0 -0
app.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
|
4 |
+
# Load the tokenizer vocabulary
|
5 |
+
def load_tokenizer():
|
6 |
+
with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f:
|
7 |
+
tokenizer = json.load(f)
|
8 |
+
# Reconstruct itos from stoi
|
9 |
+
tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()}
|
10 |
+
return tokenizer
|
11 |
+
|
12 |
+
def encode_text(text: str) -> str:
|
13 |
+
"""Convert text to token indices"""
|
14 |
+
tokenizer = load_tokenizer()
|
15 |
+
try:
|
16 |
+
# Convert text to tokens using the stoi (string to integer) mapping
|
17 |
+
tokens = []
|
18 |
+
current_pos = 0
|
19 |
+
while current_pos < len(text):
|
20 |
+
# Try to match the longest possible token
|
21 |
+
found = False
|
22 |
+
for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1):
|
23 |
+
substr = text[current_pos:current_pos + token_length]
|
24 |
+
if substr in tokenizer['stoi']:
|
25 |
+
tokens.append(tokenizer['stoi'][substr])
|
26 |
+
current_pos += token_length
|
27 |
+
found = True
|
28 |
+
break
|
29 |
+
if not found:
|
30 |
+
return f"Error: Unable to encode character at position {current_pos}"
|
31 |
+
|
32 |
+
return str(tokens)
|
33 |
+
except Exception as e:
|
34 |
+
return f"Error encoding text: {str(e)}"
|
35 |
+
|
36 |
+
def decode_tokens(token_string: str) -> str:
|
37 |
+
"""Convert token indices back to text"""
|
38 |
+
tokenizer = load_tokenizer()
|
39 |
+
try:
|
40 |
+
# Clean the input string and convert to list of integers
|
41 |
+
token_string = token_string.strip('[]').replace(' ', '')
|
42 |
+
if not token_string:
|
43 |
+
return "Error: Empty input"
|
44 |
+
|
45 |
+
tokens = [int(t) for t in token_string.split(',') if t]
|
46 |
+
|
47 |
+
# Convert tokens back to text using the itos (integer to string) mapping
|
48 |
+
result = ""
|
49 |
+
for token in tokens:
|
50 |
+
token_str = str(token)
|
51 |
+
if token_str not in tokenizer['itos']:
|
52 |
+
return f"Error: Invalid token {token}"
|
53 |
+
result += tokenizer['itos'][token_str]
|
54 |
+
|
55 |
+
return result
|
56 |
+
except ValueError:
|
57 |
+
return "Error: Invalid input format. Please enter numbers separated by commas"
|
58 |
+
except Exception as e:
|
59 |
+
return f"Error decoding tokens: {str(e)}"
|
60 |
+
|
61 |
+
# Create Gradio interface
|
62 |
+
def create_interface():
|
63 |
+
with gr.Blocks(title="Kannada Text Tokenizer") as interface:
|
64 |
+
gr.Markdown("# Kannada Text Tokenizer")
|
65 |
+
|
66 |
+
with gr.Tab("Encode"):
|
67 |
+
with gr.Row():
|
68 |
+
input_text = gr.Textbox(label="Enter Kannada Text", lines=5)
|
69 |
+
encode_button = gr.Button("Encode")
|
70 |
+
encoded_output = gr.Textbox(label="Encoded Tokens", lines=5)
|
71 |
+
encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output)
|
72 |
+
|
73 |
+
with gr.Tab("Decode"):
|
74 |
+
with gr.Row():
|
75 |
+
input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5)
|
76 |
+
decode_button = gr.Button("Decode")
|
77 |
+
decoded_output = gr.Textbox(label="Decoded Text", lines=5)
|
78 |
+
decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output)
|
79 |
+
|
80 |
+
gr.Markdown("""
|
81 |
+
### Instructions:
|
82 |
+
- **Encode**: Enter Kannada text in the input box and click 'Encode' to get token indices
|
83 |
+
- **Decode**: Enter a list of token indices and click 'Decode' to get back the original text
|
84 |
+
- Token indices must be in the format: [123, 456, 789]
|
85 |
+
""")
|
86 |
+
|
87 |
+
return interface
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
interface = create_interface()
|
91 |
+
interface.launch(share=False)
|
kannada_tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
File without changes
|