Tousifahamed commited on
Commit
35bb218
·
verified ·
1 Parent(s): 7b58fbe

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +91 -0
  2. kannada_tokenizer.json +0 -0
  3. requirements.txt +0 -0
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ # Load the tokenizer vocabulary
5
+ def load_tokenizer():
6
+ with open('kannada_tokenizer.json', 'r', encoding='utf-8') as f:
7
+ tokenizer = json.load(f)
8
+ # Reconstruct itos from stoi
9
+ tokenizer['itos'] = {str(v): k for k, v in tokenizer['stoi'].items()}
10
+ return tokenizer
11
+
12
+ def encode_text(text: str) -> str:
13
+ """Convert text to token indices"""
14
+ tokenizer = load_tokenizer()
15
+ try:
16
+ # Convert text to tokens using the stoi (string to integer) mapping
17
+ tokens = []
18
+ current_pos = 0
19
+ while current_pos < len(text):
20
+ # Try to match the longest possible token
21
+ found = False
22
+ for token_length in range(min(tokenizer['max_token_length'], len(text) - current_pos), 0, -1):
23
+ substr = text[current_pos:current_pos + token_length]
24
+ if substr in tokenizer['stoi']:
25
+ tokens.append(tokenizer['stoi'][substr])
26
+ current_pos += token_length
27
+ found = True
28
+ break
29
+ if not found:
30
+ return f"Error: Unable to encode character at position {current_pos}"
31
+
32
+ return str(tokens)
33
+ except Exception as e:
34
+ return f"Error encoding text: {str(e)}"
35
+
36
+ def decode_tokens(token_string: str) -> str:
37
+ """Convert token indices back to text"""
38
+ tokenizer = load_tokenizer()
39
+ try:
40
+ # Clean the input string and convert to list of integers
41
+ token_string = token_string.strip('[]').replace(' ', '')
42
+ if not token_string:
43
+ return "Error: Empty input"
44
+
45
+ tokens = [int(t) for t in token_string.split(',') if t]
46
+
47
+ # Convert tokens back to text using the itos (integer to string) mapping
48
+ result = ""
49
+ for token in tokens:
50
+ token_str = str(token)
51
+ if token_str not in tokenizer['itos']:
52
+ return f"Error: Invalid token {token}"
53
+ result += tokenizer['itos'][token_str]
54
+
55
+ return result
56
+ except ValueError:
57
+ return "Error: Invalid input format. Please enter numbers separated by commas"
58
+ except Exception as e:
59
+ return f"Error decoding tokens: {str(e)}"
60
+
61
+ # Create Gradio interface
62
+ def create_interface():
63
+ with gr.Blocks(title="Kannada Text Tokenizer") as interface:
64
+ gr.Markdown("# Kannada Text Tokenizer")
65
+
66
+ with gr.Tab("Encode"):
67
+ with gr.Row():
68
+ input_text = gr.Textbox(label="Enter Kannada Text", lines=5)
69
+ encode_button = gr.Button("Encode")
70
+ encoded_output = gr.Textbox(label="Encoded Tokens", lines=5)
71
+ encode_button.click(fn=encode_text, inputs=input_text, outputs=encoded_output)
72
+
73
+ with gr.Tab("Decode"):
74
+ with gr.Row():
75
+ input_tokens = gr.Textbox(label="Enter Token List (e.g., [120, 135, 171])", lines=5)
76
+ decode_button = gr.Button("Decode")
77
+ decoded_output = gr.Textbox(label="Decoded Text", lines=5)
78
+ decode_button.click(fn=decode_tokens, inputs=input_tokens, outputs=decoded_output)
79
+
80
+ gr.Markdown("""
81
+ ### Instructions:
82
+ - **Encode**: Enter Kannada text in the input box and click 'Encode' to get token indices
83
+ - **Decode**: Enter a list of token indices and click 'Decode' to get back the original text
84
+ - Token indices must be in the format: [123, 456, 789]
85
+ """)
86
+
87
+ return interface
88
+
89
+ if __name__ == "__main__":
90
+ interface = create_interface()
91
+ interface.launch(share=False)
kannada_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
File without changes