Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

App Files Files Community

atiwari751 commited on Jan 10

Commit

1e8ebcb

1 Parent(s): c128a5f

basic regex

Browse files

Files changed (3) hide show

BPE.py +15 -7
decoded_output.txt +2 -3
encode_decode.py +1 -1

BPE.py CHANGED Viewed

@@ -1,12 +1,20 @@
 import pickle
 from tqdm import tqdm  # Import tqdm for progress bar
 # Read text from a file
-with open('text_file.txt', 'r', encoding='utf-8') as file:
     text = file.read()
-tokens = text.encode("utf-8")  # raw bytes
-tokens = list(map(int, tokens))  # convert to a list of integers in range 0..255 for convenience
 def get_stats(ids):
     counts = {}
@@ -28,7 +36,7 @@ def merge(ids, pair, idx):
     return newids
 def perform_bpe():
-    vocab_size = 3500  # the desired final vocabulary size
     num_merges = vocab_size - 256
     ids = list(tokens)  # copy so we don't destroy the original list
@@ -54,9 +62,9 @@ if __name__ == "__main__":
     print("length of tokens:", len(tokens))
     # Run BPE and save results
-    merges, ids, num_merges = perform_bpe()
     # Save merges and vocab to a file
-    with open('bpe_results.pkl', 'wb') as f:
-        pickle.dump((merges, ids, num_merges), f)

 import pickle
+import regex as re
 from tqdm import tqdm  # Import tqdm for progress bar
 # Read text from a file
+with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
     text = file.read()
+# Define the GPT-2 regex pattern
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+# Apply the regex pattern to tokenize the text
+tokens = re.findall(gpt2pat, text)
+# Convert tokens to a list of integers in range 0..255 for convenience
+tokens = [ord(char) for token in tokens for char in token]
+print(tokens)
 def get_stats(ids):
     counts = {}
     return newids
 def perform_bpe():
+    vocab_size = 1500  # the desired final vocabulary size
     num_merges = vocab_size - 256
     ids = list(tokens)  # copy so we don't destroy the original list
     print("length of tokens:", len(tokens))
     # Run BPE and save results
+    #merges, ids, num_merges = perform_bpe()
     # Save merges and vocab to a file
+    #with open('bpe_results.pkl', 'wb') as f:
+        #pickle.dump((merges, ids, num_merges), f)

decoded_output.txt CHANGED Viewed

@@ -1,3 +1,2 @@
-"    प    ठ    ान    क    ोट     पहुंच    े     PM     मोद    ी,     ए    यर    ब    ेस     का �    �    ाय    ज    ा ल    े ब    ॉर्ड    र �    �लाक    ों     क    ा कर    ेंग    े ह    वाई     सर्व    े    ","    प्र    धानमंत्र    ी नरेंद्र मोद    ी प    ठ    ान    क    ोट     ए    यर    ब    ेस     पहुंच     गए     हैं. �    �े �    �यर    ब    ेस     में स    ुरक्ष    ा के �    �ाल    ात     का �    �    ाय    ज    ा ल    े रह    े हैं �    �र �    �    ाय    ुस    ेन    ाक    र्म    ियों से �    �िल     रह    े हैं    . स    ुबह     करीब     स    व    ा द    स     बजे �    �्रधानमंत्र    ी प    ंजाब     के प    ठ    ान    क    ोट     के लिए     रव    ाना �    �ुए    . �    �यर    ब    ेस     का �    �    ाय    ज    ा ल    े    ने के बाद     प्रधानमंत्र    ी ब    ॉर्ड    र �    �लाक    ों का �    �    वाई     सर्व    ेक    ्ष    ण     भ    ी कर    ेंगे. �    �    ठ    ान    क    ोट     ए    यर    ब    ेस     पर �    �िछले �    �    फ्त    े �    �तंक    ियों ने �    �मल    ा क    िया था. �    �ाकिस्तान     से �    �ए     आतंक    ियों के �    �मल    े को �    �ि�    �    ल     कर द    िया गय    ा थ    ा. स    भ    ी     6     पाकिस्तान    ी �    �तंक    ी म    ारे �    �ए     थे    .     7     सुरक्ष    ाबल     भी �    �ह    ीद     हुए     थे. �    �ारत     ने �    �ाकिस्तान     को स    ब    ूत     स    ौंप    ते हुए     द    ोष    ियों के �    �िलाफ     स    ख्त     कार    ्रवाई     करने क    ो कह    ा है.
-    ज    ानकार    ी के म    ुताबिक    ,     प्रधानमंत्र    ी के साथ     आ    र्म    ी और �    �    य    रफ    ो    र्स     के �    �    ीफ     भी �    �ौजूद     रह     सक    ते हैं. �    �यर    ब    ेस     पर
-प    ाकिस्तान    ी �    �तंक    ियों


1	+ NIUS:
2	+ done before

encode_decode.py CHANGED Viewed

@@ -21,7 +21,7 @@ def decode(ids):
     return text
 # Example: Decode a list of IDs
-set_of_ids = [34, 293, 474, 298, 275, 575, 1271, 260, 778, 1298, 763, 611, 1921, 310, 424, 352, 156, 347, 318, 947, 1410, 1832, 276, 2984, 314, 262, 770, 639, 2516, 1020, 3054, 260, 795, 1072, 993, 2392, 499, 474, 298, 275, 575, 611, 1921, 310, 424, 1271, 854, 940, 1761, 3036, 310, 424, 932, 1060, 661, 918, 342, 352, 156, 347, 318, 947, 1453, 1483, 324, 181, 347, 863, 591, 412, 606, 2234, 789, 481, 751, 587, 2039, 1750, 289, 301, 565, 278, 2675, 1532, 499, 1898, 820, 474, 298, 275, 575, 410, 3428, 1195, 569, 295, 3036, 310, 424, 352, 156, 347, 318, 947, 260, 1033, 2697, 495, 1832, 276, 2984, 761, 185, 1020, 3054, 377, 401, 430, 471, 953, 2232, 170, 474, 298, 275, 575, 611, 1921, 310, 424, 419, 2029, 185, 869, 268, 1254, 1998, 1842, 317, 2214, 1630, 376, 1141, 1709, 1909, 1842, 1200, 514, 171, 281, 798, 904, 510, 1865, 418, 264, 890, 1877, 272, 1254, 485, 1069, 967, 2100, 1046, 55, 2355, 2555, 563, 3352, 654, 1477, 1622, 708, 405, 1630, 859, 310, 1130, 289, 2817, 762, 336, 979, 1909, 867, 289, 3316, 1205, 1811, 1266, 2718, 886, 318, 1303, 1861, 1029, 44, 2697, 2033, 390, 606, 686, 143, 311, 1537, 271, 806, 286, 154, 1106, 563, 2460, 481, 786, 1061, 3036, 310, 424, 810, 2250, 881, 272, 1254, 1135]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)

     return text
 # Example: Decode a list of IDs
+set_of_ids = [25, 345, 992, 1353]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)