Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

App Files Files Community

atiwari751 commited on Jan 11

Commit

fa753cb

2 Parent(s): fa76461 c9f3e85

removed pkl file to address merge conflict

Browse files

Files changed (5) hide show

.gitignore +1 -1
BPE.py +65 -26
decoded_output.txt +3 -1
encode_decode.py +38 -15
encode_input.txt +3 -1

.gitignore CHANGED Viewed

@@ -3,4 +3,4 @@ __pycache__
 test.csv
 GPT2_encoder.py
 Hindi_Regex.txt
-Hindi_no_Regex.txt

 test.csv
 GPT2_encoder.py
 Hindi_Regex.txt
+Hindi_no_Regex.txt

BPE.py CHANGED Viewed

@@ -1,51 +1,90 @@
 import pickle
-from tqdm import tqdm  # Import tqdm for progress bar
 # Read text from a file
 with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
     text = file.read()
-tokens = text.encode("utf-8")  # raw bytes
-tokens = list(map(int, tokens))  # convert to a list of integers in range 0..255 for convenience
-def get_stats(ids):
     counts = {}
-    for pair in zip(ids, ids[1:]):
-        counts[pair] = counts.get(pair, 0) + 1
     return counts
-def merge(ids, pair, idx):
-    # in the list of ints (ids), replace all consecutive occurrences of pair with the new token idx
     newids = []
-    i = 0
-    while i < len(ids):
-        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
-            newids.append(idx)
-            i += 2
-        else:
-            newids.append(ids[i])
-            i += 1
     return newids
 def perform_bpe():
-    vocab_size = 3500  # the desired final vocabulary size
     num_merges = vocab_size - 256
-    ids = list(tokens)  # copy so we don't destroy the original list
     merges = {}  # (int, int) -> int
-    # Use tqdm to add a progress bar
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
-        stats = get_stats(ids)
         pair = max(stats, key=stats.get)
         idx = 256 + i
-        ids = merge(ids, pair, idx)
         merges[pair] = idx
     print("---")
-    print("ids length:", len(ids))
-    print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
-    return merges, ids, num_merges
 if __name__ == "__main__":
     print('---')

 import pickle
+import regex as re
+from tqdm import tqdm
 # Read text from a file
 with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
     text = file.read()
+# Hindi-focused pattern
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+# Apply the regex pattern to the raw text to tokenize it
+tokens = re.findall(gpt2pat, text)
+# Convert tokens to byte sequences
+byte_tokens = [token.encode('utf-8') for token in tokens]
+# Create a list of byte sequences, each representing a token
+tokens = [list(token) for token in byte_tokens]
+def get_stats(token_list):
+    """Count frequency of pairs across all tokens"""
     counts = {}
+    # Count pairs within each token
+    for token in token_list:
+        if len(token) < 2:
+            continue
+        for pair in zip(token, token[1:]):
+            counts[pair] = counts.get(pair, 0) + 1
     return counts
+def merge(token_list, pair, idx):
+    """Merge all occurrences of pair within each token"""
     newids = []
+    for token in token_list:
+        if len(token) < 2:
+            newids.append(token)
+            continue
+        new_token = []
+        i = 0
+        while i < len(token):
+            if i < len(token) - 1 and (token[i], token[i+1]) == pair:
+                new_token.append(idx)
+                i += 2
+            else:
+                new_token.append(token[i])
+                i += 1
+        newids.append(new_token)
     return newids
 def perform_bpe():
+    vocab_size = 4000  # the desired final vocabulary size
     num_merges = vocab_size - 256
+    token_list = list(tokens)  # copy so we don't destroy the original list
+    # Calculate total bytes before compression
+    total_bytes_before = sum(len(token) for token in token_list)
     merges = {}  # (int, int) -> int
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
+        stats = get_stats(token_list)
+        if not stats:  # No more pairs to merge
+            break
+        # Find most frequent pair
         pair = max(stats, key=stats.get)
         idx = 256 + i
+        # Perform the merge
+        token_list = merge(token_list, pair, idx)
         merges[pair] = idx
+    # Calculate total bytes after compression
+    total_bytes_after = sum(len(token) for token in token_list)
     print("---")
+    print("Total bytes before:", total_bytes_before)
+    print("Total bytes after:", total_bytes_after)
+    print(f"Compression ratio: {total_bytes_before / total_bytes_after:.2f}X")
+    # Flatten for storage, but maintain token boundaries
+    flat_ids = []
+    for token in token_list:
+        flat_ids.extend(token)
+    return merges, flat_ids, num_merges
 if __name__ == "__main__":
     print('---')

decoded_output.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- Th ere 's a ch anc e this is not work ing, is n 't it ? Th ere ' re many pa per s, why will this work ? I ' ve got to make su re . I ' m now thin king some thing 's w ron g . I t 'll be sa d if there 's something w r ong and I mis s it, ~~I'll be sor r y~~. ~~I t 'd bet ter be re view ed well~~ , ~~I 'd want to be cer tain~~ .


1	+ अम ज द के परिवार की तीन पी ढ़ ियां चांद नी चौ क निर्� � ाचन क्षेत्र में ह वे ली आज ़ म ख ां के नाम से पहच ाने जाने वाले एक दम स ट कर बने घ रों के झ ुण ्ड में रहती हैं . यह इलाक ा दिल्ली की ऐ त िहास िक ज ामा मस्जिद से पै दल की द ूरी पर है , और इस परिवार के 23 सदस्य मतदान केंद्र 10 पर प ंज ीक ृत मत द ाता हैं . लेकिन पिछले साल लोकसभा चुनावों के दौरान अम ज द को पता चला कि वह अपने परिवार के उन 20 लोगों में से एक हैं , जिन का नाम मत द ाता सू ची से इस वजह से काट दिया गया कि उन्होंने अपना घर बदल लिया है .
2	+
3	+ 5 5 वर्षीय अम ज द ने न्यू ज़ ल ॉन ्ड ्री को बताया , " हम ारे सामने ये पहली बार हुआ है . लेकिन नाम कट ने के बारे में सबसे ज्यादा निर ाश ाज न क बात ये थी कि इसका पता मतदान के दिन ही चला . जब हम पहली बार बू थ 10 पर गए तो उन्होंने हमें बताया कि उन्हें मत द ाता सू ची में हमारा नाम नहीं मिला . इसलिए हमें ज ामा मस्जिद में किसी दूसरे बू थ पर जाकर देखना चाहिए . वहां से हमें दूसरे बू थ पर भेज दिया गया . इस तरह हमने पांच से छह बू थ ों का दौर ा किया . और फिर अंत में हमें जो कारण बताया गया , वो यह था कि शायद घर - घ र जाकर सर्वे क्षण के दौरान बी एल ओ ( ब ू थ ले वल ऑफिस र ) को हम घर पर नहीं मिले इसलिए उन्होंने हमारे नाम काट दिए .”

encode_decode.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import pickle
 from BPE import get_stats, merge
 # Load merges and vocab from the file
 with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
@@ -12,7 +16,7 @@ for (p0, p1), idx in merges.items():
 def decode(ids):
     # given ids (list of integers), return Python string
     tokens = [vocab[idx].decode("utf-8", errors="replace") for idx in ids]
-    text = '    '.join(tokens)  # Join tokens with a single space
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -21,7 +25,7 @@ def decode(ids):
     return text
 # Example: Decode a list of IDs
-set_of_ids = [312, 1366, 565, 278, 302, 717, 256, 429, 1496, 1687, 808, 411, 110, 2862, 289, 670, 312, 1366, 39, 1281, 1191, 2358, 456, 374, 2453, 574, 429, 1687, 670, 73, 39, 353, 1176, 286, 904, 367, 279, 2310, 39, 695, 1398, 999, 806, 1271, 3455, 565, 119, 1902, 103, 2310, 116, 851, 403, 379, 260, 846, 2713, 565, 3466, 119, 114, 588, 292, 360, 1263, 258, 1285, 1402, 403, 3305, 114, 1278, 73, 116, 887, 773, 363, 403, 279, 2035, 274, 1150, 3273, 887, 2398, 1219, 1031, 2514, 46]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
@@ -30,18 +34,37 @@ def encode():
     with open('encode_input.txt', 'r', encoding='utf-8') as f:
         text = f.read()
-    # given a string, return list of integers (the tokens)
-    tokens = list(text.encode("utf-8"))
-    while len(tokens) >= 2:
-        stats = get_stats(tokens)
-        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
-        if pair not in merges:
-            break  # nothing else can be merged
-        idx = merges[pair]
-        tokens = merge(tokens, pair, idx)
-    return tokens
 # Example: Encode text from a file
-encoded_tokens = encode()
-print(encoded_tokens)

 import pickle
 from BPE import get_stats, merge
+import regex as re
 # Load merges and vocab from the file
 with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
+# Define the GPT-2 regex pattern (same as in BPE.py)
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
     # given ids (list of integers), return Python string
     tokens = [vocab[idx].decode("utf-8", errors="replace") for idx in ids]
+    text = '\t'.join(tokens)  # Join tokens with tabs
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
     return text
 # Example: Decode a list of IDs
+set_of_ids = [2342, 307, 295, 286, 1413, 302, 839, 644, 574, 982, 3877, 405, 1086, 272, 978, 181, 3927, 1171, 294, 274, 964, 438, 767, 337, 284, 361, 332, 286, 776, 315, 2331, 429, 841, 631, 385, 1694, 273, 310, 418, 1607, 445, 935, 286, 962, 1244, 698, 294, 3069, 347, 46, 450, 1462, 259, 646, 302, 554, 276, 2252, 334, 292, 2835, 2500, 315, 1006, 3367, 302, 296, 1299, 330, 289, 44, 327, 345, 1413, 286, 2911, 1906, 2592, 1322, 888, 330, 279, 711, 1474, 997, 1068, 295, 1236, 347, 46, 513, 1067, 579, 1194, 2596, 286, 847, 732, 307, 295, 309, 1423, 1953, 340, 555, 563, 1413, 286, 376, 466, 596, 294, 315, 385, 347, 44, 1001, 478, 776, 1068, 295, 1236, 919, 1216, 315, 345, 1115, 315, 3189, 481, 437, 340, 557, 1125, 1135, 1501, 857, 289, 46, 10, 10, 53, 53, 2794, 732, 307, 295, 317, 2705, 2246, 280, 1308, 698, 486, 309, 739, 44, 32, 34, 808, 830, 1015, 516, 1315, 544, 667, 289, 46, 513, 776, 1914, 311, 286, 948, 294, 856, 915, 2438, 658, 367, 271, 272, 564, 516, 472, 340, 1571, 1423, 2592, 286, 638, 416, 1953, 46, 586, 462, 1315, 544, 3075, 583, 888, 330, 588, 444, 557, 1448, 739, 340, 737, 1068, 295, 1236, 919, 1216, 294, 3253, 776, 391, 1410, 46, 1496, 1448, 292, 2835, 2500, 294, 738, 1374, 3075, 583, 330, 2660, 3252, 904, 46, 1441, 315, 1448, 1374, 3075, 583, 330, 1473, 481, 437, 46, 345, 778, 1758, 1307, 315, 2210, 3075, 583, 299, 333, 751, 259, 420, 46, 327, 766, 1200, 294, 1448, 499, 1394, 739, 437, 44, 707, 450, 413, 340, 3602, 1135, 45, 864, 261, 2660, 2749, 1930, 286, 847, 447, 1782, 1633, 510, 308, 306, 583, 399, 1508, 2632, 261, 41, 309, 462, 1135, 330, 391, 1193, 1496, 557, 1574, 776, 3189, 1340, 3435]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
     with open('encode_input.txt', 'r', encoding='utf-8') as f:
         text = f.read()
+    # Tokenize the text using the regex pattern
+    tokens = re.findall(gpt2pat, text)
+    # Convert tokens to byte sequences and maintain grouping
+    byte_tokens = [token.encode('utf-8') for token in tokens]
+    token_list = [list(token) for token in byte_tokens]
+    # Calculate total bytes before compression
+    total_bytes_before = sum(len(token) for token in token_list)
+    # Process each token
+    final_tokens = []
+    for token in token_list:
+        current_token = list(token)
+        while len(current_token) >= 2:
+            stats = get_stats([current_token])
+            if not stats:
+                break
+            pair = min(stats, key=lambda p: merges.get(p, float("inf")))
+            if pair not in merges:
+                break
+            idx = merges[pair]
+            current_token = merge([current_token], pair, idx)[0]
+        final_tokens.extend(current_token)
+    # Calculate compression ratio
+    compression_ratio = total_bytes_before / len(final_tokens)
+    print(f"Compression ratio: {compression_ratio:.2f}X")
+    return final_tokens, compression_ratio
 # Example: Encode text from a file
+encoded_tokens, ratio = encode()
+print(f"Encoded tokens: {encoded_tokens}")

encode_input.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~There's~~ a ~~chance~~ ~~this~~ is ~~not~~ ~~working,~~ ~~isn't~~ ~~it?~~ ~~There're~~ ~~many~~ ~~papers,~~ ~~why~~ ~~will~~ ~~this~~ ~~work?~~ ~~I've~~ ~~got~~ to ~~make~~ ~~sure.~~ ~~I'm~~ ~~now~~ ~~thinking~~ ~~something's~~ ~~wrong.~~ ~~It'll~~ be ~~sad~~ if ~~there's~~ ~~something~~ ~~wrong~~ ~~and~~ I ~~miss~~ ~~it,~~ ~~I'll~~ be ~~sorry.~~ ~~It'd~~ ~~better~~ be ~~reviewed~~ ~~well,~~ ~~I'd~~ ~~want~~ to be ~~certain.~~


1	+ अमजद के परिवार की तीन पीढ़ियां चांदनी चौक निर्वाचन क्षेत्र में हवेली आज़म खां के नाम से पहचाने जाने वाले एकदम सटकर बने घरों के झुण्ड में रहती हैं. यह इलाका दिल्ली की ऐतिहासिक जामा मस्जिद से पैदल की दूरी पर है, और इस परिवार के 23 सदस्य मतदान केंद्र 10 पर पंजीकृत मतदाता हैं. लेकिन पिछले साल लोकसभा चुनावों के दौरान अमजद को पता चला कि वह अपने परिवार के उन 20 लोगों में से एक हैं, जिनका नाम मतदाता सूची से इस वजह से काट दिया गया कि उन्होंने अपना घर बदल लिया है.
2	+
3	+ 55 वर्षीय अमजद ने न्यूज़लॉन्ड्री को बताया, "हमारे सामने ये पहली बार हुआ है. लेकिन नाम कटने के बारे में सबसे ज्यादा निराशाजनक बात ये थी कि इसका पता मतदान के दिन ही चला. जब हम पहली बार बूथ 10 पर गए तो उन्होंने हमें बताया कि उन्हें मतदाता सूची में हमारा नाम नहीं मिला. इसलिए हमें जामा मस्जिद में किसी दूसरे बूथ पर जाकर देखना चाहिए. वहां से हमें दूसरे बूथ पर भेज दिया गया. इस तरह हमने पांच से छह बूथों का दौरा किया. और फिर अंत में हमें जो कारण बताया गया, वो यह था कि शायद घर-घर जाकर सर्वेक्षण के दौरान बीएलओ (बूथ लेवल ऑफिसर) को हम घर पर नहीं मिले इसलिए उन्होंने हमारे नाम काट दिए.”