Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

App Files Files Community

atiwari751 commited on Jan 11

Commit

c9f3e85

1 Parent(s): 850b586

Hindi regex cheat code applied

Browse files

Files changed (3) hide show

BPE.py +2 -36
decoded_output.txt +3 -3
encode_decode.py +15 -39

BPE.py CHANGED Viewed

@@ -7,41 +7,7 @@ with open('text_file.txt', 'r', encoding='utf-8') as file:
     text = file.read()
 # Hindi-focused pattern
-gpt2pat = re.compile(r"""
-    # Simpler syllable-based grouping
-    (?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?)  # Consonant + modifiers
-    # This part matches:
-    #   - Any consonant [क-ह]
-    #   - Optionally followed by:
-    #     - maatras [ा-ौ] (like ा ि ी ु ू े ै ो ौ)
-    #     - OR nukta (\u093C = ़)
-    #     - OR chandrabindu (\u0901 = ँ)
-    #     - OR anusvara (\u0902 = ं)
-    #     - OR visarga (\u0903 = ः)
-    |[\u0905-\u0914]    # Independent vowels
-    # Matches standalone vowels like अ आ इ ई उ ऊ ए ऐ ओ औ
-    |[क-ह]्[क-ह]       # Basic conjuncts
-    # Matches basic consonant conjuncts:
-    #   - First consonant + halant (्) + second consonant
-    #   - Examples: क्क, न्न, त्त
-    |\p{N}+            # Numbers
-    # Matches one or more digits
-    |\s+               # Whitespace
-    # Matches spaces, tabs, newlines
-    |[।॥]             # Punctuation
-    # Matches Hindi punctuation marks
-    |[^\s\p{Devanagari}\p{N}]+  # Other characters
-    # Matches any sequence of characters that aren't:
-    #   - whitespace
-    #   - Devanagari script
-    #   - numbers
-    """, re.VERBOSE)
 # Apply the regex pattern to the raw text to tokenize it
 tokens = re.findall(gpt2pat, text)
@@ -84,7 +50,7 @@ def merge(token_list, pair, idx):
     return newids
 def perform_bpe():
-    vocab_size = 3500  # the desired final vocabulary size
     num_merges = vocab_size - 256
     token_list = list(tokens)  # copy so we don't destroy the original list

     text = file.read()
 # Hindi-focused pattern
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 # Apply the regex pattern to the raw text to tokenize it
 tokens = re.findall(gpt2pat, text)
     return newids
 def perform_bpe():
+    vocab_size = 4000  # the desired final vocabulary size
     num_merges = vocab_size - 256
     token_list = list(tokens)  # copy so we don't destroy the original list

decoded_output.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-अ        र्व क्षत्र   आ        ए     ण्ड   .  इ ल्ल  ऐ  स्ज      , औ इ   23 स्य  द्र 10    .        अ      अ   उ 20    ए ,      इ       उन्ह अ    .
-55 र्ष अ  न्यन्ड  , "     आ .        ज्य      इ      .      10  ए  उन्ह    उन्ह       . इए   स्ज        ए.         . इ         . औ  अ      ,      -  र्वक्ष   एओ (  ऑ)       इए उन्ह    ए.”


1	+ अम ज द के परिवार की तीन पी ढ़ ियां चांद नी चौ क निर्� � ाचन क्षेत्र में ह वे ली आज ़ म ख ां के नाम से पहच ाने जाने वाले एक दम स ट कर बने घ रों के झ ुण ्ड में रहती हैं . यह इलाक ा दिल्ली की ऐ त िहास िक ज ामा मस्जिद से पै दल की द ूरी पर है , और इस परिवार के 23 सदस्य मतदान केंद्र 10 पर प ंज ीक ृत मत द ाता हैं . लेकिन पिछले साल लोकसभा चुनावों के दौरान अम ज द को पता चला कि वह अपने परिवार के उन 20 लोगों में से एक हैं , जिन का नाम मत द ाता सू ची से इस वजह से काट दिया गया कि उन्होंने अपना घर बदल लिया है .
2	+
3	+ 5 5 वर्षीय अम ज द ने न्यू ज़ ल ॉन ्ड ्री को बताया , " हम ारे सामने ये पहली बार हुआ है . लेकिन नाम कट ने के बारे में सबसे ज्यादा निर ाश ाज न क बात ये थी कि इसका पता मतदान के दिन ही चला . जब हम पहली बार बू थ 10 पर गए तो उन्होंने हमें बताया कि उन्हें मत द ाता सू ची में हमारा नाम नहीं मिला . इसलिए हमें ज ामा मस्जिद में किसी दूसरे बू थ पर जाकर देखना चाहिए . वहां से हमें दूसरे बू थ पर भेज दिया गया . इस तरह हमने पांच से छह बू थ ों का दौर ा किया . और फिर अंत में हमें जो कारण बताया गया , वो यह था कि शायद घर - घ र जाकर सर्वे क्षण के दौरान बी एल ओ ( ब ू थ ले वल ऑफिस र ) को हम घर पर नहीं मिले इसलिए उन्होंने हमारे नाम काट दिए .”

encode_decode.py CHANGED Viewed

@@ -7,47 +7,16 @@ with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
 # Define the GPT-2 regex pattern (same as in BPE.py)
-gpt2pat = re.compile(r"""
-    # Simpler syllable-based grouping
-    (?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?)  # Consonant + modifiers
-    |[\u0905-\u0914]    # Independent vowels
-    |[क-ह]्[क-ह]       # Basic conjuncts
-    |\p{N}+            # Numbers
-    |\s+               # Whitespace
-    |[।॥]             # Punctuation
-    |[^\s\p{Devanagari}\p{N}]+  # Other characters
-    """, re.VERBOSE)
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
-    # Debug printing
-    print("Vocabulary contents:")
-    for idx, byte_seq in vocab.items():
-        try:
-            char = byte_seq.decode('utf-8')
-            print(f"ID {idx}: bytes {list(byte_seq)} -> '{char}'")
-        except UnicodeDecodeError:
-            print(f"ID {idx}: bytes {list(byte_seq)} -> [INVALID UTF-8]")
-    print("\nDecoding sequence:")
-    tokens = []
-    for idx in ids:
-        if idx in vocab:
-            token_bytes = vocab[idx]
-            try:
-                char = token_bytes.decode('utf-8')
-                print(f"ID {idx} -> '{char}'")
-            except UnicodeDecodeError:
-                print(f"ID {idx} -> [INVALID UTF-8] {list(token_bytes)}")
-            tokens.append(token_bytes)
-        else:
-            print(f"Missing ID: {idx}")
-    # Original decoding logic
-    text = b''.join(tokens).decode('utf-8', errors='replace')
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -56,7 +25,7 @@ def decode(ids):
     return text
 # Example: Decode a list of IDs
-set_of_ids = [262, 32, 32, 32, 32, 32, 32, 32, 32, 342, 32, 287, 281, 32, 32, 32, 266, 32, 32, 32, 32, 32, 32, 32, 32, 260, 32, 32, 32, 32, 32, 1719, 32, 32, 32, 46, 32, 32, 265, 32, 308, 32, 32, 317, 32, 32, 639, 32, 32, 32, 32, 32, 32, 44, 32, 272, 32, 265, 32, 32, 32, 611, 32, 410, 32, 32, 313, 32, 354, 32, 32, 32, 32, 46, 32, 32, 32, 32, 32, 32, 32, 32, 262, 32, 32, 32, 32, 32, 32, 262, 32, 32, 32, 267, 32, 297, 32, 32, 32, 32, 260, 32, 44, 32, 32, 32, 32, 32, 32, 265, 32, 32, 32, 32, 32, 32, 32, 267, 293, 32, 262, 32, 32, 32, 32, 46, 270, 666, 32, 396, 32, 262, 32, 32, 353, 829, 32, 32, 44, 32, 34, 32, 32, 32, 32, 32, 266, 32, 46, 32, 32, 32, 32, 32, 32, 32, 32, 314, 32, 32, 32, 32, 32, 32, 265, 32, 32, 32, 32, 32, 32, 46, 32, 32, 32, 32, 32, 32, 354, 32, 32, 260, 32, 32, 267, 293, 32, 32, 32, 32, 267, 293, 32, 32, 32, 32, 32, 32, 32, 46, 32, 265, 260, 32, 32, 32, 639, 32, 32, 32, 32, 32, 32, 32, 32, 260, 46, 32, 32, 32, 32, 32, 32, 32, 32, 32, 46, 32, 265, 32, 32, 32, 32, 32, 32, 32, 32, 32, 46, 32, 272, 32, 32, 262, 32, 32, 32, 32, 32, 32, 44, 32, 32, 32, 32, 32, 32, 45, 32, 32, 342, 287, 32, 32, 32, 260, 298, 32, 40, 32, 32, 351, 41, 32, 32, 32, 32, 32, 32, 32, 265, 260, 32, 267, 293, 32, 32, 32, 32, 260, 760]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
@@ -72,6 +41,9 @@ def encode():
     byte_tokens = [token.encode('utf-8') for token in tokens]
     token_list = [list(token) for token in byte_tokens]
     # Process each token
     final_tokens = []
     for token in token_list:
@@ -87,8 +59,12 @@ def encode():
             current_token = merge([current_token], pair, idx)[0]
         final_tokens.extend(current_token)
-    return final_tokens
 # Example: Encode text from a file
-#encoded_tokens = encode()
-#print(encoded_tokens)

     merges, ids, num_merges = pickle.load(f)
 # Define the GPT-2 regex pattern (same as in BPE.py)
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
+    # given ids (list of integers), return Python string
+    tokens = [vocab[idx].decode("utf-8", errors="replace") for idx in ids]
+    text = '\t'.join(tokens)  # Join tokens with tabs
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
     return text
 # Example: Decode a list of IDs
+set_of_ids = [2342, 307, 295, 286, 1413, 302, 839, 644, 574, 982, 3877, 405, 1086, 272, 978, 181, 3927, 1171, 294, 274, 964, 438, 767, 337, 284, 361, 332, 286, 776, 315, 2331, 429, 841, 631, 385, 1694, 273, 310, 418, 1607, 445, 935, 286, 962, 1244, 698, 294, 3069, 347, 46, 450, 1462, 259, 646, 302, 554, 276, 2252, 334, 292, 2835, 2500, 315, 1006, 3367, 302, 296, 1299, 330, 289, 44, 327, 345, 1413, 286, 2911, 1906, 2592, 1322, 888, 330, 279, 711, 1474, 997, 1068, 295, 1236, 347, 46, 513, 1067, 579, 1194, 2596, 286, 847, 732, 307, 295, 309, 1423, 1953, 340, 555, 563, 1413, 286, 376, 466, 596, 294, 315, 385, 347, 44, 1001, 478, 776, 1068, 295, 1236, 919, 1216, 315, 345, 1115, 315, 3189, 481, 437, 340, 557, 1125, 1135, 1501, 857, 289, 46, 10, 10, 53, 53, 2794, 732, 307, 295, 317, 2705, 2246, 280, 1308, 698, 486, 309, 739, 44, 32, 34, 808, 830, 1015, 516, 1315, 544, 667, 289, 46, 513, 776, 1914, 311, 286, 948, 294, 856, 915, 2438, 658, 367, 271, 272, 564, 516, 472, 340, 1571, 1423, 2592, 286, 638, 416, 1953, 46, 586, 462, 1315, 544, 3075, 583, 888, 330, 588, 444, 557, 1448, 739, 340, 737, 1068, 295, 1236, 919, 1216, 294, 3253, 776, 391, 1410, 46, 1496, 1448, 292, 2835, 2500, 294, 738, 1374, 3075, 583, 330, 2660, 3252, 904, 46, 1441, 315, 1448, 1374, 3075, 583, 330, 1473, 481, 437, 46, 345, 778, 1758, 1307, 315, 2210, 3075, 583, 299, 333, 751, 259, 420, 46, 327, 766, 1200, 294, 1448, 499, 1394, 739, 437, 44, 707, 450, 413, 340, 3602, 1135, 45, 864, 261, 2660, 2749, 1930, 286, 847, 447, 1782, 1633, 510, 308, 306, 583, 399, 1508, 2632, 261, 41, 309, 462, 1135, 330, 391, 1193, 1496, 557, 1574, 776, 3189, 1340, 3435]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
     byte_tokens = [token.encode('utf-8') for token in tokens]
     token_list = [list(token) for token in byte_tokens]
+    # Calculate total bytes before compression
+    total_bytes_before = sum(len(token) for token in token_list)
     # Process each token
     final_tokens = []
     for token in token_list:
             current_token = merge([current_token], pair, idx)[0]
         final_tokens.extend(current_token)
+    # Calculate compression ratio
+    compression_ratio = total_bytes_before / len(final_tokens)
+    print(f"Compression ratio: {compression_ratio:.2f}X")
+    return final_tokens, compression_ratio
 # Example: Encode text from a file
+encoded_tokens, ratio = encode()
+print(f"Encoded tokens: {encoded_tokens}")