Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

App Files Files Community

atiwari751 commited on Jan 10

Commit

850b586

1 Parent(s): b0f8dcf

Hindi regex brutality

Browse files

Files changed (4) hide show

BPE.py +37 -3
decoded_output.txt +3 -1
encode_decode.py +38 -9
encode_input.txt +3 -1

BPE.py CHANGED Viewed

@@ -3,11 +3,45 @@ import regex as re
 from tqdm import tqdm
 # Read text from a file
-with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
     text = file.read()
-# Define the GPT-2 regex pattern
-gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 # Apply the regex pattern to the raw text to tokenize it
 tokens = re.findall(gpt2pat, text)

 from tqdm import tqdm
 # Read text from a file
+with open('text_file.txt', 'r', encoding='utf-8') as file:
     text = file.read()
+# Hindi-focused pattern
+gpt2pat = re.compile(r"""
+    # Simpler syllable-based grouping
+    (?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?)  # Consonant + modifiers
+    # This part matches:
+    #   - Any consonant [क-ह]
+    #   - Optionally followed by:
+    #     - maatras [ा-ौ] (like ा ि ी ु ू े ै ो ौ)
+    #     - OR nukta (\u093C = ़)
+    #     - OR chandrabindu (\u0901 = ँ)
+    #     - OR anusvara (\u0902 = ं)
+    #     - OR visarga (\u0903 = ः)
+    |[\u0905-\u0914]    # Independent vowels
+    # Matches standalone vowels like अ आ इ ई उ ऊ ए ऐ ओ औ
+    |[क-ह]्[क-ह]       # Basic conjuncts
+    # Matches basic consonant conjuncts:
+    #   - First consonant + halant (्) + second consonant
+    #   - Examples: क्क, न्न, त्त
+    |\p{N}+            # Numbers
+    # Matches one or more digits
+    |\s+               # Whitespace
+    # Matches spaces, tabs, newlines
+    |[।॥]             # Punctuation
+    # Matches Hindi punctuation marks
+    |[^\s\p{Devanagari}\p{N}]+  # Other characters
+    # Matches any sequence of characters that aren't:
+    #   - whitespace
+    #   - Devanagari script
+    #   - numbers
+    """, re.VERBOSE)
 # Apply the regex pattern to the raw text to tokenize it
 tokens = re.findall(gpt2pat, text)

decoded_output.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~There 's a chance this is not working , isn 't it ? There ' re many p ap ers , why will this work ? I ' ve got to make su re~~ . ~~I ' m now th in king something 's wr ong . It 'll be s ad if there 's something wr ong and I miss it~~ , ~~I 'll be s or ry~~ . ~~It 'd better be re vi ew ed well~~ , ~~I 'd want to be certain~~ .


1	+ अ र्व क्षत्र आ ए ण्ड . इ ल्ल ऐ स्ज , औ इ 23 स्य द्र 10 . अ अ उ 20 ए , इ उन्ह अ .
2	+
3	+ 55 र्ष अ न्यन्ड , " आ . ज्य इ . 10 ए उन्ह उन्ह . इए स्ज ए. . इ . औ अ , - र्वक्ष एओ ( ऑ) इए उन्ह ए.”

encode_decode.py CHANGED Viewed

@@ -7,18 +7,47 @@ with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
 # Define the GPT-2 regex pattern (same as in BPE.py)
-gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
-    # given ids (list of integers), return Python string
-    tokens = [vocab[idx] for idx in ids]
-    # Decode each token separately and join with tabs
-    decoded_tokens = [token.decode("utf-8", errors="replace") for token in tokens]
-    text = '\t'.join(decoded_tokens)
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -27,7 +56,7 @@ def decode(ids):
     return text
 # Example: Decode a list of IDs
-set_of_ids = [2532, 522, 258, 3103, 425, 332, 374, 2797, 44, 2391, 1508, 369, 63, 1375, 39, 261, 972, 277, 641, 385, 44, 2208, 553, 425, 1592, 63, 330, 39, 318, 1088, 285, 843, 405, 261, 46, 330, 39, 109, 1070, 325, 259, 888, 2913, 522, 1796, 524, 46, 966, 824, 306, 262, 354, 820, 726, 522, 2913, 1796, 524, 294, 330, 2827, 369, 44, 330, 824, 306, 262, 279, 551, 46, 966, 672, 2988, 306, 301, 3188, 451, 270, 814, 44, 330, 672, 1726, 285, 306, 1475, 46]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
@@ -61,5 +90,5 @@ def encode():
     return final_tokens
 # Example: Encode text from a file
-encoded_tokens = encode()
-print(encoded_tokens)

     merges, ids, num_merges = pickle.load(f)
 # Define the GPT-2 regex pattern (same as in BPE.py)
+gpt2pat = re.compile(r"""
+    # Simpler syllable-based grouping
+    (?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?)  # Consonant + modifiers
+    |[\u0905-\u0914]    # Independent vowels
+    |[क-ह]्[क-ह]       # Basic conjuncts
+    |\p{N}+            # Numbers
+    |\s+               # Whitespace
+    |[।॥]             # Punctuation
+    |[^\s\p{Devanagari}\p{N}]+  # Other characters
+    """, re.VERBOSE)
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
+    # Debug printing
+    print("Vocabulary contents:")
+    for idx, byte_seq in vocab.items():
+        try:
+            char = byte_seq.decode('utf-8')
+            print(f"ID {idx}: bytes {list(byte_seq)} -> '{char}'")
+        except UnicodeDecodeError:
+            print(f"ID {idx}: bytes {list(byte_seq)} -> [INVALID UTF-8]")
+    print("\nDecoding sequence:")
+    tokens = []
+    for idx in ids:
+        if idx in vocab:
+            token_bytes = vocab[idx]
+            try:
+                char = token_bytes.decode('utf-8')
+                print(f"ID {idx} -> '{char}'")
+            except UnicodeDecodeError:
+                print(f"ID {idx} -> [INVALID UTF-8] {list(token_bytes)}")
+            tokens.append(token_bytes)
+        else:
+            print(f"Missing ID: {idx}")
+    # Original decoding logic
+    text = b''.join(tokens).decode('utf-8', errors='replace')
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
     return text
 # Example: Decode a list of IDs
+set_of_ids = [262, 32, 32, 32, 32, 32, 32, 32, 32, 342, 32, 287, 281, 32, 32, 32, 266, 32, 32, 32, 32, 32, 32, 32, 32, 260, 32, 32, 32, 32, 32, 1719, 32, 32, 32, 46, 32, 32, 265, 32, 308, 32, 32, 317, 32, 32, 639, 32, 32, 32, 32, 32, 32, 44, 32, 272, 32, 265, 32, 32, 32, 611, 32, 410, 32, 32, 313, 32, 354, 32, 32, 32, 32, 46, 32, 32, 32, 32, 32, 32, 32, 32, 262, 32, 32, 32, 32, 32, 32, 262, 32, 32, 32, 267, 32, 297, 32, 32, 32, 32, 260, 32, 44, 32, 32, 32, 32, 32, 32, 265, 32, 32, 32, 32, 32, 32, 32, 267, 293, 32, 262, 32, 32, 32, 32, 46, 270, 666, 32, 396, 32, 262, 32, 32, 353, 829, 32, 32, 44, 32, 34, 32, 32, 32, 32, 32, 266, 32, 46, 32, 32, 32, 32, 32, 32, 32, 32, 314, 32, 32, 32, 32, 32, 32, 265, 32, 32, 32, 32, 32, 32, 46, 32, 32, 32, 32, 32, 32, 354, 32, 32, 260, 32, 32, 267, 293, 32, 32, 32, 32, 267, 293, 32, 32, 32, 32, 32, 32, 32, 46, 32, 265, 260, 32, 32, 32, 639, 32, 32, 32, 32, 32, 32, 32, 32, 260, 46, 32, 32, 32, 32, 32, 32, 32, 32, 32, 46, 32, 265, 32, 32, 32, 32, 32, 32, 32, 32, 32, 46, 32, 272, 32, 32, 262, 32, 32, 32, 32, 32, 32, 44, 32, 32, 32, 32, 32, 32, 45, 32, 32, 342, 287, 32, 32, 32, 260, 298, 32, 40, 32, 32, 351, 41, 32, 32, 32, 32, 32, 32, 32, 265, 260, 32, 267, 293, 32, 32, 32, 32, 260, 760]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
     return final_tokens
 # Example: Encode text from a file
+#encoded_tokens = encode()
+#print(encoded_tokens)

encode_input.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~There's~~ a ~~chance~~ ~~this~~ is ~~not~~ ~~working,~~ ~~isn't~~ ~~it?~~ ~~There're~~ ~~many~~ ~~papers,~~ ~~why~~ ~~will~~ ~~this~~ ~~work?~~ ~~I've~~ ~~got~~ to ~~make~~ ~~sure.~~ ~~I'm~~ ~~now~~ ~~thinking~~ ~~something's~~ ~~wrong.~~ ~~It'll~~ be ~~sad~~ if ~~there's~~ ~~something~~ ~~wrong~~ ~~and~~ I ~~miss~~ ~~it,~~ ~~I'll~~ be ~~sorry.~~ ~~It'd~~ ~~better~~ be ~~reviewed~~ ~~well,~~ ~~I'd~~ ~~want~~ to be ~~certain.~~


1	+ अमजद के परिवार की तीन पीढ़ियां चांदनी चौक निर्वाचन क्षेत्र में हवेली आज़म खां के नाम से पहचाने जाने वाले एकदम सटकर बने घरों के झुण्ड में रहती हैं. यह इलाका दिल्ली की ऐतिहासिक जामा मस्जिद से पैदल की दूरी पर है, और इस परिवार के 23 सदस्य मतदान केंद्र 10 पर पंजीकृत मतदाता हैं. लेकिन पिछले साल लोकसभा चुनावों के दौरान अमजद को पता चला कि वह अपने परिवार के उन 20 लोगों में से एक हैं, जिनका नाम मतदाता सूची से इस वजह से काट दिया गया कि उन्होंने अपना घर बदल लिया है.
2	+
3	+ 55 वर्षीय अमजद ने न्यूज़लॉन्ड्री को बताया, "हमारे सामने ये पहली बार हुआ है. लेकिन नाम कटने के बारे में सबसे ज्यादा निराशाजनक बात ये थी कि इसका पता मतदान के दिन ही चला. जब हम पहली बार बूथ 10 पर गए तो उन्होंने हमें बताया कि उन्हें मतदाता सूची में हमारा नाम नहीं मिला. इसलिए हमें जामा मस्जिद में किसी दूसरे बूथ पर जाकर देखना चाहिए. वहां से हमें दूसरे बूथ पर भेज दिया गया. इस तरह हमने पांच से छह बूथों का दौरा किया. और फिर अंत में हमें जो कारण बताया गया, वो यह था कि शायद घर-घर जाकर सर्वेक्षण के दौरान बीएलओ (बूथ लेवल ऑफिसर) को हम घर पर नहीं मिले इसलिए उन्होंने हमारे नाम काट दिए.”