Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

App Files Files Community

atiwari751 commited on Jan 10

Commit

76f084f

1 Parent(s): 781de59

Regex working

Browse files

Files changed (5) hide show

BPE.py +54 -23
decoded_output.txt +1 -2
encode_decode.py +24 -14
encode_input.txt +1 -3
text_file_eng_short.txt +1 -0

BPE.py CHANGED Viewed

@@ -9,51 +9,82 @@ with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
 # Define the GPT-2 regex pattern
 gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-# Tokenize the text using the regex pattern
 tokens = re.findall(gpt2pat, text)
 # Convert tokens to byte sequences
 byte_tokens = [token.encode('utf-8') for token in tokens]
-# Flatten the list of byte sequences into a single list of bytes
-tokens = [b for token in byte_tokens for b in token]
-def get_stats(ids):
     counts = {}
-    for pair in zip(ids, ids[1:]):
-        counts[pair] = counts.get(pair, 0) + 1
     return counts
-def merge(ids, pair, idx):
     newids = []
-    i = 0
-    while i < len(ids):
-        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
-            newids.append(idx)
-            i += 2
-        else:
-            newids.append(ids[i])
-            i += 1
     return newids
 def perform_bpe():
     vocab_size = 1500  # the desired final vocabulary size
     num_merges = vocab_size - 256
-    ids = list(tokens)  # copy so we don't destroy the original list
     merges = {}  # (int, int) -> int
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
-        stats = get_stats(ids)
         pair = max(stats, key=stats.get)
         idx = 256 + i
-        ids = merge(ids, pair, idx)
         merges[pair] = idx
     print("---")
-    print("ids length:", len(ids))
-    print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
-    return merges, ids, num_merges
 if __name__ == "__main__":
     print('---')

 # Define the GPT-2 regex pattern
 gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+# Apply the regex pattern to the raw text to tokenize it
 tokens = re.findall(gpt2pat, text)
 # Convert tokens to byte sequences
 byte_tokens = [token.encode('utf-8') for token in tokens]
+# Create a list of byte sequences, each representing a token
+tokens = [list(token) for token in byte_tokens]
+def get_stats(token_list):
+    """Count frequency of pairs across all tokens"""
     counts = {}
+    # Count pairs within each token
+    for token in token_list:
+        if len(token) < 2:
+            continue
+        for pair in zip(token, token[1:]):
+            counts[pair] = counts.get(pair, 0) + 1
     return counts
+def merge(token_list, pair, idx):
+    """Merge all occurrences of pair within each token"""
     newids = []
+    for token in token_list:
+        if len(token) < 2:
+            newids.append(token)
+            continue
+        new_token = []
+        i = 0
+        while i < len(token):
+            if i < len(token) - 1 and (token[i], token[i+1]) == pair:
+                new_token.append(idx)
+                i += 2
+            else:
+                new_token.append(token[i])
+                i += 1
+        newids.append(new_token)
     return newids
 def perform_bpe():
     vocab_size = 1500  # the desired final vocabulary size
     num_merges = vocab_size - 256
+    token_list = list(tokens)  # copy so we don't destroy the original list
+    # Calculate total bytes before compression
+    total_bytes_before = sum(len(token) for token in token_list)
     merges = {}  # (int, int) -> int
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
+        stats = get_stats(token_list)
+        if not stats:  # No more pairs to merge
+            break
+        # Find most frequent pair
         pair = max(stats, key=stats.get)
         idx = 256 + i
+        # Perform the merge
+        token_list = merge(token_list, pair, idx)
         merges[pair] = idx
+    # Calculate total bytes after compression
+    total_bytes_after = sum(len(token) for token in token_list)
     print("---")
+    print("Total bytes before:", total_bytes_before)
+    print("Total bytes after:", total_bytes_after)
+    print(f"Compression ratio: {total_bytes_before / total_bytes_after:.2f}X")
+    # Flatten for storage, but maintain token boundaries
+    flat_ids = []
+    for token in token_list:
+        flat_ids.extend(token)
+    return merges, flat_ids, num_merges
 if __name__ == "__main__":
     print('---')

decoded_output.txt CHANGED Viewed

	@@ -1,2 +1 @@
1	- ~~ NIUS:~~
2	- done before


1	+ There 'll be chan g es after the war . I ' ve never been more h o pe ful . I t 'd be a more pe ace ful world , people 'll be ha p p ier .

encode_decode.py CHANGED Viewed

@@ -6,6 +6,9 @@ import regex as re
 with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
@@ -13,7 +16,9 @@ for (p0, p1), idx in merges.items():
 def decode(ids):
     # given ids (list of integers), return Python string
     tokens = [vocab[idx] for idx in ids]
-    text = b''.join(tokens).decode("utf-8", errors="replace")
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -22,7 +27,7 @@ def decode(ids):
     return text
 # Example: Decode a list of IDs
-set_of_ids = [25, 345, 992, 1353]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
@@ -34,21 +39,26 @@ def encode():
     # Tokenize the text using the regex pattern
     tokens = re.findall(gpt2pat, text)
-    # Convert tokens to byte sequences
     byte_tokens = [token.encode('utf-8') for token in tokens]
-    # Flatten the list of byte sequences into a single list of bytes
-    tokens = [b for token in byte_tokens for b in token]
-    while len(tokens) >= 2:
-        stats = get_stats(tokens)
-        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
-        if pair not in merges:
-            break  # nothing else can be merged
-        idx = merges[pair]
-        tokens = merge(tokens, pair, idx)
-    return tokens
 # Example: Encode text from a file
 encoded_tokens = encode()

 with open('bpe_results.pkl', 'rb') as f:
     merges, ids, num_merges = pickle.load(f)
+# Define the GPT-2 regex pattern (same as in BPE.py)
+gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 vocab = {idx: bytes([idx]) for idx in range(256)}
 for (p0, p1), idx in merges.items():
     vocab[idx] = vocab[p0] + vocab[p1]
 def decode(ids):
     # given ids (list of integers), return Python string
     tokens = [vocab[idx] for idx in ids]
+    # Decode each token separately and join with tabs
+    decoded_tokens = [token.decode("utf-8", errors="replace") for token in tokens]
+    text = '\t'.join(decoded_tokens)
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
     return text
 # Example: Decode a list of IDs
+set_of_ids = [1072, 415, 308, 1406, 103, 279, 999, 260, 550, 46, 301, 39, 299, 1076, 1172, 562, 284, 111, 414, 1460, 46, 301, 116, 373, 308, 259, 562, 798, 832, 1460, 1449, 44, 892, 415, 308, 311, 112, 112, 549, 46]
 decoded_text = decode(set_of_ids)  # Pass the list of IDs
 print(decoded_text)
     # Tokenize the text using the regex pattern
     tokens = re.findall(gpt2pat, text)
+    # Convert tokens to byte sequences and maintain grouping
     byte_tokens = [token.encode('utf-8') for token in tokens]
+    token_list = [list(token) for token in byte_tokens]
+    # Process each token
+    final_tokens = []
+    for token in token_list:
+        current_token = list(token)
+        while len(current_token) >= 2:
+            stats = get_stats([current_token])
+            if not stats:
+                break
+            pair = min(stats, key=lambda p: merges.get(p, float("inf")))
+            if pair not in merges:
+                break
+            idx = merges[pair]
+            current_token = merge([current_token], pair, idx)[0]
+        final_tokens.extend(current_token)
+    return final_tokens
 # Example: Encode text from a file
 encoded_tokens = encode()

encode_input.txt CHANGED Viewed

@@ -1,3 +1 @@
-"पठानकोट पहुंचे PM मोदी, एयरबेस का जायजा ले बॉर्डर इलाकों का करेंगे हवाई सर्वे","प्रधानमंत्री नरेंद्र मोदी पठानकोट एयरबेस पहुंच गए हैं. वे एयरबेस में सुरक्षा के हालात का जायजा ले रहे हैं और वायुसेनाकर्मियों से मिल रहे हैं. सुबह करीब सवा दस बजे प्रधानमंत्री पंजाब के पठानकोट के लिए रवाना हुए. एयरबेस का जायजा लेने के बाद प्रधानमंत्री बॉर्डर इलाकों का हवाई सर्वेक्षण भी करेंगे. पठानकोट एयरबेस पर पिछले हफ्ते आतंकियों ने हमला किया था. पाकिस्तान से आए आतंकियों के हमले को विफल कर दिया गया था. सभी 6 पाकिस्तानी आतंकी मारे गए थे. 7 सुरक्षाबल भी शहीद हुए थे. भारत ने पाकिस्तान को सबूत सौंपते हुए दोषियों के खिलाफ सख्त कार्रवाई करने को कहा है.
-जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर
-पाकिस्तानी आतंकियों


1	+ There'll be changes after the war. I've never been more hopeful. It'd be a more peaceful world, people'll be happier.

text_file_eng_short.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ There's a chance this is not working, isn't it? There're many papers, why will this work? I've got to make sure. I'm now thinking something's wrong. It'll be sad if there's something wrong and I miss it, I'll be sorry. It'd better be reviewed well, I'd want to be certain.