Spaces:

atiwari751
/

Hindi-tokenizer

Sleeping

atiwari751 commited on Jan 10

Commit

781de59

1 Parent(s): cae9627

regex on byte sequences

Files changed (3) hide show

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .venv
 __pycache__
 test.csv

 .venv
 __pycache__
 test.csv
+GPT2_encoder.py

BPE.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import pickle
 import regex as re
-from tqdm import tqdm  # Import tqdm for progress bar
 # Read text from a file
 with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
@@ -9,12 +9,14 @@ with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
 # Define the GPT-2 regex pattern
 gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-# Apply the regex pattern to tokenize the text
 tokens = re.findall(gpt2pat, text)
-# Convert tokens to a list of integers in range 0..255 for convenience
-tokens = [ord(char) for token in tokens for char in token]
-#print(tokens)
 def get_stats(ids):
     counts = {}
@@ -23,7 +25,6 @@ def get_stats(ids):
     return counts
 def merge(ids, pair, idx):
-    # in the list of ints (ids), replace all consecutive occurrences of pair with the new token idx
     newids = []
     i = 0
     while i < len(ids):
@@ -41,7 +42,6 @@ def perform_bpe():
     ids = list(tokens)  # copy so we don't destroy the original list
     merges = {}  # (int, int) -> int
-    # Use tqdm to add a progress bar
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
         stats = get_stats(ids)
         pair = max(stats, key=stats.get)

 import pickle
 import regex as re
+from tqdm import tqdm
 # Read text from a file
 with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
 # Define the GPT-2 regex pattern
 gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+# Tokenize the text using the regex pattern
 tokens = re.findall(gpt2pat, text)
+# Convert tokens to byte sequences
+byte_tokens = [token.encode('utf-8') for token in tokens]
+# Flatten the list of byte sequences into a single list of bytes
+tokens = [b for token in byte_tokens for b in token]
 def get_stats(ids):
     counts = {}
     return counts
 def merge(ids, pair, idx):
     newids = []
     i = 0
     while i < len(ids):
     ids = list(tokens)  # copy so we don't destroy the original list
     merges = {}  # (int, int) -> int
     for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
         stats = get_stats(ids)
         pair = max(stats, key=stats.get)

encode_decode.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pickle
 from BPE import get_stats, merge
 # Load merges and vocab from the file
 with open('bpe_results.pkl', 'rb') as f:
@@ -11,8 +12,8 @@ for (p0, p1), idx in merges.items():
 def decode(ids):
     # given ids (list of integers), return Python string
-    tokens = [vocab[idx].decode("utf-8", errors="replace") for idx in ids]
-    text = '    '.join(tokens)  # Join tokens with a single space
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -30,8 +31,15 @@ def encode():
     with open('encode_input.txt', 'r', encoding='utf-8') as f:
         text = f.read()
-    # given a string, return list of integers (the tokens)
-    tokens = list(text.encode("utf-8"))
     while len(tokens) >= 2:
         stats = get_stats(tokens)
         pair = min(stats, key=lambda p: merges.get(p, float("inf")))
@@ -43,5 +51,5 @@ def encode():
     return tokens
 # Example: Encode text from a file
-#encoded_tokens = encode()
-#print(encoded_tokens)

 import pickle
 from BPE import get_stats, merge
+import regex as re
 # Load merges and vocab from the file
 with open('bpe_results.pkl', 'rb') as f:
 def decode(ids):
     # given ids (list of integers), return Python string
+    tokens = [vocab[idx] for idx in ids]
+    text = b''.join(tokens).decode("utf-8", errors="replace")
     # Write the decoded text to a new file
     with open('decoded_output.txt', 'w', encoding='utf-8') as f:
     with open('encode_input.txt', 'r', encoding='utf-8') as f:
         text = f.read()
+    # Tokenize the text using the regex pattern
+    tokens = re.findall(gpt2pat, text)
+    # Convert tokens to byte sequences
+    byte_tokens = [token.encode('utf-8') for token in tokens]
+    # Flatten the list of byte sequences into a single list of bytes
+    tokens = [b for token in byte_tokens for b in token]
     while len(tokens) >= 2:
         stats = get_stats(tokens)
         pair = min(stats, key=lambda p: merges.get(p, float("inf")))
     return tokens
 # Example: Encode text from a file
+encoded_tokens = encode()
+print(encoded_tokens)