import pickle import regex as re from tqdm import tqdm # Import tqdm for progress bar # Read text from a file with open('text_file_eng.txt', 'r', encoding='utf-8') as file: text = file.read() # Define the GPT-2 regex pattern gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") # Apply the regex pattern to tokenize the text tokens = re.findall(gpt2pat, text) # Convert tokens to a list of integers in range 0..255 for convenience tokens = [ord(char) for token in tokens for char in token] print(tokens) def get_stats(ids): counts = {} for pair in zip(ids, ids[1:]): counts[pair] = counts.get(pair, 0) + 1 return counts def merge(ids, pair, idx): # in the list of ints (ids), replace all consecutive occurrences of pair with the new token idx newids = [] i = 0 while i < len(ids): if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]: newids.append(idx) i += 2 else: newids.append(ids[i]) i += 1 return newids def perform_bpe(): vocab_size = 1500 # the desired final vocabulary size num_merges = vocab_size - 256 ids = list(tokens) # copy so we don't destroy the original list merges = {} # (int, int) -> int # Use tqdm to add a progress bar for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"): stats = get_stats(ids) pair = max(stats, key=stats.get) idx = 256 + i ids = merge(ids, pair, idx) merges[pair] = idx print("---") print("ids length:", len(ids)) print(f"compression ratio: {len(tokens) / len(ids):.2f}X") return merges, ids, num_merges if __name__ == "__main__": print('---') print("length of text:", len(text)) print('---') print("length of tokens:", len(tokens)) # Run BPE and save results #merges, ids, num_merges = perform_bpe() # Save merges and vocab to a file #with open('bpe_results.pkl', 'wb') as f: #pickle.dump((merges, ids, num_merges), f)