atiwari751 commited on
Commit
76f084f
·
1 Parent(s): 781de59

Regex working

Browse files
Files changed (5) hide show
  1. BPE.py +54 -23
  2. decoded_output.txt +1 -2
  3. encode_decode.py +24 -14
  4. encode_input.txt +1 -3
  5. text_file_eng_short.txt +1 -0
BPE.py CHANGED
@@ -9,51 +9,82 @@ with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
9
  # Define the GPT-2 regex pattern
10
  gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
11
 
12
- # Tokenize the text using the regex pattern
13
  tokens = re.findall(gpt2pat, text)
14
 
15
  # Convert tokens to byte sequences
16
  byte_tokens = [token.encode('utf-8') for token in tokens]
17
 
18
- # Flatten the list of byte sequences into a single list of bytes
19
- tokens = [b for token in byte_tokens for b in token]
20
 
21
- def get_stats(ids):
 
22
  counts = {}
23
- for pair in zip(ids, ids[1:]):
24
- counts[pair] = counts.get(pair, 0) + 1
 
 
 
 
25
  return counts
26
 
27
- def merge(ids, pair, idx):
 
28
  newids = []
29
- i = 0
30
- while i < len(ids):
31
- if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
32
- newids.append(idx)
33
- i += 2
34
- else:
35
- newids.append(ids[i])
36
- i += 1
 
 
 
 
 
 
 
37
  return newids
38
 
39
  def perform_bpe():
40
  vocab_size = 1500 # the desired final vocabulary size
41
  num_merges = vocab_size - 256
42
- ids = list(tokens) # copy so we don't destroy the original list
43
-
 
 
 
44
  merges = {} # (int, int) -> int
45
  for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
46
- stats = get_stats(ids)
 
 
 
 
47
  pair = max(stats, key=stats.get)
48
  idx = 256 + i
49
- ids = merge(ids, pair, idx)
 
 
50
  merges[pair] = idx
51
-
 
 
 
52
  print("---")
53
- print("ids length:", len(ids))
54
- print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
 
 
 
 
 
 
55
 
56
- return merges, ids, num_merges
57
 
58
  if __name__ == "__main__":
59
  print('---')
 
9
  # Define the GPT-2 regex pattern
10
  gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
11
 
12
+ # Apply the regex pattern to the raw text to tokenize it
13
  tokens = re.findall(gpt2pat, text)
14
 
15
  # Convert tokens to byte sequences
16
  byte_tokens = [token.encode('utf-8') for token in tokens]
17
 
18
+ # Create a list of byte sequences, each representing a token
19
+ tokens = [list(token) for token in byte_tokens]
20
 
21
+ def get_stats(token_list):
22
+ """Count frequency of pairs across all tokens"""
23
  counts = {}
24
+ # Count pairs within each token
25
+ for token in token_list:
26
+ if len(token) < 2:
27
+ continue
28
+ for pair in zip(token, token[1:]):
29
+ counts[pair] = counts.get(pair, 0) + 1
30
  return counts
31
 
32
+ def merge(token_list, pair, idx):
33
+ """Merge all occurrences of pair within each token"""
34
  newids = []
35
+ for token in token_list:
36
+ if len(token) < 2:
37
+ newids.append(token)
38
+ continue
39
+
40
+ new_token = []
41
+ i = 0
42
+ while i < len(token):
43
+ if i < len(token) - 1 and (token[i], token[i+1]) == pair:
44
+ new_token.append(idx)
45
+ i += 2
46
+ else:
47
+ new_token.append(token[i])
48
+ i += 1
49
+ newids.append(new_token)
50
  return newids
51
 
52
  def perform_bpe():
53
  vocab_size = 1500 # the desired final vocabulary size
54
  num_merges = vocab_size - 256
55
+ token_list = list(tokens) # copy so we don't destroy the original list
56
+
57
+ # Calculate total bytes before compression
58
+ total_bytes_before = sum(len(token) for token in token_list)
59
+
60
  merges = {} # (int, int) -> int
61
  for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
62
+ stats = get_stats(token_list)
63
+ if not stats: # No more pairs to merge
64
+ break
65
+
66
+ # Find most frequent pair
67
  pair = max(stats, key=stats.get)
68
  idx = 256 + i
69
+
70
+ # Perform the merge
71
+ token_list = merge(token_list, pair, idx)
72
  merges[pair] = idx
73
+
74
+ # Calculate total bytes after compression
75
+ total_bytes_after = sum(len(token) for token in token_list)
76
+
77
  print("---")
78
+ print("Total bytes before:", total_bytes_before)
79
+ print("Total bytes after:", total_bytes_after)
80
+ print(f"Compression ratio: {total_bytes_before / total_bytes_after:.2f}X")
81
+
82
+ # Flatten for storage, but maintain token boundaries
83
+ flat_ids = []
84
+ for token in token_list:
85
+ flat_ids.extend(token)
86
 
87
+ return merges, flat_ids, num_merges
88
 
89
  if __name__ == "__main__":
90
  print('---')
decoded_output.txt CHANGED
@@ -1,2 +1 @@
1
-  NIUS:
2
- done before
 
1
+ There 'll be chan g es after the war . I ' ve never been more h o pe ful . I t 'd be a more pe ace ful world , people 'll be ha p p ier .
 
encode_decode.py CHANGED
@@ -6,6 +6,9 @@ import regex as re
6
  with open('bpe_results.pkl', 'rb') as f:
7
  merges, ids, num_merges = pickle.load(f)
8
 
 
 
 
9
  vocab = {idx: bytes([idx]) for idx in range(256)}
10
  for (p0, p1), idx in merges.items():
11
  vocab[idx] = vocab[p0] + vocab[p1]
@@ -13,7 +16,9 @@ for (p0, p1), idx in merges.items():
13
  def decode(ids):
14
  # given ids (list of integers), return Python string
15
  tokens = [vocab[idx] for idx in ids]
16
- text = b''.join(tokens).decode("utf-8", errors="replace")
 
 
17
 
18
  # Write the decoded text to a new file
19
  with open('decoded_output.txt', 'w', encoding='utf-8') as f:
@@ -22,7 +27,7 @@ def decode(ids):
22
  return text
23
 
24
  # Example: Decode a list of IDs
25
- set_of_ids = [25, 345, 992, 1353]
26
  decoded_text = decode(set_of_ids) # Pass the list of IDs
27
  print(decoded_text)
28
 
@@ -34,21 +39,26 @@ def encode():
34
  # Tokenize the text using the regex pattern
35
  tokens = re.findall(gpt2pat, text)
36
 
37
- # Convert tokens to byte sequences
38
  byte_tokens = [token.encode('utf-8') for token in tokens]
 
39
 
40
- # Flatten the list of byte sequences into a single list of bytes
41
- tokens = [b for token in byte_tokens for b in token]
42
-
43
- while len(tokens) >= 2:
44
- stats = get_stats(tokens)
45
- pair = min(stats, key=lambda p: merges.get(p, float("inf")))
46
- if pair not in merges:
47
- break # nothing else can be merged
48
- idx = merges[pair]
49
- tokens = merge(tokens, pair, idx)
 
 
 
 
50
 
51
- return tokens
52
 
53
  # Example: Encode text from a file
54
  encoded_tokens = encode()
 
6
  with open('bpe_results.pkl', 'rb') as f:
7
  merges, ids, num_merges = pickle.load(f)
8
 
9
+ # Define the GPT-2 regex pattern (same as in BPE.py)
10
+ gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
11
+
12
  vocab = {idx: bytes([idx]) for idx in range(256)}
13
  for (p0, p1), idx in merges.items():
14
  vocab[idx] = vocab[p0] + vocab[p1]
 
16
  def decode(ids):
17
  # given ids (list of integers), return Python string
18
  tokens = [vocab[idx] for idx in ids]
19
+ # Decode each token separately and join with tabs
20
+ decoded_tokens = [token.decode("utf-8", errors="replace") for token in tokens]
21
+ text = '\t'.join(decoded_tokens)
22
 
23
  # Write the decoded text to a new file
24
  with open('decoded_output.txt', 'w', encoding='utf-8') as f:
 
27
  return text
28
 
29
  # Example: Decode a list of IDs
30
+ set_of_ids = [1072, 415, 308, 1406, 103, 279, 999, 260, 550, 46, 301, 39, 299, 1076, 1172, 562, 284, 111, 414, 1460, 46, 301, 116, 373, 308, 259, 562, 798, 832, 1460, 1449, 44, 892, 415, 308, 311, 112, 112, 549, 46]
31
  decoded_text = decode(set_of_ids) # Pass the list of IDs
32
  print(decoded_text)
33
 
 
39
  # Tokenize the text using the regex pattern
40
  tokens = re.findall(gpt2pat, text)
41
 
42
+ # Convert tokens to byte sequences and maintain grouping
43
  byte_tokens = [token.encode('utf-8') for token in tokens]
44
+ token_list = [list(token) for token in byte_tokens]
45
 
46
+ # Process each token
47
+ final_tokens = []
48
+ for token in token_list:
49
+ current_token = list(token)
50
+ while len(current_token) >= 2:
51
+ stats = get_stats([current_token])
52
+ if not stats:
53
+ break
54
+ pair = min(stats, key=lambda p: merges.get(p, float("inf")))
55
+ if pair not in merges:
56
+ break
57
+ idx = merges[pair]
58
+ current_token = merge([current_token], pair, idx)[0]
59
+ final_tokens.extend(current_token)
60
 
61
+ return final_tokens
62
 
63
  # Example: Encode text from a file
64
  encoded_tokens = encode()
encode_input.txt CHANGED
@@ -1,3 +1 @@
1
- "पठानकोट पहुंचे PM मोदी, एयरबेस का जायजा ले बॉर्डर इलाकों का करेंगे हवाई सर्वे","प्रधानमंत्री नरेंद्र मोदी पठानकोट एयरबेस पहुंच गए हैं. वे एयरबेस में सुरक्षा के हालात का जायजा ले रहे हैं और वायुसेनाकर्मियों से मिल रहे हैं. सुबह करीब सवा दस बजे प्रधानमंत्री पंजाब के पठानकोट के लिए रवाना हुए. एयरबेस का जायजा लेने के बाद प्रधानमंत्री बॉर्डर इलाकों का हवाई सर्वेक्षण भी करेंगे. पठानकोट एयरबेस पर पिछले हफ्ते आतंकियों ने हमला किया था. पाकिस्तान से आए आतंकियों के हमले को विफल कर दिया गया था. सभी 6 पाकिस्तानी आतंकी मारे गए थे. 7 सुरक्षाबल भी शहीद हुए थे. भारत ने पाकिस्तान को सबूत सौंपते हुए दोषियों के खिलाफ सख्त कार्रवाई करने को कहा है.
2
- जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर
3
- पाकिस्तानी आतंकियों
 
1
+ There'll be changes after the war. I've never been more hopeful. It'd be a more peaceful world, people'll be happier.
 
 
text_file_eng_short.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ There's a chance this is not working, isn't it? There're many papers, why will this work? I've got to make sure. I'm now thinking something's wrong. It'll be sad if there's something wrong and I miss it, I'll be sorry. It'd better be reviewed well, I'd want to be certain.