Spaces:
Sleeping
Sleeping
Commit
·
76f084f
1
Parent(s):
781de59
Regex working
Browse files- BPE.py +54 -23
- decoded_output.txt +1 -2
- encode_decode.py +24 -14
- encode_input.txt +1 -3
- text_file_eng_short.txt +1 -0
BPE.py
CHANGED
@@ -9,51 +9,82 @@ with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
|
|
9 |
# Define the GPT-2 regex pattern
|
10 |
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
11 |
|
12 |
-
#
|
13 |
tokens = re.findall(gpt2pat, text)
|
14 |
|
15 |
# Convert tokens to byte sequences
|
16 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
17 |
|
18 |
-
#
|
19 |
-
tokens = [
|
20 |
|
21 |
-
def get_stats(
|
|
|
22 |
counts = {}
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
return counts
|
26 |
|
27 |
-
def merge(
|
|
|
28 |
newids = []
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return newids
|
38 |
|
39 |
def perform_bpe():
|
40 |
vocab_size = 1500 # the desired final vocabulary size
|
41 |
num_merges = vocab_size - 256
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
44 |
merges = {} # (int, int) -> int
|
45 |
for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
|
46 |
-
stats = get_stats(
|
|
|
|
|
|
|
|
|
47 |
pair = max(stats, key=stats.get)
|
48 |
idx = 256 + i
|
49 |
-
|
|
|
|
|
50 |
merges[pair] = idx
|
51 |
-
|
|
|
|
|
|
|
52 |
print("---")
|
53 |
-
print("
|
54 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
-
return merges,
|
57 |
|
58 |
if __name__ == "__main__":
|
59 |
print('---')
|
|
|
9 |
# Define the GPT-2 regex pattern
|
10 |
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
11 |
|
12 |
+
# Apply the regex pattern to the raw text to tokenize it
|
13 |
tokens = re.findall(gpt2pat, text)
|
14 |
|
15 |
# Convert tokens to byte sequences
|
16 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
17 |
|
18 |
+
# Create a list of byte sequences, each representing a token
|
19 |
+
tokens = [list(token) for token in byte_tokens]
|
20 |
|
21 |
+
def get_stats(token_list):
|
22 |
+
"""Count frequency of pairs across all tokens"""
|
23 |
counts = {}
|
24 |
+
# Count pairs within each token
|
25 |
+
for token in token_list:
|
26 |
+
if len(token) < 2:
|
27 |
+
continue
|
28 |
+
for pair in zip(token, token[1:]):
|
29 |
+
counts[pair] = counts.get(pair, 0) + 1
|
30 |
return counts
|
31 |
|
32 |
+
def merge(token_list, pair, idx):
|
33 |
+
"""Merge all occurrences of pair within each token"""
|
34 |
newids = []
|
35 |
+
for token in token_list:
|
36 |
+
if len(token) < 2:
|
37 |
+
newids.append(token)
|
38 |
+
continue
|
39 |
+
|
40 |
+
new_token = []
|
41 |
+
i = 0
|
42 |
+
while i < len(token):
|
43 |
+
if i < len(token) - 1 and (token[i], token[i+1]) == pair:
|
44 |
+
new_token.append(idx)
|
45 |
+
i += 2
|
46 |
+
else:
|
47 |
+
new_token.append(token[i])
|
48 |
+
i += 1
|
49 |
+
newids.append(new_token)
|
50 |
return newids
|
51 |
|
52 |
def perform_bpe():
|
53 |
vocab_size = 1500 # the desired final vocabulary size
|
54 |
num_merges = vocab_size - 256
|
55 |
+
token_list = list(tokens) # copy so we don't destroy the original list
|
56 |
+
|
57 |
+
# Calculate total bytes before compression
|
58 |
+
total_bytes_before = sum(len(token) for token in token_list)
|
59 |
+
|
60 |
merges = {} # (int, int) -> int
|
61 |
for i in tqdm(range(num_merges), desc="Performing BPE", unit="merge"):
|
62 |
+
stats = get_stats(token_list)
|
63 |
+
if not stats: # No more pairs to merge
|
64 |
+
break
|
65 |
+
|
66 |
+
# Find most frequent pair
|
67 |
pair = max(stats, key=stats.get)
|
68 |
idx = 256 + i
|
69 |
+
|
70 |
+
# Perform the merge
|
71 |
+
token_list = merge(token_list, pair, idx)
|
72 |
merges[pair] = idx
|
73 |
+
|
74 |
+
# Calculate total bytes after compression
|
75 |
+
total_bytes_after = sum(len(token) for token in token_list)
|
76 |
+
|
77 |
print("---")
|
78 |
+
print("Total bytes before:", total_bytes_before)
|
79 |
+
print("Total bytes after:", total_bytes_after)
|
80 |
+
print(f"Compression ratio: {total_bytes_before / total_bytes_after:.2f}X")
|
81 |
+
|
82 |
+
# Flatten for storage, but maintain token boundaries
|
83 |
+
flat_ids = []
|
84 |
+
for token in token_list:
|
85 |
+
flat_ids.extend(token)
|
86 |
|
87 |
+
return merges, flat_ids, num_merges
|
88 |
|
89 |
if __name__ == "__main__":
|
90 |
print('---')
|
decoded_output.txt
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
|
2 |
-
done before
|
|
|
1 |
+
There 'll be chan g es after the war . I ' ve never been more h o pe ful . I t 'd be a more pe ace ful world , people 'll be ha p p ier .
|
|
encode_decode.py
CHANGED
@@ -6,6 +6,9 @@ import regex as re
|
|
6 |
with open('bpe_results.pkl', 'rb') as f:
|
7 |
merges, ids, num_merges = pickle.load(f)
|
8 |
|
|
|
|
|
|
|
9 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
10 |
for (p0, p1), idx in merges.items():
|
11 |
vocab[idx] = vocab[p0] + vocab[p1]
|
@@ -13,7 +16,9 @@ for (p0, p1), idx in merges.items():
|
|
13 |
def decode(ids):
|
14 |
# given ids (list of integers), return Python string
|
15 |
tokens = [vocab[idx] for idx in ids]
|
16 |
-
|
|
|
|
|
17 |
|
18 |
# Write the decoded text to a new file
|
19 |
with open('decoded_output.txt', 'w', encoding='utf-8') as f:
|
@@ -22,7 +27,7 @@ def decode(ids):
|
|
22 |
return text
|
23 |
|
24 |
# Example: Decode a list of IDs
|
25 |
-
set_of_ids = [
|
26 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
27 |
print(decoded_text)
|
28 |
|
@@ -34,21 +39,26 @@ def encode():
|
|
34 |
# Tokenize the text using the regex pattern
|
35 |
tokens = re.findall(gpt2pat, text)
|
36 |
|
37 |
-
# Convert tokens to byte sequences
|
38 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
|
|
39 |
|
40 |
-
#
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
return
|
52 |
|
53 |
# Example: Encode text from a file
|
54 |
encoded_tokens = encode()
|
|
|
6 |
with open('bpe_results.pkl', 'rb') as f:
|
7 |
merges, ids, num_merges = pickle.load(f)
|
8 |
|
9 |
+
# Define the GPT-2 regex pattern (same as in BPE.py)
|
10 |
+
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
11 |
+
|
12 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
13 |
for (p0, p1), idx in merges.items():
|
14 |
vocab[idx] = vocab[p0] + vocab[p1]
|
|
|
16 |
def decode(ids):
|
17 |
# given ids (list of integers), return Python string
|
18 |
tokens = [vocab[idx] for idx in ids]
|
19 |
+
# Decode each token separately and join with tabs
|
20 |
+
decoded_tokens = [token.decode("utf-8", errors="replace") for token in tokens]
|
21 |
+
text = '\t'.join(decoded_tokens)
|
22 |
|
23 |
# Write the decoded text to a new file
|
24 |
with open('decoded_output.txt', 'w', encoding='utf-8') as f:
|
|
|
27 |
return text
|
28 |
|
29 |
# Example: Decode a list of IDs
|
30 |
+
set_of_ids = [1072, 415, 308, 1406, 103, 279, 999, 260, 550, 46, 301, 39, 299, 1076, 1172, 562, 284, 111, 414, 1460, 46, 301, 116, 373, 308, 259, 562, 798, 832, 1460, 1449, 44, 892, 415, 308, 311, 112, 112, 549, 46]
|
31 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
32 |
print(decoded_text)
|
33 |
|
|
|
39 |
# Tokenize the text using the regex pattern
|
40 |
tokens = re.findall(gpt2pat, text)
|
41 |
|
42 |
+
# Convert tokens to byte sequences and maintain grouping
|
43 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
44 |
+
token_list = [list(token) for token in byte_tokens]
|
45 |
|
46 |
+
# Process each token
|
47 |
+
final_tokens = []
|
48 |
+
for token in token_list:
|
49 |
+
current_token = list(token)
|
50 |
+
while len(current_token) >= 2:
|
51 |
+
stats = get_stats([current_token])
|
52 |
+
if not stats:
|
53 |
+
break
|
54 |
+
pair = min(stats, key=lambda p: merges.get(p, float("inf")))
|
55 |
+
if pair not in merges:
|
56 |
+
break
|
57 |
+
idx = merges[pair]
|
58 |
+
current_token = merge([current_token], pair, idx)[0]
|
59 |
+
final_tokens.extend(current_token)
|
60 |
|
61 |
+
return final_tokens
|
62 |
|
63 |
# Example: Encode text from a file
|
64 |
encoded_tokens = encode()
|
encode_input.txt
CHANGED
@@ -1,3 +1 @@
|
|
1 |
-
|
2 |
-
जानकारी के मुताबिक, प्रधानमंत्री के साथ आर्मी और एयरफोर्स के चीफ भी मौजूद रह सकते हैं. एयरबेस पर
|
3 |
-
पाकिस्तानी आतंकियों
|
|
|
1 |
+
There'll be changes after the war. I've never been more hopeful. It'd be a more peaceful world, people'll be happier.
|
|
|
|
text_file_eng_short.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
There's a chance this is not working, isn't it? There're many papers, why will this work? I've got to make sure. I'm now thinking something's wrong. It'll be sad if there's something wrong and I miss it, I'll be sorry. It'd better be reviewed well, I'd want to be certain.
|