Spaces:
Sleeping
Sleeping
Commit
·
1e8ebcb
1
Parent(s):
c128a5f
basic regex
Browse files- BPE.py +15 -7
- decoded_output.txt +2 -3
- encode_decode.py +1 -1
BPE.py
CHANGED
@@ -1,12 +1,20 @@
|
|
1 |
import pickle
|
|
|
2 |
from tqdm import tqdm # Import tqdm for progress bar
|
3 |
|
4 |
# Read text from a file
|
5 |
-
with open('
|
6 |
text = file.read()
|
7 |
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def get_stats(ids):
|
12 |
counts = {}
|
@@ -28,7 +36,7 @@ def merge(ids, pair, idx):
|
|
28 |
return newids
|
29 |
|
30 |
def perform_bpe():
|
31 |
-
vocab_size =
|
32 |
num_merges = vocab_size - 256
|
33 |
ids = list(tokens) # copy so we don't destroy the original list
|
34 |
|
@@ -54,9 +62,9 @@ if __name__ == "__main__":
|
|
54 |
print("length of tokens:", len(tokens))
|
55 |
|
56 |
# Run BPE and save results
|
57 |
-
merges, ids, num_merges = perform_bpe()
|
58 |
|
59 |
# Save merges and vocab to a file
|
60 |
-
with open('bpe_results.pkl', 'wb') as f:
|
61 |
-
pickle.dump((merges, ids, num_merges), f)
|
62 |
|
|
|
1 |
import pickle
|
2 |
+
import regex as re
|
3 |
from tqdm import tqdm # Import tqdm for progress bar
|
4 |
|
5 |
# Read text from a file
|
6 |
+
with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
|
7 |
text = file.read()
|
8 |
|
9 |
+
# Define the GPT-2 regex pattern
|
10 |
+
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
11 |
+
|
12 |
+
# Apply the regex pattern to tokenize the text
|
13 |
+
tokens = re.findall(gpt2pat, text)
|
14 |
+
|
15 |
+
# Convert tokens to a list of integers in range 0..255 for convenience
|
16 |
+
tokens = [ord(char) for token in tokens for char in token]
|
17 |
+
print(tokens)
|
18 |
|
19 |
def get_stats(ids):
|
20 |
counts = {}
|
|
|
36 |
return newids
|
37 |
|
38 |
def perform_bpe():
|
39 |
+
vocab_size = 1500 # the desired final vocabulary size
|
40 |
num_merges = vocab_size - 256
|
41 |
ids = list(tokens) # copy so we don't destroy the original list
|
42 |
|
|
|
62 |
print("length of tokens:", len(tokens))
|
63 |
|
64 |
# Run BPE and save results
|
65 |
+
#merges, ids, num_merges = perform_bpe()
|
66 |
|
67 |
# Save merges and vocab to a file
|
68 |
+
#with open('bpe_results.pkl', 'wb') as f:
|
69 |
+
#pickle.dump((merges, ids, num_merges), f)
|
70 |
|
decoded_output.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
प ाकिस्तान ी � �तंक ियों
|
|
|
1 |
+
NIUS:
|
2 |
+
done before
|
|
encode_decode.py
CHANGED
@@ -21,7 +21,7 @@ def decode(ids):
|
|
21 |
return text
|
22 |
|
23 |
# Example: Decode a list of IDs
|
24 |
-
set_of_ids = [
|
25 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
26 |
print(decoded_text)
|
27 |
|
|
|
21 |
return text
|
22 |
|
23 |
# Example: Decode a list of IDs
|
24 |
+
set_of_ids = [25, 345, 992, 1353]
|
25 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
26 |
print(decoded_text)
|
27 |
|