Spaces:
Sleeping
Sleeping
Commit
·
cae9627
1
Parent(s):
1e8ebcb
eng test re
Browse files
BPE.py
CHANGED
@@ -14,7 +14,7 @@ tokens = re.findall(gpt2pat, text)
|
|
14 |
|
15 |
# Convert tokens to a list of integers in range 0..255 for convenience
|
16 |
tokens = [ord(char) for token in tokens for char in token]
|
17 |
-
print(tokens)
|
18 |
|
19 |
def get_stats(ids):
|
20 |
counts = {}
|
@@ -62,9 +62,9 @@ if __name__ == "__main__":
|
|
62 |
print("length of tokens:", len(tokens))
|
63 |
|
64 |
# Run BPE and save results
|
65 |
-
|
66 |
|
67 |
# Save merges and vocab to a file
|
68 |
-
|
69 |
-
|
70 |
|
|
|
14 |
|
15 |
# Convert tokens to a list of integers in range 0..255 for convenience
|
16 |
tokens = [ord(char) for token in tokens for char in token]
|
17 |
+
#print(tokens)
|
18 |
|
19 |
def get_stats(ids):
|
20 |
counts = {}
|
|
|
62 |
print("length of tokens:", len(tokens))
|
63 |
|
64 |
# Run BPE and save results
|
65 |
+
merges, ids, num_merges = perform_bpe()
|
66 |
|
67 |
# Save merges and vocab to a file
|
68 |
+
with open('bpe_results.pkl', 'wb') as f:
|
69 |
+
pickle.dump((merges, ids, num_merges), f)
|
70 |
|