Spaces:
Sleeping
Sleeping
Commit
·
fa76461
1
Parent(s):
25f17ee
english trial long
Browse files- BPE.py +1 -1
- decoded_output.txt +1 -1
- encode_decode.py +1 -1
- text_file_eng_long.txt +0 -0
BPE.py
CHANGED
@@ -2,7 +2,7 @@ import pickle
|
|
2 |
from tqdm import tqdm # Import tqdm for progress bar
|
3 |
|
4 |
# Read text from a file
|
5 |
-
with open('
|
6 |
text = file.read()
|
7 |
|
8 |
tokens = text.encode("utf-8") # raw bytes
|
|
|
2 |
from tqdm import tqdm # Import tqdm for progress bar
|
3 |
|
4 |
# Read text from a file
|
5 |
+
with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
|
6 |
text = file.read()
|
7 |
|
8 |
tokens = text.encode("utf-8") # raw bytes
|
decoded_output.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Th ere 's a ch
|
|
|
1 |
+
Th ere 's a ch anc e this is not work ing, is n 't it ? Th ere ' re many pa per s, why will this work ? I ' ve got to make su re . I ' m now thin king some thing 's w ron g . I t 'll be sa d if there 's something w r ong and I mis s it, I'll be sor r y. I t 'd bet ter be re view ed well , I 'd want to be cer tain .
|
encode_decode.py
CHANGED
@@ -21,7 +21,7 @@ def decode(ids):
|
|
21 |
return text
|
22 |
|
23 |
# Example: Decode a list of IDs
|
24 |
-
set_of_ids = [
|
25 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
26 |
print(decoded_text)
|
27 |
|
|
|
21 |
return text
|
22 |
|
23 |
# Example: Decode a list of IDs
|
24 |
+
set_of_ids = [312, 1366, 565, 278, 302, 717, 256, 429, 1496, 1687, 808, 411, 110, 2862, 289, 670, 312, 1366, 39, 1281, 1191, 2358, 456, 374, 2453, 574, 429, 1687, 670, 73, 39, 353, 1176, 286, 904, 367, 279, 2310, 39, 695, 1398, 999, 806, 1271, 3455, 565, 119, 1902, 103, 2310, 116, 851, 403, 379, 260, 846, 2713, 565, 3466, 119, 114, 588, 292, 360, 1263, 258, 1285, 1402, 403, 3305, 114, 1278, 73, 116, 887, 773, 363, 403, 279, 2035, 274, 1150, 3273, 887, 2398, 1219, 1031, 2514, 46]
|
25 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
26 |
print(decoded_text)
|
27 |
|
text_file_eng_long.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|