atiwari751 commited on
Commit
fa76461
·
1 Parent(s): 25f17ee

english trial long

Browse files
Files changed (4) hide show
  1. BPE.py +1 -1
  2. decoded_output.txt +1 -1
  3. encode_decode.py +1 -1
  4. text_file_eng_long.txt +0 -0
BPE.py CHANGED
@@ -2,7 +2,7 @@ import pickle
2
  from tqdm import tqdm # Import tqdm for progress bar
3
 
4
  # Read text from a file
5
- with open('text_file_eng.txt', 'r', encoding='utf-8') as file:
6
  text = file.read()
7
 
8
  tokens = text.encode("utf-8") # raw bytes
 
2
  from tqdm import tqdm # Import tqdm for progress bar
3
 
4
  # Read text from a file
5
+ with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
6
  text = file.read()
7
 
8
  tokens = text.encode("utf-8") # raw bytes
decoded_output.txt CHANGED
@@ -1 +1 @@
1
- Th ere 's a ch ance this is not wor king , is n 't it ? Th ere ' re many p ap er s, why will this work ? I ' ve go t to make su re . I ' m now thin king s om e th ing 's w r ong . I t 'll be sa d if th ere 's s om e thing w r ong and I mis s it, I'll be s or r y. I t 'd better be re vi e w ed well , I 'd w ant to be cer t ain .
 
1
+ Th ere 's a ch anc e this is not work ing, is n 't it ? Th ere ' re many pa per s, why will this work ? I ' ve got to make su re . I ' m now thin king some thing 's w ron g . I t 'll be sa d if there 's something w r ong and I mis s it, I'll be sor r y. I t 'd bet ter be re view ed well , I 'd want to be cer tain .
encode_decode.py CHANGED
@@ -21,7 +21,7 @@ def decode(ids):
21
  return text
22
 
23
  # Example: Decode a list of IDs
24
- set_of_ids = [297, 562, 373, 322, 310, 1454, 609, 1518, 405, 908, 261, 463, 110, 974, 316, 466, 297, 562, 39, 689, 1953, 112, 563, 268, 352, 1494, 1023, 587, 1509, 466, 73, 39, 307, 484, 1166, 525, 398, 315, 314, 73, 39, 347, 435, 815, 1421, 115, 284, 101, 257, 305, 373, 119, 114, 1061, 314, 73, 116, 436, 418, 465, 262, 712, 273, 562, 373, 115, 284, 101, 1316, 119, 114, 791, 295, 296, 1447, 259, 732, 595, 418, 115, 278, 114, 1333, 73, 116, 447, 1982, 418, 315, 717, 101, 119, 330, 517, 261, 73, 447, 119, 741, 304, 418, 1109, 116, 464, 46]
25
  decoded_text = decode(set_of_ids) # Pass the list of IDs
26
  print(decoded_text)
27
 
 
21
  return text
22
 
23
  # Example: Decode a list of IDs
24
+ set_of_ids = [312, 1366, 565, 278, 302, 717, 256, 429, 1496, 1687, 808, 411, 110, 2862, 289, 670, 312, 1366, 39, 1281, 1191, 2358, 456, 374, 2453, 574, 429, 1687, 670, 73, 39, 353, 1176, 286, 904, 367, 279, 2310, 39, 695, 1398, 999, 806, 1271, 3455, 565, 119, 1902, 103, 2310, 116, 851, 403, 379, 260, 846, 2713, 565, 3466, 119, 114, 588, 292, 360, 1263, 258, 1285, 1402, 403, 3305, 114, 1278, 73, 116, 887, 773, 363, 403, 279, 2035, 274, 1150, 3273, 887, 2398, 1219, 1031, 2514, 46]
25
  decoded_text = decode(set_of_ids) # Pass the list of IDs
26
  print(decoded_text)
27
 
text_file_eng_long.txt ADDED
The diff for this file is too large to render. See raw diff