Spaces:
Sleeping
Sleeping
Commit
·
b0f8dcf
1
Parent(s):
dea5ea1
english trial long final
Browse files- BPE.py +2 -2
- decoded_output.txt +1 -1
- encode_decode.py +1 -1
- text_file_eng_long.txt +0 -0
- text_file_eng_short.txt +0 -1
BPE.py
CHANGED
@@ -3,7 +3,7 @@ import regex as re
|
|
3 |
from tqdm import tqdm
|
4 |
|
5 |
# Read text from a file
|
6 |
-
with open('
|
7 |
text = file.read()
|
8 |
|
9 |
# Define the GPT-2 regex pattern
|
@@ -50,7 +50,7 @@ def merge(token_list, pair, idx):
|
|
50 |
return newids
|
51 |
|
52 |
def perform_bpe():
|
53 |
-
vocab_size =
|
54 |
num_merges = vocab_size - 256
|
55 |
token_list = list(tokens) # copy so we don't destroy the original list
|
56 |
|
|
|
3 |
from tqdm import tqdm
|
4 |
|
5 |
# Read text from a file
|
6 |
+
with open('text_file_eng_long.txt', 'r', encoding='utf-8') as file:
|
7 |
text = file.read()
|
8 |
|
9 |
# Define the GPT-2 regex pattern
|
|
|
50 |
return newids
|
51 |
|
52 |
def perform_bpe():
|
53 |
+
vocab_size = 3500 # the desired final vocabulary size
|
54 |
num_merges = vocab_size - 256
|
55 |
token_list = list(tokens) # copy so we don't destroy the original list
|
56 |
|
decoded_output.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
There 's a
|
|
|
1 |
+
There 's a chance this is not working , isn 't it ? There ' re many p ap ers , why will this work ? I ' ve got to make su re . I ' m now th in king something 's wr ong . It 'll be s ad if there 's something wr ong and I miss it , I 'll be s or ry . It 'd better be re vi ew ed well , I 'd want to be certain .
|
encode_decode.py
CHANGED
@@ -27,7 +27,7 @@ def decode(ids):
|
|
27 |
return text
|
28 |
|
29 |
# Example: Decode a list of IDs
|
30 |
-
set_of_ids = [
|
31 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
32 |
print(decoded_text)
|
33 |
|
|
|
27 |
return text
|
28 |
|
29 |
# Example: Decode a list of IDs
|
30 |
+
set_of_ids = [2532, 522, 258, 3103, 425, 332, 374, 2797, 44, 2391, 1508, 369, 63, 1375, 39, 261, 972, 277, 641, 385, 44, 2208, 553, 425, 1592, 63, 330, 39, 318, 1088, 285, 843, 405, 261, 46, 330, 39, 109, 1070, 325, 259, 888, 2913, 522, 1796, 524, 46, 966, 824, 306, 262, 354, 820, 726, 522, 2913, 1796, 524, 294, 330, 2827, 369, 44, 330, 824, 306, 262, 279, 551, 46, 966, 672, 2988, 306, 301, 3188, 451, 270, 814, 44, 330, 672, 1726, 285, 306, 1475, 46]
|
31 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
32 |
print(decoded_text)
|
33 |
|
text_file_eng_long.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
text_file_eng_short.txt
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
There's a chance this is not working, isn't it? There're many papers, why will this work? I've got to make sure. I'm now thinking something's wrong. It'll be sad if there's something wrong and I miss it, I'll be sorry. It'd better be reviewed well, I'd want to be certain.
|
|
|
|