Spaces:
Runtime error
Runtime error
Update train.py
Browse files
train.py
CHANGED
@@ -40,23 +40,20 @@ class Space:
|
|
40 |
space = Space()
|
41 |
|
42 |
def load_data():
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
def create_tokenizer(training_corpus):
|
56 |
tokenizer = ByteLevelBPETokenizer()
|
57 |
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
|
58 |
-
if INSTRUCT_FINETUNE_BOOL:
|
59 |
-
special_tokens.extend(["<|user|>", "<|bot|>", "<|end|>"])
|
60 |
tokenizer.train_from_iterator(
|
61 |
training_corpus,
|
62 |
vocab_size=VOCAB_SIZE,
|
@@ -117,7 +114,8 @@ def configure_tokenizer(tokenizer):
|
|
117 |
"eos_token": "</s>",
|
118 |
"unk_token": "<unk>",
|
119 |
"pad_token": "<pad>",
|
120 |
-
"mask_token": "<mask>"
|
|
|
121 |
}
|
122 |
if INSTRUCT_FINETUNE_BOOL:
|
123 |
special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
|
|
|
40 |
space = Space()
|
41 |
|
42 |
def load_data():
|
43 |
+
if not INSTRUCT_FINETUNE_BOOL:
|
44 |
+
dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
|
45 |
+
else:
|
46 |
+
dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
|
|
|
47 |
|
48 |
+
start = INIT * SHARD_SIZE
|
49 |
+
data_list = list(islice(dataset, start, start + SHARD_SIZE))
|
50 |
+
|
51 |
+
dataset = Dataset.from_dict({'text': [example['text'] for example in data_list]})
|
52 |
+
return dataset
|
53 |
|
54 |
def create_tokenizer(training_corpus):
|
55 |
tokenizer = ByteLevelBPETokenizer()
|
56 |
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
|
|
|
|
|
57 |
tokenizer.train_from_iterator(
|
58 |
training_corpus,
|
59 |
vocab_size=VOCAB_SIZE,
|
|
|
114 |
"eos_token": "</s>",
|
115 |
"unk_token": "<unk>",
|
116 |
"pad_token": "<pad>",
|
117 |
+
"mask_token": "<mask>",
|
118 |
+
"additional_special_tokens": []
|
119 |
}
|
120 |
if INSTRUCT_FINETUNE_BOOL:
|
121 |
special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
|