nroggendorff commited on
Commit
5ec318d
·
verified ·
1 Parent(s): 5720fe4

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +11 -13
train.py CHANGED
@@ -40,23 +40,20 @@ class Space:
40
  space = Space()
41
 
42
  def load_data():
43
- try:
44
- if not INSTRUCT_FINETUNE_BOOL:
45
- dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
46
- else:
47
- dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
48
 
49
- start = INIT * SHARD_SIZE
50
- data_list = list(islice(dataset, start, start + SHARD_SIZE))
51
-
52
- dataset = Dataset.from_dict({'text': [example['text'] for example in data_list]})
53
- return dataset
54
 
55
  def create_tokenizer(training_corpus):
56
  tokenizer = ByteLevelBPETokenizer()
57
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
58
- if INSTRUCT_FINETUNE_BOOL:
59
- special_tokens.extend(["<|user|>", "<|bot|>", "<|end|>"])
60
  tokenizer.train_from_iterator(
61
  training_corpus,
62
  vocab_size=VOCAB_SIZE,
@@ -117,7 +114,8 @@ def configure_tokenizer(tokenizer):
117
  "eos_token": "</s>",
118
  "unk_token": "<unk>",
119
  "pad_token": "<pad>",
120
- "mask_token": "<mask>"
 
121
  }
122
  if INSTRUCT_FINETUNE_BOOL:
123
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]
 
40
  space = Space()
41
 
42
  def load_data():
43
+ if not INSTRUCT_FINETUNE_BOOL:
44
+ dataset = load_dataset(INPUT_DATASET, "cosmopedia-v2", split="train", streaming=True)
45
+ else:
46
+ dataset = load_dataset(INSTRUCT_DATASET, split="train", streaming=True)
 
47
 
48
+ start = INIT * SHARD_SIZE
49
+ data_list = list(islice(dataset, start, start + SHARD_SIZE))
50
+
51
+ dataset = Dataset.from_dict({'text': [example['text'] for example in data_list]})
52
+ return dataset
53
 
54
  def create_tokenizer(training_corpus):
55
  tokenizer = ByteLevelBPETokenizer()
56
  special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
 
 
57
  tokenizer.train_from_iterator(
58
  training_corpus,
59
  vocab_size=VOCAB_SIZE,
 
114
  "eos_token": "</s>",
115
  "unk_token": "<unk>",
116
  "pad_token": "<pad>",
117
+ "mask_token": "<mask>",
118
+ "additional_special_tokens": []
119
  }
120
  if INSTRUCT_FINETUNE_BOOL:
121
  special_tokens["additional_special_tokens"] = ["<|user|>", "<|bot|>", "<|end|>"]