nroggendorff commited on
Commit
f5014ce
·
verified ·
1 Parent(s): 85c4894

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +0 -8
train.py CHANGED
@@ -49,14 +49,6 @@ def create_tokenizer(training_corpus):
49
 
50
  def load_tokenizer(training_corpus):
51
  tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO)
52
- special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
53
- special_tokens.append(["<|user|>", "<|bot|>", "<|end|>"])
54
- tokenizer.train_from_iterator(
55
- training_corpus,
56
- vocab_size=VOCAB_SIZE,
57
- min_frequency=2,
58
- special_tokens=special_tokens
59
- )
60
  return tokenizer
61
 
62
  def get_training_corpus(dataset):
 
49
 
50
  def load_tokenizer(training_corpus):
51
  tokenizer = AutoTokenizer.from_pretrained(OUTPUT_REPO)
 
 
 
 
 
 
 
 
52
  return tokenizer
53
 
54
  def get_training_corpus(dataset):