Chris4K commited on
Commit
fb5fef9
·
verified ·
1 Parent(s): 027365f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -357,7 +357,7 @@ def optimize_vocabulary(texts, vocab_size=10000, min_frequency=2):
357
 
358
  # Train BPE tokenizer
359
  # tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
360
- trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
361
  tokenizer.train_from_iterator(optimized_texts, trainer)
362
 
363
  return tokenizer, optimized_texts
 
357
 
358
  # Train BPE tokenizer
359
  # tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
360
+ trainer = models.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
361
  tokenizer.train_from_iterator(optimized_texts, trainer)
362
 
363
  return tokenizer, optimized_texts