nroggendorff commited on
Commit
da2f127
·
verified ·
1 Parent(s): 3647db8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -1
app.py CHANGED
@@ -13,6 +13,8 @@ EPOCHS = 3
13
  LEARNING_RATE = 1e-4
14
  FP16 = True
15
  FACTOR = 8
 
 
16
 
17
  def load_data():
18
  dataset = load_dataset("nroggendorff/elephant", split="train")
@@ -22,7 +24,7 @@ def create_tokenizer():
22
  tokenizer = ByteLevelBPETokenizer()
23
  tokenizer.train_from_iterator(
24
  training_corpus,
25
- vocab_size=3200,
26
  min_frequency=2,
27
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
28
  )
 
13
  LEARNING_RATE = 1e-4
14
  FP16 = True
15
  FACTOR = 8
16
+ VOCAB_SIZE = 3200
17
+ DATASET = "nroggendorff/elephant"
18
 
19
  def load_data():
20
  dataset = load_dataset("nroggendorff/elephant", split="train")
 
24
  tokenizer = ByteLevelBPETokenizer()
25
  tokenizer.train_from_iterator(
26
  training_corpus,
27
+ vocab_size=VOCAB_SIZE,
28
  min_frequency=2,
29
  special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
30
  )