Spaces:
Starting
on
L40S
Starting
on
L40S
Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,8 @@ EPOCHS = 3
|
|
13 |
LEARNING_RATE = 1e-4
|
14 |
FP16 = True
|
15 |
FACTOR = 8
|
|
|
|
|
16 |
|
17 |
def load_data():
|
18 |
dataset = load_dataset("nroggendorff/elephant", split="train")
|
@@ -22,7 +24,7 @@ def create_tokenizer():
|
|
22 |
tokenizer = ByteLevelBPETokenizer()
|
23 |
tokenizer.train_from_iterator(
|
24 |
training_corpus,
|
25 |
-
vocab_size=
|
26 |
min_frequency=2,
|
27 |
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
|
28 |
)
|
|
|
13 |
LEARNING_RATE = 1e-4
|
14 |
FP16 = True
|
15 |
FACTOR = 8
|
16 |
+
VOCAB_SIZE = 3200
|
17 |
+
DATASET = "nroggendorff/elephant"
|
18 |
|
19 |
def load_data():
|
20 |
dataset = load_dataset("nroggendorff/elephant", split="train")
|
|
|
24 |
tokenizer = ByteLevelBPETokenizer()
|
25 |
tokenizer.train_from_iterator(
|
26 |
training_corpus,
|
27 |
+
vocab_size=VOCAB_SIZE,
|
28 |
min_frequency=2,
|
29 |
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>", "<|user|>", "<|bot|>", "<|end|>"]
|
30 |
)
|