pretrain core 4
Browse files
scripts/pretrain_core_model_4.yaml
CHANGED
@@ -60,7 +60,7 @@ train:
|
|
60 |
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
61 |
# global_batch_size: 512
|
62 |
# global_batch_size: 256
|
63 |
-
global_batch_size:
|
64 |
|
65 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
66 |
micro_batch_size: 1
|
@@ -78,8 +78,7 @@ train:
|
|
78 |
max_steps:
|
79 |
|
80 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
81 |
-
|
82 |
-
max_seq_length:
|
83 |
|
84 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
85 |
tie_embeddings: true
|
|
|
60 |
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
|
61 |
# global_batch_size: 512
|
62 |
# global_batch_size: 256
|
63 |
+
global_batch_size: 4
|
64 |
|
65 |
# Number of samples per data-parallel rank (type: int, default: 4)
|
66 |
micro_batch_size: 1
|
|
|
78 |
max_steps:
|
79 |
|
80 |
# Limits the length of samples. Off by default (type: Optional[int], default: null)
|
81 |
+
max_seq_length: 16384
|
|
|
82 |
|
83 |
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
|
84 |
tie_embeddings: true
|