mtasic85 commited on
Commit
df5288a
·
1 Parent(s): b7123ab

pretrain core 4

Browse files
Files changed (1) hide show
  1. scripts/pretrain_core_model_4.yaml +2 -3
scripts/pretrain_core_model_4.yaml CHANGED
@@ -60,7 +60,7 @@ train:
60
  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
61
  # global_batch_size: 512
62
  # global_batch_size: 256
63
- global_batch_size: 8
64
 
65
  # Number of samples per data-parallel rank (type: int, default: 4)
66
  micro_batch_size: 1
@@ -78,8 +78,7 @@ train:
78
  max_steps:
79
 
80
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
81
- # max_seq_length: 16384
82
- max_seq_length:
83
 
84
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
85
  tie_embeddings: true
 
60
  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
61
  # global_batch_size: 512
62
  # global_batch_size: 256
63
+ global_batch_size: 4
64
 
65
  # Number of samples per data-parallel rank (type: int, default: 4)
66
  micro_batch_size: 1
 
78
  max_steps:
79
 
80
  # Limits the length of samples. Off by default (type: Optional[int], default: null)
81
+ max_seq_length: 16384
 
82
 
83
  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
84
  tie_embeddings: true