tangledgroup
/

tangled-alpha-0.9-core

@@ -60,7 +60,7 @@ train:
   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
   # global_batch_size: 512
   # global_batch_size: 256
-  global_batch_size: 8
   # Number of samples per data-parallel rank (type: int, default: 4)
   micro_batch_size: 1
@@ -78,8 +78,7 @@ train:
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
-  # max_seq_length: 16384
-  max_seq_length:
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings: true

   # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
   # global_batch_size: 512
   # global_batch_size: 256
+  global_batch_size: 4
   # Number of samples per data-parallel rank (type: int, default: 4)
   micro_batch_size: 1
   max_steps:
   # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 16384
   # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
   tie_embeddings: true