File size: 2,952 Bytes
1ebceda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
cfg:
  micro_batch_size: 4
  global_batch_size: 32
  rampup_batch_size: null
  context_parallel_size: 1
  tensor_model_parallel_size: 1
  pipeline_model_parallel_size: 1
  virtual_pipeline_model_parallel_size: null
  resume_from_checkpoint: null
  encoder_seq_length: 2048
  max_position_embeddings: 2048
  num_layers: 24
  hidden_size: 4096
  ffn_hidden_size: 16384
  num_attention_heads: 32
  init_method_std: 0.01
  hidden_dropout: 0.1
  attention_dropout: 0.1
  kv_channels: null
  apply_query_key_layer_scaling: true
  layernorm_epsilon: 1.0e-05
  make_vocab_size_divisible_by: 128
  pre_process: true
  post_process: true
  persist_layer_norm: true
  gradient_as_bucket_view: true
  grad_div_ar_fusion: true
  gradient_accumulation_fusion: true
  bias_activation_fusion: true
  bias_dropout_add_fusion: true
  masked_softmax_fusion: true
  activations_checkpoint_granularity: null
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: null
  num_micro_batches_with_partial_activation_checkpoints: null
  activations_checkpoint_layers_per_pipeline: null
  fsdp: false
  fsdp_sharding_strategy: full
  fsdp_grad_reduce_dtype: 32
  fsdp_sharded_checkpoint: false
  sequence_parallel: false
  overlap_p2p_comm: false
  batch_p2p_comm: true
  num_query_groups: null
  tokenizer:
    library: megatron
    type: GPT2BPETokenizer
    model: null
    delimiter: null
    vocab_file: /gpt3_dataset//bpe/vocab.json
    merge_file: /gpt3_dataset//bpe/merges.txt
  native_amp_init_scale: 4294967296
  native_amp_growth_interval: 1000
  hysteresis: 2
  fp32_residual_connection: false
  fp16_lm_cross_entropy: false
  megatron_amp_O2: true
  grad_allreduce_chunk_size_mb: 125
  sharp: false
  mcore_gpt: true
  transformer_engine: false
  fp8: false
  fp8_e4m3: false
  fp8_hybrid: true
  fp8_margin: 0
  fp8_interval: 1
  fp8_amax_history_len: 1024
  fp8_amax_compute_algo: max
  fp8_wgrad: true
  ub_tp_comm_overlap: false
  tp_comm_atomic_ag: false
  tp_comm_atomic_rs: false
  seed: 1234
  sync_batch_comm: false
  use_cpu_initialization: false
  onnx_safe: false
  apex_transformer_log_level: 30
  nsys_profile:
    enabled: false
    trace:
    - nvtx
    - cuda
    start_step: 10
    end_step: 10
    ranks:
    - 0
    gen_shape: false
  optim:
    name: distributed_fused_adam
    bucket_cap_mb: 400
    overlap_grad_sync: true
    overlap_param_sync: true
    contiguous_grad_buffer: true
    lr: 0.00016
    weight_decay: 0.1
    betas:
    - 0.9
    - 0.95
    sched:
      name: CosineAnnealing
      warmup_steps: 115
      constant_steps: 12500
      min_lr: 1.6e-05
  data:
    data_impl: mmap
    splits_string: 99990,8,2
    seq_length: 2048
    skip_warmup: true
    num_workers: 2
    dataloader_type: single
    reset_position_ids: false
    reset_attention_mask: false
    eod_mask_loss: false
    index_mapping_dir: null
    data_prefix:
    - 0.0333
    - /gpt3_dataset/wiki_text_document
  precision: bf16-mixed