File size: 3,499 Bytes
94f4faf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
micro_batch_size: 24
global_batch_size: 1920
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
resume_from_checkpoint: null
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
megatron_amp_O2: true
seq_length: 512
max_position_embeddings: 512
num_layers: 24
hidden_size: 2048
ffn_hidden_size: 5120
num_attention_heads: 32
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
apply_query_key_layer_scaling: true
layernorm_epsilon: 1.0e-05
persist_layer_norm: true
gradient_as_bucket_view: true
bias_gelu_fusion: false
masked_softmax_fusion: true
encoder_arch: transformer
decoder_arch: transformer
activation: geglu
tokenizer:
library: sentencepiece
type: null
model: nemo:d55283aced7944109f3cf68d9452e73b_mt5_tokenizer.model
vocab_file: null
merge_file: null
num_sentinel_tokens: 100
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp32_residual_connection: false
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
data:
data_prefix:
- 0.056224
- /preproc_data/mc4_ja_mt5_tokenizer_text_document
- 0.064717
- /preproc_data/mc4_en_mt5_tokenizer_text_document
- 0.055394
- /preproc_data/mc4_it_mt5_tokenizer_text_document
- 0.006129
- /preproc_data/mc4_lv_mt5_tokenizer_text_document
- 0.156199
- /preproc_data/mc4_ru_mt5_tokenizer_text_document
- 0.02047
- /preproc_data/mc4_hu_mt5_tokenizer_text_document
- 0.020264
- /preproc_data/mc4_zh_mt5_tokenizer_text_document
- 0.047618
- /preproc_data/mc4_pl_mt5_tokenizer_text_document
- 0.021716
- /preproc_data/mc4_el_mt5_tokenizer_text_document
- 0.094469
- /preproc_data/mc4_de_mt5_tokenizer_text_document
- 0.028565
- /preproc_data/mc4_cs_mt5_tokenizer_text_document
- 0.015286
- /preproc_data/mc4_ko_mt5_tokenizer_text_document
- 0.014667
- /preproc_data/mc4_hi_mt5_tokenizer_text_document
- 0.015717
- /preproc_data/mc4_no_mt5_tokenizer_text_document
- 0.016761
- /preproc_data/mc4_da_mt5_tokenizer_text_document
- 0.011884
- /preproc_data/mc4_sk_mt5_tokenizer_text_document
- 0.088899
- /preproc_data/mc4_fr_mt5_tokenizer_text_document
- 0.051519
- /preproc_data/mc4_pt_mt5_tokenizer_text_document
- 0.008662
- /preproc_data/mc4_lt_mt5_tokenizer_text_document
- 0.110217
- /preproc_data/mc4_es_mt5_tokenizer_text_document
- 0.031769
- /preproc_data/mc4_nl_mt5_tokenizer_text_document
- 0.022698
- /preproc_data/mc4_sv_mt5_tokenizer_text_document
- 0.025119
- /preproc_data/mc4_ro_mt5_tokenizer_text_document
- 0.015036
- /preproc_data/mc4_fi_mt5_tokenizer_text_document
index_mapping_dir: null
data_impl: mmap
splits_string: 99892,99,9
seq_length: 512
seq_length_dec: 128
skip_warmup: true
num_workers: 8
dataloader_type: single
masked_lm_prob: 0.15
dataset_type: t5
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: true
permutation: false
whole_word_masking: false
favor_longer_ngrams: false
optim:
name: fused_adam
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.01
sched:
name: WarmupAnnealing
min_lr: 1.0e-05
last_epoch: -1
warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.9.0rc0
|