model_name: tangled-alpha-0.11-core | |
model_config: | |
name: tangled-alpha-0.11-core | |
hf_config: {} | |
block_size: 131072 | |
n_layer: 32 | |
n_embd: 512 | |
vocab_size: 131072 | |
padding_multiple: 512 | |
padded_vocab_size: 131072 | |
norm_class_name: RMSNorm | |
norm_eps: 1.0e-05 | |
norm_qk: false | |
post_attention_norm: false | |
post_mlp_norm: false | |
parallel_residual: false | |
shared_attention_norm: false | |
n_head: 8 | |
head_size: 64 | |
n_query_groups: 8 | |
attn_bias: false | |
rope_base: 16000 | |
rotary_percentage: 1.0 | |
rope_condense_ratio: 1 | |
intermediate_size: 1365 | |
bias: false | |
mlp_class_name: LLaMAMLP | |
gelu_approximate: none | |
n_expert: 0 | |
n_expert_per_token: 0 | |
scale_embeddings: false | |
lm_head_bias: false | |
out_dir: ../out/pretrain-core-1 | |
precision: bf16-true | |
initial_checkpoint_dir: ../out/pretrain-core-0/checkpoint | |
data: | |
class_path: litgpt.data.LitData | |
init_args: | |
data_path: ../core-data-1-1025-2049-2049-8000/ | |
seed: 42 | |
num_workers: 32 | |
train: | |
save_interval: 50 | |
log_interval: 1 | |
global_batch_size: 512 | |
micro_batch_size: 4 | |
lr_warmup_steps: 0 | |
max_tokens: 1830709785 | |
max_seq_length: 2049 | |
tie_embeddings: false | |
max_norm: 1.0 | |
min_lr: 1.0e-05 | |
eval: | |
interval: 50 | |
max_iters: 100 | |
initial_validation: true | |
final_validation: true | |
evaluate_example: first | |
optimizer: | |
class_path: sophia_opt.SophiaG | |
init_args: | |
lr: 1.0e-05 | |
betas: | |
- 0.965 | |
- 0.99 | |
rho: 0.04 | |
weight_decay: 0.1 | |
devices: auto | |
num_nodes: 1 | |
tokenizer_dir: ../tokenizer | |
logger_name: wandb | |
seed: 23 | |