|
loggers: |
|
tensorboard: |
|
_target_: src.loggers.TensorBoardLogger |
|
save_dir: ./ |
|
name: '' |
|
version: ./ |
|
callbacks: |
|
lr_monitor: |
|
_target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor |
|
grad_norm: |
|
_target_: src.callbacks.grad_norm.GradNorm |
|
norm_type: 2 |
|
group_separator: / |
|
histogram_freq: null |
|
check_clipping: false |
|
log_weight_distribution: false |
|
only_total: true |
|
speed_monitor: |
|
_target_: src.callbacks.speed_monitor.SpeedMonitor |
|
grad_accum: |
|
_target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler |
|
scheduling: |
|
0: 4 |
|
model_checkpoint: |
|
_target_: src.callbacks.model_checkpoint.ModelCheckpoint |
|
dirpath: .checkpoints |
|
filename: '{step}' |
|
enable_version_counter: false |
|
every_n_train_steps: 2000 |
|
save_top_k: -1 |
|
save_last: link |
|
verbose: true |
|
save_initial_checkpoint: true |
|
tok_path: /home/pl487/rdd/outputs/tokenizers/wordpiece32000minipile |
|
run_folder: minipile/smol_llama-81M-tied_wordpiece32000minipile_2025-02-03T17-14-42 |
|
out_parent_folder: model_train |
|
tok_name: wordpiece32000minipile |
|
dataset: minipile |
|
train_data_path: /home/pl487/rdd/data/minipile/wordpiece32000minipile/train |
|
val_data_path: /home/pl487/rdd/data/minipile/wordpiece32000minipile/validation |
|
model: smol_llama-81M-tied |
|
resume_from_checkpoint: null |
|
save_initial_checkpoint: true |
|
seed: 42 |
|
torch_compile: true |
|
data: |
|
batch_size: 16 |
|
eval_batch_size: 64 |
|
shuffle: true |
|
drop_last: false |
|
num_workers: 8 |
|
pin_memory: true |
|
persistent_workers: false |
|
prefetch_factor: 2 |
|
multiprocessing_context: null |
|
optim: |
|
optim_name: adamw |
|
lr: 0.0006 |
|
weight_decay: 0.1 |
|
optim_kwargs: |
|
fused: true |
|
eps: 1.0e-08 |
|
betas: |
|
- 0.9 |
|
- 0.95 |
|
scheduler_name: warmup_stable_decay |
|
num_warmup_steps: 2000 |
|
scheduler_kwargs: |
|
num_stable_steps: 46000 |
|
num_decay_steps: 2000 |
|
min_lr_ratio: 0.01 |
|
trainer: |
|
accelerator: gpu |
|
precision: bf16-true |
|
deterministic: false |
|
log_every_n_steps: 1 |
|
enable_progress_bar: true |
|
fast_dev_run: false |
|
gradient_clip_val: 1.0 |
|
gradient_clip_algorithm: norm |
|
val_check_interval: 2000 |
|
max_steps: 50000 |
|
limit_val_batches: 500 |
|
|