## Experiment Configuration ```yaml callbacks: grad_accum: _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler scheduling: 0: 16 grad_norm: _target_: src.callbacks.grad_norm.GradNorm check_clipping: false group_separator: / histogram_freq: null log_weight_distribution: false norm_type: 2 only_total: true lr_monitor: _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor model_checkpoint: _target_: src.callbacks.model_checkpoint.ModelCheckpoint dirpath: .checkpoints enable_version_counter: false every_n_train_steps: 2000 filename: '{step}' save_initial_checkpoint: true save_last: link save_top_k: -1 verbose: true speed_monitor: _target_: src.callbacks.speed_monitor.SpeedMonitor data: batch_size: 8 drop_last: false eval_batch_size: 16 multiprocessing_context: null num_workers: 8 persistent_workers: false pin_memory: true prefetch_factor: 2 shuffle: true dataset: minipile loggers: tensorboard: _target_: src.loggers.TensorBoardLogger name: '' save_dir: ./ version: ./ model: smol_llama-81M-tied optim: lr: 0.0006 num_warmup_steps: 2000 optim_kwargs: betas: - 0.9 - 0.95 eps: 1.0e-08 fused: true optim_name: adamw scheduler_kwargs: min_lr_ratio: 0.01 num_decay_steps: 2000 num_stable_steps: 46000 scheduler_name: warmup_stable_decay weight_decay: 0.1 out_parent_folder: model_train resume_from_checkpoint: null run_folder: minipile/smol_llama-81M-tied_bpe128000minipile_2024-11-01T13-04-36 save_initial_checkpoint: true seed: 42 tok_name: bpe128000minipile tok_path: /home/pl487/rdd/outputs/tokenizers/bpe128000minipile torch_compile: true train_data_path: /home/pl487/rdd/data/minipile/bpe128000minipile/train trainer: accelerator: gpu deterministic: false enable_progress_bar: true fast_dev_run: false gradient_clip_algorithm: norm gradient_clip_val: 1.0 limit_val_batches: 500 log_every_n_steps: 1 max_steps: 50000 precision: bf16-true val_check_interval: 2000 val_data_path: /home/pl487/rdd/data/minipile/bpe128000minipile/validation ```