loggers: tensorboard: _target_: src.loggers.TensorBoardLogger save_dir: ./ name: '' version: ./ callbacks: lr_monitor: _target_: src.callbacks.lr_monitor.SimpleLearningRateMonitor grad_norm: _target_: src.callbacks.grad_norm.GradNorm norm_type: 2 group_separator: / histogram_freq: null check_clipping: false log_weight_distribution: false only_total: true speed_monitor: _target_: src.callbacks.speed_monitor.SpeedMonitor grad_accum: _target_: src.callbacks.gradient_accumulation.GradientAccumulationScheduler scheduling: 0: 4 model_checkpoint: _target_: src.callbacks.model_checkpoint.ModelCheckpoint dirpath: .checkpoints filename: '{step}' enable_version_counter: false every_n_train_steps: 2000 save_top_k: -1 save_last: link verbose: true save_initial_checkpoint: true tok_path: /home/pl487/rdd/outputs/tokenizers/bpe8064minipile run_folder: minipile/smol_llama-81M-tied_bpe8064minipile_2024-10-31T17-34-52 out_parent_folder: model_train tok_name: bpe8064minipile dataset: minipile train_data_path: /home/pl487/rdd/data/minipile/bpe8064minipile/train val_data_path: /home/pl487/rdd/data/minipile/bpe8064minipile/validation model: smol_llama-81M-tied resume_from_checkpoint: null save_initial_checkpoint: true seed: 42 torch_compile: true data: batch_size: 32 eval_batch_size: 64 shuffle: true drop_last: false num_workers: 8 pin_memory: true persistent_workers: false prefetch_factor: 2 multiprocessing_context: null optim: optim_name: adamw lr: 0.0006 weight_decay: 0.1 optim_kwargs: fused: true eps: 1.0e-08 betas: - 0.9 - 0.95 scheduler_name: warmup_stable_decay num_warmup_steps: 2000 scheduler_kwargs: num_stable_steps: 46000 num_decay_steps: 2000 min_lr_ratio: 0.01 trainer: accelerator: gpu precision: bf16-true deterministic: false log_every_n_steps: 1 enable_progress_bar: true fast_dev_run: false gradient_clip_val: 1.0 gradient_clip_algorithm: norm val_check_interval: 2000 max_steps: 50000 limit_val_batches: 500