|
batch_size_training: '4' |
|
batching_strategy: padding |
|
checkpoint_type: StateDictType.SHARDED_STATE_DICT |
|
context_length: '8192' |
|
curriculum_learning: 'False' |
|
curriculum_phases: '3' |
|
dataset: '[''OpenCoderSFTStage2'']' |
|
ddp_timeout: '36000' |
|
debug: 'False' |
|
decay_steps: None |
|
dist_checkpoint_folder: fine-tuned |
|
drop_last: 'True' |
|
dynamic_batch_size: 'False' |
|
enable_deepspeed: 'False' |
|
enable_fsdp: 'True' |
|
enable_memory_profiling: 'False' |
|
enable_memory_trace: 'False' |
|
enable_mixed_precision: 'True' |
|
enable_tensorboard: 'True' |
|
eta_min: 1e-05 |
|
eval_epoch: '1' |
|
eval_in_memory: 'False' |
|
eval_steps: '1000' |
|
evaluation_strategy: steps |
|
flop_counter: 'False' |
|
flop_counter_start: '3' |
|
fp16: 'False' |
|
freeze_layers: 'False' |
|
from_peft_checkpoint: '' |
|
fsdp_activation_checkpointing: 'True' |
|
fsdp_cpu_offload: 'False' |
|
fsdp_cpu_ram_efficient_loading: 'False' |
|
gamma: '0.85' |
|
gradient_accumulation_steps: '8' |
|
gradient_checkpointing: 'True' |
|
gradient_checkpointing_kwargs: '{''use_reentrant'': False}' |
|
gradient_clipping: 'False' |
|
gradient_clipping_threshold: '1.0' |
|
handle_long_sequences: 'True' |
|
hf_hub_metrics_cache_dir: /shared/public/data/controlllm/metrics/ |
|
hsdp: 'True' |
|
learning_rate: 5e-05 |
|
load_best_model_at_end: 'False' |
|
logging_steps: '500' |
|
long_sequence_threshold: '16384' |
|
low_cpu_fsdp: 'False' |
|
lr: '0.0001' |
|
lr_scheduler_per_iter: 'True' |
|
max_eval_step: '500' |
|
max_grad_norm: '1.0' |
|
max_step: '0' |
|
max_tokens_per_batch: '-1' |
|
max_train_step: '-1' |
|
memory_per_token: '-1' |
|
mixed_precision: 'True' |
|
model_name: PATH/to/Model |
|
no_cuda: 'False' |
|
num_epochs: '3' |
|
num_freeze_layers: '1' |
|
num_train_epochs: '20' |
|
num_unfrozen_layers: '8' |
|
num_workers_dataloader: '0' |
|
one_gpu: 'False' |
|
optimizer: AdamW |
|
overwrite_output_dir: 'False' |
|
peft_method: lora |
|
per_device_eval_batch_size: '1' |
|
per_device_train_batch_size: '12' |
|
precompute_batches: None |
|
pure_bf16: 'False' |
|
quantization: 'False' |
|
replica_group_size: '1' |
|
resume_checkpoint_folder: None |
|
resume_from_latest: 'True' |
|
run_validation: 'True' |
|
save_epoch: '1' |
|
save_metrics: 'False' |
|
save_model: 'True' |
|
save_optimizer: 'False' |
|
save_steps: '1000' |
|
seed: '42' |
|
sharding_group_size: '8' |
|
sharding_strategy: ShardingStrategy.HYBRID_SHARD |
|
step_size: '1' |
|
tokenizer_name: None |
|
trainer: native |
|
unfrozen_strategy: interweave |
|
use_fast_kernels: 'False' |
|
use_fp16: 'False' |
|
use_peft: 'False' |
|
use_profiler: 'False' |
|
use_wandb: 'False' |
|
val_batch_size: '1' |
|
warmup_steps: '1000' |
|
weight_decay: '0.01' |
|
weight_decay_ratio: '0.1' |
|
|