|
|
|
|
|
model: |
|
model_name: "meta-llama/Llama-3.1-405B-Instruct" |
|
model_max_length: 4096 |
|
torch_dtype_str: "bfloat16" |
|
attn_implementation: "sdpa" |
|
load_pretrained_weights: True |
|
trust_remote_code: True |
|
tokenizer_pad_token: "<|finetune_right_pad_id|>" |
|
enable_liger_kernel: True |
|
|
|
data: |
|
train: |
|
datasets: |
|
- dataset_name: "text_sft_jsonl" |
|
dataset_path: "/path/to/training/dataset.jsonl" |
|
shuffle: True |
|
seed: 42 |
|
collator_name: "text_completions_only_with_padding" |
|
target_col: "messages" |
|
use_async_dataset: True |
|
seed: 42 |
|
validation: |
|
datasets: |
|
- dataset_name: "text_sft_jsonl" |
|
dataset_path: "/path/to/validation/dataset.jsonl" |
|
collator_name: "text_completions_only_with_padding" |
|
target_col: "messages" |
|
use_async_dataset: True |
|
seed: 42 |
|
|
|
training: |
|
trainer_type: "TRL_SFT" |
|
use_peft: True |
|
save_steps: 500 |
|
num_train_epochs: 1 |
|
per_device_train_batch_size: 2 |
|
gradient_accumulation_steps: 1 |
|
eval_strategy: "steps" |
|
eval_steps: 500 |
|
per_device_eval_batch_size: 1 |
|
|
|
enable_gradient_checkpointing: True |
|
gradient_checkpointing_kwargs: |
|
use_reentrant: False |
|
ddp_find_unused_parameters: False |
|
optimizer: "adamw_torch_fused" |
|
learning_rate: 1.0e-04 |
|
warmup_ratio: 0.1 |
|
weight_decay: 0.01 |
|
max_grad_norm: 10 |
|
compile: False |
|
|
|
dataloader_num_workers: "auto" |
|
dataloader_prefetch_factor: 32 |
|
|
|
logging_steps: 50 |
|
log_model_summary: False |
|
empty_device_cache_steps: 1 |
|
include_performance_metrics: True |
|
output_dir: "output/llama405b.qlora" |
|
enable_wandb: True |
|
|
|
peft: |
|
q_lora: True |
|
|
|
bnb_4bit_quant_type: "nf4" |
|
|
|
|
|
bnb_4bit_quant_storage: "bfloat16" |
|
bnb_4bit_compute_dtype: "bfloat16" |
|
lora_r: 16 |
|
lora_alpha: 32 |
|
lora_dropout: 0.0 |
|
lora_target_modules: |
|
- q_proj |
|
- k_proj |
|
- v_proj |
|
- o_proj |
|
- gate_proj |
|
- up_proj |
|
- down_proj |
|
|
|
fsdp: |
|
enable_fsdp: True |
|
forward_prefetch: True |
|
backward_prefetch: "BACKWARD_POST" |
|
use_orig_params: True |
|
cpu_offload: True |
|
auto_wrap_policy: "TRANSFORMER_BASED_WRAP" |
|
transformer_layer_cls: "LlamaDecoderLayer" |
|
|