data: | |
prompt: llama3_formal | |
train: ../data/susgen/FINAL/PER_3500/FINAL_PER3500_30k.json | |
val: null | |
val_split_ratio: 0.005 | |
device: cuda | |
instruct_mask: true | |
local_rank: 0 | |
model: | |
acceleration: null | |
int4_config: | |
bnb_4bit_compute_dtype: bfloat16 | |
bnb_4bit_quant_type: nf4 | |
bnb_4bit_use_double_quant: true | |
load_in_4bit: true | |
load_in_8bit: false | |
int8_config: | |
load_in_4bit: false | |
load_in_8bit: true | |
lora: | |
bias: none | |
inference_mode: false | |
lora_alpha: 32 | |
lora_dropout: 0.1 | |
r: 16 | |
target_modules: | |
- q_proj | |
- k_proj | |
- v_proj | |
- o_proj | |
- gate_proj | |
- up_proj | |
- down_proj | |
- lm_head | |
task_type: CAUSAL_LM | |
lora_path: false | |
model_path: ../ckpts/Meta-Llama-3-8B-Instruct | |
quantization: int4 | |
seed: 2024 | |
show_config: false | |
use_lora: true | |
name: 30k-Llama3-8B-Instruct | |
output_dir: ../results/ | |
tokenizer: | |
add_bos_token: true | |
add_eos_token: false | |
add_prefix_space: false | |
encode: | |
max_length: 2048 | |
return_tensors: pt | |
truncation: true | |
model_max_length: 2048 | |
padding_side: left | |
pretrained_model_name_or_path: ../ckpts/Meta-Llama-3-8B-Instruct | |
truncation_side: right | |
use_fast: true | |
trainer: NewTrainer | |
training: | |
bf16: true | |
deepspeed: ./configs/ds_configs/ds_config_stage_2.json | |
gradient_accumulation_steps: 16 | |
learning_rate: 2.0e-05 | |
logging_steps: 1 | |
lr_scheduler_type: cosine | |
num_train_epochs: 3 | |
optim: paged_adamw_32bit | |
per_device_train_batch_size: 16 | |
remove_unused_columns: false | |
report_to: wandb | |
resume_from_checkpoint: true | |
save_steps: 20 | |
save_strategy: steps | |
warmup_steps: 100 | |
weight_decay: 0.01 | |