athirdpath/BigMistral-13b-GLUE_LORA

base_model: athirdpath/BigMistral-13b

model_type: MistralForCausalLM

tokenizer_type: LlamaTokenizer

is_mistral_derived_model: true

load_in_4bit: true

datasets:

val_set_size: 0.07

adapter: qlora

sequence_len: 4096

sample_packing: true

pad_to_sequence_len: true

lora_r: 512

lora_alpha: 32

lora_dropout: 0.04

lora_target_linear: true

gradient_accumulation_steps: 6

micro_batch_size: 3

eval_batch_size: 3

num_epochs: 4

optimizer: adamw_bnb_8bit

lr_scheduler: cosine

learning_rate: 0.00005

bf16: true

gradient_checkpointing: true

flash_attention: true

warmup_steps: 10

weight_decay: 0.00001

athirdpath
/

BigMistral-13b-GLUE_LORA