Crystalcareai
/

Qwen1.5-8x7b

@@ -1,18 +1,18 @@
-base_model: Crystalcareai/Qwen-1.5-8x7B
-model_type: Qwen2ForCausalLM
-tokenizer_type: Qwen2Tokenizer
 trust_remote_code: true
 load_in_8bit: false
-load_in_4bit: true
 strict: false
 datasets:
   - path: Crystalcareai/MoD
         type: sharegpt
-dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./qlora-out
@@ -23,8 +23,6 @@ model_config:
 adapter: qlora
 lora_model_dir:
 sequence_len: 32768
 sample_packing: true
 pad_to_sequence_len: true
@@ -42,7 +40,7 @@ micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
-learning_rate: 0.0002
 train_on_inputs: false
@@ -53,7 +51,7 @@ tf32: false
 gradient_checkpointing: true
-early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1

+base_model: Crystalcareai/Qwen-1.5-8x7B #this is the raw (random gated) model straight out of mergekit. Change this to "Crystalcareai/Qwen1.5-8x7b" for training SFT'd model.
+model_type: Qwen2ForCausalLM #don't use HF auto config
+tokenizer_type: Qwen2Tokenizer #don't use HF auto config
 trust_remote_code: true
 load_in_8bit: false
+load_in_4bit: true #Mixtral models still chug vram in axolotl, so qlora is required at the moment.
 strict: false
 datasets:
   - path: Crystalcareai/MoD
         type: sharegpt
+dataset_prepared_path: last_run_prepared #preprocess your dataset for easier vram: "python -m axolotl.cli.preprocess examples/Qwen/YOURCONFIG.yml"
 val_set_size: 0.0
 output_dir: ./qlora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 32768
 sample_packing: true
 pad_to_sequence_len: true
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
+learning_rate: 0.0002  # anything from 2-5 is acceptable
 train_on_inputs: false
 gradient_checkpointing: true
+early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1