error577 commited on
Commit
6966040
·
verified ·
1 Parent(s): 0c1541a

End of training

Browse files
README.md CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  axolotl version: `0.4.1`
19
  ```yaml
20
- adapter: qlora
21
  base_model: katuni4ka/tiny-random-qwen1.5-moe
22
  bf16: auto
23
  chat_template: llama3
@@ -41,7 +41,7 @@ early_stopping_patience: null
41
  eval_max_new_tokens: 128
42
  eval_table_size: null
43
  evals_per_epoch: 1
44
- flash_attention: false
45
  fp16: null
46
  fsdp: null
47
  fsdp_config: null
@@ -53,7 +53,7 @@ hub_repo: null
53
  hub_strategy: end
54
  hub_token: null
55
  learning_rate: 0.0001
56
- load_in_4bit: true
57
  load_in_8bit: false
58
  local_rank: null
59
  logging_steps: 1
@@ -64,11 +64,11 @@ lora_model_dir: null
64
  lora_r: 32
65
  lora_target_linear: true
66
  lr_scheduler: cosine
67
- max_steps: 100
68
  micro_batch_size: 1
69
  mlflow_experiment_name: /tmp/2b8375abdf26554a_train_data.json
70
  model_type: AutoModelForCausalLM
71
- num_epochs: 4
72
  optimizer: adamw_bnb_8bit
73
  output_dir: miner_id_24
74
  pad_to_sequence_len: true
@@ -76,7 +76,7 @@ resume_from_checkpoint: null
76
  s2_attention: null
77
  sample_packing: false
78
  saves_per_epoch: 1
79
- sequence_len: 512
80
  strict: false
81
  tf32: false
82
  tokenizer_type: AutoTokenizer
@@ -101,7 +101,7 @@ xformers_attention: null
101
 
102
  This model is a fine-tuned version of [katuni4ka/tiny-random-qwen1.5-moe](https://huggingface.co/katuni4ka/tiny-random-qwen1.5-moe) on the None dataset.
103
  It achieves the following results on the evaluation set:
104
- - Loss: 11.9239
105
 
106
  ## Model description
107
 
@@ -129,17 +129,21 @@ The following hyperparameters were used during training:
129
  - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
130
  - lr_scheduler_type: cosine
131
  - lr_scheduler_warmup_steps: 10
132
- - training_steps: 100
133
 
134
  ### Training results
135
 
136
  | Training Loss | Epoch | Step | Validation Loss |
137
  |:-------------:|:------:|:----:|:---------------:|
138
- | 11.9457 | 0.0008 | 1 | 11.9393 |
139
- | 11.9385 | 0.0212 | 25 | 11.9341 |
140
- | 11.9232 | 0.0424 | 50 | 11.9279 |
141
- | 11.9167 | 0.0636 | 75 | 11.9244 |
142
- | 11.9557 | 0.0848 | 100 | 11.9239 |
 
 
 
 
143
 
144
 
145
  ### Framework versions
 
17
 
18
  axolotl version: `0.4.1`
19
  ```yaml
20
+ adapter: lora
21
  base_model: katuni4ka/tiny-random-qwen1.5-moe
22
  bf16: auto
23
  chat_template: llama3
 
41
  eval_max_new_tokens: 128
42
  eval_table_size: null
43
  evals_per_epoch: 1
44
+ flash_attention: true
45
  fp16: null
46
  fsdp: null
47
  fsdp_config: null
 
53
  hub_strategy: end
54
  hub_token: null
55
  learning_rate: 0.0001
56
+ load_in_4bit: false
57
  load_in_8bit: false
58
  local_rank: null
59
  logging_steps: 1
 
64
  lora_r: 32
65
  lora_target_linear: true
66
  lr_scheduler: cosine
67
+ max_steps: 1000
68
  micro_batch_size: 1
69
  mlflow_experiment_name: /tmp/2b8375abdf26554a_train_data.json
70
  model_type: AutoModelForCausalLM
71
+ num_epochs: 8
72
  optimizer: adamw_bnb_8bit
73
  output_dir: miner_id_24
74
  pad_to_sequence_len: true
 
76
  s2_attention: null
77
  sample_packing: false
78
  saves_per_epoch: 1
79
+ sequence_len: 2048
80
  strict: false
81
  tf32: false
82
  tokenizer_type: AutoTokenizer
 
101
 
102
  This model is a fine-tuned version of [katuni4ka/tiny-random-qwen1.5-moe](https://huggingface.co/katuni4ka/tiny-random-qwen1.5-moe) on the None dataset.
103
  It achieves the following results on the evaluation set:
104
+ - Loss: 11.8547
105
 
106
  ## Model description
107
 
 
129
  - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
130
  - lr_scheduler_type: cosine
131
  - lr_scheduler_warmup_steps: 10
132
+ - training_steps: 1000
133
 
134
  ### Training results
135
 
136
  | Training Loss | Epoch | Step | Validation Loss |
137
  |:-------------:|:------:|:----:|:---------------:|
138
+ | 11.9466 | 0.0008 | 1 | 11.9395 |
139
+ | 11.9117 | 0.1060 | 125 | 11.8985 |
140
+ | 11.8802 | 0.2121 | 250 | 11.8872 |
141
+ | 11.8782 | 0.3181 | 375 | 11.8751 |
142
+ | 11.8794 | 0.4242 | 500 | 11.8600 |
143
+ | 11.8716 | 0.5302 | 625 | 11.8564 |
144
+ | 11.8596 | 0.6363 | 750 | 11.8553 |
145
+ | 11.8706 | 0.7423 | 875 | 11.8548 |
146
+ | 11.8513 | 0.8484 | 1000 | 11.8547 |
147
 
148
 
149
  ### Framework versions
adapter_config.json CHANGED
@@ -20,15 +20,15 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "gate_proj",
24
- "v_proj",
25
- "shared_expert_gate",
26
  "o_proj",
27
- "up_proj",
28
- "down_proj",
29
- "k_proj",
30
  "gate",
31
- "q_proj"
 
 
 
 
 
 
32
  ],
33
  "task_type": "CAUSAL_LM",
34
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
 
23
  "o_proj",
 
 
 
24
  "gate",
25
+ "k_proj",
26
+ "up_proj",
27
+ "v_proj",
28
+ "shared_expert_gate",
29
+ "gate_proj",
30
+ "q_proj",
31
+ "down_proj"
32
  ],
33
  "task_type": "CAUSAL_LM",
34
  "use_dora": false,
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87fa5161451e91cdc546edb9eb904dff6858c39afba74b98bacef94af029191a
3
  size 1265130
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f12be5ef04a053c67b95380acd55b89af25af55306be8bdf60190bfbe142221
3
  size 1265130
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5e03ed8e28bbaac8b3a4213afadd5528ddf7b77685ee80865719d96ac0939c
3
  size 1204200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e172d6fea7af81f45f881ef2a1d2fd5983609af1043d4bdd89ebb406fba4d06d
3
  size 1204200
config.json CHANGED
@@ -11,7 +11,7 @@
11
  "hidden_size": 32,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 22,
14
- "max_position_embeddings": 512,
15
  "max_window_layers": 2,
16
  "mlp_only_layers": [],
17
  "model_type": "qwen2_moe",
@@ -23,21 +23,6 @@
23
  "num_hidden_layers": 4,
24
  "num_key_value_heads": 2,
25
  "output_router_logits": false,
26
- "quantization_config": {
27
- "_load_in_4bit": true,
28
- "_load_in_8bit": false,
29
- "bnb_4bit_compute_dtype": "bfloat16",
30
- "bnb_4bit_quant_storage": "float32",
31
- "bnb_4bit_quant_type": "nf4",
32
- "bnb_4bit_use_double_quant": true,
33
- "llm_int8_enable_fp32_cpu_offload": false,
34
- "llm_int8_has_fp16_weight": false,
35
- "llm_int8_skip_modules": null,
36
- "llm_int8_threshold": 6.0,
37
- "load_in_4bit": true,
38
- "load_in_8bit": false,
39
- "quant_method": "bitsandbytes"
40
- },
41
  "rms_norm_eps": 1e-06,
42
  "rope_scaling": null,
43
  "rope_theta": 1000000.0,
 
11
  "hidden_size": 32,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 22,
14
+ "max_position_embeddings": 2048,
15
  "max_window_layers": 2,
16
  "mlp_only_layers": [],
17
  "model_type": "qwen2_moe",
 
23
  "num_hidden_layers": 4,
24
  "num_key_value_heads": 2,
25
  "output_router_logits": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  "rms_norm_eps": 1e-06,
27
  "rope_scaling": null,
28
  "rope_theta": 1000000.0,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b7b99c2a3eeb003823b295ecde38550d0c43d5d370de60127d11355c6dfa8ad
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a1a4bf91134d51cf10b3455a9337aef717da260b938001879d17eecb31311f6
3
  size 6776