End of training
Browse files- README.md +17 -13
- adapter_config.json +7 -7
- adapter_model.bin +1 -1
- adapter_model.safetensors +1 -1
- config.json +1 -16
- training_args.bin +1 -1
README.md
CHANGED
@@ -17,7 +17,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
17 |
|
18 |
axolotl version: `0.4.1`
|
19 |
```yaml
|
20 |
-
adapter:
|
21 |
base_model: katuni4ka/tiny-random-qwen1.5-moe
|
22 |
bf16: auto
|
23 |
chat_template: llama3
|
@@ -41,7 +41,7 @@ early_stopping_patience: null
|
|
41 |
eval_max_new_tokens: 128
|
42 |
eval_table_size: null
|
43 |
evals_per_epoch: 1
|
44 |
-
flash_attention:
|
45 |
fp16: null
|
46 |
fsdp: null
|
47 |
fsdp_config: null
|
@@ -53,7 +53,7 @@ hub_repo: null
|
|
53 |
hub_strategy: end
|
54 |
hub_token: null
|
55 |
learning_rate: 0.0001
|
56 |
-
load_in_4bit:
|
57 |
load_in_8bit: false
|
58 |
local_rank: null
|
59 |
logging_steps: 1
|
@@ -64,11 +64,11 @@ lora_model_dir: null
|
|
64 |
lora_r: 32
|
65 |
lora_target_linear: true
|
66 |
lr_scheduler: cosine
|
67 |
-
max_steps:
|
68 |
micro_batch_size: 1
|
69 |
mlflow_experiment_name: /tmp/2b8375abdf26554a_train_data.json
|
70 |
model_type: AutoModelForCausalLM
|
71 |
-
num_epochs:
|
72 |
optimizer: adamw_bnb_8bit
|
73 |
output_dir: miner_id_24
|
74 |
pad_to_sequence_len: true
|
@@ -76,7 +76,7 @@ resume_from_checkpoint: null
|
|
76 |
s2_attention: null
|
77 |
sample_packing: false
|
78 |
saves_per_epoch: 1
|
79 |
-
sequence_len:
|
80 |
strict: false
|
81 |
tf32: false
|
82 |
tokenizer_type: AutoTokenizer
|
@@ -101,7 +101,7 @@ xformers_attention: null
|
|
101 |
|
102 |
This model is a fine-tuned version of [katuni4ka/tiny-random-qwen1.5-moe](https://huggingface.co/katuni4ka/tiny-random-qwen1.5-moe) on the None dataset.
|
103 |
It achieves the following results on the evaluation set:
|
104 |
-
- Loss: 11.
|
105 |
|
106 |
## Model description
|
107 |
|
@@ -129,17 +129,21 @@ The following hyperparameters were used during training:
|
|
129 |
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
130 |
- lr_scheduler_type: cosine
|
131 |
- lr_scheduler_warmup_steps: 10
|
132 |
-
- training_steps:
|
133 |
|
134 |
### Training results
|
135 |
|
136 |
| Training Loss | Epoch | Step | Validation Loss |
|
137 |
|:-------------:|:------:|:----:|:---------------:|
|
138 |
-
| 11.
|
139 |
-
| 11.
|
140 |
-
| 11.
|
141 |
-
| 11.
|
142 |
-
| 11.
|
|
|
|
|
|
|
|
|
143 |
|
144 |
|
145 |
### Framework versions
|
|
|
17 |
|
18 |
axolotl version: `0.4.1`
|
19 |
```yaml
|
20 |
+
adapter: lora
|
21 |
base_model: katuni4ka/tiny-random-qwen1.5-moe
|
22 |
bf16: auto
|
23 |
chat_template: llama3
|
|
|
41 |
eval_max_new_tokens: 128
|
42 |
eval_table_size: null
|
43 |
evals_per_epoch: 1
|
44 |
+
flash_attention: true
|
45 |
fp16: null
|
46 |
fsdp: null
|
47 |
fsdp_config: null
|
|
|
53 |
hub_strategy: end
|
54 |
hub_token: null
|
55 |
learning_rate: 0.0001
|
56 |
+
load_in_4bit: false
|
57 |
load_in_8bit: false
|
58 |
local_rank: null
|
59 |
logging_steps: 1
|
|
|
64 |
lora_r: 32
|
65 |
lora_target_linear: true
|
66 |
lr_scheduler: cosine
|
67 |
+
max_steps: 1000
|
68 |
micro_batch_size: 1
|
69 |
mlflow_experiment_name: /tmp/2b8375abdf26554a_train_data.json
|
70 |
model_type: AutoModelForCausalLM
|
71 |
+
num_epochs: 8
|
72 |
optimizer: adamw_bnb_8bit
|
73 |
output_dir: miner_id_24
|
74 |
pad_to_sequence_len: true
|
|
|
76 |
s2_attention: null
|
77 |
sample_packing: false
|
78 |
saves_per_epoch: 1
|
79 |
+
sequence_len: 2048
|
80 |
strict: false
|
81 |
tf32: false
|
82 |
tokenizer_type: AutoTokenizer
|
|
|
101 |
|
102 |
This model is a fine-tuned version of [katuni4ka/tiny-random-qwen1.5-moe](https://huggingface.co/katuni4ka/tiny-random-qwen1.5-moe) on the None dataset.
|
103 |
It achieves the following results on the evaluation set:
|
104 |
+
- Loss: 11.8547
|
105 |
|
106 |
## Model description
|
107 |
|
|
|
129 |
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
130 |
- lr_scheduler_type: cosine
|
131 |
- lr_scheduler_warmup_steps: 10
|
132 |
+
- training_steps: 1000
|
133 |
|
134 |
### Training results
|
135 |
|
136 |
| Training Loss | Epoch | Step | Validation Loss |
|
137 |
|:-------------:|:------:|:----:|:---------------:|
|
138 |
+
| 11.9466 | 0.0008 | 1 | 11.9395 |
|
139 |
+
| 11.9117 | 0.1060 | 125 | 11.8985 |
|
140 |
+
| 11.8802 | 0.2121 | 250 | 11.8872 |
|
141 |
+
| 11.8782 | 0.3181 | 375 | 11.8751 |
|
142 |
+
| 11.8794 | 0.4242 | 500 | 11.8600 |
|
143 |
+
| 11.8716 | 0.5302 | 625 | 11.8564 |
|
144 |
+
| 11.8596 | 0.6363 | 750 | 11.8553 |
|
145 |
+
| 11.8706 | 0.7423 | 875 | 11.8548 |
|
146 |
+
| 11.8513 | 0.8484 | 1000 | 11.8547 |
|
147 |
|
148 |
|
149 |
### Framework versions
|
adapter_config.json
CHANGED
@@ -20,15 +20,15 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"gate_proj",
|
24 |
-
"v_proj",
|
25 |
-
"shared_expert_gate",
|
26 |
"o_proj",
|
27 |
-
"up_proj",
|
28 |
-
"down_proj",
|
29 |
-
"k_proj",
|
30 |
"gate",
|
31 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
],
|
33 |
"task_type": "CAUSAL_LM",
|
34 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
|
|
|
|
23 |
"o_proj",
|
|
|
|
|
|
|
24 |
"gate",
|
25 |
+
"k_proj",
|
26 |
+
"up_proj",
|
27 |
+
"v_proj",
|
28 |
+
"shared_expert_gate",
|
29 |
+
"gate_proj",
|
30 |
+
"q_proj",
|
31 |
+
"down_proj"
|
32 |
],
|
33 |
"task_type": "CAUSAL_LM",
|
34 |
"use_dora": false,
|
adapter_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1265130
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f12be5ef04a053c67b95380acd55b89af25af55306be8bdf60190bfbe142221
|
3 |
size 1265130
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1204200
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e172d6fea7af81f45f881ef2a1d2fd5983609af1043d4bdd89ebb406fba4d06d
|
3 |
size 1204200
|
config.json
CHANGED
@@ -11,7 +11,7 @@
|
|
11 |
"hidden_size": 32,
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 22,
|
14 |
-
"max_position_embeddings":
|
15 |
"max_window_layers": 2,
|
16 |
"mlp_only_layers": [],
|
17 |
"model_type": "qwen2_moe",
|
@@ -23,21 +23,6 @@
|
|
23 |
"num_hidden_layers": 4,
|
24 |
"num_key_value_heads": 2,
|
25 |
"output_router_logits": false,
|
26 |
-
"quantization_config": {
|
27 |
-
"_load_in_4bit": true,
|
28 |
-
"_load_in_8bit": false,
|
29 |
-
"bnb_4bit_compute_dtype": "bfloat16",
|
30 |
-
"bnb_4bit_quant_storage": "float32",
|
31 |
-
"bnb_4bit_quant_type": "nf4",
|
32 |
-
"bnb_4bit_use_double_quant": true,
|
33 |
-
"llm_int8_enable_fp32_cpu_offload": false,
|
34 |
-
"llm_int8_has_fp16_weight": false,
|
35 |
-
"llm_int8_skip_modules": null,
|
36 |
-
"llm_int8_threshold": 6.0,
|
37 |
-
"load_in_4bit": true,
|
38 |
-
"load_in_8bit": false,
|
39 |
-
"quant_method": "bitsandbytes"
|
40 |
-
},
|
41 |
"rms_norm_eps": 1e-06,
|
42 |
"rope_scaling": null,
|
43 |
"rope_theta": 1000000.0,
|
|
|
11 |
"hidden_size": 32,
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 22,
|
14 |
+
"max_position_embeddings": 2048,
|
15 |
"max_window_layers": 2,
|
16 |
"mlp_only_layers": [],
|
17 |
"model_type": "qwen2_moe",
|
|
|
23 |
"num_hidden_layers": 4,
|
24 |
"num_key_value_heads": 2,
|
25 |
"output_router_logits": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
"rms_norm_eps": 1e-06,
|
27 |
"rope_scaling": null,
|
28 |
"rope_theta": 1000000.0,
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6776
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a1a4bf91134d51cf10b3455a9337aef717da260b938001879d17eecb31311f6
|
3 |
size 6776
|