elichen3051 commited on
Commit
324ec26
·
verified ·
1 Parent(s): 9a3ba0e

Model save

Browse files
Files changed (2) hide show
  1. README.md +188 -0
  2. generation_config.json +9 -0
README.md ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: llama3.1
4
+ base_model: meta-llama/Llama-3.1-8B
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ model-index:
9
+ - name: Llama3.1-8B-v0.1-dolma-skymizer-method-0.6
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
17
+ <details><summary>See axolotl config</summary>
18
+
19
+ axolotl version: `0.5.2`
20
+ ```yaml
21
+ base_model: meta-llama/Llama-3.1-8B
22
+ model_type: AutoModelForCausalLM
23
+ tokenizer_type: AutoTokenizer
24
+ tokenizer_use_fast: false
25
+ resize_token_embeddings_to_32x: false
26
+
27
+ flash_attention: true
28
+ xformers_attention:
29
+
30
+ load_in_8bit: false
31
+ load_in_4bit: false
32
+ strict: false
33
+
34
+ datasets:
35
+ - path: skymizer/Llama3.1-base-tokenized-dolma-v1_7-50B
36
+ train_on_split: train
37
+ type: completion
38
+
39
+ test_datasets:
40
+ - path: skymizer/Llama3.1-tokenized-dolma-v1_7-test
41
+ split: test
42
+ type: completion
43
+
44
+ is_preprocess: true
45
+ skip_prepare_dataset: true
46
+
47
+ dataset_prepared_path: /mnt/home/model-team/datasets/pretokenized/Llama3.1-8B-base-tokenized-dolma-v1_7_50B-4096
48
+
49
+ hf_use_auth_token: true
50
+ output_dir: /mnt/home/model-team/models/Llama3.1-8B-v0.1-STE-0.6
51
+ resume_from_checkpoint:
52
+ auto_resume_from_checkpoints: true
53
+
54
+ sequence_len: 4096
55
+ sample_packing: true
56
+ sample_packing_group_size: 100000
57
+ sample_packing_bin_size: 200
58
+ pad_to_sequence_len: true
59
+
60
+ eval_sample_packing: false
61
+ # eval_causal_lm_metrics: ["perplexity"]
62
+
63
+ wandb_project: "sparse-tuning-cpt"
64
+ wandb_entity:
65
+ wandb_watch:
66
+ wandb_name: "Llama3.1-8B-v0.1-dolma-STE-0.6"
67
+ wandb_log_model:
68
+
69
+ # global batch size = 2 * 8 * 8 GPUs * 8 Nodes * 4096 = 4M
70
+ gradient_accumulation_steps: 8
71
+ micro_batch_size: 2
72
+ eval_batch_size: 1
73
+ max_steps: 10000
74
+ optimizer: adamw_torch
75
+ learning_rate: 0.00005
76
+ lr_scheduler: cosine
77
+ cosine_min_lr_ratio: 0.2
78
+ weight_decay: 0.0
79
+ adam_beta1: 0.9
80
+ adam_beta2: 0.95
81
+ adam_eps: 0.000001
82
+ max_grad_norm: 1.0
83
+
84
+ train_on_inputs: false
85
+ group_by_length: false
86
+ bf16: true
87
+ fp16:
88
+ tf32: false
89
+
90
+ hub_model_id: "skymizer/Llama3.1-8B-v0.1-dolma-skymizer-method-0.6"
91
+
92
+ save_strategy: "steps"
93
+ save_steps: 500
94
+
95
+ gradient_checkpointing: true
96
+ gradient_checkpointing_kwargs:
97
+ use_reentrant: false
98
+ early_stopping_patience:
99
+ resume_from_checkpoint:
100
+ local_rank:
101
+ logging_steps: 1
102
+
103
+ warmup_steps: 375
104
+ eval_steps: 500
105
+ eval_table_size:
106
+ debug:
107
+ deepspeed: /root/train/axolotl/deepspeed_configs/zero3_bf16.json
108
+ fsdp:
109
+ fsdp_config:
110
+ seed: 42
111
+
112
+ special_tokens:
113
+ pad_token: "<|end_of_text|>"
114
+
115
+ ```
116
+
117
+ </details><br>
118
+
119
+ # Llama3.1-8B-v0.1-dolma-skymizer-method-0.6
120
+
121
+ This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
122
+ It achieves the following results on the evaluation set:
123
+ - Loss: 2.3883
124
+
125
+ ## Model description
126
+
127
+ More information needed
128
+
129
+ ## Intended uses & limitations
130
+
131
+ More information needed
132
+
133
+ ## Training and evaluation data
134
+
135
+ More information needed
136
+
137
+ ## Training procedure
138
+
139
+ ### Training hyperparameters
140
+
141
+ The following hyperparameters were used during training:
142
+ - learning_rate: 5e-05
143
+ - train_batch_size: 2
144
+ - eval_batch_size: 1
145
+ - seed: 42
146
+ - distributed_type: multi-GPU
147
+ - num_devices: 64
148
+ - gradient_accumulation_steps: 8
149
+ - total_train_batch_size: 1024
150
+ - total_eval_batch_size: 64
151
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
152
+ - lr_scheduler_type: cosine
153
+ - lr_scheduler_warmup_steps: 375
154
+ - training_steps: 10000
155
+
156
+ ### Training results
157
+
158
+ | Training Loss | Epoch | Step | Validation Loss |
159
+ |:-------------:|:------:|:-----:|:---------------:|
160
+ | 2.2837 | 0.0001 | 1 | 2.5425 |
161
+ | 2.2557 | 0.0414 | 500 | 2.4568 |
162
+ | 2.2641 | 0.0829 | 1000 | 2.4520 |
163
+ | 2.2207 | 0.1243 | 1500 | 2.4477 |
164
+ | 2.3003 | 0.1657 | 2000 | 2.4432 |
165
+ | 2.2382 | 0.2072 | 2500 | 2.4388 |
166
+ | 2.2339 | 0.2486 | 3000 | 2.4349 |
167
+ | 2.2517 | 0.2901 | 3500 | 2.4303 |
168
+ | 2.2483 | 0.3315 | 4000 | 2.4246 |
169
+ | 2.2067 | 0.3729 | 4500 | 2.4207 |
170
+ | 2.2485 | 0.4144 | 5000 | 2.4163 |
171
+ | 2.2541 | 0.4558 | 5500 | 2.4123 |
172
+ | 2.2192 | 0.4972 | 6000 | 2.4084 |
173
+ | 2.2346 | 0.5387 | 6500 | 2.4041 |
174
+ | 2.2106 | 0.5801 | 7000 | 2.4010 |
175
+ | 2.2112 | 0.6215 | 7500 | 2.3982 |
176
+ | 2.2215 | 0.6630 | 8000 | 2.3951 |
177
+ | 2.2118 | 0.7044 | 8500 | 2.3924 |
178
+ | 2.1933 | 0.7458 | 9000 | 2.3905 |
179
+ | 2.1813 | 0.7873 | 9500 | 2.3893 |
180
+ | 2.1969 | 0.8287 | 10000 | 2.3883 |
181
+
182
+
183
+ ### Framework versions
184
+
185
+ - Transformers 4.46.3
186
+ - Pytorch 2.5.1+cu124
187
+ - Datasets 3.1.0
188
+ - Tokenizers 0.20.3
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "do_sample": true,
5
+ "eos_token_id": 128001,
6
+ "temperature": 0.6,
7
+ "top_p": 0.9,
8
+ "transformers_version": "4.46.3"
9
+ }