chitanda commited on
Commit
2214470
·
verified ·
1 Parent(s): 218ebc0

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-100/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-100/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-100/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aedc40cb3a34638de2c2cb8109771e907dfe679c95f65e5b5fdb30aa3869fe7d
3
+ size 5012367854
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
checkpoint-100/training_config.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: 3671
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: 220
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ aligner:
60
+ _target_: data.input_aligner.concat_aligner
61
+ aligners:
62
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
63
+ response_field: response
64
+ id_field: id
65
+ do_sample: false
66
+ template:
67
+ chosen: '{instruction}
68
+
69
+
70
+ ### Question: {query}
71
+
72
+
73
+ SubQuestion 1: {pos}<eos>'
74
+ reject: '{instruction}
75
+
76
+
77
+ ### Question: {query}
78
+
79
+
80
+ SubQuestion 1: {neg}<eos>'
81
+ prompt: '{instruction}
82
+
83
+
84
+ ### Question: {query}
85
+
86
+
87
+ SubQuestion 1:'
88
+ instruction: 'Given a question, please decompose it into sub-questions. For each
89
+ sub-question, please answer it in a complete sentence, ending with "The answer
90
+ is". When the original question is answerable, please start the sub-question with
91
+ "Now we can answer the question: ".'
92
+ kv_mapping:
93
+ chosen: chosen
94
+ reject: reject
95
+ id: index
96
+ prompt: prompt
97
+ dist_load_data_barrier: false
98
+ extended_vocab: null
99
+ collator:
100
+ _target_: data.dpo.DPOCollator
101
+ tokenizer: ${tokenizer_init}
102
+ max_seq_length: 1024
103
+ num_workers: 8
104
+ prefetch_factor: 2
105
+ model_name_or_path: ${sft_model_dir}
106
+ pretrain: null
107
+ dp_size: 2
108
+ tp_size: 1
109
+ pp_size: 1
110
+ exp_name: gemma.2b.it.meta_math_rap.dpo.H100.w2.v1.1.fix.s${seed}
111
+ exp_notes: null
112
+ output_dir: experiments/${exp_name}
113
+ do_train: true
114
+ evaluate_during_training: false
115
+ do_eval: false
116
+ eval_sub_path: checkpoint-100
117
+ per_gpu_train_batch_size: 1
118
+ per_gpu_eval_batch_size: 4
119
+ learning_rate: 1.0e-06
120
+ gradient_accumulation_steps: 32
121
+ weight_decay: 0.1
122
+ adam_epsilon: 1.0e-06
123
+ adam_betas: (0.9, 0.98)
124
+ total_dataset_len: 234960
125
+ max_grad_norm: 1.0
126
+ num_train_epochs: 1
127
+ max_steps: 0
128
+ warmup_proportion: 0.06
129
+ warmup_steps: 0
130
+ optimizer: null
131
+ use_nvlamb: null
132
+ bit_training: null
133
+ logging_steps: 5
134
+ save_ds_state: false
135
+ save_steps: 100
136
+ save_best: false
137
+ eval_steps: 400
138
+ ddp_eval: true
139
+ no_cuda: false
140
+ seed: 42
141
+ local_rank: 0
142
+ fp16: true
143
+ fp16_opt_level: O1
144
+ fp16_bfloat16: true
145
+ prediction_cfg:
146
+ metric: loss
147
+ measure: -1
148
+ best_checkpoint: null
149
+ best_result: null
150
+ eval_forward_fn:
151
+ _target_: general_util.evaluator.DefaultForwardFn
152
+ post_process:
153
+ _target_: post_processors.dpo.DPOEvalPostProcessor
154
+ summary_helper:
155
+ _target_: general_util.tensorboard_helper.WandbWriter
156
+ batch_index_or_keys: null
157
+ outputs_index_or_keys:
158
+ train/chosen_reward: chosen_reward
159
+ train/rejected_reward: rejected_reward
160
+ n_gpu: 1
161
+ device: cuda:0
162
+ train_batch_size: 1
163
+ eval_batch_size: null
164
+ world_size: 2
checkpoint-200/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/",
3
+ "architectures": [
4
+ "GemmaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 2,
9
+ "eos_token_id": 1,
10
+ "head_dim": 256,
11
+ "hidden_act": "gelu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 16384,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "gemma",
17
+ "num_attention_heads": 8,
18
+ "num_hidden_layers": 18,
19
+ "num_key_value_heads": 1,
20
+ "pad_token_id": 0,
21
+ "rms_norm_eps": 1e-06,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.38.2",
26
+ "use_cache": true,
27
+ "vocab_size": 256000
28
+ }
checkpoint-200/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.38.2"
7
+ }
checkpoint-200/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e2c8955695753ec7efbb5dee17112b12e2d2549f5ae4f03c843a815244dfb5
3
+ size 5012367854
checkpoint-200/special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<start_of_turn>",
4
+ "<end_of_turn>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<bos>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "eos_token": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "pad_token": {
21
+ "content": "<pad>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "unk_token": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
checkpoint-200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05e97791a5e007260de1db7e1692e53150e08cea481e2bf25435553380c147ee
3
+ size 17477929
checkpoint-200/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
3
+ size 4241003
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<eos>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "<bos>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "106": {
38
+ "content": "<start_of_turn>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "107": {
46
+ "content": "<end_of_turn>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ }
53
+ },
54
+ "additional_special_tokens": [
55
+ "<start_of_turn>",
56
+ "<end_of_turn>"
57
+ ],
58
+ "bos_token": "<bos>",
59
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
60
+ "clean_up_tokenization_spaces": false,
61
+ "eos_token": "<eos>",
62
+ "legacy": null,
63
+ "model_max_length": 1000000000000000019884624838656,
64
+ "pad_token": "<pad>",
65
+ "sp_model_kwargs": {},
66
+ "spaces_between_special_tokens": false,
67
+ "tokenizer_class": "GemmaTokenizer",
68
+ "unk_token": "<unk>",
69
+ "use_default_system_prompt": false
70
+ }
checkpoint-200/training_config.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: 3671
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: 220
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ aligner:
60
+ _target_: data.input_aligner.concat_aligner
61
+ aligners:
62
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
63
+ response_field: response
64
+ id_field: id
65
+ do_sample: false
66
+ template:
67
+ chosen: '{instruction}
68
+
69
+
70
+ ### Question: {query}
71
+
72
+
73
+ SubQuestion 1: {pos}<eos>'
74
+ reject: '{instruction}
75
+
76
+
77
+ ### Question: {query}
78
+
79
+
80
+ SubQuestion 1: {neg}<eos>'
81
+ prompt: '{instruction}
82
+
83
+
84
+ ### Question: {query}
85
+
86
+
87
+ SubQuestion 1:'
88
+ instruction: 'Given a question, please decompose it into sub-questions. For each
89
+ sub-question, please answer it in a complete sentence, ending with "The answer
90
+ is". When the original question is answerable, please start the sub-question with
91
+ "Now we can answer the question: ".'
92
+ kv_mapping:
93
+ chosen: chosen
94
+ reject: reject
95
+ id: index
96
+ prompt: prompt
97
+ dist_load_data_barrier: false
98
+ extended_vocab: null
99
+ collator:
100
+ _target_: data.dpo.DPOCollator
101
+ tokenizer: ${tokenizer_init}
102
+ max_seq_length: 1024
103
+ num_workers: 8
104
+ prefetch_factor: 2
105
+ model_name_or_path: ${sft_model_dir}
106
+ pretrain: null
107
+ dp_size: 2
108
+ tp_size: 1
109
+ pp_size: 1
110
+ exp_name: gemma.2b.it.meta_math_rap.dpo.H100.w2.v1.1.fix.s${seed}
111
+ exp_notes: null
112
+ output_dir: experiments/${exp_name}
113
+ do_train: true
114
+ evaluate_during_training: false
115
+ do_eval: false
116
+ eval_sub_path: checkpoint-100
117
+ per_gpu_train_batch_size: 1
118
+ per_gpu_eval_batch_size: 4
119
+ learning_rate: 1.0e-06
120
+ gradient_accumulation_steps: 32
121
+ weight_decay: 0.1
122
+ adam_epsilon: 1.0e-06
123
+ adam_betas: (0.9, 0.98)
124
+ total_dataset_len: 234960
125
+ max_grad_norm: 1.0
126
+ num_train_epochs: 1
127
+ max_steps: 0
128
+ warmup_proportion: 0.06
129
+ warmup_steps: 0
130
+ optimizer: null
131
+ use_nvlamb: null
132
+ bit_training: null
133
+ logging_steps: 5
134
+ save_ds_state: false
135
+ save_steps: 100
136
+ save_best: false
137
+ eval_steps: 400
138
+ ddp_eval: true
139
+ no_cuda: false
140
+ seed: 42
141
+ local_rank: 0
142
+ fp16: true
143
+ fp16_opt_level: O1
144
+ fp16_bfloat16: true
145
+ prediction_cfg:
146
+ metric: loss
147
+ measure: -1
148
+ best_checkpoint: null
149
+ best_result: null
150
+ eval_forward_fn:
151
+ _target_: general_util.evaluator.DefaultForwardFn
152
+ post_process:
153
+ _target_: post_processors.dpo.DPOEvalPostProcessor
154
+ summary_helper:
155
+ _target_: general_util.tensorboard_helper.WandbWriter
156
+ batch_index_or_keys: null
157
+ outputs_index_or_keys:
158
+ train/chosen_reward: chosen_reward
159
+ train/rejected_reward: rejected_reward
160
+ n_gpu: 1
161
+ device: cuda:0
162
+ train_batch_size: 1
163
+ eval_batch_size: null
164
+ world_size: 2
training_config.yaml ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ds_cfg:
2
+ train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
3
+ gradient_accumulation_steps: ${gradient_accumulation_steps}
4
+ scheduler:
5
+ type: WarmupDecayLR
6
+ params:
7
+ total_num_steps: null
8
+ warmup_max_lr: ${learning_rate}
9
+ warmup_num_steps: null
10
+ warmup_type: linear
11
+ optimizer:
12
+ type: AdamW
13
+ params:
14
+ lr: ${learning_rate}
15
+ betas:
16
+ - 0.9
17
+ - 0.95
18
+ eps: 1.0e-06
19
+ weight_decay: ${weight_decay}
20
+ bf16:
21
+ enabled: true
22
+ zero_optimization:
23
+ stage: 1
24
+ stage3_param_persistence_threshold: 100000.0
25
+ stage3_max_live_parameters: 100000000.0
26
+ stage3_prefetch_bucket_size: 100000000.0
27
+ memory_efficient_linear: false
28
+ steps_per_print: 25
29
+ gradient_clipping: 1.0
30
+ prescale_gradients: false
31
+ sft_model_dir: experiments/gemma.2b.it.meta_math_distil.H100.w4.v1.0/checkpoint-2000/
32
+ train_file: ${sft_model_dir}/meta_math_sub.25k.rap.train.0shot.n10.tem1.0.p0.7.v1.0_clean_fix.json
33
+ dev_file: null
34
+ test_file: null
35
+ torch_dtype:
36
+ _target_: general_util.training_utils.return_torch_dtype
37
+ dtype: bfloat16
38
+ tokenizer_init:
39
+ _target_: general_util.tokenization_utils.init_tokenizer
40
+ tokenizer_path: ${model_name_or_path}
41
+ padding_side: left
42
+ device_map:
43
+ _target_: models.utils.return_single_device_map
44
+ model:
45
+ _target_: models.gemma.GemmaForCausalLMDPO.from_pretrained_with_ref_model
46
+ beta: 0.5
47
+ gradient_checkpointing: false
48
+ attn_implementation: flash_attention_2
49
+ torch_dtype: ${torch_dtype}
50
+ device_map: ${device_map}
51
+ ref_model:
52
+ _target_: models.gemma.GemmaForCausalLM.from_pretrained
53
+ pretrained_model_name_or_path: ${model_name_or_path}
54
+ torch_dtype: ${torch_dtype}
55
+ attn_implementation: flash_attention_2
56
+ device_map: ${device_map}
57
+ read_tensor:
58
+ _target_: data.logic_combine.MultiMappingDataset
59
+ aligner:
60
+ _target_: data.input_aligner.concat_aligner
61
+ aligners:
62
+ - _target_: data.input_aligner.dpo_pair_aligner_cleaned
63
+ response_field: response
64
+ id_field: id
65
+ do_sample: false
66
+ template:
67
+ chosen: '{instruction}
68
+
69
+
70
+ ### Question: {query}
71
+
72
+
73
+ SubQuestion 1: {pos}<eos>'
74
+ reject: '{instruction}
75
+
76
+
77
+ ### Question: {query}
78
+
79
+
80
+ SubQuestion 1: {neg}<eos>'
81
+ prompt: '{instruction}
82
+
83
+
84
+ ### Question: {query}
85
+
86
+
87
+ SubQuestion 1:'
88
+ instruction: 'Given a question, please decompose it into sub-questions. For each
89
+ sub-question, please answer it in a complete sentence, ending with "The answer
90
+ is". When the original question is answerable, please start the sub-question with
91
+ "Now we can answer the question: ".'
92
+ kv_mapping:
93
+ chosen: chosen
94
+ reject: reject
95
+ id: index
96
+ prompt: prompt
97
+ dist_load_data_barrier: false
98
+ extended_vocab: null
99
+ collator:
100
+ _target_: data.dpo.DPOCollator
101
+ tokenizer: ${tokenizer_init}
102
+ max_seq_length: 1024
103
+ num_workers: 8
104
+ prefetch_factor: 2
105
+ model_name_or_path: ${sft_model_dir}
106
+ pretrain: null
107
+ dp_size: 2
108
+ tp_size: 1
109
+ pp_size: 1
110
+ exp_name: gemma.2b.it.meta_math_rap.dpo.H100.w2.v1.1.fix.s${seed}
111
+ exp_notes: null
112
+ output_dir: experiments/${exp_name}
113
+ do_train: true
114
+ evaluate_during_training: false
115
+ do_eval: false
116
+ eval_sub_path: checkpoint-100
117
+ per_gpu_train_batch_size: 1
118
+ per_gpu_eval_batch_size: 4
119
+ learning_rate: 1.0e-06
120
+ gradient_accumulation_steps: 32
121
+ weight_decay: 0.1
122
+ adam_epsilon: 1.0e-06
123
+ adam_betas: (0.9, 0.98)
124
+ total_dataset_len: -1
125
+ max_grad_norm: 1.0
126
+ num_train_epochs: 1
127
+ max_steps: 0
128
+ warmup_proportion: 0.06
129
+ warmup_steps: 0
130
+ optimizer: null
131
+ use_nvlamb: null
132
+ bit_training: null
133
+ logging_steps: 5
134
+ save_ds_state: false
135
+ save_steps: 100
136
+ save_best: false
137
+ eval_steps: 400
138
+ ddp_eval: true
139
+ no_cuda: false
140
+ seed: 42
141
+ local_rank: 0
142
+ fp16: true
143
+ fp16_opt_level: O1
144
+ fp16_bfloat16: true
145
+ prediction_cfg:
146
+ metric: loss
147
+ measure: -1
148
+ best_checkpoint: null
149
+ best_result: null
150
+ eval_forward_fn:
151
+ _target_: general_util.evaluator.DefaultForwardFn
152
+ post_process:
153
+ _target_: post_processors.dpo.DPOEvalPostProcessor
154
+ summary_helper:
155
+ _target_: general_util.tensorboard_helper.WandbWriter
156
+ batch_index_or_keys: null
157
+ outputs_index_or_keys:
158
+ train/chosen_reward: chosen_reward
159
+ train/rejected_reward: rejected_reward
160
+ n_gpu: 1
161
+ device: cuda:0
162
+ train_batch_size: null
163
+ eval_batch_size: null
164
+ world_size: 2