nhxnnz commited on
Commit
9cdd971
·
verified ·
1 Parent(s): cf7b425

End of training

Browse files
README.md CHANGED
@@ -37,15 +37,15 @@ More information needed
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
- - learning_rate: 2e-05
41
- - train_batch_size: 2
42
  - eval_batch_size: 2
43
  - seed: 42
44
- - gradient_accumulation_steps: 8
45
- - total_train_batch_size: 16
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
- - num_epochs: 5
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
 
37
  ### Training hyperparameters
38
 
39
  The following hyperparameters were used during training:
40
+ - learning_rate: 0.0002
41
+ - train_batch_size: 4
42
  - eval_batch_size: 2
43
  - seed: 42
44
+ - gradient_accumulation_steps: 2
45
+ - total_train_batch_size: 8
46
  - optimizer: Use OptimizerNames.PAGED_ADAMW_8BIT with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
  - lr_scheduler_type: linear
48
+ - num_epochs: 3
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ec026e0c170dfb09ec06149ee60a5ef42da92251d006bbd0f8353a446a171af
3
  size 7098064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df88e0c996327c515e25de6a58c119ceadd953b0f82a9129e4a7a2897cc8ea64
3
  size 7098064
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.974765681326604,
3
- "total_flos": 4.01647693824e+18,
4
- "train_loss": 2.9910258519167154,
5
- "train_runtime": 3782.4986,
6
- "train_samples_per_second": 3.666,
7
- "train_steps_per_second": 0.229
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 2.42193908736e+18,
4
+ "train_loss": 0.3701638735680484,
5
+ "train_runtime": 4045.1211,
6
+ "train_samples_per_second": 2.057,
7
+ "train_steps_per_second": 0.257
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.974765681326604,
3
- "total_flos": 4.01647693824e+18,
4
- "train_loss": 2.9910258519167154,
5
- "train_runtime": 3782.4986,
6
- "train_samples_per_second": 3.666,
7
- "train_steps_per_second": 0.229
8
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "total_flos": 2.42193908736e+18,
4
+ "train_loss": 0.3701638735680484,
5
+ "train_runtime": 4045.1211,
6
+ "train_samples_per_second": 2.057,
7
+ "train_steps_per_second": 0.257
8
  }
trainer_state.json CHANGED
@@ -1,154 +1,238 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.974765681326604,
5
- "eval_steps": 500,
6
- "global_step": 865,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.2883922134102379,
13
- "grad_norm": 5.0078020095825195,
14
- "learning_rate": 1.8843930635838153e-05,
15
- "loss": 6.6486,
16
  "step": 50
17
  },
18
  {
19
- "epoch": 0.5767844268204758,
20
- "grad_norm": 2.4647130966186523,
21
- "learning_rate": 1.76878612716763e-05,
22
- "loss": 5.1264,
23
  "step": 100
24
  },
25
  {
26
- "epoch": 0.8651766402307137,
27
- "grad_norm": 3.123357057571411,
28
- "learning_rate": 1.6531791907514452e-05,
29
- "loss": 4.0859,
 
 
 
 
 
 
 
30
  "step": 150
31
  },
32
  {
33
- "epoch": 1.1499639509733237,
34
- "grad_norm": 1.291628122329712,
35
- "learning_rate": 1.5375722543352604e-05,
36
- "loss": 3.061,
 
 
 
 
 
 
 
37
  "step": 200
38
  },
39
  {
40
- "epoch": 1.4383561643835616,
41
- "grad_norm": 1.2552602291107178,
42
- "learning_rate": 1.4242774566473989e-05,
43
- "loss": 2.8565,
44
  "step": 250
45
  },
46
  {
47
- "epoch": 1.7267483777937995,
48
- "grad_norm": 1.0198160409927368,
49
- "learning_rate": 1.3132947976878613e-05,
50
- "loss": 2.6909,
 
 
 
 
 
 
 
51
  "step": 300
52
  },
53
  {
54
- "epoch": 2.0115356885364095,
55
- "grad_norm": 1.449593424797058,
56
- "learning_rate": 1.1976878612716765e-05,
57
- "loss": 2.6208,
58
  "step": 350
59
  },
60
  {
61
- "epoch": 2.2999279019466474,
62
- "grad_norm": 1.5548640489578247,
63
- "learning_rate": 1.0820809248554913e-05,
64
- "loss": 2.6048,
 
 
 
 
 
 
 
65
  "step": 400
66
  },
67
  {
68
- "epoch": 2.5883201153568853,
69
- "grad_norm": 1.121537446975708,
70
- "learning_rate": 9.664739884393064e-06,
71
- "loss": 2.494,
72
  "step": 450
73
  },
74
  {
75
- "epoch": 2.8767123287671232,
76
- "grad_norm": 1.1632635593414307,
77
- "learning_rate": 8.508670520231216e-06,
78
- "loss": 2.4738,
79
  "step": 500
80
  },
81
  {
82
- "epoch": 2.8767123287671232,
83
- "eval_runtime": 229.0364,
84
- "eval_samples_per_second": 5.401,
85
- "eval_steps_per_second": 2.703,
86
  "step": 500
87
  },
88
  {
89
- "epoch": 3.1614996395097332,
90
- "grad_norm": 1.4202187061309814,
91
- "learning_rate": 7.3526011560693645e-06,
92
- "loss": 2.4292,
93
  "step": 550
94
  },
95
  {
96
- "epoch": 3.449891852919971,
97
- "grad_norm": 9.334039688110352,
98
- "learning_rate": 6.196531791907515e-06,
99
- "loss": 2.3906,
100
  "step": 600
101
  },
102
  {
103
- "epoch": 3.738284066330209,
104
- "grad_norm": 1.7713243961334229,
105
- "learning_rate": 5.040462427745665e-06,
106
- "loss": 2.4202,
 
 
 
 
 
 
 
107
  "step": 650
108
  },
109
  {
110
- "epoch": 4.023071377072819,
111
- "grad_norm": 1.497362732887268,
112
- "learning_rate": 3.884393063583815e-06,
113
- "loss": 2.2646,
114
  "step": 700
115
  },
116
  {
117
- "epoch": 4.311463590483057,
118
- "grad_norm": 1.148555040359497,
119
- "learning_rate": 2.7283236994219654e-06,
120
- "loss": 2.2977,
 
 
 
 
 
 
 
121
  "step": 750
122
  },
123
  {
124
- "epoch": 4.599855803893295,
125
- "grad_norm": 1.0419560670852661,
126
- "learning_rate": 1.5722543352601158e-06,
127
- "loss": 2.2711,
 
 
 
 
 
 
 
128
  "step": 800
129
  },
130
  {
131
- "epoch": 4.888248017303533,
132
- "grad_norm": 1.8127518892288208,
133
- "learning_rate": 4.161849710982659e-07,
134
- "loss": 2.3197,
135
  "step": 850
136
  },
137
  {
138
- "epoch": 4.974765681326604,
139
- "step": 865,
140
- "total_flos": 4.01647693824e+18,
141
- "train_loss": 2.9910258519167154,
142
- "train_runtime": 3782.4986,
143
- "train_samples_per_second": 3.666,
144
- "train_steps_per_second": 0.229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
  ],
147
  "logging_steps": 50,
148
- "max_steps": 865,
149
  "num_input_tokens_seen": 0,
150
- "num_train_epochs": 5,
151
- "save_steps": 500,
152
  "stateful_callbacks": {
153
  "TrainerControl": {
154
  "args": {
@@ -161,8 +245,8 @@
161
  "attributes": {}
162
  }
163
  },
164
- "total_flos": 4.01647693824e+18,
165
- "train_batch_size": 2,
166
  "trial_name": null,
167
  "trial_params": null
168
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1041,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.1440922190201729,
13
+ "grad_norm": 2.598850727081299,
14
+ "learning_rate": 0.0001903938520653218,
15
+ "loss": 0.6467,
16
  "step": 50
17
  },
18
  {
19
+ "epoch": 0.2881844380403458,
20
+ "grad_norm": 2.249300956726074,
21
+ "learning_rate": 0.00018078770413064362,
22
+ "loss": 0.4668,
23
  "step": 100
24
  },
25
  {
26
+ "epoch": 0.2881844380403458,
27
+ "eval_runtime": 232.7234,
28
+ "eval_samples_per_second": 5.315,
29
+ "eval_steps_per_second": 2.66,
30
+ "step": 100
31
+ },
32
+ {
33
+ "epoch": 0.4322766570605187,
34
+ "grad_norm": 1.964786410331726,
35
+ "learning_rate": 0.00017118155619596544,
36
+ "loss": 0.4482,
37
  "step": 150
38
  },
39
  {
40
+ "epoch": 0.5763688760806917,
41
+ "grad_norm": 1.8695533275604248,
42
+ "learning_rate": 0.00016157540826128723,
43
+ "loss": 0.4197,
44
+ "step": 200
45
+ },
46
+ {
47
+ "epoch": 0.5763688760806917,
48
+ "eval_runtime": 227.4078,
49
+ "eval_samples_per_second": 5.44,
50
+ "eval_steps_per_second": 2.722,
51
  "step": 200
52
  },
53
  {
54
+ "epoch": 0.7204610951008645,
55
+ "grad_norm": 1.8768256902694702,
56
+ "learning_rate": 0.00015196926032660902,
57
+ "loss": 0.4265,
58
  "step": 250
59
  },
60
  {
61
+ "epoch": 0.8645533141210374,
62
+ "grad_norm": 1.8593772649765015,
63
+ "learning_rate": 0.00014236311239193086,
64
+ "loss": 0.4039,
65
+ "step": 300
66
+ },
67
+ {
68
+ "epoch": 0.8645533141210374,
69
+ "eval_runtime": 225.9543,
70
+ "eval_samples_per_second": 5.475,
71
+ "eval_steps_per_second": 2.739,
72
  "step": 300
73
  },
74
  {
75
+ "epoch": 1.0086455331412103,
76
+ "grad_norm": 1.2640736103057861,
77
+ "learning_rate": 0.00013275696445725266,
78
+ "loss": 0.3906,
79
  "step": 350
80
  },
81
  {
82
+ "epoch": 1.1527377521613833,
83
+ "grad_norm": 1.3723323345184326,
84
+ "learning_rate": 0.00012315081652257445,
85
+ "loss": 0.3547,
86
+ "step": 400
87
+ },
88
+ {
89
+ "epoch": 1.1527377521613833,
90
+ "eval_runtime": 226.5998,
91
+ "eval_samples_per_second": 5.459,
92
+ "eval_steps_per_second": 2.732,
93
  "step": 400
94
  },
95
  {
96
+ "epoch": 1.2968299711815563,
97
+ "grad_norm": 1.395857810974121,
98
+ "learning_rate": 0.00011354466858789625,
99
+ "loss": 0.3402,
100
  "step": 450
101
  },
102
  {
103
+ "epoch": 1.440922190201729,
104
+ "grad_norm": 1.6172202825546265,
105
+ "learning_rate": 0.00010393852065321807,
106
+ "loss": 0.34,
107
  "step": 500
108
  },
109
  {
110
+ "epoch": 1.440922190201729,
111
+ "eval_runtime": 227.4832,
112
+ "eval_samples_per_second": 5.438,
113
+ "eval_steps_per_second": 2.721,
114
  "step": 500
115
  },
116
  {
117
+ "epoch": 1.585014409221902,
118
+ "grad_norm": 1.5597540140151978,
119
+ "learning_rate": 9.433237271853987e-05,
120
+ "loss": 0.337,
121
  "step": 550
122
  },
123
  {
124
+ "epoch": 1.729106628242075,
125
+ "grad_norm": 1.3169169425964355,
126
+ "learning_rate": 8.472622478386168e-05,
127
+ "loss": 0.3597,
128
  "step": 600
129
  },
130
  {
131
+ "epoch": 1.729106628242075,
132
+ "eval_runtime": 224.0401,
133
+ "eval_samples_per_second": 5.521,
134
+ "eval_steps_per_second": 2.763,
135
+ "step": 600
136
+ },
137
+ {
138
+ "epoch": 1.8731988472622478,
139
+ "grad_norm": 1.5286619663238525,
140
+ "learning_rate": 7.512007684918348e-05,
141
+ "loss": 0.3541,
142
  "step": 650
143
  },
144
  {
145
+ "epoch": 2.0172910662824206,
146
+ "grad_norm": 1.4207804203033447,
147
+ "learning_rate": 6.551392891450529e-05,
148
+ "loss": 0.3503,
149
  "step": 700
150
  },
151
  {
152
+ "epoch": 2.0172910662824206,
153
+ "eval_runtime": 229.2463,
154
+ "eval_samples_per_second": 5.396,
155
+ "eval_steps_per_second": 2.7,
156
+ "step": 700
157
+ },
158
+ {
159
+ "epoch": 2.161383285302594,
160
+ "grad_norm": 1.2903691530227661,
161
+ "learning_rate": 5.59077809798271e-05,
162
+ "loss": 0.3175,
163
  "step": 750
164
  },
165
  {
166
+ "epoch": 2.3054755043227666,
167
+ "grad_norm": 1.656386375427246,
168
+ "learning_rate": 4.63016330451489e-05,
169
+ "loss": 0.3074,
170
+ "step": 800
171
+ },
172
+ {
173
+ "epoch": 2.3054755043227666,
174
+ "eval_runtime": 228.7555,
175
+ "eval_samples_per_second": 5.408,
176
+ "eval_steps_per_second": 2.706,
177
  "step": 800
178
  },
179
  {
180
+ "epoch": 2.4495677233429394,
181
+ "grad_norm": 1.3060048818588257,
182
+ "learning_rate": 3.66954851104707e-05,
183
+ "loss": 0.3095,
184
  "step": 850
185
  },
186
  {
187
+ "epoch": 2.5936599423631126,
188
+ "grad_norm": 1.136078953742981,
189
+ "learning_rate": 2.7089337175792506e-05,
190
+ "loss": 0.2877,
191
+ "step": 900
192
+ },
193
+ {
194
+ "epoch": 2.5936599423631126,
195
+ "eval_runtime": 228.5101,
196
+ "eval_samples_per_second": 5.413,
197
+ "eval_steps_per_second": 2.709,
198
+ "step": 900
199
+ },
200
+ {
201
+ "epoch": 2.7377521613832854,
202
+ "grad_norm": 1.0104094743728638,
203
+ "learning_rate": 1.7483189241114314e-05,
204
+ "loss": 0.2997,
205
+ "step": 950
206
+ },
207
+ {
208
+ "epoch": 2.881844380403458,
209
+ "grad_norm": 1.434010624885559,
210
+ "learning_rate": 7.87704130643612e-06,
211
+ "loss": 0.2896,
212
+ "step": 1000
213
+ },
214
+ {
215
+ "epoch": 2.881844380403458,
216
+ "eval_runtime": 226.8836,
217
+ "eval_samples_per_second": 5.452,
218
+ "eval_steps_per_second": 2.728,
219
+ "step": 1000
220
+ },
221
+ {
222
+ "epoch": 3.0,
223
+ "step": 1041,
224
+ "total_flos": 2.42193908736e+18,
225
+ "train_loss": 0.3701638735680484,
226
+ "train_runtime": 4045.1211,
227
+ "train_samples_per_second": 2.057,
228
+ "train_steps_per_second": 0.257
229
  }
230
  ],
231
  "logging_steps": 50,
232
+ "max_steps": 1041,
233
  "num_input_tokens_seen": 0,
234
+ "num_train_epochs": 3,
235
+ "save_steps": 100,
236
  "stateful_callbacks": {
237
  "TrainerControl": {
238
  "args": {
 
245
  "attributes": {}
246
  }
247
  },
248
+ "total_flos": 2.42193908736e+18,
249
+ "train_batch_size": 4,
250
  "trial_name": null,
251
  "trial_params": null
252
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67cec95dc49a757b9d826940fcd3043ff38ff7b88b59ae2cb25c21210f2da126
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ecfa4925538d5d694197138cbcc95396fedc580f5bf53b61a2b5623a82235f7
3
  size 5304