zalim0zalima commited on
Commit
02dd971
·
verified ·
1 Parent(s): f97ae1d

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.3036231884057971,
4
- "eval_loss": 1.1199791431427002,
5
- "eval_runtime": 998.4286,
6
- "eval_samples_per_second": 2.764,
7
- "eval_steps_per_second": 0.23
8
  }
 
1
  {
2
+ "epoch": 9.096017699115045,
3
+ "eval_accuracy": 0.7652173913043478,
4
+ "eval_loss": 0.6633358597755432,
5
+ "eval_runtime": 903.9019,
6
+ "eval_samples_per_second": 3.053,
7
+ "eval_steps_per_second": 0.096
8
  }
runs/Feb13_14-20-30_DESKTOP-T04IOFP/events.out.tfevents.1739484113.DESKTOP-T04IOFP.18948.1 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97d06f627fc897366d84267d4315593eb20adf44ed9b17eaab13baf6f5470c60
3
- size 411
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b2e7e3e087679270790296c73aa48b295a4bed2330633c7469e2f885d7654aa
3
+ size 734
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_accuracy": 0.3036231884057971,
4
- "eval_loss": 1.1199791431427002,
5
- "eval_runtime": 998.4286,
6
- "eval_samples_per_second": 2.764,
7
- "eval_steps_per_second": 0.23
8
  }
 
1
  {
2
+ "epoch": 9.096017699115045,
3
+ "eval_accuracy": 0.7652173913043478,
4
+ "eval_loss": 0.6633358597755432,
5
+ "eval_runtime": 903.9019,
6
+ "eval_samples_per_second": 3.053,
7
+ "eval_steps_per_second": 0.096
8
  }
trainer_state.json CHANGED
@@ -1,50 +1,1715 @@
1
  {
2
- "best_metric": 0.3359073359073359,
3
- "best_model_checkpoint": "videomae-small-finetuned-kinetics-finetuned-2\\checkpoint-10",
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 10,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "grad_norm": 2.350801944732666,
14
- "learning_rate": 0.0,
15
- "loss": 1.1384,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_accuracy": 0.3359073359073359,
21
- "eval_loss": 1.1075096130371094,
22
- "eval_runtime": 1097.4829,
23
- "eval_samples_per_second": 2.596,
24
- "eval_steps_per_second": 0.217,
25
- "step": 10
26
  },
27
  {
28
- "epoch": 1.0,
29
- "step": 10,
30
- "total_flos": 3.794361829687296e+16,
31
- "train_loss": 1.138401412963867,
32
- "train_runtime": 1152.3511,
33
- "train_samples_per_second": 0.104,
34
- "train_steps_per_second": 0.009
35
  },
36
  {
37
- "epoch": 1.0,
38
- "eval_accuracy": 0.3036231884057971,
39
- "eval_loss": 1.1199791431427002,
40
- "eval_runtime": 998.4286,
41
- "eval_samples_per_second": 2.764,
42
- "eval_steps_per_second": 0.23,
43
- "step": 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  ],
46
  "logging_steps": 10,
47
- "max_steps": 10,
48
  "num_input_tokens_seen": 0,
49
  "num_train_epochs": 9223372036854775807,
50
  "save_steps": 500,
@@ -60,8 +1725,8 @@
60
  "attributes": {}
61
  }
62
  },
63
- "total_flos": 3.794361829687296e+16,
64
- "train_batch_size": 12,
65
  "trial_name": null,
66
  "trial_params": null
67
  }
 
1
  {
2
+ "best_metric": 0.7665847665847666,
3
+ "best_model_checkpoint": "videomae-small-finetuned-kinetics-finetuned-2\\checkpoint-2260",
4
+ "epoch": 9.096017699115045,
5
  "eval_steps": 500,
6
+ "global_step": 2260,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.004424778761061947,
13
+ "grad_norm": 2.1747984886169434,
14
+ "learning_rate": 2.2123893805309734e-06,
15
+ "loss": 1.0543,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.008849557522123894,
20
+ "grad_norm": 1.4438425302505493,
21
+ "learning_rate": 4.424778761061947e-06,
22
+ "loss": 1.0544,
23
+ "step": 20
 
 
24
  },
25
  {
26
+ "epoch": 0.01327433628318584,
27
+ "grad_norm": 1.5373220443725586,
28
+ "learning_rate": 6.6371681415929215e-06,
29
+ "loss": 1.0551,
30
+ "step": 30
 
 
31
  },
32
  {
33
+ "epoch": 0.017699115044247787,
34
+ "grad_norm": 1.477241039276123,
35
+ "learning_rate": 8.849557522123894e-06,
36
+ "loss": 1.0596,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.022123893805309734,
41
+ "grad_norm": 1.7869963645935059,
42
+ "learning_rate": 1.1061946902654869e-05,
43
+ "loss": 1.0579,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.02654867256637168,
48
+ "grad_norm": 1.406799554824829,
49
+ "learning_rate": 1.3274336283185843e-05,
50
+ "loss": 1.064,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.030973451327433628,
55
+ "grad_norm": 1.6087795495986938,
56
+ "learning_rate": 1.5486725663716813e-05,
57
+ "loss": 1.0334,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.035398230088495575,
62
+ "grad_norm": 1.539732575416565,
63
+ "learning_rate": 1.7699115044247787e-05,
64
+ "loss": 1.0502,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.03982300884955752,
69
+ "grad_norm": 1.7329670190811157,
70
+ "learning_rate": 1.991150442477876e-05,
71
+ "loss": 1.0408,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.04424778761061947,
76
+ "grad_norm": 1.849195957183838,
77
+ "learning_rate": 2.2123893805309738e-05,
78
+ "loss": 1.0403,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.048672566371681415,
83
+ "grad_norm": 1.7182987928390503,
84
+ "learning_rate": 2.433628318584071e-05,
85
+ "loss": 1.0406,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.05309734513274336,
90
+ "grad_norm": 1.8291207551956177,
91
+ "learning_rate": 2.6548672566371686e-05,
92
+ "loss": 1.0188,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.05752212389380531,
97
+ "grad_norm": 1.8635715246200562,
98
+ "learning_rate": 2.8761061946902656e-05,
99
+ "loss": 1.0132,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.061946902654867256,
104
+ "grad_norm": 2.1121060848236084,
105
+ "learning_rate": 3.097345132743363e-05,
106
+ "loss": 1.0194,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.06637168141592921,
111
+ "grad_norm": 1.605365514755249,
112
+ "learning_rate": 3.3185840707964604e-05,
113
+ "loss": 1.0276,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.07079646017699115,
118
+ "grad_norm": 1.4162675142288208,
119
+ "learning_rate": 3.5398230088495574e-05,
120
+ "loss": 1.0155,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.0752212389380531,
125
+ "grad_norm": 2.037018060684204,
126
+ "learning_rate": 3.7610619469026545e-05,
127
+ "loss": 0.9921,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.07964601769911504,
132
+ "grad_norm": 1.6612162590026855,
133
+ "learning_rate": 3.982300884955752e-05,
134
+ "loss": 0.9814,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.084070796460177,
139
+ "grad_norm": 1.4722100496292114,
140
+ "learning_rate": 4.20353982300885e-05,
141
+ "loss": 1.0092,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.08849557522123894,
146
+ "grad_norm": 1.9917899370193481,
147
+ "learning_rate": 4.4247787610619477e-05,
148
+ "loss": 0.9742,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.09292035398230089,
153
+ "grad_norm": 1.5193358659744263,
154
+ "learning_rate": 4.646017699115045e-05,
155
+ "loss": 0.9647,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.09734513274336283,
160
+ "grad_norm": 1.4704391956329346,
161
+ "learning_rate": 4.867256637168142e-05,
162
+ "loss": 0.9644,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.10044247787610619,
167
+ "eval_accuracy": 0.5777465777465778,
168
+ "eval_loss": 0.9732658863067627,
169
+ "eval_runtime": 1124.1661,
170
+ "eval_samples_per_second": 2.534,
171
+ "eval_steps_per_second": 0.08,
172
+ "step": 227
173
+ },
174
+ {
175
+ "epoch": 1.0013274336283187,
176
+ "grad_norm": 1.666140079498291,
177
+ "learning_rate": 4.990167158308752e-05,
178
+ "loss": 0.967,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 1.0057522123893805,
183
+ "grad_norm": 1.6128636598587036,
184
+ "learning_rate": 4.9655850540806295e-05,
185
+ "loss": 0.9457,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 1.0101769911504426,
190
+ "grad_norm": 1.2806826829910278,
191
+ "learning_rate": 4.941002949852507e-05,
192
+ "loss": 0.9482,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 1.0146017699115044,
197
+ "grad_norm": 1.5180962085723877,
198
+ "learning_rate": 4.9164208456243856e-05,
199
+ "loss": 0.9656,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 1.0190265486725665,
204
+ "grad_norm": 1.306519627571106,
205
+ "learning_rate": 4.891838741396263e-05,
206
+ "loss": 0.9615,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 1.0234513274336283,
211
+ "grad_norm": 1.5663535594940186,
212
+ "learning_rate": 4.867256637168142e-05,
213
+ "loss": 0.9306,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 1.0278761061946902,
218
+ "grad_norm": 1.3962496519088745,
219
+ "learning_rate": 4.8426745329400195e-05,
220
+ "loss": 0.9359,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 1.0323008849557522,
225
+ "grad_norm": 1.3210822343826294,
226
+ "learning_rate": 4.818092428711898e-05,
227
+ "loss": 0.8929,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 1.036725663716814,
232
+ "grad_norm": 1.5808290243148804,
233
+ "learning_rate": 4.7935103244837756e-05,
234
+ "loss": 0.8904,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 1.0411504424778761,
239
+ "grad_norm": 1.4897513389587402,
240
+ "learning_rate": 4.768928220255654e-05,
241
+ "loss": 0.9128,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 1.045575221238938,
246
+ "grad_norm": 1.4278603792190552,
247
+ "learning_rate": 4.7443461160275324e-05,
248
+ "loss": 0.9177,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.05,
253
+ "grad_norm": 2.067490577697754,
254
+ "learning_rate": 4.71976401179941e-05,
255
+ "loss": 0.8882,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.0544247787610619,
260
+ "grad_norm": 1.3919837474822998,
261
+ "learning_rate": 4.6951819075712886e-05,
262
+ "loss": 0.8824,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.058849557522124,
267
+ "grad_norm": 1.398645281791687,
268
+ "learning_rate": 4.670599803343166e-05,
269
+ "loss": 0.905,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.0632743362831858,
274
+ "grad_norm": 1.4657193422317505,
275
+ "learning_rate": 4.646017699115045e-05,
276
+ "loss": 0.8687,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.0676991150442479,
281
+ "grad_norm": 1.2234872579574585,
282
+ "learning_rate": 4.6214355948869224e-05,
283
+ "loss": 0.8918,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.0721238938053097,
288
+ "grad_norm": 1.321500539779663,
289
+ "learning_rate": 4.596853490658801e-05,
290
+ "loss": 0.8582,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.0765486725663718,
295
+ "grad_norm": 1.3090804815292358,
296
+ "learning_rate": 4.5722713864306786e-05,
297
+ "loss": 0.868,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.0809734513274336,
302
+ "grad_norm": 1.2257106304168701,
303
+ "learning_rate": 4.547689282202557e-05,
304
+ "loss": 0.8837,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.0853982300884957,
309
+ "grad_norm": 1.2699074745178223,
310
+ "learning_rate": 4.523107177974435e-05,
311
+ "loss": 0.8863,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.0898230088495575,
316
+ "grad_norm": 1.791901707649231,
317
+ "learning_rate": 4.498525073746313e-05,
318
+ "loss": 0.8883,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.0942477876106196,
323
+ "grad_norm": 1.2082570791244507,
324
+ "learning_rate": 4.473942969518191e-05,
325
+ "loss": 0.8544,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.0986725663716814,
330
+ "grad_norm": 1.3323192596435547,
331
+ "learning_rate": 4.449360865290069e-05,
332
+ "loss": 0.8823,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.1004424778761062,
337
+ "eval_accuracy": 0.6676026676026676,
338
+ "eval_loss": 0.869060754776001,
339
+ "eval_runtime": 1028.2799,
340
+ "eval_samples_per_second": 2.771,
341
+ "eval_steps_per_second": 0.088,
342
+ "step": 454
343
+ },
344
+ {
345
+ "epoch": 2.0026548672566373,
346
+ "grad_norm": 1.2703664302825928,
347
+ "learning_rate": 4.4247787610619477e-05,
348
+ "loss": 0.846,
349
+ "step": 460
350
+ },
351
+ {
352
+ "epoch": 2.007079646017699,
353
+ "grad_norm": 1.2533842325210571,
354
+ "learning_rate": 4.4001966568338254e-05,
355
+ "loss": 0.8445,
356
+ "step": 470
357
+ },
358
+ {
359
+ "epoch": 2.011504424778761,
360
+ "grad_norm": 1.2655799388885498,
361
+ "learning_rate": 4.375614552605704e-05,
362
+ "loss": 0.8526,
363
+ "step": 480
364
+ },
365
+ {
366
+ "epoch": 2.015929203539823,
367
+ "grad_norm": 1.3053220510482788,
368
+ "learning_rate": 4.351032448377581e-05,
369
+ "loss": 0.8576,
370
+ "step": 490
371
+ },
372
+ {
373
+ "epoch": 2.020353982300885,
374
+ "grad_norm": 1.160129189491272,
375
+ "learning_rate": 4.326450344149459e-05,
376
+ "loss": 0.8595,
377
+ "step": 500
378
+ },
379
+ {
380
+ "epoch": 2.0247787610619468,
381
+ "grad_norm": 1.4154675006866455,
382
+ "learning_rate": 4.301868239921337e-05,
383
+ "loss": 0.8563,
384
+ "step": 510
385
+ },
386
+ {
387
+ "epoch": 2.029203539823009,
388
+ "grad_norm": 1.166509747505188,
389
+ "learning_rate": 4.2772861356932154e-05,
390
+ "loss": 0.8431,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 2.033628318584071,
395
+ "grad_norm": 1.4635576009750366,
396
+ "learning_rate": 4.252704031465093e-05,
397
+ "loss": 0.8099,
398
+ "step": 530
399
+ },
400
+ {
401
+ "epoch": 2.038053097345133,
402
+ "grad_norm": 1.205285906791687,
403
+ "learning_rate": 4.2281219272369715e-05,
404
+ "loss": 0.8308,
405
+ "step": 540
406
+ },
407
+ {
408
+ "epoch": 2.0424778761061946,
409
+ "grad_norm": 1.521293044090271,
410
+ "learning_rate": 4.20353982300885e-05,
411
+ "loss": 0.8257,
412
+ "step": 550
413
+ },
414
+ {
415
+ "epoch": 2.0469026548672566,
416
+ "grad_norm": 1.212964415550232,
417
+ "learning_rate": 4.178957718780728e-05,
418
+ "loss": 0.8587,
419
+ "step": 560
420
+ },
421
+ {
422
+ "epoch": 2.0513274336283187,
423
+ "grad_norm": 1.156423807144165,
424
+ "learning_rate": 4.154375614552606e-05,
425
+ "loss": 0.812,
426
+ "step": 570
427
+ },
428
+ {
429
+ "epoch": 2.0557522123893803,
430
+ "grad_norm": 1.274183988571167,
431
+ "learning_rate": 4.129793510324484e-05,
432
+ "loss": 0.838,
433
+ "step": 580
434
+ },
435
+ {
436
+ "epoch": 2.0601769911504424,
437
+ "grad_norm": 1.8873164653778076,
438
+ "learning_rate": 4.105211406096362e-05,
439
+ "loss": 0.8067,
440
+ "step": 590
441
+ },
442
+ {
443
+ "epoch": 2.0646017699115045,
444
+ "grad_norm": 1.1822903156280518,
445
+ "learning_rate": 4.08062930186824e-05,
446
+ "loss": 0.7949,
447
+ "step": 600
448
+ },
449
+ {
450
+ "epoch": 2.0690265486725665,
451
+ "grad_norm": 1.2289406061172485,
452
+ "learning_rate": 4.0560471976401183e-05,
453
+ "loss": 0.8064,
454
+ "step": 610
455
+ },
456
+ {
457
+ "epoch": 2.073451327433628,
458
+ "grad_norm": 1.1396745443344116,
459
+ "learning_rate": 4.031465093411996e-05,
460
+ "loss": 0.8183,
461
+ "step": 620
462
+ },
463
+ {
464
+ "epoch": 2.07787610619469,
465
+ "grad_norm": 1.041188359260559,
466
+ "learning_rate": 4.0068829891838745e-05,
467
+ "loss": 0.8024,
468
+ "step": 630
469
+ },
470
+ {
471
+ "epoch": 2.0823008849557523,
472
+ "grad_norm": 1.2631102800369263,
473
+ "learning_rate": 3.982300884955752e-05,
474
+ "loss": 0.8168,
475
+ "step": 640
476
+ },
477
+ {
478
+ "epoch": 2.0867256637168143,
479
+ "grad_norm": 1.06856107711792,
480
+ "learning_rate": 3.9577187807276306e-05,
481
+ "loss": 0.7869,
482
+ "step": 650
483
+ },
484
+ {
485
+ "epoch": 2.091150442477876,
486
+ "grad_norm": 1.4375030994415283,
487
+ "learning_rate": 3.9331366764995083e-05,
488
+ "loss": 0.8004,
489
+ "step": 660
490
+ },
491
+ {
492
+ "epoch": 2.095575221238938,
493
+ "grad_norm": 1.1425267457962036,
494
+ "learning_rate": 3.908554572271387e-05,
495
+ "loss": 0.7626,
496
+ "step": 670
497
+ },
498
+ {
499
+ "epoch": 2.1,
500
+ "grad_norm": 1.9019882678985596,
501
+ "learning_rate": 3.883972468043265e-05,
502
+ "loss": 0.826,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 2.1004424778761064,
507
+ "eval_accuracy": 0.7093717093717093,
508
+ "eval_loss": 0.8009993433952332,
509
+ "eval_runtime": 986.4291,
510
+ "eval_samples_per_second": 2.888,
511
+ "eval_steps_per_second": 0.091,
512
+ "step": 681
513
+ },
514
+ {
515
+ "epoch": 3.0039823008849558,
516
+ "grad_norm": 1.6283575296401978,
517
+ "learning_rate": 3.859390363815143e-05,
518
+ "loss": 0.7981,
519
+ "step": 690
520
+ },
521
+ {
522
+ "epoch": 3.008407079646018,
523
+ "grad_norm": 1.1492269039154053,
524
+ "learning_rate": 3.834808259587021e-05,
525
+ "loss": 0.8298,
526
+ "step": 700
527
+ },
528
+ {
529
+ "epoch": 3.0128318584070795,
530
+ "grad_norm": 1.1298381090164185,
531
+ "learning_rate": 3.810226155358899e-05,
532
+ "loss": 0.7982,
533
+ "step": 710
534
+ },
535
+ {
536
+ "epoch": 3.0172566371681415,
537
+ "grad_norm": 1.1305561065673828,
538
+ "learning_rate": 3.7856440511307774e-05,
539
+ "loss": 0.7666,
540
+ "step": 720
541
+ },
542
+ {
543
+ "epoch": 3.0216814159292036,
544
+ "grad_norm": 1.1142364740371704,
545
+ "learning_rate": 3.7610619469026545e-05,
546
+ "loss": 0.7775,
547
+ "step": 730
548
+ },
549
+ {
550
+ "epoch": 3.0261061946902656,
551
+ "grad_norm": 1.7376995086669922,
552
+ "learning_rate": 3.736479842674533e-05,
553
+ "loss": 0.7879,
554
+ "step": 740
555
+ },
556
+ {
557
+ "epoch": 3.0305309734513273,
558
+ "grad_norm": 1.1918601989746094,
559
+ "learning_rate": 3.711897738446411e-05,
560
+ "loss": 0.7587,
561
+ "step": 750
562
+ },
563
+ {
564
+ "epoch": 3.0349557522123893,
565
+ "grad_norm": 1.3769606351852417,
566
+ "learning_rate": 3.687315634218289e-05,
567
+ "loss": 0.7916,
568
+ "step": 760
569
+ },
570
+ {
571
+ "epoch": 3.0393805309734514,
572
+ "grad_norm": 1.269805669784546,
573
+ "learning_rate": 3.6627335299901674e-05,
574
+ "loss": 0.8069,
575
+ "step": 770
576
+ },
577
+ {
578
+ "epoch": 3.0438053097345135,
579
+ "grad_norm": 1.045404314994812,
580
+ "learning_rate": 3.638151425762045e-05,
581
+ "loss": 0.7894,
582
+ "step": 780
583
+ },
584
+ {
585
+ "epoch": 3.048230088495575,
586
+ "grad_norm": 1.1676949262619019,
587
+ "learning_rate": 3.6135693215339236e-05,
588
+ "loss": 0.8034,
589
+ "step": 790
590
+ },
591
+ {
592
+ "epoch": 3.052654867256637,
593
+ "grad_norm": 1.8563295602798462,
594
+ "learning_rate": 3.588987217305801e-05,
595
+ "loss": 0.7366,
596
+ "step": 800
597
+ },
598
+ {
599
+ "epoch": 3.057079646017699,
600
+ "grad_norm": 1.0208163261413574,
601
+ "learning_rate": 3.56440511307768e-05,
602
+ "loss": 0.7332,
603
+ "step": 810
604
+ },
605
+ {
606
+ "epoch": 3.0615044247787613,
607
+ "grad_norm": 1.3837121725082397,
608
+ "learning_rate": 3.5398230088495574e-05,
609
+ "loss": 0.7421,
610
+ "step": 820
611
+ },
612
+ {
613
+ "epoch": 3.065929203539823,
614
+ "grad_norm": 1.1950623989105225,
615
+ "learning_rate": 3.515240904621436e-05,
616
+ "loss": 0.7696,
617
+ "step": 830
618
+ },
619
+ {
620
+ "epoch": 3.070353982300885,
621
+ "grad_norm": 1.3049529790878296,
622
+ "learning_rate": 3.4906588003933136e-05,
623
+ "loss": 0.7897,
624
+ "step": 840
625
+ },
626
+ {
627
+ "epoch": 3.074778761061947,
628
+ "grad_norm": 1.3270063400268555,
629
+ "learning_rate": 3.466076696165192e-05,
630
+ "loss": 0.7884,
631
+ "step": 850
632
+ },
633
+ {
634
+ "epoch": 3.0792035398230087,
635
+ "grad_norm": 1.4187606573104858,
636
+ "learning_rate": 3.44149459193707e-05,
637
+ "loss": 0.7526,
638
+ "step": 860
639
+ },
640
+ {
641
+ "epoch": 3.0836283185840707,
642
+ "grad_norm": 1.0789133310317993,
643
+ "learning_rate": 3.416912487708948e-05,
644
+ "loss": 0.7535,
645
+ "step": 870
646
+ },
647
+ {
648
+ "epoch": 3.088053097345133,
649
+ "grad_norm": 1.1631813049316406,
650
+ "learning_rate": 3.3923303834808265e-05,
651
+ "loss": 0.7579,
652
+ "step": 880
653
+ },
654
+ {
655
+ "epoch": 3.092477876106195,
656
+ "grad_norm": 1.1757570505142212,
657
+ "learning_rate": 3.367748279252704e-05,
658
+ "loss": 0.79,
659
+ "step": 890
660
+ },
661
+ {
662
+ "epoch": 3.0969026548672565,
663
+ "grad_norm": 1.0601412057876587,
664
+ "learning_rate": 3.343166175024583e-05,
665
+ "loss": 0.7422,
666
+ "step": 900
667
+ },
668
+ {
669
+ "epoch": 3.1004424778761064,
670
+ "eval_accuracy": 0.7371007371007371,
671
+ "eval_loss": 0.7514493465423584,
672
+ "eval_runtime": 1046.1945,
673
+ "eval_samples_per_second": 2.723,
674
+ "eval_steps_per_second": 0.086,
675
+ "step": 908
676
+ },
677
+ {
678
+ "epoch": 4.000884955752213,
679
+ "grad_norm": 1.757256269454956,
680
+ "learning_rate": 3.3185840707964604e-05,
681
+ "loss": 0.7465,
682
+ "step": 910
683
+ },
684
+ {
685
+ "epoch": 4.005309734513275,
686
+ "grad_norm": 1.1318788528442383,
687
+ "learning_rate": 3.294001966568339e-05,
688
+ "loss": 0.7494,
689
+ "step": 920
690
+ },
691
+ {
692
+ "epoch": 4.009734513274337,
693
+ "grad_norm": 1.104137659072876,
694
+ "learning_rate": 3.2694198623402165e-05,
695
+ "loss": 0.7696,
696
+ "step": 930
697
+ },
698
+ {
699
+ "epoch": 4.014159292035398,
700
+ "grad_norm": 1.0731704235076904,
701
+ "learning_rate": 3.244837758112095e-05,
702
+ "loss": 0.748,
703
+ "step": 940
704
+ },
705
+ {
706
+ "epoch": 4.01858407079646,
707
+ "grad_norm": 1.1554673910140991,
708
+ "learning_rate": 3.220255653883973e-05,
709
+ "loss": 0.79,
710
+ "step": 950
711
+ },
712
+ {
713
+ "epoch": 4.023008849557522,
714
+ "grad_norm": 1.2223293781280518,
715
+ "learning_rate": 3.1956735496558504e-05,
716
+ "loss": 0.7281,
717
+ "step": 960
718
+ },
719
+ {
720
+ "epoch": 4.027433628318584,
721
+ "grad_norm": 1.1225980520248413,
722
+ "learning_rate": 3.171091445427729e-05,
723
+ "loss": 0.7219,
724
+ "step": 970
725
+ },
726
+ {
727
+ "epoch": 4.031858407079646,
728
+ "grad_norm": 1.0847655534744263,
729
+ "learning_rate": 3.1465093411996065e-05,
730
+ "loss": 0.7481,
731
+ "step": 980
732
+ },
733
+ {
734
+ "epoch": 4.036283185840708,
735
+ "grad_norm": 1.3014447689056396,
736
+ "learning_rate": 3.121927236971485e-05,
737
+ "loss": 0.7305,
738
+ "step": 990
739
+ },
740
+ {
741
+ "epoch": 4.04070796460177,
742
+ "grad_norm": 1.1104828119277954,
743
+ "learning_rate": 3.097345132743363e-05,
744
+ "loss": 0.7211,
745
+ "step": 1000
746
+ },
747
+ {
748
+ "epoch": 4.0451327433628315,
749
+ "grad_norm": 1.0301592350006104,
750
+ "learning_rate": 3.072763028515241e-05,
751
+ "loss": 0.7676,
752
+ "step": 1010
753
+ },
754
+ {
755
+ "epoch": 4.0495575221238935,
756
+ "grad_norm": 1.1068618297576904,
757
+ "learning_rate": 3.048180924287119e-05,
758
+ "loss": 0.7078,
759
+ "step": 1020
760
+ },
761
+ {
762
+ "epoch": 4.053982300884956,
763
+ "grad_norm": 1.4518685340881348,
764
+ "learning_rate": 3.0235988200589972e-05,
765
+ "loss": 0.7706,
766
+ "step": 1030
767
+ },
768
+ {
769
+ "epoch": 4.058407079646018,
770
+ "grad_norm": 0.9889740347862244,
771
+ "learning_rate": 2.9990167158308753e-05,
772
+ "loss": 0.7207,
773
+ "step": 1040
774
+ },
775
+ {
776
+ "epoch": 4.06283185840708,
777
+ "grad_norm": 1.1339548826217651,
778
+ "learning_rate": 2.9744346116027534e-05,
779
+ "loss": 0.7423,
780
+ "step": 1050
781
+ },
782
+ {
783
+ "epoch": 4.067256637168142,
784
+ "grad_norm": 1.3356149196624756,
785
+ "learning_rate": 2.9498525073746314e-05,
786
+ "loss": 0.7114,
787
+ "step": 1060
788
+ },
789
+ {
790
+ "epoch": 4.071681415929204,
791
+ "grad_norm": 1.0467292070388794,
792
+ "learning_rate": 2.9252704031465095e-05,
793
+ "loss": 0.6998,
794
+ "step": 1070
795
+ },
796
+ {
797
+ "epoch": 4.076106194690266,
798
+ "grad_norm": 1.0783181190490723,
799
+ "learning_rate": 2.9006882989183876e-05,
800
+ "loss": 0.7544,
801
+ "step": 1080
802
+ },
803
+ {
804
+ "epoch": 4.080530973451327,
805
+ "grad_norm": 1.0915840864181519,
806
+ "learning_rate": 2.8761061946902656e-05,
807
+ "loss": 0.7145,
808
+ "step": 1090
809
+ },
810
+ {
811
+ "epoch": 4.084955752212389,
812
+ "grad_norm": 1.646979570388794,
813
+ "learning_rate": 2.8515240904621437e-05,
814
+ "loss": 0.7145,
815
+ "step": 1100
816
+ },
817
+ {
818
+ "epoch": 4.089380530973451,
819
+ "grad_norm": 1.3354220390319824,
820
+ "learning_rate": 2.8269419862340218e-05,
821
+ "loss": 0.7329,
822
+ "step": 1110
823
+ },
824
+ {
825
+ "epoch": 4.093805309734513,
826
+ "grad_norm": 1.1131007671356201,
827
+ "learning_rate": 2.8023598820059e-05,
828
+ "loss": 0.6812,
829
+ "step": 1120
830
+ },
831
+ {
832
+ "epoch": 4.098230088495575,
833
+ "grad_norm": 1.260852575302124,
834
+ "learning_rate": 2.777777777777778e-05,
835
+ "loss": 0.7206,
836
+ "step": 1130
837
+ },
838
+ {
839
+ "epoch": 4.100442477876106,
840
+ "eval_accuracy": 0.7507897507897507,
841
+ "eval_loss": 0.7169636487960815,
842
+ "eval_runtime": 1087.6772,
843
+ "eval_samples_per_second": 2.619,
844
+ "eval_steps_per_second": 0.083,
845
+ "step": 1135
846
+ },
847
+ {
848
+ "epoch": 5.002212389380531,
849
+ "grad_norm": 1.1113187074661255,
850
+ "learning_rate": 2.753195673549656e-05,
851
+ "loss": 0.6946,
852
+ "step": 1140
853
+ },
854
+ {
855
+ "epoch": 5.006637168141593,
856
+ "grad_norm": 1.0982720851898193,
857
+ "learning_rate": 2.7286135693215344e-05,
858
+ "loss": 0.7624,
859
+ "step": 1150
860
+ },
861
+ {
862
+ "epoch": 5.011061946902655,
863
+ "grad_norm": 1.3066320419311523,
864
+ "learning_rate": 2.7040314650934125e-05,
865
+ "loss": 0.7023,
866
+ "step": 1160
867
+ },
868
+ {
869
+ "epoch": 5.015486725663717,
870
+ "grad_norm": 0.9856134057044983,
871
+ "learning_rate": 2.6794493608652905e-05,
872
+ "loss": 0.6863,
873
+ "step": 1170
874
+ },
875
+ {
876
+ "epoch": 5.019911504424779,
877
+ "grad_norm": 1.2251431941986084,
878
+ "learning_rate": 2.6548672566371686e-05,
879
+ "loss": 0.7327,
880
+ "step": 1180
881
+ },
882
+ {
883
+ "epoch": 5.024336283185841,
884
+ "grad_norm": 0.9257007837295532,
885
+ "learning_rate": 2.6302851524090467e-05,
886
+ "loss": 0.7002,
887
+ "step": 1190
888
+ },
889
+ {
890
+ "epoch": 5.028761061946903,
891
+ "grad_norm": 0.9643080830574036,
892
+ "learning_rate": 2.605703048180924e-05,
893
+ "loss": 0.6906,
894
+ "step": 1200
895
+ },
896
+ {
897
+ "epoch": 5.033185840707965,
898
+ "grad_norm": 1.1749197244644165,
899
+ "learning_rate": 2.581120943952802e-05,
900
+ "loss": 0.7327,
901
+ "step": 1210
902
+ },
903
+ {
904
+ "epoch": 5.037610619469026,
905
+ "grad_norm": 1.1557971239089966,
906
+ "learning_rate": 2.5565388397246802e-05,
907
+ "loss": 0.6845,
908
+ "step": 1220
909
+ },
910
+ {
911
+ "epoch": 5.042035398230088,
912
+ "grad_norm": 1.2948167324066162,
913
+ "learning_rate": 2.5319567354965586e-05,
914
+ "loss": 0.7152,
915
+ "step": 1230
916
+ },
917
+ {
918
+ "epoch": 5.04646017699115,
919
+ "grad_norm": 0.9071692228317261,
920
+ "learning_rate": 2.5073746312684367e-05,
921
+ "loss": 0.701,
922
+ "step": 1240
923
+ },
924
+ {
925
+ "epoch": 5.050884955752212,
926
+ "grad_norm": 1.0012332201004028,
927
+ "learning_rate": 2.4827925270403147e-05,
928
+ "loss": 0.7141,
929
+ "step": 1250
930
+ },
931
+ {
932
+ "epoch": 5.0553097345132745,
933
+ "grad_norm": 1.4386842250823975,
934
+ "learning_rate": 2.4582104228121928e-05,
935
+ "loss": 0.7405,
936
+ "step": 1260
937
+ },
938
+ {
939
+ "epoch": 5.0597345132743365,
940
+ "grad_norm": 1.1797550916671753,
941
+ "learning_rate": 2.433628318584071e-05,
942
+ "loss": 0.6717,
943
+ "step": 1270
944
+ },
945
+ {
946
+ "epoch": 5.064159292035399,
947
+ "grad_norm": 1.0993843078613281,
948
+ "learning_rate": 2.409046214355949e-05,
949
+ "loss": 0.6832,
950
+ "step": 1280
951
+ },
952
+ {
953
+ "epoch": 5.06858407079646,
954
+ "grad_norm": 0.9947831630706787,
955
+ "learning_rate": 2.384464110127827e-05,
956
+ "loss": 0.7164,
957
+ "step": 1290
958
+ },
959
+ {
960
+ "epoch": 5.073008849557522,
961
+ "grad_norm": 1.335589051246643,
962
+ "learning_rate": 2.359882005899705e-05,
963
+ "loss": 0.7541,
964
+ "step": 1300
965
+ },
966
+ {
967
+ "epoch": 5.077433628318584,
968
+ "grad_norm": 1.3028713464736938,
969
+ "learning_rate": 2.335299901671583e-05,
970
+ "loss": 0.7167,
971
+ "step": 1310
972
+ },
973
+ {
974
+ "epoch": 5.081858407079646,
975
+ "grad_norm": 1.1588274240493774,
976
+ "learning_rate": 2.3107177974434612e-05,
977
+ "loss": 0.7194,
978
+ "step": 1320
979
+ },
980
+ {
981
+ "epoch": 5.086283185840708,
982
+ "grad_norm": 0.9976550340652466,
983
+ "learning_rate": 2.2861356932153393e-05,
984
+ "loss": 0.7196,
985
+ "step": 1330
986
+ },
987
+ {
988
+ "epoch": 5.09070796460177,
989
+ "grad_norm": 0.8966324925422668,
990
+ "learning_rate": 2.2615535889872174e-05,
991
+ "loss": 0.7268,
992
+ "step": 1340
993
+ },
994
+ {
995
+ "epoch": 5.095132743362832,
996
+ "grad_norm": 1.0999760627746582,
997
+ "learning_rate": 2.2369714847590954e-05,
998
+ "loss": 0.7284,
999
+ "step": 1350
1000
+ },
1001
+ {
1002
+ "epoch": 5.099557522123894,
1003
+ "grad_norm": 1.032329797744751,
1004
+ "learning_rate": 2.2123893805309738e-05,
1005
+ "loss": 0.6806,
1006
+ "step": 1360
1007
+ },
1008
+ {
1009
+ "epoch": 5.100442477876106,
1010
+ "eval_accuracy": 0.7542997542997543,
1011
+ "eval_loss": 0.6924355626106262,
1012
+ "eval_runtime": 1080.3798,
1013
+ "eval_samples_per_second": 2.637,
1014
+ "eval_steps_per_second": 0.083,
1015
+ "step": 1362
1016
+ },
1017
+ {
1018
+ "epoch": 6.0035398230088495,
1019
+ "grad_norm": 1.2827861309051514,
1020
+ "learning_rate": 2.187807276302852e-05,
1021
+ "loss": 0.768,
1022
+ "step": 1370
1023
+ },
1024
+ {
1025
+ "epoch": 6.0079646017699115,
1026
+ "grad_norm": 1.1958467960357666,
1027
+ "learning_rate": 2.1632251720747296e-05,
1028
+ "loss": 0.7209,
1029
+ "step": 1380
1030
+ },
1031
+ {
1032
+ "epoch": 6.012389380530974,
1033
+ "grad_norm": 1.0269005298614502,
1034
+ "learning_rate": 2.1386430678466077e-05,
1035
+ "loss": 0.7095,
1036
+ "step": 1390
1037
+ },
1038
+ {
1039
+ "epoch": 6.016814159292036,
1040
+ "grad_norm": 0.9638125896453857,
1041
+ "learning_rate": 2.1140609636184858e-05,
1042
+ "loss": 0.6557,
1043
+ "step": 1400
1044
+ },
1045
+ {
1046
+ "epoch": 6.021238938053098,
1047
+ "grad_norm": 1.3249033689498901,
1048
+ "learning_rate": 2.089478859390364e-05,
1049
+ "loss": 0.7129,
1050
+ "step": 1410
1051
+ },
1052
+ {
1053
+ "epoch": 6.025663716814159,
1054
+ "grad_norm": 1.1330633163452148,
1055
+ "learning_rate": 2.064896755162242e-05,
1056
+ "loss": 0.6472,
1057
+ "step": 1420
1058
+ },
1059
+ {
1060
+ "epoch": 6.030088495575221,
1061
+ "grad_norm": 1.1240506172180176,
1062
+ "learning_rate": 2.04031465093412e-05,
1063
+ "loss": 0.6748,
1064
+ "step": 1430
1065
+ },
1066
+ {
1067
+ "epoch": 6.034513274336283,
1068
+ "grad_norm": 1.091794729232788,
1069
+ "learning_rate": 2.015732546705998e-05,
1070
+ "loss": 0.7125,
1071
+ "step": 1440
1072
+ },
1073
+ {
1074
+ "epoch": 6.038938053097345,
1075
+ "grad_norm": 0.892193078994751,
1076
+ "learning_rate": 1.991150442477876e-05,
1077
+ "loss": 0.7045,
1078
+ "step": 1450
1079
+ },
1080
+ {
1081
+ "epoch": 6.043362831858407,
1082
+ "grad_norm": 1.1631165742874146,
1083
+ "learning_rate": 1.9665683382497542e-05,
1084
+ "loss": 0.7432,
1085
+ "step": 1460
1086
+ },
1087
+ {
1088
+ "epoch": 6.047787610619469,
1089
+ "grad_norm": 0.9747660160064697,
1090
+ "learning_rate": 1.9419862340216326e-05,
1091
+ "loss": 0.6965,
1092
+ "step": 1470
1093
+ },
1094
+ {
1095
+ "epoch": 6.052212389380531,
1096
+ "grad_norm": 0.9529617428779602,
1097
+ "learning_rate": 1.9174041297935107e-05,
1098
+ "loss": 0.6725,
1099
+ "step": 1480
1100
+ },
1101
+ {
1102
+ "epoch": 6.056637168141593,
1103
+ "grad_norm": 1.0859458446502686,
1104
+ "learning_rate": 1.8928220255653887e-05,
1105
+ "loss": 0.6744,
1106
+ "step": 1490
1107
+ },
1108
+ {
1109
+ "epoch": 6.0610619469026545,
1110
+ "grad_norm": 1.4146475791931152,
1111
+ "learning_rate": 1.8682399213372664e-05,
1112
+ "loss": 0.6668,
1113
+ "step": 1500
1114
+ },
1115
+ {
1116
+ "epoch": 6.065486725663717,
1117
+ "grad_norm": 1.5994105339050293,
1118
+ "learning_rate": 1.8436578171091445e-05,
1119
+ "loss": 0.6941,
1120
+ "step": 1510
1121
+ },
1122
+ {
1123
+ "epoch": 6.069911504424779,
1124
+ "grad_norm": 1.22159743309021,
1125
+ "learning_rate": 1.8190757128810226e-05,
1126
+ "loss": 0.7201,
1127
+ "step": 1520
1128
+ },
1129
+ {
1130
+ "epoch": 6.074336283185841,
1131
+ "grad_norm": 1.196835994720459,
1132
+ "learning_rate": 1.7944936086529007e-05,
1133
+ "loss": 0.6506,
1134
+ "step": 1530
1135
+ },
1136
+ {
1137
+ "epoch": 6.078761061946903,
1138
+ "grad_norm": 0.9212129712104797,
1139
+ "learning_rate": 1.7699115044247787e-05,
1140
+ "loss": 0.6931,
1141
+ "step": 1540
1142
+ },
1143
+ {
1144
+ "epoch": 6.083185840707965,
1145
+ "grad_norm": 1.0447068214416504,
1146
+ "learning_rate": 1.7453294001966568e-05,
1147
+ "loss": 0.7107,
1148
+ "step": 1550
1149
+ },
1150
+ {
1151
+ "epoch": 6.087610619469027,
1152
+ "grad_norm": 1.2515738010406494,
1153
+ "learning_rate": 1.720747295968535e-05,
1154
+ "loss": 0.7178,
1155
+ "step": 1560
1156
+ },
1157
+ {
1158
+ "epoch": 6.092035398230088,
1159
+ "grad_norm": 1.4285340309143066,
1160
+ "learning_rate": 1.6961651917404133e-05,
1161
+ "loss": 0.6929,
1162
+ "step": 1570
1163
+ },
1164
+ {
1165
+ "epoch": 6.09646017699115,
1166
+ "grad_norm": 1.3406482934951782,
1167
+ "learning_rate": 1.6715830875122913e-05,
1168
+ "loss": 0.6826,
1169
+ "step": 1580
1170
+ },
1171
+ {
1172
+ "epoch": 6.100442477876106,
1173
+ "eval_accuracy": 0.7585117585117586,
1174
+ "eval_loss": 0.675682008266449,
1175
+ "eval_runtime": 1089.8885,
1176
+ "eval_samples_per_second": 2.614,
1177
+ "eval_steps_per_second": 0.083,
1178
+ "step": 1589
1179
+ },
1180
+ {
1181
+ "epoch": 7.000442477876106,
1182
+ "grad_norm": 1.920408844947815,
1183
+ "learning_rate": 1.6470009832841694e-05,
1184
+ "loss": 0.6845,
1185
+ "step": 1590
1186
+ },
1187
+ {
1188
+ "epoch": 7.004867256637168,
1189
+ "grad_norm": 1.2930206060409546,
1190
+ "learning_rate": 1.6224188790560475e-05,
1191
+ "loss": 0.6906,
1192
+ "step": 1600
1193
+ },
1194
+ {
1195
+ "epoch": 7.00929203539823,
1196
+ "grad_norm": 1.010541319847107,
1197
+ "learning_rate": 1.5978367748279252e-05,
1198
+ "loss": 0.7002,
1199
+ "step": 1610
1200
+ },
1201
+ {
1202
+ "epoch": 7.013716814159292,
1203
+ "grad_norm": 1.3391730785369873,
1204
+ "learning_rate": 1.5732546705998033e-05,
1205
+ "loss": 0.6841,
1206
+ "step": 1620
1207
+ },
1208
+ {
1209
+ "epoch": 7.018141592920354,
1210
+ "grad_norm": 1.4052796363830566,
1211
+ "learning_rate": 1.5486725663716813e-05,
1212
+ "loss": 0.6611,
1213
+ "step": 1630
1214
+ },
1215
+ {
1216
+ "epoch": 7.022566371681416,
1217
+ "grad_norm": 1.2141647338867188,
1218
+ "learning_rate": 1.5240904621435596e-05,
1219
+ "loss": 0.6856,
1220
+ "step": 1640
1221
+ },
1222
+ {
1223
+ "epoch": 7.026991150442478,
1224
+ "grad_norm": 1.3713358640670776,
1225
+ "learning_rate": 1.4995083579154376e-05,
1226
+ "loss": 0.6978,
1227
+ "step": 1650
1228
+ },
1229
+ {
1230
+ "epoch": 7.03141592920354,
1231
+ "grad_norm": 0.9116381406784058,
1232
+ "learning_rate": 1.4749262536873157e-05,
1233
+ "loss": 0.696,
1234
+ "step": 1660
1235
+ },
1236
+ {
1237
+ "epoch": 7.035840707964602,
1238
+ "grad_norm": 1.1704223155975342,
1239
+ "learning_rate": 1.4503441494591938e-05,
1240
+ "loss": 0.6695,
1241
+ "step": 1670
1242
+ },
1243
+ {
1244
+ "epoch": 7.040265486725664,
1245
+ "grad_norm": 1.1459695100784302,
1246
+ "learning_rate": 1.4257620452310719e-05,
1247
+ "loss": 0.6575,
1248
+ "step": 1680
1249
+ },
1250
+ {
1251
+ "epoch": 7.044690265486726,
1252
+ "grad_norm": 1.098761796951294,
1253
+ "learning_rate": 1.40117994100295e-05,
1254
+ "loss": 0.6905,
1255
+ "step": 1690
1256
+ },
1257
+ {
1258
+ "epoch": 7.049115044247787,
1259
+ "grad_norm": 1.2097493410110474,
1260
+ "learning_rate": 1.376597836774828e-05,
1261
+ "loss": 0.7086,
1262
+ "step": 1700
1263
+ },
1264
+ {
1265
+ "epoch": 7.053539823008849,
1266
+ "grad_norm": 1.3922789096832275,
1267
+ "learning_rate": 1.3520157325467062e-05,
1268
+ "loss": 0.6473,
1269
+ "step": 1710
1270
+ },
1271
+ {
1272
+ "epoch": 7.057964601769911,
1273
+ "grad_norm": 1.064712405204773,
1274
+ "learning_rate": 1.3274336283185843e-05,
1275
+ "loss": 0.6987,
1276
+ "step": 1720
1277
+ },
1278
+ {
1279
+ "epoch": 7.062389380530973,
1280
+ "grad_norm": 1.1013967990875244,
1281
+ "learning_rate": 1.302851524090462e-05,
1282
+ "loss": 0.6593,
1283
+ "step": 1730
1284
+ },
1285
+ {
1286
+ "epoch": 7.0668141592920355,
1287
+ "grad_norm": 0.9073940515518188,
1288
+ "learning_rate": 1.2782694198623401e-05,
1289
+ "loss": 0.6873,
1290
+ "step": 1740
1291
+ },
1292
+ {
1293
+ "epoch": 7.0712389380530976,
1294
+ "grad_norm": 1.1259492635726929,
1295
+ "learning_rate": 1.2536873156342183e-05,
1296
+ "loss": 0.6954,
1297
+ "step": 1750
1298
+ },
1299
+ {
1300
+ "epoch": 7.07566371681416,
1301
+ "grad_norm": 0.8687026500701904,
1302
+ "learning_rate": 1.2291052114060964e-05,
1303
+ "loss": 0.6854,
1304
+ "step": 1760
1305
+ },
1306
+ {
1307
+ "epoch": 7.080088495575222,
1308
+ "grad_norm": 1.687637448310852,
1309
+ "learning_rate": 1.2045231071779745e-05,
1310
+ "loss": 0.6781,
1311
+ "step": 1770
1312
+ },
1313
+ {
1314
+ "epoch": 7.084513274336283,
1315
+ "grad_norm": 1.0679877996444702,
1316
+ "learning_rate": 1.1799410029498525e-05,
1317
+ "loss": 0.7,
1318
+ "step": 1780
1319
+ },
1320
+ {
1321
+ "epoch": 7.088938053097345,
1322
+ "grad_norm": 1.1482867002487183,
1323
+ "learning_rate": 1.1553588987217306e-05,
1324
+ "loss": 0.6494,
1325
+ "step": 1790
1326
+ },
1327
+ {
1328
+ "epoch": 7.093362831858407,
1329
+ "grad_norm": 1.042781949043274,
1330
+ "learning_rate": 1.1307767944936087e-05,
1331
+ "loss": 0.6776,
1332
+ "step": 1800
1333
+ },
1334
+ {
1335
+ "epoch": 7.097787610619469,
1336
+ "grad_norm": 0.8880053162574768,
1337
+ "learning_rate": 1.1061946902654869e-05,
1338
+ "loss": 0.6756,
1339
+ "step": 1810
1340
+ },
1341
+ {
1342
+ "epoch": 7.100442477876106,
1343
+ "eval_accuracy": 0.7630747630747631,
1344
+ "eval_loss": 0.6651633381843567,
1345
+ "eval_runtime": 1032.3877,
1346
+ "eval_samples_per_second": 2.76,
1347
+ "eval_steps_per_second": 0.087,
1348
+ "step": 1816
1349
+ },
1350
+ {
1351
+ "epoch": 8.001769911504425,
1352
+ "grad_norm": 0.9909548163414001,
1353
+ "learning_rate": 1.0816125860373648e-05,
1354
+ "loss": 0.6837,
1355
+ "step": 1820
1356
+ },
1357
+ {
1358
+ "epoch": 8.006194690265486,
1359
+ "grad_norm": 0.943515419960022,
1360
+ "learning_rate": 1.0570304818092429e-05,
1361
+ "loss": 0.665,
1362
+ "step": 1830
1363
+ },
1364
+ {
1365
+ "epoch": 8.01061946902655,
1366
+ "grad_norm": 0.9931963682174683,
1367
+ "learning_rate": 1.032448377581121e-05,
1368
+ "loss": 0.6664,
1369
+ "step": 1840
1370
+ },
1371
+ {
1372
+ "epoch": 8.01504424778761,
1373
+ "grad_norm": 0.8919005393981934,
1374
+ "learning_rate": 1.007866273352999e-05,
1375
+ "loss": 0.7089,
1376
+ "step": 1850
1377
+ },
1378
+ {
1379
+ "epoch": 8.019469026548673,
1380
+ "grad_norm": 1.1344377994537354,
1381
+ "learning_rate": 9.832841691248771e-06,
1382
+ "loss": 0.6589,
1383
+ "step": 1860
1384
+ },
1385
+ {
1386
+ "epoch": 8.023893805309735,
1387
+ "grad_norm": 1.4625028371810913,
1388
+ "learning_rate": 9.587020648967553e-06,
1389
+ "loss": 0.6771,
1390
+ "step": 1870
1391
+ },
1392
+ {
1393
+ "epoch": 8.028318584070796,
1394
+ "grad_norm": 0.8750028610229492,
1395
+ "learning_rate": 9.341199606686332e-06,
1396
+ "loss": 0.6856,
1397
+ "step": 1880
1398
+ },
1399
+ {
1400
+ "epoch": 8.032743362831859,
1401
+ "grad_norm": 1.1741547584533691,
1402
+ "learning_rate": 9.095378564405113e-06,
1403
+ "loss": 0.693,
1404
+ "step": 1890
1405
+ },
1406
+ {
1407
+ "epoch": 8.03716814159292,
1408
+ "grad_norm": 1.0879665613174438,
1409
+ "learning_rate": 8.849557522123894e-06,
1410
+ "loss": 0.7053,
1411
+ "step": 1900
1412
+ },
1413
+ {
1414
+ "epoch": 8.041592920353983,
1415
+ "grad_norm": 0.9571520686149597,
1416
+ "learning_rate": 8.603736479842674e-06,
1417
+ "loss": 0.6415,
1418
+ "step": 1910
1419
+ },
1420
+ {
1421
+ "epoch": 8.046017699115044,
1422
+ "grad_norm": 1.060584545135498,
1423
+ "learning_rate": 8.357915437561457e-06,
1424
+ "loss": 0.6809,
1425
+ "step": 1920
1426
+ },
1427
+ {
1428
+ "epoch": 8.050442477876107,
1429
+ "grad_norm": 1.0861510038375854,
1430
+ "learning_rate": 8.112094395280237e-06,
1431
+ "loss": 0.6771,
1432
+ "step": 1930
1433
+ },
1434
+ {
1435
+ "epoch": 8.054867256637168,
1436
+ "grad_norm": 1.0085254907608032,
1437
+ "learning_rate": 7.866273352999016e-06,
1438
+ "loss": 0.6639,
1439
+ "step": 1940
1440
+ },
1441
+ {
1442
+ "epoch": 8.05929203539823,
1443
+ "grad_norm": 1.0106779336929321,
1444
+ "learning_rate": 7.620452310717798e-06,
1445
+ "loss": 0.6665,
1446
+ "step": 1950
1447
+ },
1448
+ {
1449
+ "epoch": 8.063716814159292,
1450
+ "grad_norm": 1.03801691532135,
1451
+ "learning_rate": 7.374631268436579e-06,
1452
+ "loss": 0.6768,
1453
+ "step": 1960
1454
+ },
1455
+ {
1456
+ "epoch": 8.068141592920353,
1457
+ "grad_norm": 1.242561936378479,
1458
+ "learning_rate": 7.128810226155359e-06,
1459
+ "loss": 0.6154,
1460
+ "step": 1970
1461
+ },
1462
+ {
1463
+ "epoch": 8.072566371681416,
1464
+ "grad_norm": 0.9356180429458618,
1465
+ "learning_rate": 6.88298918387414e-06,
1466
+ "loss": 0.6564,
1467
+ "step": 1980
1468
+ },
1469
+ {
1470
+ "epoch": 8.076991150442478,
1471
+ "grad_norm": 1.3207311630249023,
1472
+ "learning_rate": 6.6371681415929215e-06,
1473
+ "loss": 0.7182,
1474
+ "step": 1990
1475
+ },
1476
+ {
1477
+ "epoch": 8.08141592920354,
1478
+ "grad_norm": 1.6134599447250366,
1479
+ "learning_rate": 6.3913470993117005e-06,
1480
+ "loss": 0.6538,
1481
+ "step": 2000
1482
+ },
1483
+ {
1484
+ "epoch": 8.085840707964602,
1485
+ "grad_norm": 1.317514419555664,
1486
+ "learning_rate": 6.145526057030482e-06,
1487
+ "loss": 0.6756,
1488
+ "step": 2010
1489
+ },
1490
+ {
1491
+ "epoch": 8.090265486725663,
1492
+ "grad_norm": 1.5709384679794312,
1493
+ "learning_rate": 5.899705014749263e-06,
1494
+ "loss": 0.646,
1495
+ "step": 2020
1496
+ },
1497
+ {
1498
+ "epoch": 8.094690265486726,
1499
+ "grad_norm": 0.9243668913841248,
1500
+ "learning_rate": 5.653883972468043e-06,
1501
+ "loss": 0.6908,
1502
+ "step": 2030
1503
+ },
1504
+ {
1505
+ "epoch": 8.099115044247787,
1506
+ "grad_norm": 0.859581708908081,
1507
+ "learning_rate": 5.408062930186824e-06,
1508
+ "loss": 0.6964,
1509
+ "step": 2040
1510
+ },
1511
+ {
1512
+ "epoch": 8.100442477876106,
1513
+ "eval_accuracy": 0.7655317655317655,
1514
+ "eval_loss": 0.6591010093688965,
1515
+ "eval_runtime": 997.7811,
1516
+ "eval_samples_per_second": 2.855,
1517
+ "eval_steps_per_second": 0.09,
1518
+ "step": 2043
1519
+ },
1520
+ {
1521
+ "epoch": 9.003097345132744,
1522
+ "grad_norm": 1.7191717624664307,
1523
+ "learning_rate": 5.162241887905605e-06,
1524
+ "loss": 0.6788,
1525
+ "step": 2050
1526
+ },
1527
+ {
1528
+ "epoch": 9.007522123893805,
1529
+ "grad_norm": 1.2770448923110962,
1530
+ "learning_rate": 4.9164208456243854e-06,
1531
+ "loss": 0.6437,
1532
+ "step": 2060
1533
+ },
1534
+ {
1535
+ "epoch": 9.011946902654866,
1536
+ "grad_norm": 1.0872125625610352,
1537
+ "learning_rate": 4.670599803343166e-06,
1538
+ "loss": 0.6984,
1539
+ "step": 2070
1540
+ },
1541
+ {
1542
+ "epoch": 9.01637168141593,
1543
+ "grad_norm": 0.8692576289176941,
1544
+ "learning_rate": 4.424778761061947e-06,
1545
+ "loss": 0.6377,
1546
+ "step": 2080
1547
+ },
1548
+ {
1549
+ "epoch": 9.02079646017699,
1550
+ "grad_norm": 0.9820857048034668,
1551
+ "learning_rate": 4.178957718780728e-06,
1552
+ "loss": 0.6671,
1553
+ "step": 2090
1554
+ },
1555
+ {
1556
+ "epoch": 9.025221238938054,
1557
+ "grad_norm": 1.6207181215286255,
1558
+ "learning_rate": 3.933136676499508e-06,
1559
+ "loss": 0.683,
1560
+ "step": 2100
1561
+ },
1562
+ {
1563
+ "epoch": 9.029646017699115,
1564
+ "grad_norm": 1.0268634557724,
1565
+ "learning_rate": 3.6873156342182893e-06,
1566
+ "loss": 0.6381,
1567
+ "step": 2110
1568
+ },
1569
+ {
1570
+ "epoch": 9.034070796460178,
1571
+ "grad_norm": 1.0367904901504517,
1572
+ "learning_rate": 3.44149459193707e-06,
1573
+ "loss": 0.6969,
1574
+ "step": 2120
1575
+ },
1576
+ {
1577
+ "epoch": 9.038495575221239,
1578
+ "grad_norm": 1.4884945154190063,
1579
+ "learning_rate": 3.1956735496558502e-06,
1580
+ "loss": 0.6342,
1581
+ "step": 2130
1582
+ },
1583
+ {
1584
+ "epoch": 9.042920353982302,
1585
+ "grad_norm": 1.086195707321167,
1586
+ "learning_rate": 2.9498525073746313e-06,
1587
+ "loss": 0.6649,
1588
+ "step": 2140
1589
+ },
1590
+ {
1591
+ "epoch": 9.047345132743363,
1592
+ "grad_norm": 1.0393646955490112,
1593
+ "learning_rate": 2.704031465093412e-06,
1594
+ "loss": 0.6646,
1595
+ "step": 2150
1596
+ },
1597
+ {
1598
+ "epoch": 9.051769911504424,
1599
+ "grad_norm": 1.188118815422058,
1600
+ "learning_rate": 2.4582104228121927e-06,
1601
+ "loss": 0.6681,
1602
+ "step": 2160
1603
+ },
1604
+ {
1605
+ "epoch": 9.056194690265487,
1606
+ "grad_norm": 1.3695433139801025,
1607
+ "learning_rate": 2.2123893805309734e-06,
1608
+ "loss": 0.667,
1609
+ "step": 2170
1610
+ },
1611
+ {
1612
+ "epoch": 9.060619469026548,
1613
+ "grad_norm": 1.166283130645752,
1614
+ "learning_rate": 1.966568338249754e-06,
1615
+ "loss": 0.679,
1616
+ "step": 2180
1617
+ },
1618
+ {
1619
+ "epoch": 9.065044247787611,
1620
+ "grad_norm": 0.956119179725647,
1621
+ "learning_rate": 1.720747295968535e-06,
1622
+ "loss": 0.6528,
1623
+ "step": 2190
1624
+ },
1625
+ {
1626
+ "epoch": 9.069469026548672,
1627
+ "grad_norm": 1.2216787338256836,
1628
+ "learning_rate": 1.4749262536873157e-06,
1629
+ "loss": 0.6857,
1630
+ "step": 2200
1631
+ },
1632
+ {
1633
+ "epoch": 9.073893805309735,
1634
+ "grad_norm": 1.2176066637039185,
1635
+ "learning_rate": 1.2291052114060964e-06,
1636
+ "loss": 0.6849,
1637
+ "step": 2210
1638
+ },
1639
+ {
1640
+ "epoch": 9.078318584070797,
1641
+ "grad_norm": 0.9874552488327026,
1642
+ "learning_rate": 9.83284169124877e-07,
1643
+ "loss": 0.6788,
1644
+ "step": 2220
1645
+ },
1646
+ {
1647
+ "epoch": 9.082743362831858,
1648
+ "grad_norm": 1.053183913230896,
1649
+ "learning_rate": 7.374631268436578e-07,
1650
+ "loss": 0.6791,
1651
+ "step": 2230
1652
+ },
1653
+ {
1654
+ "epoch": 9.08716814159292,
1655
+ "grad_norm": 1.2449108362197876,
1656
+ "learning_rate": 4.916420845624385e-07,
1657
+ "loss": 0.663,
1658
+ "step": 2240
1659
+ },
1660
+ {
1661
+ "epoch": 9.091592920353982,
1662
+ "grad_norm": 0.9063498377799988,
1663
+ "learning_rate": 2.4582104228121926e-07,
1664
+ "loss": 0.6794,
1665
+ "step": 2250
1666
+ },
1667
+ {
1668
+ "epoch": 9.096017699115045,
1669
+ "grad_norm": 1.1721782684326172,
1670
+ "learning_rate": 0.0,
1671
+ "loss": 0.6943,
1672
+ "step": 2260
1673
+ },
1674
+ {
1675
+ "epoch": 9.096017699115045,
1676
+ "eval_accuracy": 0.7665847665847666,
1677
+ "eval_loss": 0.6571447253227234,
1678
+ "eval_runtime": 1006.2992,
1679
+ "eval_samples_per_second": 2.831,
1680
+ "eval_steps_per_second": 0.089,
1681
+ "step": 2260
1682
+ },
1683
+ {
1684
+ "epoch": 9.096017699115045,
1685
+ "step": 2260,
1686
+ "total_flos": 2.277913504770854e+19,
1687
+ "train_loss": 0.7694057713567684,
1688
+ "train_runtime": 44318.5607,
1689
+ "train_samples_per_second": 1.632,
1690
+ "train_steps_per_second": 0.051
1691
+ },
1692
+ {
1693
+ "epoch": 9.096017699115045,
1694
+ "eval_accuracy": 0.7652173913043478,
1695
+ "eval_loss": 0.663335919380188,
1696
+ "eval_runtime": 925.6485,
1697
+ "eval_samples_per_second": 2.982,
1698
+ "eval_steps_per_second": 0.094,
1699
+ "step": 2260
1700
+ },
1701
+ {
1702
+ "epoch": 9.096017699115045,
1703
+ "eval_accuracy": 0.7652173913043478,
1704
+ "eval_loss": 0.6633358597755432,
1705
+ "eval_runtime": 903.9019,
1706
+ "eval_samples_per_second": 3.053,
1707
+ "eval_steps_per_second": 0.096,
1708
+ "step": 2260
1709
  }
1710
  ],
1711
  "logging_steps": 10,
1712
+ "max_steps": 2260,
1713
  "num_input_tokens_seen": 0,
1714
  "num_train_epochs": 9223372036854775807,
1715
  "save_steps": 500,
 
1725
  "attributes": {}
1726
  }
1727
  },
1728
+ "total_flos": 2.277913504770854e+19,
1729
+ "train_batch_size": 32,
1730
  "trial_name": null,
1731
  "trial_params": null
1732
  }