liusq19 commited on
Commit
ce0c2a3
·
verified ·
1 Parent(s): cb135e3

Model save

Browse files
README.md CHANGED
@@ -1,19 +1,17 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: HuggingFaceH4/Bespoke-Stratos-17k
4
  library_name: transformers
5
- model_name: Qwen/Qwen2.5-1.5B-Instruct
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
12
  ---
13
 
14
- # Model Card for Qwen/Qwen2.5-1.5B-Instruct
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [HuggingFaceH4/Bespoke-Stratos-17k](https://huggingface.co/datasets/HuggingFaceH4/Bespoke-Stratos-17k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shiqi_1/huggingface/runs/2klnrejm)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
+ model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
10
  ---
11
 
12
+ # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/shiqi_1/huggingface/runs/ak49qf9r)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 0.9994447529150472,
3
  "eval_loss": 0.7437080144882202,
4
  "eval_runtime": 13.7531,
5
  "eval_samples": 100,
6
  "eval_samples_per_second": 9.307,
7
  "eval_steps_per_second": 2.327,
8
- "total_flos": 76902580617216.0,
9
- "train_loss": 0.7594085027553417,
10
- "train_runtime": 8931.1449,
11
  "train_samples": 16610,
12
- "train_samples_per_second": 2.42,
13
- "train_steps_per_second": 0.151
14
  }
 
1
  {
2
+ "epoch": 0.999259807549963,
3
  "eval_loss": 0.7437080144882202,
4
  "eval_runtime": 13.7531,
5
  "eval_samples": 100,
6
  "eval_samples_per_second": 9.307,
7
  "eval_steps_per_second": 2.327,
8
+ "total_flos": 76888336760832.0,
9
+ "train_loss": 0.7675936229140671,
10
+ "train_runtime": 4627.4844,
11
  "train_samples": 16610,
12
+ "train_samples_per_second": 4.67,
13
+ "train_steps_per_second": 0.146
14
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b31468cf55d32779e4c1fd9e1d076cf687a4ebdc81989daf2bba4471cc9f355e
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af43e3f1e581c8d60a24df84d884fb99613a2dff51fa2c3605e5a1fc0cbe43d2
3
  size 3087467144
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9994447529150472,
3
- "total_flos": 76902580617216.0,
4
- "train_loss": 0.7594085027553417,
5
- "train_runtime": 8931.1449,
6
  "train_samples": 16610,
7
- "train_samples_per_second": 2.42,
8
- "train_steps_per_second": 0.151
9
  }
 
1
  {
2
+ "epoch": 0.999259807549963,
3
+ "total_flos": 76888336760832.0,
4
+ "train_loss": 0.7675936229140671,
5
+ "train_runtime": 4627.4844,
6
  "train_samples": 16610,
7
+ "train_samples_per_second": 4.67,
8
+ "train_steps_per_second": 0.146
9
  }
trainer_state.json CHANGED
@@ -1,2019 +1,1018 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9994447529150472,
5
  "eval_steps": 100,
6
- "global_step": 1350,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0037016472330186935,
13
- "grad_norm": 2.6187397645490957,
14
- "learning_rate": 7.407407407407407e-07,
15
- "loss": 1.1051,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.007403294466037387,
20
- "grad_norm": 2.626149854731043,
21
- "learning_rate": 1.4814814814814815e-06,
22
- "loss": 1.0488,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.01110494169905608,
27
- "grad_norm": 2.1918725080390606,
28
- "learning_rate": 2.222222222222222e-06,
29
- "loss": 1.0739,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 0.014806588932074774,
34
- "grad_norm": 1.664203092963064,
35
- "learning_rate": 2.962962962962963e-06,
36
- "loss": 1.0591,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.018508236165093468,
41
- "grad_norm": 1.6572531314700638,
42
- "learning_rate": 3.7037037037037037e-06,
43
- "loss": 1.0203,
44
  "step": 25
45
  },
46
  {
47
- "epoch": 0.02220988339811216,
48
- "grad_norm": 1.2520837580448105,
49
- "learning_rate": 4.444444444444444e-06,
50
- "loss": 1.0001,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.025911530631130855,
55
- "grad_norm": 1.1838717775548426,
56
- "learning_rate": 5.185185185185185e-06,
57
- "loss": 0.937,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.029613177864149548,
62
- "grad_norm": 1.0877804851125874,
63
- "learning_rate": 5.925925925925926e-06,
64
- "loss": 0.9428,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.03331482509716824,
69
- "grad_norm": 1.0468818723804836,
70
- "learning_rate": 6.666666666666667e-06,
71
- "loss": 0.9189,
72
  "step": 45
73
  },
74
  {
75
- "epoch": 0.037016472330186935,
76
- "grad_norm": 0.9963468132619961,
77
- "learning_rate": 7.4074074074074075e-06,
78
- "loss": 0.8983,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.040718119563205625,
83
- "grad_norm": 0.9587759497954821,
84
- "learning_rate": 8.148148148148148e-06,
85
- "loss": 0.8727,
86
  "step": 55
87
  },
88
  {
89
- "epoch": 0.04441976679622432,
90
- "grad_norm": 0.9190997603177266,
91
- "learning_rate": 8.888888888888888e-06,
92
- "loss": 0.8744,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.04812141402924301,
97
- "grad_norm": 0.9023307139402018,
98
- "learning_rate": 9.62962962962963e-06,
99
- "loss": 0.8342,
100
  "step": 65
101
  },
102
  {
103
- "epoch": 0.05182306126226171,
104
- "grad_norm": 1.0636801138652445,
105
- "learning_rate": 1.037037037037037e-05,
106
- "loss": 0.8617,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.0555247084952804,
111
- "grad_norm": 0.921690264948927,
112
- "learning_rate": 1.1111111111111113e-05,
113
- "loss": 0.8508,
114
  "step": 75
115
  },
116
  {
117
- "epoch": 0.059226355728299096,
118
- "grad_norm": 0.9474476132241261,
119
- "learning_rate": 1.1851851851851852e-05,
120
- "loss": 0.8151,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.06292800296131779,
125
- "grad_norm": 0.8997909905563047,
126
- "learning_rate": 1.2592592592592593e-05,
127
- "loss": 0.8292,
128
  "step": 85
129
  },
130
  {
131
- "epoch": 0.06662965019433648,
132
- "grad_norm": 0.9441138820892802,
133
- "learning_rate": 1.3333333333333333e-05,
134
- "loss": 0.8196,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.07033129742735518,
139
- "grad_norm": 0.928380523664138,
140
- "learning_rate": 1.4074074074074075e-05,
141
- "loss": 0.8431,
142
  "step": 95
143
  },
144
  {
145
- "epoch": 0.07403294466037387,
146
- "grad_norm": 0.913846755266702,
147
- "learning_rate": 1.4814814814814815e-05,
148
- "loss": 0.8401,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.07403294466037387,
153
- "eval_loss": 0.8453992605209351,
154
- "eval_runtime": 13.9552,
155
- "eval_samples_per_second": 9.172,
156
- "eval_steps_per_second": 2.293,
157
  "step": 100
158
  },
159
  {
160
- "epoch": 0.07773459189339256,
161
- "grad_norm": 1.0066526224903312,
162
- "learning_rate": 1.555555555555556e-05,
163
- "loss": 0.8371,
164
  "step": 105
165
  },
166
  {
167
- "epoch": 0.08143623912641125,
168
- "grad_norm": 0.9627584187646963,
169
- "learning_rate": 1.6296296296296297e-05,
170
- "loss": 0.8401,
171
  "step": 110
172
  },
173
  {
174
- "epoch": 0.08513788635942994,
175
- "grad_norm": 1.043835378412796,
176
- "learning_rate": 1.7037037037037038e-05,
177
- "loss": 0.8056,
178
  "step": 115
179
  },
180
  {
181
- "epoch": 0.08883953359244864,
182
- "grad_norm": 1.0717871960568617,
183
- "learning_rate": 1.7777777777777777e-05,
184
- "loss": 0.8279,
185
  "step": 120
186
  },
187
  {
188
- "epoch": 0.09254118082546733,
189
- "grad_norm": 0.9708258016301404,
190
- "learning_rate": 1.851851851851852e-05,
191
- "loss": 0.7864,
192
  "step": 125
193
  },
194
  {
195
- "epoch": 0.09624282805848602,
196
- "grad_norm": 0.9639104638720157,
197
- "learning_rate": 1.925925925925926e-05,
198
- "loss": 0.8442,
199
  "step": 130
200
  },
201
  {
202
- "epoch": 0.09994447529150471,
203
- "grad_norm": 1.062054926130498,
204
- "learning_rate": 2e-05,
205
- "loss": 0.7993,
206
  "step": 135
207
  },
208
  {
209
- "epoch": 0.10364612252452342,
210
- "grad_norm": 1.0677672852122793,
211
- "learning_rate": 1.9999164298554375e-05,
212
- "loss": 0.8285,
213
  "step": 140
214
  },
215
  {
216
- "epoch": 0.10734776975754211,
217
- "grad_norm": 1.0144467644285207,
218
- "learning_rate": 1.9996657333896875e-05,
219
- "loss": 0.8313,
220
  "step": 145
221
  },
222
  {
223
- "epoch": 0.1110494169905608,
224
- "grad_norm": 1.179093283782147,
225
- "learning_rate": 1.9992479525042305e-05,
226
- "loss": 0.788,
227
  "step": 150
228
  },
229
  {
230
- "epoch": 0.11475106422357949,
231
- "grad_norm": 1.0894219697825935,
232
- "learning_rate": 1.9986631570270835e-05,
233
- "loss": 0.8223,
234
  "step": 155
235
  },
236
  {
237
- "epoch": 0.11845271145659819,
238
- "grad_norm": 0.9611330729714898,
239
- "learning_rate": 1.9979114447011323e-05,
240
- "loss": 0.7773,
241
  "step": 160
242
  },
243
  {
244
- "epoch": 0.12215435868961688,
245
- "grad_norm": 0.8836942084290859,
246
- "learning_rate": 1.996992941167792e-05,
247
- "loss": 0.7919,
248
  "step": 165
249
  },
250
  {
251
- "epoch": 0.12585600592263557,
252
- "grad_norm": 0.9325794007240239,
253
- "learning_rate": 1.9959077999460094e-05,
254
- "loss": 0.7888,
255
  "step": 170
256
  },
257
  {
258
- "epoch": 0.12955765315565426,
259
- "grad_norm": 0.9951559311754377,
260
- "learning_rate": 1.9946562024066018e-05,
261
- "loss": 0.7817,
262
  "step": 175
263
  },
264
  {
265
- "epoch": 0.13325930038867295,
266
- "grad_norm": 0.9681713465677686,
267
- "learning_rate": 1.9932383577419432e-05,
268
- "loss": 0.8005,
269
  "step": 180
270
  },
271
  {
272
- "epoch": 0.13696094762169164,
273
- "grad_norm": 1.0088921380567866,
274
- "learning_rate": 1.991654502931001e-05,
275
- "loss": 0.7749,
276
  "step": 185
277
  },
278
  {
279
- "epoch": 0.14066259485471036,
280
- "grad_norm": 1.0845994864602486,
281
- "learning_rate": 1.9899049026997272e-05,
282
- "loss": 0.7994,
283
  "step": 190
284
  },
285
  {
286
- "epoch": 0.14436424208772905,
287
- "grad_norm": 0.8934316580644518,
288
- "learning_rate": 1.9879898494768093e-05,
289
- "loss": 0.8145,
290
  "step": 195
291
  },
292
  {
293
- "epoch": 0.14806588932074774,
294
- "grad_norm": 1.0410789338035273,
295
- "learning_rate": 1.9859096633447965e-05,
296
- "loss": 0.8111,
297
  "step": 200
298
  },
299
  {
300
- "epoch": 0.14806588932074774,
301
- "eval_loss": 0.8161085844039917,
302
- "eval_runtime": 13.9613,
303
- "eval_samples_per_second": 9.168,
304
- "eval_steps_per_second": 2.292,
305
  "step": 200
306
  },
307
  {
308
- "epoch": 0.15176753655376643,
309
- "grad_norm": 0.9827709342036304,
310
- "learning_rate": 1.9836646919866012e-05,
311
- "loss": 0.7804,
312
  "step": 205
313
  },
314
  {
315
- "epoch": 0.15546918378678512,
316
- "grad_norm": 1.003474849989548,
317
- "learning_rate": 1.9812553106273848e-05,
318
- "loss": 0.7971,
319
  "step": 210
320
  },
321
  {
322
- "epoch": 0.1591708310198038,
323
- "grad_norm": 0.9703054353773543,
324
- "learning_rate": 1.9786819219718443e-05,
325
- "loss": 0.7735,
326
  "step": 215
327
  },
328
  {
329
- "epoch": 0.1628724782528225,
330
- "grad_norm": 0.8887456196929027,
331
- "learning_rate": 1.9759449561369036e-05,
332
- "loss": 0.7913,
333
  "step": 220
334
  },
335
  {
336
- "epoch": 0.1665741254858412,
337
- "grad_norm": 0.9432643025255308,
338
- "learning_rate": 1.973044870579824e-05,
339
- "loss": 0.7996,
340
  "step": 225
341
  },
342
  {
343
- "epoch": 0.17027577271885988,
344
- "grad_norm": 0.839226549249778,
345
- "learning_rate": 1.9699821500217436e-05,
346
- "loss": 0.7226,
347
  "step": 230
348
  },
349
  {
350
- "epoch": 0.1739774199518786,
351
- "grad_norm": 1.091867770357073,
352
- "learning_rate": 1.9667573063666622e-05,
353
- "loss": 0.7977,
354
  "step": 235
355
  },
356
  {
357
- "epoch": 0.1776790671848973,
358
- "grad_norm": 0.8957970876878486,
359
- "learning_rate": 1.9633708786158803e-05,
360
- "loss": 0.7824,
361
  "step": 240
362
  },
363
  {
364
- "epoch": 0.18138071441791598,
365
- "grad_norm": 0.9277895788218994,
366
- "learning_rate": 1.959823432777912e-05,
367
- "loss": 0.8142,
368
  "step": 245
369
  },
370
  {
371
- "epoch": 0.18508236165093467,
372
- "grad_norm": 0.9204433681878583,
373
- "learning_rate": 1.95611556177388e-05,
374
- "loss": 0.763,
375
  "step": 250
376
  },
377
  {
378
- "epoch": 0.18878400888395336,
379
- "grad_norm": 0.8426481109707195,
380
- "learning_rate": 1.9522478853384154e-05,
381
- "loss": 0.7579,
382
  "step": 255
383
  },
384
  {
385
- "epoch": 0.19248565611697205,
386
- "grad_norm": 0.9563840160596006,
387
- "learning_rate": 1.9482210499160767e-05,
388
- "loss": 0.8039,
389
  "step": 260
390
  },
391
  {
392
- "epoch": 0.19618730334999074,
393
- "grad_norm": 0.876265917540201,
394
- "learning_rate": 1.9440357285533e-05,
395
- "loss": 0.7426,
396
  "step": 265
397
  },
398
  {
399
- "epoch": 0.19988895058300943,
400
- "grad_norm": 0.8616754726225433,
401
- "learning_rate": 1.9396926207859085e-05,
402
- "loss": 0.7934,
403
  "step": 270
404
  },
405
  {
406
- "epoch": 0.20359059781602815,
407
- "grad_norm": 0.9982907243202536,
408
- "learning_rate": 1.93519245252219e-05,
409
- "loss": 0.7933,
410
  "step": 275
411
  },
412
  {
413
- "epoch": 0.20729224504904684,
414
- "grad_norm": 0.9048851945663438,
415
- "learning_rate": 1.9305359759215686e-05,
416
- "loss": 0.7904,
417
  "step": 280
418
  },
419
  {
420
- "epoch": 0.21099389228206553,
421
- "grad_norm": 0.9101761515606357,
422
- "learning_rate": 1.9257239692688907e-05,
423
- "loss": 0.7603,
424
  "step": 285
425
  },
426
  {
427
- "epoch": 0.21469553951508422,
428
- "grad_norm": 0.9117583933872578,
429
- "learning_rate": 1.9207572368443386e-05,
430
- "loss": 0.7861,
431
  "step": 290
432
  },
433
  {
434
- "epoch": 0.2183971867481029,
435
- "grad_norm": 1.0650351555940032,
436
- "learning_rate": 1.9156366087890062e-05,
437
- "loss": 0.788,
438
  "step": 295
439
  },
440
  {
441
- "epoch": 0.2220988339811216,
442
- "grad_norm": 0.969249362194927,
443
- "learning_rate": 1.9103629409661468e-05,
444
- "loss": 0.7513,
445
  "step": 300
446
  },
447
  {
448
- "epoch": 0.2220988339811216,
449
- "eval_loss": 0.799350380897522,
450
- "eval_runtime": 13.9587,
451
- "eval_samples_per_second": 9.17,
452
- "eval_steps_per_second": 2.292,
453
  "step": 300
454
  },
455
  {
456
- "epoch": 0.2258004812141403,
457
- "grad_norm": 0.9004354980972079,
458
- "learning_rate": 1.9049371148181253e-05,
459
- "loss": 0.7873,
460
  "step": 305
461
  },
462
  {
463
- "epoch": 0.22950212844715898,
464
- "grad_norm": 0.9646842722396891,
465
- "learning_rate": 1.8993600372190933e-05,
466
- "loss": 0.811,
467
  "step": 310
468
  },
469
  {
470
- "epoch": 0.23320377568017767,
471
- "grad_norm": 0.9534884432642204,
472
- "learning_rate": 1.8936326403234125e-05,
473
- "loss": 0.8029,
474
  "step": 315
475
  },
476
  {
477
- "epoch": 0.23690542291319638,
478
- "grad_norm": 1.0197381583907712,
479
- "learning_rate": 1.8877558814098564e-05,
480
- "loss": 0.8012,
481
  "step": 320
482
  },
483
  {
484
- "epoch": 0.24060707014621507,
485
- "grad_norm": 0.8801464820705825,
486
- "learning_rate": 1.881730742721608e-05,
487
- "loss": 0.7725,
488
  "step": 325
489
  },
490
  {
491
- "epoch": 0.24430871737923376,
492
- "grad_norm": 0.9133382986967378,
493
- "learning_rate": 1.8755582313020912e-05,
494
- "loss": 0.7792,
495
  "step": 330
496
  },
497
  {
498
- "epoch": 0.24801036461225245,
499
- "grad_norm": 0.9131840239823398,
500
- "learning_rate": 1.8692393788266477e-05,
501
- "loss": 0.7902,
502
  "step": 335
503
  },
504
  {
505
- "epoch": 0.25171201184527114,
506
- "grad_norm": 0.9107021211247343,
507
- "learning_rate": 1.8627752414301087e-05,
508
- "loss": 0.7952,
509
  "step": 340
510
  },
511
  {
512
- "epoch": 0.25541365907828983,
513
- "grad_norm": 0.8995888119431379,
514
- "learning_rate": 1.8561668995302668e-05,
515
- "loss": 0.7901,
516
  "step": 345
517
  },
518
  {
519
- "epoch": 0.2591153063113085,
520
- "grad_norm": 0.852186358263588,
521
- "learning_rate": 1.8494154576472976e-05,
522
- "loss": 0.7695,
523
  "step": 350
524
  },
525
  {
526
- "epoch": 0.2628169535443272,
527
- "grad_norm": 0.8396394207930936,
528
- "learning_rate": 1.8425220442191496e-05,
529
- "loss": 0.7635,
530
  "step": 355
531
  },
532
  {
533
- "epoch": 0.2665186007773459,
534
- "grad_norm": 0.8979065139575588,
535
- "learning_rate": 1.8354878114129368e-05,
536
- "loss": 0.7397,
537
  "step": 360
538
  },
539
  {
540
- "epoch": 0.2702202480103646,
541
- "grad_norm": 0.9830492467939624,
542
- "learning_rate": 1.8283139349323632e-05,
543
- "loss": 0.8034,
544
  "step": 365
545
  },
546
  {
547
- "epoch": 0.2739218952433833,
548
- "grad_norm": 0.8322915324627383,
549
- "learning_rate": 1.8210016138212186e-05,
550
- "loss": 0.7463,
551
  "step": 370
552
  },
553
  {
554
- "epoch": 0.277623542476402,
555
- "grad_norm": 0.758748760459231,
556
- "learning_rate": 1.8135520702629677e-05,
557
- "loss": 0.7175,
558
  "step": 375
559
  },
560
  {
561
- "epoch": 0.2813251897094207,
562
- "grad_norm": 0.9061675038903559,
563
- "learning_rate": 1.8059665493764745e-05,
564
- "loss": 0.7905,
565
  "step": 380
566
  },
567
  {
568
- "epoch": 0.2850268369424394,
569
- "grad_norm": 0.9120525610140781,
570
- "learning_rate": 1.7982463190078928e-05,
571
- "loss": 0.7422,
572
  "step": 385
573
  },
574
  {
575
- "epoch": 0.2887284841754581,
576
- "grad_norm": 0.8683320457225452,
577
- "learning_rate": 1.7903926695187595e-05,
578
- "loss": 0.7975,
579
  "step": 390
580
  },
581
  {
582
- "epoch": 0.2924301314084768,
583
- "grad_norm": 0.8975271488943286,
584
- "learning_rate": 1.78240691357032e-05,
585
- "loss": 0.766,
586
  "step": 395
587
  },
588
  {
589
- "epoch": 0.2961317786414955,
590
- "grad_norm": 0.8416591840774865,
591
- "learning_rate": 1.7742903859041324e-05,
592
- "loss": 0.7762,
593
  "step": 400
594
  },
595
  {
596
- "epoch": 0.2961317786414955,
597
- "eval_loss": 0.7873815298080444,
598
- "eval_runtime": 13.9696,
599
- "eval_samples_per_second": 9.163,
600
- "eval_steps_per_second": 2.291,
601
  "step": 400
602
  },
603
  {
604
- "epoch": 0.29983342587451417,
605
- "grad_norm": 0.8046330563391425,
606
- "learning_rate": 1.766044443118978e-05,
607
- "loss": 0.7349,
608
  "step": 405
609
  },
610
  {
611
- "epoch": 0.30353507310753286,
612
- "grad_norm": 0.9009575122141973,
613
- "learning_rate": 1.757670463444118e-05,
614
- "loss": 0.8051,
615
  "step": 410
616
  },
617
  {
618
- "epoch": 0.30723672034055155,
619
- "grad_norm": 0.8753276872380578,
620
- "learning_rate": 1.749169846508936e-05,
621
- "loss": 0.7377,
622
  "step": 415
623
  },
624
  {
625
- "epoch": 0.31093836757357024,
626
- "grad_norm": 0.9131143743513088,
627
- "learning_rate": 1.740544013109005e-05,
628
- "loss": 0.7686,
629
  "step": 420
630
  },
631
  {
632
- "epoch": 0.31464001480658893,
633
- "grad_norm": 0.961063073180003,
634
- "learning_rate": 1.7317944049686125e-05,
635
- "loss": 0.7572,
636
  "step": 425
637
  },
638
  {
639
- "epoch": 0.3183416620396076,
640
- "grad_norm": 0.9242168018337253,
641
- "learning_rate": 1.722922484499793e-05,
642
- "loss": 0.7541,
643
  "step": 430
644
  },
645
  {
646
- "epoch": 0.3220433092726263,
647
- "grad_norm": 0.8102879065641984,
648
- "learning_rate": 1.7139297345578992e-05,
649
- "loss": 0.7306,
650
  "step": 435
651
  },
652
  {
653
- "epoch": 0.325744956505645,
654
- "grad_norm": 0.8772119423935111,
655
- "learning_rate": 1.7048176581937562e-05,
656
- "loss": 0.7861,
657
  "step": 440
658
  },
659
  {
660
- "epoch": 0.3294466037386637,
661
- "grad_norm": 0.9421126607604352,
662
- "learning_rate": 1.6955877784024418e-05,
663
- "loss": 0.7536,
664
  "step": 445
665
  },
666
  {
667
- "epoch": 0.3331482509716824,
668
- "grad_norm": 0.855630117564788,
669
- "learning_rate": 1.686241637868734e-05,
670
- "loss": 0.7909,
671
  "step": 450
672
  },
673
  {
674
- "epoch": 0.33684989820470107,
675
- "grad_norm": 0.9554030247369313,
676
- "learning_rate": 1.676780798709262e-05,
677
- "loss": 0.7732,
678
  "step": 455
679
  },
680
  {
681
- "epoch": 0.34055154543771976,
682
- "grad_norm": 0.8311286009688098,
683
- "learning_rate": 1.6672068422114195e-05,
684
- "loss": 0.8165,
685
  "step": 460
686
  },
687
  {
688
- "epoch": 0.3442531926707385,
689
- "grad_norm": 0.9484301771196552,
690
- "learning_rate": 1.657521368569064e-05,
691
- "loss": 0.7728,
692
  "step": 465
693
  },
694
  {
695
- "epoch": 0.3479548399037572,
696
- "grad_norm": 0.9544835313642219,
697
- "learning_rate": 1.647725996615059e-05,
698
- "loss": 0.7393,
699
  "step": 470
700
  },
701
  {
702
- "epoch": 0.3516564871367759,
703
- "grad_norm": 0.8209462524094394,
704
- "learning_rate": 1.637822363550706e-05,
705
- "loss": 0.7568,
706
  "step": 475
707
  },
708
  {
709
- "epoch": 0.3553581343697946,
710
- "grad_norm": 0.8358271111736775,
711
- "learning_rate": 1.627812124672099e-05,
712
- "loss": 0.7617,
713
  "step": 480
714
  },
715
  {
716
- "epoch": 0.35905978160281327,
717
- "grad_norm": 0.9023899903905284,
718
- "learning_rate": 1.6176969530934573e-05,
719
- "loss": 0.7367,
720
  "step": 485
721
  },
722
  {
723
- "epoch": 0.36276142883583196,
724
- "grad_norm": 0.9096525737441111,
725
- "learning_rate": 1.6074785394674835e-05,
726
- "loss": 0.7623,
727
  "step": 490
728
  },
729
  {
730
- "epoch": 0.36646307606885065,
731
- "grad_norm": 0.8825159281951512,
732
- "learning_rate": 1.5971585917027864e-05,
733
- "loss": 0.7428,
734
  "step": 495
735
  },
736
  {
737
- "epoch": 0.37016472330186934,
738
- "grad_norm": 0.8614364489136819,
739
- "learning_rate": 1.586738834678418e-05,
740
- "loss": 0.7879,
741
  "step": 500
742
  },
743
  {
744
- "epoch": 0.37016472330186934,
745
- "eval_loss": 0.7775759100914001,
746
- "eval_runtime": 13.9703,
747
- "eval_samples_per_second": 9.162,
748
- "eval_steps_per_second": 2.291,
749
  "step": 500
750
  },
751
  {
752
- "epoch": 0.373866370534888,
753
- "grad_norm": 0.825428849040592,
754
- "learning_rate": 1.5762210099555804e-05,
755
- "loss": 0.7282,
756
  "step": 505
757
  },
758
  {
759
- "epoch": 0.3775680177679067,
760
- "grad_norm": 0.8915281904864577,
761
- "learning_rate": 1.5656068754865388e-05,
762
- "loss": 0.7478,
763
  "step": 510
764
  },
765
  {
766
- "epoch": 0.3812696650009254,
767
- "grad_norm": 0.8495286520615039,
768
- "learning_rate": 1.554898205320797e-05,
769
- "loss": 0.7189,
770
  "step": 515
771
  },
772
  {
773
- "epoch": 0.3849713122339441,
774
- "grad_norm": 0.8346171153567965,
775
- "learning_rate": 1.5440967893085827e-05,
776
- "loss": 0.7418,
777
  "step": 520
778
  },
779
  {
780
- "epoch": 0.3886729594669628,
781
- "grad_norm": 0.8454426246332264,
782
- "learning_rate": 1.5332044328016916e-05,
783
- "loss": 0.7427,
784
  "step": 525
785
  },
786
  {
787
- "epoch": 0.3923746066999815,
788
- "grad_norm": 0.8172271580093257,
789
- "learning_rate": 1.5222229563517385e-05,
790
- "loss": 0.766,
791
  "step": 530
792
  },
793
  {
794
- "epoch": 0.39607625393300017,
795
- "grad_norm": 0.756080108149318,
796
- "learning_rate": 1.5111541954058733e-05,
797
- "loss": 0.7552,
798
  "step": 535
799
  },
800
  {
801
- "epoch": 0.39977790116601886,
802
- "grad_norm": 0.9286111422412047,
803
- "learning_rate": 1.5000000000000002e-05,
804
- "loss": 0.7139,
805
  "step": 540
806
  },
807
  {
808
- "epoch": 0.40347954839903755,
809
- "grad_norm": 0.901270025112976,
810
- "learning_rate": 1.4887622344495643e-05,
811
- "loss": 0.766,
812
  "step": 545
813
  },
814
  {
815
- "epoch": 0.4071811956320563,
816
- "grad_norm": 0.8518885453623255,
817
- "learning_rate": 1.4774427770379492e-05,
818
- "loss": 0.7524,
819
  "step": 550
820
  },
821
  {
822
- "epoch": 0.410882842865075,
823
- "grad_norm": 0.7850923823705602,
824
- "learning_rate": 1.4660435197025391e-05,
825
- "loss": 0.7303,
826
  "step": 555
827
  },
828
  {
829
- "epoch": 0.41458449009809367,
830
- "grad_norm": 0.8809118291053211,
831
- "learning_rate": 1.4545663677185007e-05,
832
- "loss": 0.7508,
833
  "step": 560
834
  },
835
  {
836
- "epoch": 0.41828613733111236,
837
- "grad_norm": 0.9204090822346441,
838
- "learning_rate": 1.4430132393803353e-05,
839
- "loss": 0.7577,
840
  "step": 565
841
  },
842
  {
843
- "epoch": 0.42198778456413105,
844
- "grad_norm": 0.88540873125467,
845
- "learning_rate": 1.4313860656812537e-05,
846
- "loss": 0.7451,
847
  "step": 570
848
  },
849
  {
850
- "epoch": 0.42568943179714974,
851
- "grad_norm": 0.9404522480973243,
852
- "learning_rate": 1.4196867899904292e-05,
853
- "loss": 0.7628,
854
  "step": 575
855
  },
856
  {
857
- "epoch": 0.42939107903016843,
858
- "grad_norm": 0.9134791919932267,
859
- "learning_rate": 1.4079173677281836e-05,
860
- "loss": 0.7958,
861
  "step": 580
862
  },
863
  {
864
- "epoch": 0.4330927262631871,
865
- "grad_norm": 0.8547063134065127,
866
- "learning_rate": 1.396079766039157e-05,
867
- "loss": 0.7423,
868
  "step": 585
869
  },
870
  {
871
- "epoch": 0.4367943734962058,
872
- "grad_norm": 0.8271676690566563,
873
- "learning_rate": 1.3841759634635177e-05,
874
- "loss": 0.7353,
875
  "step": 590
876
  },
877
  {
878
- "epoch": 0.4404960207292245,
879
- "grad_norm": 0.8262219798664319,
880
- "learning_rate": 1.3722079496062702e-05,
881
- "loss": 0.7601,
882
  "step": 595
883
  },
884
  {
885
- "epoch": 0.4441976679622432,
886
- "grad_norm": 0.9475599443388361,
887
- "learning_rate": 1.3601777248047105e-05,
888
- "loss": 0.7595,
889
  "step": 600
890
  },
891
  {
892
- "epoch": 0.4441976679622432,
893
- "eval_loss": 0.772025465965271,
894
- "eval_runtime": 13.9495,
895
- "eval_samples_per_second": 9.176,
896
- "eval_steps_per_second": 2.294,
897
  "step": 600
898
  },
899
  {
900
- "epoch": 0.4478993151952619,
901
- "grad_norm": 0.9511942323230904,
902
- "learning_rate": 1.3480872997940906e-05,
903
- "loss": 0.7392,
904
  "step": 605
905
  },
906
  {
907
- "epoch": 0.4516009624282806,
908
- "grad_norm": 0.8807518033426519,
909
- "learning_rate": 1.3359386953715423e-05,
910
- "loss": 0.7627,
911
  "step": 610
912
  },
913
  {
914
- "epoch": 0.45530260966129926,
915
- "grad_norm": 0.8362844884857521,
916
- "learning_rate": 1.3237339420583213e-05,
917
- "loss": 0.7117,
918
  "step": 615
919
  },
920
  {
921
- "epoch": 0.45900425689431795,
922
- "grad_norm": 0.8150528468524839,
923
- "learning_rate": 1.3114750797604248e-05,
924
- "loss": 0.747,
925
  "step": 620
926
  },
927
  {
928
- "epoch": 0.46270590412733664,
929
- "grad_norm": 0.8362983698312987,
930
- "learning_rate": 1.2991641574276419e-05,
931
- "loss": 0.7419,
932
  "step": 625
933
  },
934
  {
935
- "epoch": 0.46640755136035533,
936
- "grad_norm": 0.9050332105138127,
937
- "learning_rate": 1.2868032327110904e-05,
938
- "loss": 0.752,
939
  "step": 630
940
  },
941
  {
942
- "epoch": 0.4701091985933741,
943
- "grad_norm": 0.9133538944566808,
944
- "learning_rate": 1.2743943716193017e-05,
945
- "loss": 0.7402,
946
  "step": 635
947
  },
948
  {
949
- "epoch": 0.47381084582639277,
950
- "grad_norm": 0.8619981266901199,
951
- "learning_rate": 1.261939648172906e-05,
952
- "loss": 0.7226,
953
  "step": 640
954
  },
955
  {
956
- "epoch": 0.47751249305941146,
957
- "grad_norm": 0.7987558090786966,
958
- "learning_rate": 1.2494411440579814e-05,
959
- "loss": 0.7342,
960
  "step": 645
961
  },
962
  {
963
- "epoch": 0.48121414029243015,
964
- "grad_norm": 0.8486334284082256,
965
- "learning_rate": 1.2369009482781191e-05,
966
- "loss": 0.7366,
967
  "step": 650
968
  },
969
  {
970
- "epoch": 0.48491578752544884,
971
- "grad_norm": 0.8017574485719782,
972
- "learning_rate": 1.2243211568052678e-05,
973
- "loss": 0.7557,
974
  "step": 655
975
  },
976
  {
977
- "epoch": 0.48861743475846753,
978
- "grad_norm": 0.7936854819266611,
979
- "learning_rate": 1.211703872229411e-05,
980
- "loss": 0.7365,
981
  "step": 660
982
  },
983
  {
984
- "epoch": 0.4923190819914862,
985
- "grad_norm": 0.8228422615712129,
986
- "learning_rate": 1.1990512034071407e-05,
987
- "loss": 0.7357,
988
  "step": 665
989
  },
990
  {
991
- "epoch": 0.4960207292245049,
992
- "grad_norm": 0.8799075927771709,
993
- "learning_rate": 1.1863652651091824e-05,
994
- "loss": 0.7483,
995
  "step": 670
996
  },
997
  {
998
- "epoch": 0.4997223764575236,
999
- "grad_norm": 0.7966461968332786,
1000
- "learning_rate": 1.1736481776669307e-05,
1001
- "loss": 0.7624,
1002
- "step": 675
1003
- },
1004
- {
1005
- "epoch": 0.5034240236905423,
1006
- "grad_norm": 0.8566725582276603,
1007
- "learning_rate": 1.1609020666180574e-05,
1008
- "loss": 0.7285,
1009
- "step": 680
1010
- },
1011
- {
1012
- "epoch": 0.507125670923561,
1013
- "grad_norm": 0.8886455444981616,
1014
- "learning_rate": 1.1481290623512491e-05,
1015
- "loss": 0.7679,
1016
- "step": 685
1017
- },
1018
- {
1019
- "epoch": 0.5108273181565797,
1020
- "grad_norm": 0.9012557873417587,
1021
- "learning_rate": 1.1353312997501313e-05,
1022
- "loss": 0.7078,
1023
- "step": 690
1024
- },
1025
- {
1026
- "epoch": 0.5145289653895984,
1027
- "grad_norm": 0.8446186262555797,
1028
- "learning_rate": 1.1225109178364456e-05,
1029
- "loss": 0.7482,
1030
- "step": 695
1031
- },
1032
- {
1033
- "epoch": 0.518230612622617,
1034
- "grad_norm": 0.8295700413779421,
1035
- "learning_rate": 1.1096700594125318e-05,
1036
- "loss": 0.7021,
1037
- "step": 700
1038
- },
1039
- {
1040
- "epoch": 0.518230612622617,
1041
- "eval_loss": 0.7634979486465454,
1042
- "eval_runtime": 13.9953,
1043
- "eval_samples_per_second": 9.146,
1044
- "eval_steps_per_second": 2.286,
1045
- "step": 700
1046
- },
1047
- {
1048
- "epoch": 0.5219322598556357,
1049
- "grad_norm": 0.8742356168350276,
1050
- "learning_rate": 1.0968108707031792e-05,
1051
- "loss": 0.7329,
1052
- "step": 705
1053
- },
1054
- {
1055
- "epoch": 0.5256339070886544,
1056
- "grad_norm": 0.8314714738177891,
1057
- "learning_rate": 1.0839355009969068e-05,
1058
- "loss": 0.7513,
1059
- "step": 710
1060
- },
1061
- {
1062
- "epoch": 0.5293355543216731,
1063
- "grad_norm": 0.8420640531277049,
1064
- "learning_rate": 1.0710461022867303e-05,
1065
- "loss": 0.7794,
1066
- "step": 715
1067
- },
1068
- {
1069
- "epoch": 0.5330372015546918,
1070
- "grad_norm": 0.8414308108330245,
1071
- "learning_rate": 1.0581448289104759e-05,
1072
- "loss": 0.7408,
1073
- "step": 720
1074
- },
1075
- {
1076
- "epoch": 0.5367388487877105,
1077
- "grad_norm": 0.907927579037995,
1078
- "learning_rate": 1.0452338371907065e-05,
1079
- "loss": 0.752,
1080
- "step": 725
1081
- },
1082
- {
1083
- "epoch": 0.5404404960207292,
1084
- "grad_norm": 0.9077406887678768,
1085
- "learning_rate": 1.0323152850743107e-05,
1086
- "loss": 0.7479,
1087
- "step": 730
1088
- },
1089
- {
1090
- "epoch": 0.5441421432537479,
1091
- "grad_norm": 0.8374984177588956,
1092
- "learning_rate": 1.0193913317718245e-05,
1093
- "loss": 0.7382,
1094
- "step": 735
1095
- },
1096
- {
1097
- "epoch": 0.5478437904867666,
1098
- "grad_norm": 0.7995233932837665,
1099
- "learning_rate": 1.0064641373965394e-05,
1100
- "loss": 0.7367,
1101
- "step": 740
1102
- },
1103
- {
1104
- "epoch": 0.5515454377197853,
1105
- "grad_norm": 0.9526825161626277,
1106
- "learning_rate": 9.935358626034607e-06,
1107
- "loss": 0.7558,
1108
- "step": 745
1109
- },
1110
- {
1111
- "epoch": 0.555247084952804,
1112
- "grad_norm": 0.8708097624074678,
1113
- "learning_rate": 9.806086682281759e-06,
1114
- "loss": 0.7544,
1115
- "step": 750
1116
- },
1117
- {
1118
- "epoch": 0.5589487321858226,
1119
- "grad_norm": 0.7982114810095368,
1120
- "learning_rate": 9.676847149256894e-06,
1121
- "loss": 0.7529,
1122
- "step": 755
1123
- },
1124
- {
1125
- "epoch": 0.5626503794188414,
1126
- "grad_norm": 0.8524546420192315,
1127
- "learning_rate": 9.547661628092938e-06,
1128
- "loss": 0.7387,
1129
- "step": 760
1130
- },
1131
- {
1132
- "epoch": 0.5663520266518601,
1133
- "grad_norm": 0.8546073655934241,
1134
- "learning_rate": 9.418551710895243e-06,
1135
- "loss": 0.7317,
1136
- "step": 765
1137
- },
1138
- {
1139
- "epoch": 0.5700536738848788,
1140
- "grad_norm": 0.8807333695737656,
1141
- "learning_rate": 9.289538977132702e-06,
1142
- "loss": 0.686,
1143
- "step": 770
1144
- },
1145
- {
1146
- "epoch": 0.5737553211178975,
1147
- "grad_norm": 0.844968358054644,
1148
- "learning_rate": 9.160644990030932e-06,
1149
- "loss": 0.7556,
1150
- "step": 775
1151
- },
1152
- {
1153
- "epoch": 0.5774569683509162,
1154
- "grad_norm": 0.7662463262387722,
1155
- "learning_rate": 9.03189129296821e-06,
1156
- "loss": 0.7265,
1157
- "step": 780
1158
- },
1159
- {
1160
- "epoch": 0.5811586155839349,
1161
- "grad_norm": 0.8068107490785559,
1162
- "learning_rate": 8.903299405874685e-06,
1163
- "loss": 0.7244,
1164
- "step": 785
1165
- },
1166
- {
1167
- "epoch": 0.5848602628169536,
1168
- "grad_norm": 0.9017752536168412,
1169
- "learning_rate": 8.774890821635548e-06,
1170
- "loss": 0.7151,
1171
- "step": 790
1172
- },
1173
- {
1174
- "epoch": 0.5885619100499723,
1175
- "grad_norm": 0.7797811553984105,
1176
- "learning_rate": 8.646687002498692e-06,
1177
- "loss": 0.7145,
1178
- "step": 795
1179
- },
1180
- {
1181
- "epoch": 0.592263557282991,
1182
- "grad_norm": 0.8674556616777656,
1183
- "learning_rate": 8.518709376487515e-06,
1184
- "loss": 0.7357,
1185
- "step": 800
1186
- },
1187
- {
1188
- "epoch": 0.592263557282991,
1189
- "eval_loss": 0.7576203346252441,
1190
- "eval_runtime": 13.9497,
1191
- "eval_samples_per_second": 9.176,
1192
- "eval_steps_per_second": 2.294,
1193
- "step": 800
1194
- },
1195
- {
1196
- "epoch": 0.5959652045160097,
1197
- "grad_norm": 0.8467559903107525,
1198
- "learning_rate": 8.390979333819427e-06,
1199
- "loss": 0.7085,
1200
- "step": 805
1201
- },
1202
- {
1203
- "epoch": 0.5996668517490283,
1204
- "grad_norm": 0.8240879183306341,
1205
- "learning_rate": 8.263518223330698e-06,
1206
- "loss": 0.7154,
1207
- "step": 810
1208
- },
1209
- {
1210
- "epoch": 0.603368498982047,
1211
- "grad_norm": 0.8813244564095002,
1212
- "learning_rate": 8.13634734890818e-06,
1213
- "loss": 0.7269,
1214
- "step": 815
1215
- },
1216
- {
1217
- "epoch": 0.6070701462150657,
1218
- "grad_norm": 0.8912483807816779,
1219
- "learning_rate": 8.009487965928597e-06,
1220
- "loss": 0.7554,
1221
- "step": 820
1222
- },
1223
- {
1224
- "epoch": 0.6107717934480844,
1225
- "grad_norm": 0.8571355410783688,
1226
- "learning_rate": 7.882961277705897e-06,
1227
- "loss": 0.7447,
1228
- "step": 825
1229
- },
1230
- {
1231
- "epoch": 0.6144734406811031,
1232
- "grad_norm": 0.7573684977008671,
1233
- "learning_rate": 7.756788431947327e-06,
1234
- "loss": 0.6973,
1235
- "step": 830
1236
- },
1237
- {
1238
- "epoch": 0.6181750879141218,
1239
- "grad_norm": 0.8624302347661563,
1240
- "learning_rate": 7.630990517218809e-06,
1241
- "loss": 0.7371,
1242
- "step": 835
1243
- },
1244
- {
1245
- "epoch": 0.6218767351471405,
1246
- "grad_norm": 0.8516524205337687,
1247
- "learning_rate": 7.505588559420188e-06,
1248
- "loss": 0.7341,
1249
- "step": 840
1250
- },
1251
- {
1252
- "epoch": 0.6255783823801592,
1253
- "grad_norm": 0.9276498179412384,
1254
- "learning_rate": 7.380603518270942e-06,
1255
- "loss": 0.7141,
1256
- "step": 845
1257
- },
1258
- {
1259
- "epoch": 0.6292800296131779,
1260
- "grad_norm": 0.8975411826897715,
1261
- "learning_rate": 7.256056283806987e-06,
1262
- "loss": 0.7436,
1263
- "step": 850
1264
- },
1265
- {
1266
- "epoch": 0.6329816768461966,
1267
- "grad_norm": 0.8474590565828304,
1268
- "learning_rate": 7.131967672889101e-06,
1269
- "loss": 0.7493,
1270
- "step": 855
1271
- },
1272
- {
1273
- "epoch": 0.6366833240792152,
1274
- "grad_norm": 0.8185387261304637,
1275
- "learning_rate": 7.008358425723586e-06,
1276
- "loss": 0.735,
1277
- "step": 860
1278
- },
1279
- {
1280
- "epoch": 0.6403849713122339,
1281
- "grad_norm": 0.8128716130836567,
1282
- "learning_rate": 6.885249202395754e-06,
1283
- "loss": 0.7442,
1284
- "step": 865
1285
- },
1286
- {
1287
- "epoch": 0.6440866185452526,
1288
- "grad_norm": 0.8314128496312047,
1289
- "learning_rate": 6.762660579416791e-06,
1290
- "loss": 0.735,
1291
- "step": 870
1292
- },
1293
- {
1294
- "epoch": 0.6477882657782713,
1295
- "grad_norm": 0.8396057619913906,
1296
- "learning_rate": 6.640613046284581e-06,
1297
- "loss": 0.7277,
1298
- "step": 875
1299
- },
1300
- {
1301
- "epoch": 0.65148991301129,
1302
- "grad_norm": 0.8254782849090412,
1303
- "learning_rate": 6.519127002059096e-06,
1304
- "loss": 0.7447,
1305
- "step": 880
1306
- },
1307
- {
1308
- "epoch": 0.6551915602443087,
1309
- "grad_norm": 0.8091364242123348,
1310
- "learning_rate": 6.3982227519528986e-06,
1311
- "loss": 0.7355,
1312
- "step": 885
1313
- },
1314
- {
1315
- "epoch": 0.6588932074773274,
1316
- "grad_norm": 0.834856655149733,
1317
- "learning_rate": 6.277920503937303e-06,
1318
- "loss": 0.7467,
1319
- "step": 890
1320
- },
1321
- {
1322
- "epoch": 0.6625948547103461,
1323
- "grad_norm": 0.8144437357073777,
1324
- "learning_rate": 6.158240365364823e-06,
1325
- "loss": 0.7144,
1326
- "step": 895
1327
- },
1328
- {
1329
- "epoch": 0.6662965019433648,
1330
- "grad_norm": 0.7827054414778657,
1331
- "learning_rate": 6.039202339608432e-06,
1332
- "loss": 0.7261,
1333
- "step": 900
1334
- },
1335
- {
1336
- "epoch": 0.6662965019433648,
1337
- "eval_loss": 0.7522028684616089,
1338
- "eval_runtime": 13.95,
1339
- "eval_samples_per_second": 9.176,
1340
- "eval_steps_per_second": 2.294,
1341
- "step": 900
1342
- },
1343
- {
1344
- "epoch": 0.6699981491763835,
1345
- "grad_norm": 0.9041340180526984,
1346
- "learning_rate": 5.920826322718165e-06,
1347
- "loss": 0.7662,
1348
- "step": 905
1349
- },
1350
- {
1351
- "epoch": 0.6736997964094021,
1352
- "grad_norm": 0.8350839367682599,
1353
- "learning_rate": 5.80313210009571e-06,
1354
- "loss": 0.7186,
1355
- "step": 910
1356
- },
1357
- {
1358
- "epoch": 0.6774014436424208,
1359
- "grad_norm": 0.7780590810310966,
1360
- "learning_rate": 5.686139343187468e-06,
1361
- "loss": 0.6972,
1362
- "step": 915
1363
- },
1364
- {
1365
- "epoch": 0.6811030908754395,
1366
- "grad_norm": 0.8000697324325545,
1367
- "learning_rate": 5.569867606196652e-06,
1368
- "loss": 0.728,
1369
- "step": 920
1370
- },
1371
- {
1372
- "epoch": 0.6848047381084582,
1373
- "grad_norm": 0.7467762690487865,
1374
- "learning_rate": 5.454336322814995e-06,
1375
- "loss": 0.7037,
1376
- "step": 925
1377
- },
1378
- {
1379
- "epoch": 0.688506385341477,
1380
- "grad_norm": 0.777234399717618,
1381
- "learning_rate": 5.339564802974615e-06,
1382
- "loss": 0.7157,
1383
- "step": 930
1384
- },
1385
- {
1386
- "epoch": 0.6922080325744957,
1387
- "grad_norm": 0.8221362929506051,
1388
- "learning_rate": 5.2255722296205104e-06,
1389
- "loss": 0.7155,
1390
- "step": 935
1391
- },
1392
- {
1393
- "epoch": 0.6959096798075144,
1394
- "grad_norm": 0.8187388357604898,
1395
- "learning_rate": 5.112377655504359e-06,
1396
- "loss": 0.7258,
1397
- "step": 940
1398
- },
1399
- {
1400
- "epoch": 0.6996113270405331,
1401
- "grad_norm": 0.837214991569854,
1402
- "learning_rate": 5.000000000000003e-06,
1403
- "loss": 0.7379,
1404
- "step": 945
1405
- },
1406
- {
1407
- "epoch": 0.7033129742735518,
1408
- "grad_norm": 0.7919335607892027,
1409
- "learning_rate": 4.888458045941269e-06,
1410
- "loss": 0.7182,
1411
- "step": 950
1412
- },
1413
- {
1414
- "epoch": 0.7070146215065705,
1415
- "grad_norm": 0.745708705122289,
1416
- "learning_rate": 4.7777704364826175e-06,
1417
- "loss": 0.7281,
1418
- "step": 955
1419
- },
1420
- {
1421
- "epoch": 0.7107162687395892,
1422
- "grad_norm": 0.7855917301440053,
1423
- "learning_rate": 4.66795567198309e-06,
1424
- "loss": 0.6976,
1425
- "step": 960
1426
- },
1427
- {
1428
- "epoch": 0.7144179159726078,
1429
- "grad_norm": 0.8039341408241852,
1430
- "learning_rate": 4.559032106914173e-06,
1431
- "loss": 0.6941,
1432
- "step": 965
1433
- },
1434
- {
1435
- "epoch": 0.7181195632056265,
1436
- "grad_norm": 0.7989086472348528,
1437
- "learning_rate": 4.4510179467920325e-06,
1438
- "loss": 0.7212,
1439
- "step": 970
1440
- },
1441
- {
1442
- "epoch": 0.7218212104386452,
1443
- "grad_norm": 0.7552035261702617,
1444
- "learning_rate": 4.343931245134616e-06,
1445
- "loss": 0.7024,
1446
- "step": 975
1447
- },
1448
- {
1449
- "epoch": 0.7255228576716639,
1450
- "grad_norm": 0.7357103146927452,
1451
- "learning_rate": 4.237789900444197e-06,
1452
- "loss": 0.7508,
1453
- "step": 980
1454
- },
1455
- {
1456
- "epoch": 0.7292245049046826,
1457
- "grad_norm": 0.8059912605188725,
1458
- "learning_rate": 4.132611653215822e-06,
1459
- "loss": 0.7212,
1460
- "step": 985
1461
- },
1462
- {
1463
- "epoch": 0.7329261521377013,
1464
- "grad_norm": 0.8301874562284118,
1465
- "learning_rate": 4.028414082972141e-06,
1466
- "loss": 0.7169,
1467
- "step": 990
1468
- },
1469
- {
1470
- "epoch": 0.73662779937072,
1471
- "grad_norm": 0.7945303825545997,
1472
- "learning_rate": 3.925214605325164e-06,
1473
- "loss": 0.7322,
1474
- "step": 995
1475
- },
1476
- {
1477
- "epoch": 0.7403294466037387,
1478
- "grad_norm": 0.8561858117102767,
1479
- "learning_rate": 3.823030469065431e-06,
1480
- "loss": 0.7239,
1481
- "step": 1000
1482
- },
1483
- {
1484
- "epoch": 0.7403294466037387,
1485
- "eval_loss": 0.7484843730926514,
1486
- "eval_runtime": 13.9605,
1487
- "eval_samples_per_second": 9.169,
1488
- "eval_steps_per_second": 2.292,
1489
- "step": 1000
1490
- },
1491
- {
1492
- "epoch": 0.7440310938367574,
1493
- "grad_norm": 0.763331720710852,
1494
- "learning_rate": 3.7218787532790167e-06,
1495
- "loss": 0.7545,
1496
- "step": 1005
1497
- },
1498
- {
1499
- "epoch": 0.747732741069776,
1500
- "grad_norm": 0.8079982386291987,
1501
- "learning_rate": 3.6217763644929393e-06,
1502
- "loss": 0.6701,
1503
- "step": 1010
1504
- },
1505
- {
1506
- "epoch": 0.7514343883027947,
1507
- "grad_norm": 0.9128451316649459,
1508
- "learning_rate": 3.522740033849411e-06,
1509
- "loss": 0.7272,
1510
- "step": 1015
1511
- },
1512
- {
1513
- "epoch": 0.7551360355358134,
1514
- "grad_norm": 0.7762845069890241,
1515
- "learning_rate": 3.424786314309365e-06,
1516
- "loss": 0.7038,
1517
- "step": 1020
1518
- },
1519
- {
1520
- "epoch": 0.7588376827688321,
1521
- "grad_norm": 0.8035944499948804,
1522
- "learning_rate": 3.3279315778858034e-06,
1523
- "loss": 0.7571,
1524
- "step": 1025
1525
- },
1526
- {
1527
- "epoch": 0.7625393300018508,
1528
- "grad_norm": 0.7994108178325574,
1529
- "learning_rate": 3.2321920129073815e-06,
1530
- "loss": 0.7438,
1531
- "step": 1030
1532
- },
1533
- {
1534
- "epoch": 0.7662409772348695,
1535
- "grad_norm": 0.7747601431769363,
1536
- "learning_rate": 3.1375836213126653e-06,
1537
- "loss": 0.7063,
1538
- "step": 1035
1539
- },
1540
- {
1541
- "epoch": 0.7699426244678882,
1542
- "grad_norm": 0.7952549748289679,
1543
- "learning_rate": 3.04412221597558e-06,
1544
- "loss": 0.7088,
1545
- "step": 1040
1546
- },
1547
- {
1548
- "epoch": 0.7736442717009069,
1549
- "grad_norm": 0.7542488837628736,
1550
- "learning_rate": 2.9518234180624393e-06,
1551
- "loss": 0.7006,
1552
- "step": 1045
1553
- },
1554
- {
1555
- "epoch": 0.7773459189339256,
1556
- "grad_norm": 0.7848275252191611,
1557
- "learning_rate": 2.8607026544210115e-06,
1558
- "loss": 0.7121,
1559
- "step": 1050
1560
- },
1561
- {
1562
- "epoch": 0.7810475661669443,
1563
- "grad_norm": 0.8279489678198366,
1564
- "learning_rate": 2.770775155002071e-06,
1565
- "loss": 0.7188,
1566
- "step": 1055
1567
- },
1568
- {
1569
- "epoch": 0.784749213399963,
1570
- "grad_norm": 0.7944390505914574,
1571
- "learning_rate": 2.6820559503138797e-06,
1572
- "loss": 0.7394,
1573
- "step": 1060
1574
- },
1575
- {
1576
- "epoch": 0.7884508606329816,
1577
- "grad_norm": 0.775645480566711,
1578
- "learning_rate": 2.594559868909956e-06,
1579
- "loss": 0.763,
1580
- "step": 1065
1581
- },
1582
- {
1583
- "epoch": 0.7921525078660003,
1584
- "grad_norm": 0.8055614791757847,
1585
- "learning_rate": 2.50830153491064e-06,
1586
- "loss": 0.7058,
1587
- "step": 1070
1588
- },
1589
- {
1590
- "epoch": 0.795854155099019,
1591
- "grad_norm": 0.781453907704552,
1592
- "learning_rate": 2.423295365558821e-06,
1593
- "loss": 0.7208,
1594
- "step": 1075
1595
- },
1596
- {
1597
- "epoch": 0.7995558023320377,
1598
- "grad_norm": 0.8993073016418077,
1599
- "learning_rate": 2.339555568810221e-06,
1600
- "loss": 0.7286,
1601
- "step": 1080
1602
- },
1603
- {
1604
- "epoch": 0.8032574495650564,
1605
- "grad_norm": 0.8088055370829833,
1606
- "learning_rate": 2.2570961409586756e-06,
1607
- "loss": 0.7158,
1608
- "step": 1085
1609
- },
1610
- {
1611
- "epoch": 0.8069590967980751,
1612
- "grad_norm": 0.8400279823862996,
1613
- "learning_rate": 2.1759308642968024e-06,
1614
- "loss": 0.7358,
1615
- "step": 1090
1616
- },
1617
- {
1618
- "epoch": 0.8106607440310938,
1619
- "grad_norm": 0.7707002767536119,
1620
- "learning_rate": 2.0960733048124082e-06,
1621
- "loss": 0.7104,
1622
- "step": 1095
1623
- },
1624
- {
1625
- "epoch": 0.8143623912641126,
1626
- "grad_norm": 0.7239863902825704,
1627
- "learning_rate": 2.01753680992107e-06,
1628
- "loss": 0.7248,
1629
- "step": 1100
1630
- },
1631
- {
1632
- "epoch": 0.8143623912641126,
1633
- "eval_loss": 0.7456310987472534,
1634
- "eval_runtime": 13.9636,
1635
- "eval_samples_per_second": 9.167,
1636
- "eval_steps_per_second": 2.292,
1637
- "step": 1100
1638
- },
1639
- {
1640
- "epoch": 0.8180640384971313,
1641
- "grad_norm": 0.7951679059745718,
1642
- "learning_rate": 1.9403345062352574e-06,
1643
- "loss": 0.7133,
1644
- "step": 1105
1645
- },
1646
- {
1647
- "epoch": 0.82176568573015,
1648
- "grad_norm": 0.786912782956472,
1649
- "learning_rate": 1.8644792973703252e-06,
1650
- "loss": 0.7349,
1651
- "step": 1110
1652
- },
1653
- {
1654
- "epoch": 0.8254673329631687,
1655
- "grad_norm": 0.8115220548838313,
1656
- "learning_rate": 1.7899838617878163e-06,
1657
- "loss": 0.673,
1658
- "step": 1115
1659
- },
1660
- {
1661
- "epoch": 0.8291689801961873,
1662
- "grad_norm": 0.817121289855384,
1663
- "learning_rate": 1.7168606506763696e-06,
1664
- "loss": 0.7105,
1665
- "step": 1120
1666
- },
1667
- {
1668
- "epoch": 0.832870627429206,
1669
- "grad_norm": 0.8002215656530822,
1670
- "learning_rate": 1.6451218858706374e-06,
1671
- "loss": 0.7079,
1672
- "step": 1125
1673
- },
1674
- {
1675
- "epoch": 0.8365722746622247,
1676
- "grad_norm": 0.7834004843053641,
1677
- "learning_rate": 1.5747795578085046e-06,
1678
- "loss": 0.7341,
1679
- "step": 1130
1680
- },
1681
- {
1682
- "epoch": 0.8402739218952434,
1683
- "grad_norm": 0.853886763855502,
1684
- "learning_rate": 1.505845423527027e-06,
1685
- "loss": 0.7526,
1686
- "step": 1135
1687
- },
1688
- {
1689
- "epoch": 0.8439755691282621,
1690
- "grad_norm": 0.7515718806456418,
1691
- "learning_rate": 1.4383310046973365e-06,
1692
- "loss": 0.7297,
1693
- "step": 1140
1694
- },
1695
- {
1696
- "epoch": 0.8476772163612808,
1697
- "grad_norm": 0.739784359909608,
1698
- "learning_rate": 1.372247585698916e-06,
1699
- "loss": 0.7156,
1700
- "step": 1145
1701
- },
1702
- {
1703
- "epoch": 0.8513788635942995,
1704
- "grad_norm": 0.8843273402374225,
1705
- "learning_rate": 1.307606211733522e-06,
1706
- "loss": 0.7076,
1707
- "step": 1150
1708
- },
1709
- {
1710
- "epoch": 0.8550805108273182,
1711
- "grad_norm": 0.7519899075455724,
1712
- "learning_rate": 1.2444176869790925e-06,
1713
- "loss": 0.6877,
1714
- "step": 1155
1715
- },
1716
- {
1717
- "epoch": 0.8587821580603369,
1718
- "grad_norm": 0.7640709305599573,
1719
- "learning_rate": 1.18269257278392e-06,
1720
- "loss": 0.7266,
1721
- "step": 1160
1722
- },
1723
- {
1724
- "epoch": 0.8624838052933556,
1725
- "grad_norm": 0.7757193067058663,
1726
- "learning_rate": 1.1224411859014417e-06,
1727
- "loss": 0.7493,
1728
- "step": 1165
1729
- },
1730
- {
1731
- "epoch": 0.8661854525263742,
1732
- "grad_norm": 0.8326144754357965,
1733
- "learning_rate": 1.0636735967658785e-06,
1734
- "loss": 0.7016,
1735
- "step": 1170
1736
- },
1737
- {
1738
- "epoch": 0.8698870997593929,
1739
- "grad_norm": 0.7980950743505824,
1740
- "learning_rate": 1.0063996278090704e-06,
1741
- "loss": 0.7473,
1742
- "step": 1175
1743
- },
1744
- {
1745
- "epoch": 0.8735887469924116,
1746
- "grad_norm": 0.7448070961300409,
1747
- "learning_rate": 9.506288518187468e-07,
1748
- "loss": 0.7417,
1749
- "step": 1180
1750
- },
1751
- {
1752
- "epoch": 0.8772903942254303,
1753
- "grad_norm": 0.8323564393063527,
1754
- "learning_rate": 8.963705903385344e-07,
1755
- "loss": 0.73,
1756
- "step": 1185
1757
- },
1758
- {
1759
- "epoch": 0.880992041458449,
1760
- "grad_norm": 0.7257509455039137,
1761
- "learning_rate": 8.436339121099413e-07,
1762
- "loss": 0.6955,
1763
- "step": 1190
1764
- },
1765
- {
1766
- "epoch": 0.8846936886914677,
1767
- "grad_norm": 0.7668542997983315,
1768
- "learning_rate": 7.924276315566171e-07,
1769
- "loss": 0.7203,
1770
- "step": 1195
1771
- },
1772
- {
1773
- "epoch": 0.8883953359244864,
1774
- "grad_norm": 0.7462196723418639,
1775
- "learning_rate": 7.427603073110967e-07,
1776
- "loss": 0.7494,
1777
- "step": 1200
1778
- },
1779
- {
1780
- "epoch": 0.8883953359244864,
1781
- "eval_loss": 0.7442336082458496,
1782
- "eval_runtime": 13.9594,
1783
- "eval_samples_per_second": 9.169,
1784
- "eval_steps_per_second": 2.292,
1785
- "step": 1200
1786
- },
1787
- {
1788
- "epoch": 0.8920969831575051,
1789
- "grad_norm": 0.7740354232470301,
1790
- "learning_rate": 6.946402407843156e-07,
1791
- "loss": 0.7271,
1792
- "step": 1205
1793
- },
1794
- {
1795
- "epoch": 0.8957986303905238,
1796
- "grad_norm": 0.8223246853277537,
1797
- "learning_rate": 6.480754747781037e-07,
1798
- "loss": 0.7145,
1799
- "step": 1210
1800
- },
1801
- {
1802
- "epoch": 0.8995002776235425,
1803
- "grad_norm": 0.7709082515472453,
1804
- "learning_rate": 6.030737921409169e-07,
1805
- "loss": 0.7183,
1806
- "step": 1215
1807
- },
1808
- {
1809
- "epoch": 0.9032019248565611,
1810
- "grad_norm": 0.737942021515047,
1811
- "learning_rate": 5.596427144670002e-07,
1812
- "loss": 0.6767,
1813
- "step": 1220
1814
- },
1815
- {
1816
- "epoch": 0.9069035720895798,
1817
- "grad_norm": 0.7921338048969848,
1818
- "learning_rate": 5.177895008392353e-07,
1819
- "loss": 0.7339,
1820
- "step": 1225
1821
- },
1822
- {
1823
- "epoch": 0.9106052193225985,
1824
- "grad_norm": 0.7788313346093989,
1825
- "learning_rate": 4.775211466158469e-07,
1826
- "loss": 0.7584,
1827
- "step": 1230
1828
- },
1829
- {
1830
- "epoch": 0.9143068665556172,
1831
- "grad_norm": 0.7977888893081708,
1832
- "learning_rate": 4.388443822612043e-07,
1833
- "loss": 0.7331,
1834
- "step": 1235
1835
- },
1836
- {
1837
- "epoch": 0.9180085137886359,
1838
- "grad_norm": 0.8205134317010626,
1839
- "learning_rate": 4.017656722208807e-07,
1840
- "loss": 0.7366,
1841
- "step": 1240
1842
- },
1843
- {
1844
- "epoch": 0.9217101610216546,
1845
- "grad_norm": 0.9195048703105001,
1846
- "learning_rate": 3.662912138411967e-07,
1847
- "loss": 0.7397,
1848
- "step": 1245
1849
- },
1850
- {
1851
- "epoch": 0.9254118082546733,
1852
- "grad_norm": 0.7329045805585533,
1853
- "learning_rate": 3.3242693633337986e-07,
1854
- "loss": 0.7227,
1855
- "step": 1250
1856
- },
1857
- {
1858
- "epoch": 0.929113455487692,
1859
- "grad_norm": 0.7813246174244928,
1860
- "learning_rate": 3.001784997825652e-07,
1861
- "loss": 0.7034,
1862
- "step": 1255
1863
- },
1864
- {
1865
- "epoch": 0.9328151027207107,
1866
- "grad_norm": 0.719576735930033,
1867
- "learning_rate": 2.6955129420176193e-07,
1868
- "loss": 0.7266,
1869
- "step": 1260
1870
- },
1871
- {
1872
- "epoch": 0.9365167499537294,
1873
- "grad_norm": 0.7662877318795512,
1874
- "learning_rate": 2.405504386309643e-07,
1875
- "loss": 0.7363,
1876
- "step": 1265
1877
- },
1878
- {
1879
- "epoch": 0.9402183971867482,
1880
- "grad_norm": 0.8271100323795003,
1881
- "learning_rate": 2.1318078028155886e-07,
1882
- "loss": 0.7237,
1883
- "step": 1270
1884
- },
1885
- {
1886
- "epoch": 0.9439200444197668,
1887
- "grad_norm": 0.8329748294999934,
1888
- "learning_rate": 1.874468937261531e-07,
1889
- "loss": 0.7304,
1890
- "step": 1275
1891
- },
1892
- {
1893
- "epoch": 0.9476216916527855,
1894
- "grad_norm": 0.7890535324896384,
1895
- "learning_rate": 1.6335308013398888e-07,
1896
- "loss": 0.7094,
1897
- "step": 1280
1898
- },
1899
- {
1900
- "epoch": 0.9513233388858042,
1901
- "grad_norm": 0.8431357622327923,
1902
- "learning_rate": 1.409033665520354e-07,
1903
- "loss": 0.7018,
1904
- "step": 1285
1905
- },
1906
- {
1907
- "epoch": 0.9550249861188229,
1908
- "grad_norm": 0.7151442012235928,
1909
- "learning_rate": 1.201015052319099e-07,
1910
- "loss": 0.7046,
1911
- "step": 1290
1912
- },
1913
- {
1914
- "epoch": 0.9587266333518416,
1915
- "grad_norm": 0.7393727883496317,
1916
- "learning_rate": 1.0095097300273026e-07,
1917
- "loss": 0.6796,
1918
- "step": 1295
1919
- },
1920
- {
1921
- "epoch": 0.9624282805848603,
1922
- "grad_norm": 0.751310291423036,
1923
- "learning_rate": 8.345497068998897e-08,
1924
- "loss": 0.7353,
1925
- "step": 1300
1926
- },
1927
- {
1928
- "epoch": 0.9624282805848603,
1929
- "eval_loss": 0.7436981201171875,
1930
- "eval_runtime": 13.9653,
1931
- "eval_samples_per_second": 9.166,
1932
- "eval_steps_per_second": 2.291,
1933
- "step": 1300
1934
- },
1935
- {
1936
- "epoch": 0.966129927817879,
1937
- "grad_norm": 0.7374172211879654,
1938
- "learning_rate": 6.761642258056977e-08,
1939
- "loss": 0.7366,
1940
- "step": 1305
1941
- },
1942
- {
1943
- "epoch": 0.9698315750508977,
1944
- "grad_norm": 0.7708672190680163,
1945
- "learning_rate": 5.3437975933985366e-08,
1946
- "loss": 0.7092,
1947
- "step": 1310
1948
- },
1949
- {
1950
- "epoch": 0.9735332222839164,
1951
- "grad_norm": 0.7902779138167899,
1952
- "learning_rate": 4.0922000539906914e-08,
1953
- "loss": 0.6746,
1954
- "step": 1315
1955
- },
1956
- {
1957
- "epoch": 0.9772348695169351,
1958
- "grad_norm": 0.7632071329843058,
1959
- "learning_rate": 3.0070588322079765e-08,
1960
- "loss": 0.7196,
1961
- "step": 1320
1962
- },
1963
- {
1964
- "epoch": 0.9809365167499537,
1965
- "grad_norm": 0.7570175036997868,
1966
- "learning_rate": 2.088555298867978e-08,
1967
- "loss": 0.7255,
1968
- "step": 1325
1969
- },
1970
- {
1971
- "epoch": 0.9846381639829724,
1972
- "grad_norm": 0.758872580261703,
1973
- "learning_rate": 1.3368429729168075e-08,
1974
- "loss": 0.7287,
1975
- "step": 1330
1976
- },
1977
- {
1978
- "epoch": 0.9883398112159911,
1979
- "grad_norm": 0.779302395482425,
1980
- "learning_rate": 7.520474957699586e-09,
1981
- "loss": 0.7452,
1982
- "step": 1335
1983
- },
1984
- {
1985
- "epoch": 0.9920414584490098,
1986
- "grad_norm": 0.8910501338394342,
1987
- "learning_rate": 3.3426661031255024e-09,
1988
- "loss": 0.7288,
1989
- "step": 1340
1990
- },
1991
- {
1992
- "epoch": 0.9957431056820285,
1993
- "grad_norm": 0.8406066173627439,
1994
- "learning_rate": 8.357014456272794e-10,
1995
- "loss": 0.6923,
1996
- "step": 1345
1997
- },
1998
- {
1999
- "epoch": 0.9994447529150472,
2000
- "grad_norm": 0.759781227954175,
2001
  "learning_rate": 0.0,
2002
- "loss": 0.7214,
2003
- "step": 1350
2004
  },
2005
  {
2006
- "epoch": 0.9994447529150472,
2007
- "step": 1350,
2008
- "total_flos": 76902580617216.0,
2009
- "train_loss": 0.7594085027553417,
2010
- "train_runtime": 8931.1449,
2011
- "train_samples_per_second": 2.42,
2012
- "train_steps_per_second": 0.151
2013
  }
2014
  ],
2015
  "logging_steps": 5,
2016
- "max_steps": 1350,
2017
  "num_input_tokens_seen": 0,
2018
  "num_train_epochs": 1,
2019
  "save_steps": 500,
@@ -2029,7 +1028,7 @@
2029
  "attributes": {}
2030
  }
2031
  },
2032
- "total_flos": 76902580617216.0,
2033
  "train_batch_size": 4,
2034
  "trial_name": null,
2035
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.999259807549963,
5
  "eval_steps": 100,
6
+ "global_step": 675,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.007401924500370096,
13
+ "grad_norm": 2.674959598727342,
14
+ "learning_rate": 1.4705882352941177e-06,
15
+ "loss": 1.0787,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.014803849000740192,
20
+ "grad_norm": 2.2849930335009767,
21
+ "learning_rate": 2.9411764705882355e-06,
22
+ "loss": 1.0901,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.02220577350111029,
27
+ "grad_norm": 1.4169102513042604,
28
+ "learning_rate": 4.411764705882353e-06,
29
+ "loss": 1.0619,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.029607698001480384,
34
+ "grad_norm": 1.3754280634869183,
35
+ "learning_rate": 5.882352941176471e-06,
36
+ "loss": 1.0087,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.037009622501850484,
41
+ "grad_norm": 1.1141154652271672,
42
+ "learning_rate": 7.352941176470589e-06,
43
+ "loss": 0.9684,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.04441154700222058,
48
+ "grad_norm": 0.9534792105007532,
49
+ "learning_rate": 8.823529411764707e-06,
50
+ "loss": 0.9217,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.05181347150259067,
55
+ "grad_norm": 0.7606504248144849,
56
+ "learning_rate": 1.0294117647058823e-05,
57
+ "loss": 0.8859,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.05921539600296077,
62
+ "grad_norm": 0.68529790584477,
63
+ "learning_rate": 1.1764705882352942e-05,
64
+ "loss": 0.8631,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.06661732050333087,
69
+ "grad_norm": 0.7482947060538522,
70
+ "learning_rate": 1.323529411764706e-05,
71
+ "loss": 0.8485,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.07401924500370097,
76
+ "grad_norm": 0.814176151345203,
77
+ "learning_rate": 1.4705882352941179e-05,
78
+ "loss": 0.861,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.08142116950407106,
83
+ "grad_norm": 0.6269959293106316,
84
+ "learning_rate": 1.6176470588235296e-05,
85
+ "loss": 0.8545,
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.08882309400444116,
90
+ "grad_norm": 0.7352759272340602,
91
+ "learning_rate": 1.7647058823529414e-05,
92
+ "loss": 0.8293,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.09622501850481126,
97
+ "grad_norm": 0.6918084490038217,
98
+ "learning_rate": 1.911764705882353e-05,
99
+ "loss": 0.8259,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.10362694300518134,
104
+ "grad_norm": 0.7718482933587625,
105
+ "learning_rate": 1.9999464266898485e-05,
106
+ "loss": 0.8211,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.11102886750555144,
111
+ "grad_norm": 0.9788064725128405,
112
+ "learning_rate": 1.9993437928712977e-05,
113
+ "loss": 0.8164,
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.11843079200592153,
118
+ "grad_norm": 0.837998184141708,
119
+ "learning_rate": 1.998071963486563e-05,
120
+ "loss": 0.8062,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.12583271650629163,
125
+ "grad_norm": 0.6629895054599079,
126
+ "learning_rate": 1.9961317901970953e-05,
127
+ "loss": 0.7945,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.13323464100666174,
132
+ "grad_norm": 0.7557912436308644,
133
+ "learning_rate": 1.993524572210807e-05,
134
+ "loss": 0.7947,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.14063656550703182,
139
+ "grad_norm": 0.797960903546945,
140
+ "learning_rate": 1.990252055412077e-05,
141
+ "loss": 0.7906,
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.14803849000740193,
146
+ "grad_norm": 0.7886433512486097,
147
+ "learning_rate": 1.9863164311926433e-05,
148
+ "loss": 0.8171,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.14803849000740193,
153
+ "eval_loss": 0.8197493553161621,
154
+ "eval_runtime": 7.2314,
155
+ "eval_samples_per_second": 17.701,
156
+ "eval_steps_per_second": 2.213,
157
  "step": 100
158
  },
159
  {
160
+ "epoch": 0.15544041450777202,
161
+ "grad_norm": 0.7194704926083675,
162
+ "learning_rate": 1.981720334984174e-05,
163
+ "loss": 0.792,
164
  "step": 105
165
  },
166
  {
167
+ "epoch": 0.16284233900814213,
168
+ "grad_norm": 0.7153212599875873,
169
+ "learning_rate": 1.9764668444934853e-05,
170
+ "loss": 0.7859,
171
  "step": 110
172
  },
173
  {
174
+ "epoch": 0.1702442635085122,
175
+ "grad_norm": 0.7209996155683963,
176
+ "learning_rate": 1.970559477641606e-05,
177
+ "loss": 0.7631,
178
  "step": 115
179
  },
180
  {
181
+ "epoch": 0.17764618800888232,
182
+ "grad_norm": 0.7333599419087882,
183
+ "learning_rate": 1.9640021902080523e-05,
184
+ "loss": 0.793,
185
  "step": 120
186
  },
187
  {
188
+ "epoch": 0.1850481125092524,
189
+ "grad_norm": 0.6289153412562695,
190
+ "learning_rate": 1.9567993731818988e-05,
191
+ "loss": 0.7916,
192
  "step": 125
193
  },
194
  {
195
+ "epoch": 0.19245003700962252,
196
+ "grad_norm": 0.7521612039190329,
197
+ "learning_rate": 1.9489558498214197e-05,
198
+ "loss": 0.7843,
199
  "step": 130
200
  },
201
  {
202
+ "epoch": 0.1998519615099926,
203
+ "grad_norm": 0.6568997079081034,
204
+ "learning_rate": 1.9404768724242667e-05,
205
+ "loss": 0.7703,
206
  "step": 135
207
  },
208
  {
209
+ "epoch": 0.20725388601036268,
210
+ "grad_norm": 0.7360339897713676,
211
+ "learning_rate": 1.931368118810346e-05,
212
+ "loss": 0.7947,
213
  "step": 140
214
  },
215
  {
216
+ "epoch": 0.2146558105107328,
217
+ "grad_norm": 0.7557749076828488,
218
+ "learning_rate": 1.92163568851975e-05,
219
+ "loss": 0.7757,
220
  "step": 145
221
  },
222
  {
223
+ "epoch": 0.22205773501110287,
224
+ "grad_norm": 0.8218802431960855,
225
+ "learning_rate": 1.911286098728296e-05,
226
+ "loss": 0.772,
227
  "step": 150
228
  },
229
  {
230
+ "epoch": 0.22945965951147299,
231
+ "grad_norm": 0.7041654789385663,
232
+ "learning_rate": 1.900326279883392e-05,
233
+ "loss": 0.8017,
234
  "step": 155
235
  },
236
  {
237
+ "epoch": 0.23686158401184307,
238
+ "grad_norm": 0.7107270494061172,
239
+ "learning_rate": 1.8887635710631716e-05,
240
+ "loss": 0.8045,
241
  "step": 160
242
  },
243
  {
244
+ "epoch": 0.24426350851221318,
245
+ "grad_norm": 0.7042955521495632,
246
+ "learning_rate": 1.8766057150619865e-05,
247
+ "loss": 0.7775,
248
  "step": 165
249
  },
250
  {
251
+ "epoch": 0.25166543301258326,
252
+ "grad_norm": 0.7141479489682149,
253
+ "learning_rate": 1.8638608532055635e-05,
254
+ "loss": 0.7947,
255
  "step": 170
256
  },
257
  {
258
+ "epoch": 0.25906735751295334,
259
+ "grad_norm": 0.6682818577909502,
260
+ "learning_rate": 1.8505375198992856e-05,
261
+ "loss": 0.7831,
262
  "step": 175
263
  },
264
  {
265
+ "epoch": 0.2664692820133235,
266
+ "grad_norm": 0.7193249750447441,
267
+ "learning_rate": 1.836644636913258e-05,
268
+ "loss": 0.7542,
269
  "step": 180
270
  },
271
  {
272
+ "epoch": 0.27387120651369357,
273
+ "grad_norm": 0.7847188441908851,
274
+ "learning_rate": 1.8221915074079764e-05,
275
+ "loss": 0.7778,
276
  "step": 185
277
  },
278
  {
279
+ "epoch": 0.28127313101406365,
280
+ "grad_norm": 0.8828987676403609,
281
+ "learning_rate": 1.8071878097046064e-05,
282
+ "loss": 0.7564,
283
  "step": 190
284
  },
285
  {
286
+ "epoch": 0.28867505551443373,
287
+ "grad_norm": 0.6213320600286455,
288
+ "learning_rate": 1.7916435908040413e-05,
289
+ "loss": 0.7723,
290
  "step": 195
291
  },
292
  {
293
+ "epoch": 0.29607698001480387,
294
+ "grad_norm": 0.6595102663479724,
295
+ "learning_rate": 1.7755692596590778e-05,
296
+ "loss": 0.7747,
297
  "step": 200
298
  },
299
  {
300
+ "epoch": 0.29607698001480387,
301
+ "eval_loss": 0.7899559736251831,
302
+ "eval_runtime": 7.2104,
303
+ "eval_samples_per_second": 17.752,
304
+ "eval_steps_per_second": 2.219,
305
  "step": 200
306
  },
307
  {
308
+ "epoch": 0.30347890451517395,
309
+ "grad_norm": 0.706296557877635,
310
+ "learning_rate": 1.7589755802042188e-05,
311
+ "loss": 0.773,
312
  "step": 205
313
  },
314
  {
315
+ "epoch": 0.31088082901554404,
316
+ "grad_norm": 0.6937734870068385,
317
+ "learning_rate": 1.7418736641477636e-05,
318
+ "loss": 0.7563,
319
  "step": 210
320
  },
321
  {
322
+ "epoch": 0.3182827535159141,
323
+ "grad_norm": 0.6327617109092492,
324
+ "learning_rate": 1.7242749635310222e-05,
325
+ "loss": 0.758,
326
  "step": 215
327
  },
328
  {
329
+ "epoch": 0.32568467801628426,
330
+ "grad_norm": 0.6635934294750666,
331
+ "learning_rate": 1.7061912630596252e-05,
332
+ "loss": 0.7605,
333
  "step": 220
334
  },
335
  {
336
+ "epoch": 0.33308660251665434,
337
+ "grad_norm": 0.7228160092157478,
338
+ "learning_rate": 1.6876346722120747e-05,
339
+ "loss": 0.7754,
340
  "step": 225
341
  },
342
  {
343
+ "epoch": 0.3404885270170244,
344
+ "grad_norm": 0.6952975644500169,
345
+ "learning_rate": 1.6686176171308125e-05,
346
+ "loss": 0.7977,
347
  "step": 230
348
  },
349
  {
350
+ "epoch": 0.3478904515173945,
351
+ "grad_norm": 0.6717058633626165,
352
+ "learning_rate": 1.6491528323012412e-05,
353
+ "loss": 0.7594,
354
  "step": 235
355
  },
356
  {
357
+ "epoch": 0.35529237601776464,
358
+ "grad_norm": 0.6596693045521963,
359
+ "learning_rate": 1.6292533520242663e-05,
360
+ "loss": 0.7623,
361
  "step": 240
362
  },
363
  {
364
+ "epoch": 0.3626943005181347,
365
+ "grad_norm": 0.6470637566854384,
366
+ "learning_rate": 1.6089325016880737e-05,
367
+ "loss": 0.7526,
368
  "step": 245
369
  },
370
  {
371
+ "epoch": 0.3700962250185048,
372
+ "grad_norm": 0.6877498215548267,
373
+ "learning_rate": 1.588203888844982e-05,
374
+ "loss": 0.7681,
375
  "step": 250
376
  },
377
  {
378
+ "epoch": 0.3774981495188749,
379
+ "grad_norm": 0.6358323626672553,
380
+ "learning_rate": 1.5670813940993504e-05,
381
+ "loss": 0.741,
382
  "step": 255
383
  },
384
  {
385
+ "epoch": 0.38490007401924503,
386
+ "grad_norm": 0.600848318475503,
387
+ "learning_rate": 1.5455791618126407e-05,
388
+ "loss": 0.7334,
389
  "step": 260
390
  },
391
  {
392
+ "epoch": 0.3923019985196151,
393
+ "grad_norm": 0.6314609013122284,
394
+ "learning_rate": 1.5237115906318565e-05,
395
+ "loss": 0.7572,
396
  "step": 265
397
  },
398
  {
399
+ "epoch": 0.3997039230199852,
400
+ "grad_norm": 0.6546980627619242,
401
+ "learning_rate": 1.5014933238477069e-05,
402
+ "loss": 0.7378,
403
  "step": 270
404
  },
405
  {
406
+ "epoch": 0.4071058475203553,
407
+ "grad_norm": 0.6975545683818176,
408
+ "learning_rate": 1.4789392395889468e-05,
409
+ "loss": 0.7632,
410
  "step": 275
411
  },
412
  {
413
+ "epoch": 0.41450777202072536,
414
+ "grad_norm": 0.6503686028697638,
415
+ "learning_rate": 1.4560644408594602e-05,
416
+ "loss": 0.744,
417
  "step": 280
418
  },
419
  {
420
+ "epoch": 0.4219096965210955,
421
+ "grad_norm": 0.6602116319307001,
422
+ "learning_rate": 1.432884245424761e-05,
423
+ "loss": 0.7556,
424
  "step": 285
425
  },
426
  {
427
+ "epoch": 0.4293116210214656,
428
+ "grad_norm": 0.6672698383287922,
429
+ "learning_rate": 1.4094141755546816e-05,
430
+ "loss": 0.7831,
431
  "step": 290
432
  },
433
  {
434
+ "epoch": 0.43671354552183567,
435
+ "grad_norm": 0.6305307497798698,
436
+ "learning_rate": 1.3856699476291176e-05,
437
+ "loss": 0.7426,
438
  "step": 295
439
  },
440
  {
441
+ "epoch": 0.44411547002220575,
442
+ "grad_norm": 0.6859073767100461,
443
+ "learning_rate": 1.3616674616137902e-05,
444
+ "loss": 0.7645,
445
  "step": 300
446
  },
447
  {
448
+ "epoch": 0.44411547002220575,
449
+ "eval_loss": 0.7755689024925232,
450
+ "eval_runtime": 7.2066,
451
+ "eval_samples_per_second": 17.762,
452
+ "eval_steps_per_second": 2.22,
453
  "step": 300
454
  },
455
  {
456
+ "epoch": 0.4515173945225759,
457
+ "grad_norm": 0.6626556377379951,
458
+ "learning_rate": 1.3374227904130724e-05,
459
+ "loss": 0.7549,
460
  "step": 305
461
  },
462
  {
463
+ "epoch": 0.45891931902294597,
464
+ "grad_norm": 0.6499839877597537,
465
+ "learning_rate": 1.3129521691070108e-05,
466
+ "loss": 0.7328,
467
  "step": 310
468
  },
469
  {
470
+ "epoch": 0.46632124352331605,
471
+ "grad_norm": 0.722140222976433,
472
+ "learning_rate": 1.2882719840797473e-05,
473
+ "loss": 0.7514,
474
  "step": 315
475
  },
476
  {
477
+ "epoch": 0.47372316802368614,
478
+ "grad_norm": 0.6900675221213151,
479
+ "learning_rate": 1.2633987620466229e-05,
480
+ "loss": 0.7353,
481
  "step": 320
482
  },
483
  {
484
+ "epoch": 0.4811250925240563,
485
+ "grad_norm": 0.6297341225224966,
486
+ "learning_rate": 1.2383491589873122e-05,
487
+ "loss": 0.7407,
488
  "step": 325
489
  },
490
  {
491
+ "epoch": 0.48852701702442636,
492
+ "grad_norm": 0.6139804357167142,
493
+ "learning_rate": 1.213139948992394e-05,
494
+ "loss": 0.7497,
495
  "step": 330
496
  },
497
  {
498
+ "epoch": 0.49592894152479644,
499
+ "grad_norm": 0.7120439739230976,
500
+ "learning_rate": 1.187788013030837e-05,
501
+ "loss": 0.7468,
502
  "step": 335
503
  },
504
  {
505
+ "epoch": 0.5033308660251665,
506
+ "grad_norm": 0.6179256601206382,
507
+ "learning_rate": 1.1623103276459086e-05,
508
+ "loss": 0.7507,
509
  "step": 340
510
  },
511
  {
512
+ "epoch": 0.5107327905255367,
513
+ "grad_norm": 0.6483835976434715,
514
+ "learning_rate": 1.1367239535870913e-05,
515
+ "loss": 0.7425,
516
  "step": 345
517
  },
518
  {
519
+ "epoch": 0.5181347150259067,
520
+ "grad_norm": 0.6928461738197682,
521
+ "learning_rate": 1.1110460243856051e-05,
522
+ "loss": 0.7302,
523
  "step": 350
524
  },
525
  {
526
+ "epoch": 0.5255366395262768,
527
+ "grad_norm": 0.6706880141545486,
528
+ "learning_rate": 1.085293734881197e-05,
529
+ "loss": 0.7468,
530
  "step": 355
531
  },
532
  {
533
+ "epoch": 0.532938564026647,
534
+ "grad_norm": 0.6042342171269331,
535
+ "learning_rate": 1.0594843297078736e-05,
536
+ "loss": 0.766,
537
  "step": 360
538
  },
539
  {
540
+ "epoch": 0.540340488527017,
541
+ "grad_norm": 0.693508088289296,
542
+ "learning_rate": 1.0336350917462925e-05,
543
+ "loss": 0.7558,
544
  "step": 365
545
  },
546
  {
547
+ "epoch": 0.5477424130273871,
548
+ "grad_norm": 0.6083705213800933,
549
+ "learning_rate": 1.0077633305505402e-05,
550
+ "loss": 0.7433,
551
  "step": 370
552
  },
553
  {
554
+ "epoch": 0.5551443375277573,
555
+ "grad_norm": 0.6396792431151416,
556
+ "learning_rate": 9.818863707570476e-06,
557
+ "loss": 0.7608,
558
  "step": 375
559
  },
560
  {
561
+ "epoch": 0.5625462620281273,
562
+ "grad_norm": 0.6663076065375303,
563
+ "learning_rate": 9.560215404834094e-06,
564
+ "loss": 0.7515,
565
  "step": 380
566
  },
567
  {
568
+ "epoch": 0.5699481865284974,
569
+ "grad_norm": 0.641428537274285,
570
+ "learning_rate": 9.30186159724869e-06,
571
+ "loss": 0.7146,
572
  "step": 385
573
  },
574
  {
575
+ "epoch": 0.5773501110288675,
576
+ "grad_norm": 0.6138036144437788,
577
+ "learning_rate": 9.043975287562443e-06,
578
+ "loss": 0.747,
579
  "step": 390
580
  },
581
  {
582
+ "epoch": 0.5847520355292376,
583
+ "grad_norm": 0.6807331921757377,
584
+ "learning_rate": 8.786729165470584e-06,
585
+ "loss": 0.7253,
586
  "step": 395
587
  },
588
  {
589
+ "epoch": 0.5921539600296077,
590
+ "grad_norm": 0.6952002905984943,
591
+ "learning_rate": 8.530295491976338e-06,
592
+ "loss": 0.7307,
593
  "step": 400
594
  },
595
  {
596
+ "epoch": 0.5921539600296077,
597
+ "eval_loss": 0.7637839317321777,
598
+ "eval_runtime": 7.2078,
599
+ "eval_samples_per_second": 17.759,
600
+ "eval_steps_per_second": 2.22,
601
  "step": 400
602
  },
603
  {
604
+ "epoch": 0.5995558845299778,
605
+ "grad_norm": 0.5939454843322792,
606
+ "learning_rate": 8.274845984038916e-06,
607
+ "loss": 0.7174,
608
  "step": 405
609
  },
610
  {
611
+ "epoch": 0.6069578090303479,
612
+ "grad_norm": 0.6621271866381216,
613
+ "learning_rate": 8.020551699585843e-06,
614
+ "loss": 0.7469,
615
  "step": 410
616
  },
617
  {
618
+ "epoch": 0.6143597335307179,
619
+ "grad_norm": 0.6106430449913639,
620
+ "learning_rate": 7.76758292296659e-06,
621
+ "loss": 0.7264,
622
  "step": 415
623
  },
624
  {
625
+ "epoch": 0.6217616580310881,
626
+ "grad_norm": 0.6584389038177016,
627
+ "learning_rate": 7.5161090509242005e-06,
628
+ "loss": 0.7418,
629
  "step": 420
630
  },
631
  {
632
+ "epoch": 0.6291635825314582,
633
+ "grad_norm": 0.6508063180682058,
634
+ "learning_rate": 7.2662984791613186e-06,
635
+ "loss": 0.7345,
636
  "step": 425
637
  },
638
  {
639
+ "epoch": 0.6365655070318282,
640
+ "grad_norm": 0.654746417724555,
641
+ "learning_rate": 7.01831848957653e-06,
642
+ "loss": 0.7488,
643
  "step": 430
644
  },
645
  {
646
+ "epoch": 0.6439674315321984,
647
+ "grad_norm": 0.6038759794228741,
648
+ "learning_rate": 6.772335138246548e-06,
649
+ "loss": 0.747,
650
  "step": 435
651
  },
652
  {
653
+ "epoch": 0.6513693560325685,
654
+ "grad_norm": 0.6254763438931118,
655
+ "learning_rate": 6.528513144229256e-06,
656
+ "loss": 0.7427,
657
  "step": 440
658
  },
659
  {
660
+ "epoch": 0.6587712805329385,
661
+ "grad_norm": 0.6195437763354315,
662
+ "learning_rate": 6.287015779262064e-06,
663
+ "loss": 0.7489,
664
  "step": 445
665
  },
666
  {
667
+ "epoch": 0.6661732050333087,
668
+ "grad_norm": 0.6629664159964251,
669
+ "learning_rate": 6.048004758429451e-06,
670
+ "loss": 0.7274,
671
  "step": 450
672
  },
673
  {
674
+ "epoch": 0.6735751295336787,
675
+ "grad_norm": 0.6058164232925908,
676
+ "learning_rate": 5.811640131872867e-06,
677
+ "loss": 0.7496,
678
  "step": 455
679
  },
680
  {
681
+ "epoch": 0.6809770540340488,
682
+ "grad_norm": 0.6082658380867586,
683
+ "learning_rate": 5.578080177615575e-06,
684
+ "loss": 0.7201,
685
  "step": 460
686
  },
687
  {
688
+ "epoch": 0.688378978534419,
689
+ "grad_norm": 0.6242205120975641,
690
+ "learning_rate": 5.347481295574141e-06,
691
+ "loss": 0.7172,
692
  "step": 465
693
  },
694
  {
695
+ "epoch": 0.695780903034789,
696
+ "grad_norm": 0.6109755979913201,
697
+ "learning_rate": 5.119997902827584e-06,
698
+ "loss": 0.7286,
699
  "step": 470
700
  },
701
  {
702
+ "epoch": 0.7031828275351591,
703
+ "grad_norm": 0.6087033956225949,
704
+ "learning_rate": 4.8957823302142916e-06,
705
+ "loss": 0.7354,
706
  "step": 475
707
  },
708
  {
709
+ "epoch": 0.7105847520355293,
710
+ "grad_norm": 0.5865522874345606,
711
+ "learning_rate": 4.674984720325961e-06,
712
+ "loss": 0.7212,
713
  "step": 480
714
  },
715
  {
716
+ "epoch": 0.7179866765358993,
717
+ "grad_norm": 0.5900008473027598,
718
+ "learning_rate": 4.457752926966888e-06,
719
+ "loss": 0.715,
720
  "step": 485
721
  },
722
  {
723
+ "epoch": 0.7253886010362695,
724
+ "grad_norm": 0.5840665816418219,
725
+ "learning_rate": 4.244232416145839e-06,
726
+ "loss": 0.7337,
727
  "step": 490
728
  },
729
  {
730
+ "epoch": 0.7327905255366395,
731
+ "grad_norm": 0.5914947024608387,
732
+ "learning_rate": 4.0345661686669745e-06,
733
+ "loss": 0.7271,
734
  "step": 495
735
  },
736
  {
737
+ "epoch": 0.7401924500370096,
738
+ "grad_norm": 0.6196202183056477,
739
+ "learning_rate": 3.828894584384867e-06,
740
+ "loss": 0.7355,
741
  "step": 500
742
  },
743
  {
744
+ "epoch": 0.7401924500370096,
745
+ "eval_loss": 0.7563655972480774,
746
+ "eval_runtime": 7.2181,
747
+ "eval_samples_per_second": 17.733,
748
+ "eval_steps_per_second": 2.217,
749
  "step": 500
750
  },
751
  {
752
+ "epoch": 0.7475943745373798,
753
+ "grad_norm": 0.5586852736192075,
754
+ "learning_rate": 3.62735538818787e-06,
755
+ "loss": 0.7197,
756
  "step": 505
757
  },
758
  {
759
+ "epoch": 0.7549962990377498,
760
+ "grad_norm": 0.6337625854919152,
761
+ "learning_rate": 3.4300835377726904e-06,
762
+ "loss": 0.7233,
763
  "step": 510
764
  },
765
  {
766
+ "epoch": 0.7623982235381199,
767
+ "grad_norm": 0.6205123290247885,
768
+ "learning_rate": 3.2372111332720045e-06,
769
+ "loss": 0.7587,
770
  "step": 515
771
  },
772
  {
773
+ "epoch": 0.7698001480384901,
774
+ "grad_norm": 0.6153129450053498,
775
+ "learning_rate": 3.048867328795588e-06,
776
+ "loss": 0.7156,
777
  "step": 520
778
  },
779
  {
780
+ "epoch": 0.7772020725388601,
781
+ "grad_norm": 0.6026709629344417,
782
+ "learning_rate": 2.865178245944218e-06,
783
+ "loss": 0.7144,
784
  "step": 525
785
  },
786
  {
787
+ "epoch": 0.7846039970392302,
788
+ "grad_norm": 0.5724937932245526,
789
+ "learning_rate": 2.686266889354211e-06,
790
+ "loss": 0.7375,
791
  "step": 530
792
  },
793
  {
794
+ "epoch": 0.7920059215396003,
795
+ "grad_norm": 0.5925644676097567,
796
+ "learning_rate": 2.5122530643292274e-06,
797
+ "loss": 0.7429,
798
  "step": 535
799
  },
800
  {
801
+ "epoch": 0.7994078460399704,
802
+ "grad_norm": 0.6326300634198754,
803
+ "learning_rate": 2.3432532966144526e-06,
804
+ "loss": 0.7323,
805
  "step": 540
806
  },
807
  {
808
+ "epoch": 0.8068097705403405,
809
+ "grad_norm": 0.5849976467168821,
810
+ "learning_rate": 2.1793807543668857e-06,
811
+ "loss": 0.7338,
812
  "step": 545
813
  },
814
  {
815
+ "epoch": 0.8142116950407106,
816
+ "grad_norm": 0.5500210584534766,
817
+ "learning_rate": 2.0207451723739633e-06,
818
+ "loss": 0.7257,
819
  "step": 550
820
  },
821
  {
822
+ "epoch": 0.8216136195410807,
823
+ "grad_norm": 0.5676680461413595,
824
+ "learning_rate": 1.8674527785713247e-06,
825
+ "loss": 0.7325,
826
  "step": 555
827
  },
828
  {
829
+ "epoch": 0.8290155440414507,
830
+ "grad_norm": 0.6239434546631168,
831
+ "learning_rate": 1.7196062229088606e-06,
832
+ "loss": 0.6996,
833
  "step": 560
834
  },
835
  {
836
+ "epoch": 0.8364174685418209,
837
+ "grad_norm": 0.6254391500900318,
838
+ "learning_rate": 1.577304508612717e-06,
839
+ "loss": 0.7298,
840
  "step": 565
841
  },
842
  {
843
+ "epoch": 0.843819393042191,
844
+ "grad_norm": 0.5238189690989516,
845
+ "learning_rate": 1.4406429258892762e-06,
846
+ "loss": 0.7503,
847
  "step": 570
848
  },
849
  {
850
+ "epoch": 0.851221317542561,
851
+ "grad_norm": 0.6133649327761147,
852
+ "learning_rate": 1.3097129881154936e-06,
853
+ "loss": 0.7199,
854
  "step": 575
855
  },
856
  {
857
+ "epoch": 0.8586232420429312,
858
+ "grad_norm": 0.5832243304649319,
859
+ "learning_rate": 1.1846023705583442e-06,
860
+ "loss": 0.7164,
861
  "step": 580
862
  },
863
  {
864
+ "epoch": 0.8660251665433013,
865
+ "grad_norm": 0.581421408776636,
866
+ "learning_rate": 1.065394851664394e-06,
867
+ "loss": 0.7345,
868
  "step": 585
869
  },
870
  {
871
+ "epoch": 0.8734270910436713,
872
+ "grad_norm": 0.5486795664712047,
873
+ "learning_rate": 9.521702569588199e-07,
874
+ "loss": 0.7537,
875
  "step": 590
876
  },
877
  {
878
+ "epoch": 0.8808290155440415,
879
+ "grad_norm": 0.5762089839170463,
880
+ "learning_rate": 8.450044055914497e-07,
881
+ "loss": 0.7221,
882
  "step": 595
883
  },
884
  {
885
+ "epoch": 0.8882309400444115,
886
+ "grad_norm": 0.5637562364478066,
887
+ "learning_rate": 7.439690595656013e-07,
888
+ "loss": 0.7445,
889
  "step": 600
890
  },
891
  {
892
+ "epoch": 0.8882309400444115,
893
+ "eval_loss": 0.7531630992889404,
894
+ "eval_runtime": 6.3295,
895
+ "eval_samples_per_second": 20.223,
896
+ "eval_steps_per_second": 2.528,
897
  "step": 600
898
  },
899
  {
900
+ "epoch": 0.8956328645447816,
901
+ "grad_norm": 0.6333025954598674,
902
+ "learning_rate": 6.491318756837417e-07,
903
+ "loss": 0.7298,
904
  "step": 605
905
  },
906
  {
907
+ "epoch": 0.9030347890451518,
908
+ "grad_norm": 0.5151052584290405,
909
+ "learning_rate": 5.605563602421149e-07,
910
+ "loss": 0.7058,
911
  "step": 610
912
  },
913
  {
914
+ "epoch": 0.9104367135455218,
915
+ "grad_norm": 0.558083301486103,
916
+ "learning_rate": 4.783018265047179e-07,
917
+ "loss": 0.7557,
918
  "step": 615
919
  },
920
  {
921
+ "epoch": 0.9178386380458919,
922
+ "grad_norm": 0.5838054593517799,
923
+ "learning_rate": 4.024233549850509e-07,
924
+ "loss": 0.7436,
925
  "step": 620
926
  },
927
  {
928
+ "epoch": 0.9252405625462621,
929
+ "grad_norm": 0.5532527164872905,
930
+ "learning_rate": 3.329717565622825e-07,
931
+ "loss": 0.7404,
932
  "step": 625
933
  },
934
  {
935
+ "epoch": 0.9326424870466321,
936
+ "grad_norm": 0.5531839239223881,
937
+ "learning_rate": 2.6999353845651113e-07,
938
+ "loss": 0.724,
939
  "step": 630
940
  },
941
  {
942
+ "epoch": 0.9400444115470022,
943
+ "grad_norm": 0.5908996580668381,
944
+ "learning_rate": 2.1353087308590314e-07,
945
+ "loss": 0.7391,
946
  "step": 635
947
  },
948
  {
949
+ "epoch": 0.9474463360473723,
950
+ "grad_norm": 0.5583503930295213,
951
+ "learning_rate": 1.6362156982656085e-07,
952
+ "loss": 0.7292,
953
  "step": 640
954
  },
955
  {
956
+ "epoch": 0.9548482605477424,
957
+ "grad_norm": 0.5273929642155748,
958
+ "learning_rate": 1.2029904969404482e-07,
959
+ "loss": 0.7127,
960
  "step": 645
961
  },
962
  {
963
+ "epoch": 0.9622501850481125,
964
+ "grad_norm": 0.6043817732068986,
965
+ "learning_rate": 8.359232296349163e-08,
966
+ "loss": 0.7163,
967
  "step": 650
968
  },
969
  {
970
+ "epoch": 0.9696521095484826,
971
+ "grad_norm": 0.5680817655810946,
972
+ "learning_rate": 5.3525969743324356e-08,
973
+ "loss": 0.7322,
974
  "step": 655
975
  },
976
  {
977
+ "epoch": 0.9770540340488527,
978
+ "grad_norm": 0.5484921768212552,
979
+ "learning_rate": 3.012012351554017e-08,
980
+ "loss": 0.7064,
981
  "step": 660
982
  },
983
  {
984
+ "epoch": 0.9844559585492227,
985
+ "grad_norm": 0.5841736663763849,
986
+ "learning_rate": 1.3390457653639221e-08,
987
+ "loss": 0.7353,
988
  "step": 665
989
  },
990
  {
991
+ "epoch": 0.9918578830495929,
992
+ "grad_norm": 0.6439690756031937,
993
+ "learning_rate": 3.3481749271768726e-09,
994
+ "loss": 0.7463,
995
  "step": 670
996
  },
997
  {
998
+ "epoch": 0.999259807549963,
999
+ "grad_norm": 0.5777335037865771,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
  "learning_rate": 0.0,
1001
+ "loss": 0.7158,
1002
+ "step": 675
1003
  },
1004
  {
1005
+ "epoch": 0.999259807549963,
1006
+ "step": 675,
1007
+ "total_flos": 76888336760832.0,
1008
+ "train_loss": 0.7675936229140671,
1009
+ "train_runtime": 4627.4844,
1010
+ "train_samples_per_second": 4.67,
1011
+ "train_steps_per_second": 0.146
1012
  }
1013
  ],
1014
  "logging_steps": 5,
1015
+ "max_steps": 675,
1016
  "num_input_tokens_seen": 0,
1017
  "num_train_epochs": 1,
1018
  "save_steps": 500,
 
1028
  "attributes": {}
1029
  }
1030
  },
1031
+ "total_flos": 76888336760832.0,
1032
  "train_batch_size": 4,
1033
  "trial_name": null,
1034
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2d1db9b31890b71d03fe2d7aef1a05bea0ecf23be7567023d332facbc04d44a
3
  size 7416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b874f4c00970d0c1ca0bdeb1229662c5f353e103ad59da3dc823860bf66099a1
3
  size 7416