SummerSigh commited on
Commit
3afb0a7
·
verified ·
1 Parent(s): b6a4d04

Upload 8 files

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +964 -4
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ebc653e18935ef6e6d593d943659324be81a89c736dc6b33141d9a72bc9696c
3
  size 18494040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fe830f215593241d5a06be7b7382d26df2df95fa5877818c55602cc0aaaec7c
3
  size 18494040
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c16f2324ca710c73fffdab8ac0b51ca0937adca9d3ea12a4fcf7b6b75c642b3
3
  size 37035002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ffc9e9ba9737e7047e65caa20e5526ed8da4c213c4ce3f2cca848b1ac8ecdbd
3
  size 37035002
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:386fcc8cc1089aade9450d86fb239ea3483f455fd2d78d8378645feecfec9d69
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff264f99d31b522cc7e2a4eac9d38606d0c58a34c0adc74d71e0ca8b371dc36
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:caeb79da12fade882c795419ac73c6806820c6ccef19831ac9e9b66b6ca1212b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4c29cdd3791b07f27eccf7f1e479362eae64a2df7b8ed21d32b1b0f2e78f0d
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8981260572266088,
5
  "eval_steps": 500,
6
- "global_step": 81500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4351,11 +4351,971 @@
4351
  "loss": 4.4091,
4352
  "num_input_tokens_seen": 475882624,
4353
  "step": 81450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4354
  }
4355
  ],
4356
  "logging_steps": 150,
4357
  "max_steps": 272232,
4358
- "num_input_tokens_seen": 476177504,
4359
  "num_train_epochs": 3,
4360
  "save_steps": 500,
4361
  "stateful_callbacks": {
@@ -4370,7 +5330,7 @@
4370
  "attributes": {}
4371
  }
4372
  },
4373
- "total_flos": 7352485415362560.0,
4374
  "train_batch_size": 32,
4375
  "trial_name": null,
4376
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0964851864300316,
5
  "eval_steps": 500,
6
+ "global_step": 99500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4351
  "loss": 4.4091,
4352
  "num_input_tokens_seen": 475882624,
4353
  "step": 81450
4354
+ },
4355
+ {
4356
+ "epoch": 0.8992280523888501,
4357
+ "grad_norm": 1.8798983097076416,
4358
+ "learning_rate": 0.00011205900935980135,
4359
+ "loss": 4.4113,
4360
+ "num_input_tokens_seen": 476749664,
4361
+ "step": 81600
4362
+ },
4363
+ {
4364
+ "epoch": 0.9008810451322119,
4365
+ "grad_norm": 1.8249197006225586,
4366
+ "learning_rate": 0.00011197084796567584,
4367
+ "loss": 4.4059,
4368
+ "num_input_tokens_seen": 477650784,
4369
+ "step": 81750
4370
+ },
4371
+ {
4372
+ "epoch": 0.9025340378755737,
4373
+ "grad_norm": 1.9157739877700806,
4374
+ "learning_rate": 0.00011188268657155033,
4375
+ "loss": 4.4109,
4376
+ "num_input_tokens_seen": 478523840,
4377
+ "step": 81900
4378
+ },
4379
+ {
4380
+ "epoch": 0.9041870306189356,
4381
+ "grad_norm": 1.9503858089447021,
4382
+ "learning_rate": 0.00011179452517742482,
4383
+ "loss": 4.4139,
4384
+ "num_input_tokens_seen": 479399296,
4385
+ "step": 82050
4386
+ },
4387
+ {
4388
+ "epoch": 0.9058400233622974,
4389
+ "grad_norm": 1.8298823833465576,
4390
+ "learning_rate": 0.0001117063637832993,
4391
+ "loss": 4.4123,
4392
+ "num_input_tokens_seen": 480262240,
4393
+ "step": 82200
4394
+ },
4395
+ {
4396
+ "epoch": 0.9074930161056592,
4397
+ "grad_norm": 1.9161386489868164,
4398
+ "learning_rate": 0.0001116182023891738,
4399
+ "loss": 4.4058,
4400
+ "num_input_tokens_seen": 481141056,
4401
+ "step": 82350
4402
+ },
4403
+ {
4404
+ "epoch": 0.9091460088490212,
4405
+ "grad_norm": 1.872722864151001,
4406
+ "learning_rate": 0.00011153004099504828,
4407
+ "loss": 4.4279,
4408
+ "num_input_tokens_seen": 482014112,
4409
+ "step": 82500
4410
+ },
4411
+ {
4412
+ "epoch": 0.910799001592383,
4413
+ "grad_norm": 1.8831090927124023,
4414
+ "learning_rate": 0.00011144187960092277,
4415
+ "loss": 4.4121,
4416
+ "num_input_tokens_seen": 482898336,
4417
+ "step": 82650
4418
+ },
4419
+ {
4420
+ "epoch": 0.9124519943357449,
4421
+ "grad_norm": 1.8128923177719116,
4422
+ "learning_rate": 0.00011135371820679725,
4423
+ "loss": 4.4105,
4424
+ "num_input_tokens_seen": 483773760,
4425
+ "step": 82800
4426
+ },
4427
+ {
4428
+ "epoch": 0.9141049870791067,
4429
+ "grad_norm": 1.8982397317886353,
4430
+ "learning_rate": 0.00011126555681267175,
4431
+ "loss": 4.4234,
4432
+ "num_input_tokens_seen": 484671008,
4433
+ "step": 82950
4434
+ },
4435
+ {
4436
+ "epoch": 0.9157579798224685,
4437
+ "grad_norm": 1.8295831680297852,
4438
+ "learning_rate": 0.00011117739541854622,
4439
+ "loss": 4.4227,
4440
+ "num_input_tokens_seen": 485553984,
4441
+ "step": 83100
4442
+ },
4443
+ {
4444
+ "epoch": 0.9174109725658305,
4445
+ "grad_norm": 1.8975720405578613,
4446
+ "learning_rate": 0.00011108923402442071,
4447
+ "loss": 4.4176,
4448
+ "num_input_tokens_seen": 486416672,
4449
+ "step": 83250
4450
+ },
4451
+ {
4452
+ "epoch": 0.9190639653091923,
4453
+ "grad_norm": 1.8207321166992188,
4454
+ "learning_rate": 0.0001110010726302952,
4455
+ "loss": 4.424,
4456
+ "num_input_tokens_seen": 487286816,
4457
+ "step": 83400
4458
+ },
4459
+ {
4460
+ "epoch": 0.9207169580525542,
4461
+ "grad_norm": 1.9241523742675781,
4462
+ "learning_rate": 0.00011091291123616969,
4463
+ "loss": 4.4129,
4464
+ "num_input_tokens_seen": 488157088,
4465
+ "step": 83550
4466
+ },
4467
+ {
4468
+ "epoch": 0.922369950795916,
4469
+ "grad_norm": 1.8391443490982056,
4470
+ "learning_rate": 0.00011082474984204418,
4471
+ "loss": 4.4036,
4472
+ "num_input_tokens_seen": 489008320,
4473
+ "step": 83700
4474
+ },
4475
+ {
4476
+ "epoch": 0.9240229435392778,
4477
+ "grad_norm": 1.9244701862335205,
4478
+ "learning_rate": 0.00011073658844791865,
4479
+ "loss": 4.4215,
4480
+ "num_input_tokens_seen": 489887328,
4481
+ "step": 83850
4482
+ },
4483
+ {
4484
+ "epoch": 0.9256759362826397,
4485
+ "grad_norm": 1.8949611186981201,
4486
+ "learning_rate": 0.00011064842705379315,
4487
+ "loss": 4.4205,
4488
+ "num_input_tokens_seen": 490765504,
4489
+ "step": 84000
4490
+ },
4491
+ {
4492
+ "epoch": 0.9273289290260016,
4493
+ "grad_norm": 1.810594081878662,
4494
+ "learning_rate": 0.00011056026565966763,
4495
+ "loss": 4.4149,
4496
+ "num_input_tokens_seen": 491650144,
4497
+ "step": 84150
4498
+ },
4499
+ {
4500
+ "epoch": 0.9289819217693635,
4501
+ "grad_norm": 1.8556066751480103,
4502
+ "learning_rate": 0.00011047210426554213,
4503
+ "loss": 4.4102,
4504
+ "num_input_tokens_seen": 492539968,
4505
+ "step": 84300
4506
+ },
4507
+ {
4508
+ "epoch": 0.9306349145127253,
4509
+ "grad_norm": 1.8486409187316895,
4510
+ "learning_rate": 0.00011038394287141661,
4511
+ "loss": 4.4231,
4512
+ "num_input_tokens_seen": 493424352,
4513
+ "step": 84450
4514
+ },
4515
+ {
4516
+ "epoch": 0.9322879072560871,
4517
+ "grad_norm": 1.8193395137786865,
4518
+ "learning_rate": 0.00011029578147729111,
4519
+ "loss": 4.4195,
4520
+ "num_input_tokens_seen": 494301152,
4521
+ "step": 84600
4522
+ },
4523
+ {
4524
+ "epoch": 0.933940899999449,
4525
+ "grad_norm": 1.8344619274139404,
4526
+ "learning_rate": 0.00011020762008316558,
4527
+ "loss": 4.4075,
4528
+ "num_input_tokens_seen": 495177600,
4529
+ "step": 84750
4530
+ },
4531
+ {
4532
+ "epoch": 0.9355938927428109,
4533
+ "grad_norm": 1.781654953956604,
4534
+ "learning_rate": 0.00011012004643166758,
4535
+ "loss": 4.4075,
4536
+ "num_input_tokens_seen": 496047680,
4537
+ "step": 84900
4538
+ },
4539
+ {
4540
+ "epoch": 0.9372468854861727,
4541
+ "grad_norm": 1.935810923576355,
4542
+ "learning_rate": 0.00011003188503754206,
4543
+ "loss": 4.408,
4544
+ "num_input_tokens_seen": 496919488,
4545
+ "step": 85050
4546
+ },
4547
+ {
4548
+ "epoch": 0.9388998782295346,
4549
+ "grad_norm": 1.8130574226379395,
4550
+ "learning_rate": 0.00010994372364341655,
4551
+ "loss": 4.4152,
4552
+ "num_input_tokens_seen": 497821280,
4553
+ "step": 85200
4554
+ },
4555
+ {
4556
+ "epoch": 0.9405528709728964,
4557
+ "grad_norm": 1.9481176137924194,
4558
+ "learning_rate": 0.00010985556224929104,
4559
+ "loss": 4.4115,
4560
+ "num_input_tokens_seen": 498694560,
4561
+ "step": 85350
4562
+ },
4563
+ {
4564
+ "epoch": 0.9422058637162583,
4565
+ "grad_norm": 1.8938475847244263,
4566
+ "learning_rate": 0.00010976740085516553,
4567
+ "loss": 4.4077,
4568
+ "num_input_tokens_seen": 499570016,
4569
+ "step": 85500
4570
+ },
4571
+ {
4572
+ "epoch": 0.9438588564596201,
4573
+ "grad_norm": 1.8449296951293945,
4574
+ "learning_rate": 0.00010967923946104002,
4575
+ "loss": 4.4043,
4576
+ "num_input_tokens_seen": 500436288,
4577
+ "step": 85650
4578
+ },
4579
+ {
4580
+ "epoch": 0.945511849202982,
4581
+ "grad_norm": 1.775891661643982,
4582
+ "learning_rate": 0.00010959166580954201,
4583
+ "loss": 4.4112,
4584
+ "num_input_tokens_seen": 501298944,
4585
+ "step": 85800
4586
+ },
4587
+ {
4588
+ "epoch": 0.9471648419463439,
4589
+ "grad_norm": 1.827708125114441,
4590
+ "learning_rate": 0.00010950350441541651,
4591
+ "loss": 4.4088,
4592
+ "num_input_tokens_seen": 502206976,
4593
+ "step": 85950
4594
+ },
4595
+ {
4596
+ "epoch": 0.9488178346897057,
4597
+ "grad_norm": 1.8833259344100952,
4598
+ "learning_rate": 0.00010941534302129099,
4599
+ "loss": 4.4107,
4600
+ "num_input_tokens_seen": 503083488,
4601
+ "step": 86100
4602
+ },
4603
+ {
4604
+ "epoch": 0.9504708274330675,
4605
+ "grad_norm": 1.8116602897644043,
4606
+ "learning_rate": 0.00010932718162716549,
4607
+ "loss": 4.4109,
4608
+ "num_input_tokens_seen": 503978240,
4609
+ "step": 86250
4610
+ },
4611
+ {
4612
+ "epoch": 0.9521238201764294,
4613
+ "grad_norm": 1.8248368501663208,
4614
+ "learning_rate": 0.00010923902023303996,
4615
+ "loss": 4.4041,
4616
+ "num_input_tokens_seen": 504859744,
4617
+ "step": 86400
4618
+ },
4619
+ {
4620
+ "epoch": 0.9537768129197913,
4621
+ "grad_norm": 1.862371802330017,
4622
+ "learning_rate": 0.00010915085883891444,
4623
+ "loss": 4.4221,
4624
+ "num_input_tokens_seen": 505740576,
4625
+ "step": 86550
4626
+ },
4627
+ {
4628
+ "epoch": 0.9554298056631532,
4629
+ "grad_norm": 1.8358848094940186,
4630
+ "learning_rate": 0.00010906269744478894,
4631
+ "loss": 4.4051,
4632
+ "num_input_tokens_seen": 506615680,
4633
+ "step": 86700
4634
+ },
4635
+ {
4636
+ "epoch": 0.957082798406515,
4637
+ "grad_norm": 1.8686786890029907,
4638
+ "learning_rate": 0.00010897453605066342,
4639
+ "loss": 4.41,
4640
+ "num_input_tokens_seen": 507477824,
4641
+ "step": 86850
4642
+ },
4643
+ {
4644
+ "epoch": 0.9587357911498768,
4645
+ "grad_norm": 1.833525538444519,
4646
+ "learning_rate": 0.00010888637465653792,
4647
+ "loss": 4.4188,
4648
+ "num_input_tokens_seen": 508371904,
4649
+ "step": 87000
4650
+ },
4651
+ {
4652
+ "epoch": 0.9603887838932387,
4653
+ "grad_norm": 1.9611468315124512,
4654
+ "learning_rate": 0.00010879821326241239,
4655
+ "loss": 4.4034,
4656
+ "num_input_tokens_seen": 509250272,
4657
+ "step": 87150
4658
+ },
4659
+ {
4660
+ "epoch": 0.9620417766366006,
4661
+ "grad_norm": 1.6934946775436401,
4662
+ "learning_rate": 0.00010871005186828688,
4663
+ "loss": 4.4067,
4664
+ "num_input_tokens_seen": 510129568,
4665
+ "step": 87300
4666
+ },
4667
+ {
4668
+ "epoch": 0.9636947693799625,
4669
+ "grad_norm": 1.8400757312774658,
4670
+ "learning_rate": 0.00010862189047416137,
4671
+ "loss": 4.4,
4672
+ "num_input_tokens_seen": 510995328,
4673
+ "step": 87450
4674
+ },
4675
+ {
4676
+ "epoch": 0.9653477621233243,
4677
+ "grad_norm": 1.8682547807693481,
4678
+ "learning_rate": 0.00010853372908003586,
4679
+ "loss": 4.4026,
4680
+ "num_input_tokens_seen": 511880416,
4681
+ "step": 87600
4682
+ },
4683
+ {
4684
+ "epoch": 0.9670007548666861,
4685
+ "grad_norm": 1.8408825397491455,
4686
+ "learning_rate": 0.00010844556768591035,
4687
+ "loss": 4.4019,
4688
+ "num_input_tokens_seen": 512759072,
4689
+ "step": 87750
4690
+ },
4691
+ {
4692
+ "epoch": 0.968653747610048,
4693
+ "grad_norm": 1.9082870483398438,
4694
+ "learning_rate": 0.00010835740629178484,
4695
+ "loss": 4.4076,
4696
+ "num_input_tokens_seen": 513640896,
4697
+ "step": 87900
4698
+ },
4699
+ {
4700
+ "epoch": 0.9703067403534098,
4701
+ "grad_norm": 1.9512287378311157,
4702
+ "learning_rate": 0.00010826983264028683,
4703
+ "loss": 4.4217,
4704
+ "num_input_tokens_seen": 514532256,
4705
+ "step": 88050
4706
+ },
4707
+ {
4708
+ "epoch": 0.9719597330967717,
4709
+ "grad_norm": 1.9278032779693604,
4710
+ "learning_rate": 0.0001081816712461613,
4711
+ "loss": 4.4217,
4712
+ "num_input_tokens_seen": 515412864,
4713
+ "step": 88200
4714
+ },
4715
+ {
4716
+ "epoch": 0.9736127258401336,
4717
+ "grad_norm": 1.814817190170288,
4718
+ "learning_rate": 0.0001080935098520358,
4719
+ "loss": 4.4178,
4720
+ "num_input_tokens_seen": 516301408,
4721
+ "step": 88350
4722
+ },
4723
+ {
4724
+ "epoch": 0.9752657185834954,
4725
+ "grad_norm": 2.0495548248291016,
4726
+ "learning_rate": 0.00010800534845791028,
4727
+ "loss": 4.4101,
4728
+ "num_input_tokens_seen": 517179648,
4729
+ "step": 88500
4730
+ },
4731
+ {
4732
+ "epoch": 0.9769187113268573,
4733
+ "grad_norm": 1.8378854990005493,
4734
+ "learning_rate": 0.00010791718706378478,
4735
+ "loss": 4.4031,
4736
+ "num_input_tokens_seen": 518064288,
4737
+ "step": 88650
4738
+ },
4739
+ {
4740
+ "epoch": 0.9785717040702191,
4741
+ "grad_norm": 1.8407827615737915,
4742
+ "learning_rate": 0.00010782902566965926,
4743
+ "loss": 4.4135,
4744
+ "num_input_tokens_seen": 518947776,
4745
+ "step": 88800
4746
+ },
4747
+ {
4748
+ "epoch": 0.980224696813581,
4749
+ "grad_norm": 1.845199704170227,
4750
+ "learning_rate": 0.00010774086427553376,
4751
+ "loss": 4.4,
4752
+ "num_input_tokens_seen": 519822560,
4753
+ "step": 88950
4754
+ },
4755
+ {
4756
+ "epoch": 0.9818776895569429,
4757
+ "grad_norm": 1.8627071380615234,
4758
+ "learning_rate": 0.00010765270288140823,
4759
+ "loss": 4.4125,
4760
+ "num_input_tokens_seen": 520709888,
4761
+ "step": 89100
4762
+ },
4763
+ {
4764
+ "epoch": 0.9835306823003047,
4765
+ "grad_norm": 1.826648235321045,
4766
+ "learning_rate": 0.00010756454148728274,
4767
+ "loss": 4.4006,
4768
+ "num_input_tokens_seen": 521586080,
4769
+ "step": 89250
4770
+ },
4771
+ {
4772
+ "epoch": 0.9851836750436666,
4773
+ "grad_norm": 1.9315438270568848,
4774
+ "learning_rate": 0.00010747638009315721,
4775
+ "loss": 4.4104,
4776
+ "num_input_tokens_seen": 522450944,
4777
+ "step": 89400
4778
+ },
4779
+ {
4780
+ "epoch": 0.9868366677870284,
4781
+ "grad_norm": 1.8507201671600342,
4782
+ "learning_rate": 0.0001073882186990317,
4783
+ "loss": 4.412,
4784
+ "num_input_tokens_seen": 523335744,
4785
+ "step": 89550
4786
+ },
4787
+ {
4788
+ "epoch": 0.9884896605303902,
4789
+ "grad_norm": 1.8950568437576294,
4790
+ "learning_rate": 0.00010730005730490619,
4791
+ "loss": 4.4106,
4792
+ "num_input_tokens_seen": 524216960,
4793
+ "step": 89700
4794
+ },
4795
+ {
4796
+ "epoch": 0.9901426532737522,
4797
+ "grad_norm": 1.92140531539917,
4798
+ "learning_rate": 0.00010721189591078068,
4799
+ "loss": 4.4001,
4800
+ "num_input_tokens_seen": 525081088,
4801
+ "step": 89850
4802
+ },
4803
+ {
4804
+ "epoch": 0.991795646017114,
4805
+ "grad_norm": 1.9179210662841797,
4806
+ "learning_rate": 0.00010712373451665516,
4807
+ "loss": 4.4044,
4808
+ "num_input_tokens_seen": 525964384,
4809
+ "step": 90000
4810
+ },
4811
+ {
4812
+ "epoch": 0.9934486387604758,
4813
+ "grad_norm": 1.8168158531188965,
4814
+ "learning_rate": 0.00010703557312252965,
4815
+ "loss": 4.4082,
4816
+ "num_input_tokens_seen": 526852256,
4817
+ "step": 90150
4818
+ },
4819
+ {
4820
+ "epoch": 0.9951016315038377,
4821
+ "grad_norm": 2.0058753490448,
4822
+ "learning_rate": 0.00010694741172840414,
4823
+ "loss": 4.4061,
4824
+ "num_input_tokens_seen": 527721152,
4825
+ "step": 90300
4826
+ },
4827
+ {
4828
+ "epoch": 0.9967546242471995,
4829
+ "grad_norm": 2.0036473274230957,
4830
+ "learning_rate": 0.00010685925033427863,
4831
+ "loss": 4.4013,
4832
+ "num_input_tokens_seen": 528601472,
4833
+ "step": 90450
4834
+ },
4835
+ {
4836
+ "epoch": 0.9984076169905615,
4837
+ "grad_norm": 1.8912723064422607,
4838
+ "learning_rate": 0.0001067710889401531,
4839
+ "loss": 4.408,
4840
+ "num_input_tokens_seen": 529485344,
4841
+ "step": 90600
4842
+ },
4843
+ {
4844
+ "epoch": 1.0000606097339233,
4845
+ "grad_norm": 1.8539482355117798,
4846
+ "learning_rate": 0.00010668292754602761,
4847
+ "loss": 4.4072,
4848
+ "num_input_tokens_seen": 530388834,
4849
+ "step": 90750
4850
+ },
4851
+ {
4852
+ "epoch": 1.001713602477285,
4853
+ "grad_norm": 1.9648711681365967,
4854
+ "learning_rate": 0.00010659476615190208,
4855
+ "loss": 4.4023,
4856
+ "num_input_tokens_seen": 531261986,
4857
+ "step": 90900
4858
+ },
4859
+ {
4860
+ "epoch": 1.003366595220647,
4861
+ "grad_norm": 1.8683501482009888,
4862
+ "learning_rate": 0.00010650660475777658,
4863
+ "loss": 4.3953,
4864
+ "num_input_tokens_seen": 532138626,
4865
+ "step": 91050
4866
+ },
4867
+ {
4868
+ "epoch": 1.005019587964009,
4869
+ "grad_norm": 1.9149645566940308,
4870
+ "learning_rate": 0.00010641844336365106,
4871
+ "loss": 4.3907,
4872
+ "num_input_tokens_seen": 533008418,
4873
+ "step": 91200
4874
+ },
4875
+ {
4876
+ "epoch": 1.0066725807073706,
4877
+ "grad_norm": 1.804408073425293,
4878
+ "learning_rate": 0.00010633028196952556,
4879
+ "loss": 4.4053,
4880
+ "num_input_tokens_seen": 533881122,
4881
+ "step": 91350
4882
+ },
4883
+ {
4884
+ "epoch": 1.0083255734507326,
4885
+ "grad_norm": 1.8145511150360107,
4886
+ "learning_rate": 0.00010624212057540004,
4887
+ "loss": 4.3993,
4888
+ "num_input_tokens_seen": 534752738,
4889
+ "step": 91500
4890
+ },
4891
+ {
4892
+ "epoch": 1.0099785661940943,
4893
+ "grad_norm": 1.8206557035446167,
4894
+ "learning_rate": 0.00010615395918127454,
4895
+ "loss": 4.3954,
4896
+ "num_input_tokens_seen": 535622338,
4897
+ "step": 91650
4898
+ },
4899
+ {
4900
+ "epoch": 1.0116315589374563,
4901
+ "grad_norm": 1.880231261253357,
4902
+ "learning_rate": 0.00010606579778714901,
4903
+ "loss": 4.4002,
4904
+ "num_input_tokens_seen": 536488386,
4905
+ "step": 91800
4906
+ },
4907
+ {
4908
+ "epoch": 1.0132845516808182,
4909
+ "grad_norm": 1.8914505243301392,
4910
+ "learning_rate": 0.00010597763639302352,
4911
+ "loss": 4.3961,
4912
+ "num_input_tokens_seen": 537374658,
4913
+ "step": 91950
4914
+ },
4915
+ {
4916
+ "epoch": 1.01493754442418,
4917
+ "grad_norm": 1.9163919687271118,
4918
+ "learning_rate": 0.0001058900627415255,
4919
+ "loss": 4.3973,
4920
+ "num_input_tokens_seen": 538244194,
4921
+ "step": 92100
4922
+ },
4923
+ {
4924
+ "epoch": 1.0165905371675419,
4925
+ "grad_norm": 1.9003725051879883,
4926
+ "learning_rate": 0.00010580190134739998,
4927
+ "loss": 4.3957,
4928
+ "num_input_tokens_seen": 539130914,
4929
+ "step": 92250
4930
+ },
4931
+ {
4932
+ "epoch": 1.0182435299109036,
4933
+ "grad_norm": 1.838493824005127,
4934
+ "learning_rate": 0.00010571373995327447,
4935
+ "loss": 4.3878,
4936
+ "num_input_tokens_seen": 540022850,
4937
+ "step": 92400
4938
+ },
4939
+ {
4940
+ "epoch": 1.0198965226542656,
4941
+ "grad_norm": 1.9080275297164917,
4942
+ "learning_rate": 0.00010562557855914896,
4943
+ "loss": 4.3917,
4944
+ "num_input_tokens_seen": 540895874,
4945
+ "step": 92550
4946
+ },
4947
+ {
4948
+ "epoch": 1.0215495153976275,
4949
+ "grad_norm": 1.8060060739517212,
4950
+ "learning_rate": 0.00010553741716502345,
4951
+ "loss": 4.3953,
4952
+ "num_input_tokens_seen": 541762658,
4953
+ "step": 92700
4954
+ },
4955
+ {
4956
+ "epoch": 1.0232025081409892,
4957
+ "grad_norm": 1.903151273727417,
4958
+ "learning_rate": 0.00010544925577089792,
4959
+ "loss": 4.3952,
4960
+ "num_input_tokens_seen": 542643138,
4961
+ "step": 92850
4962
+ },
4963
+ {
4964
+ "epoch": 1.0248555008843512,
4965
+ "grad_norm": 1.9957008361816406,
4966
+ "learning_rate": 0.00010536109437677243,
4967
+ "loss": 4.3952,
4968
+ "num_input_tokens_seen": 543505570,
4969
+ "step": 93000
4970
+ },
4971
+ {
4972
+ "epoch": 1.026508493627713,
4973
+ "grad_norm": 1.8897976875305176,
4974
+ "learning_rate": 0.0001052729329826469,
4975
+ "loss": 4.395,
4976
+ "num_input_tokens_seen": 544378466,
4977
+ "step": 93150
4978
+ },
4979
+ {
4980
+ "epoch": 1.0281614863710749,
4981
+ "grad_norm": 1.895654320716858,
4982
+ "learning_rate": 0.0001051847715885214,
4983
+ "loss": 4.4016,
4984
+ "num_input_tokens_seen": 545256738,
4985
+ "step": 93300
4986
+ },
4987
+ {
4988
+ "epoch": 1.0298144791144366,
4989
+ "grad_norm": 1.9977262020111084,
4990
+ "learning_rate": 0.00010509661019439588,
4991
+ "loss": 4.3994,
4992
+ "num_input_tokens_seen": 546150498,
4993
+ "step": 93450
4994
+ },
4995
+ {
4996
+ "epoch": 1.0314674718577985,
4997
+ "grad_norm": 1.82341468334198,
4998
+ "learning_rate": 0.00010500844880027038,
4999
+ "loss": 4.4003,
5000
+ "num_input_tokens_seen": 547021922,
5001
+ "step": 93600
5002
+ },
5003
+ {
5004
+ "epoch": 1.0331204646011605,
5005
+ "grad_norm": 1.7573907375335693,
5006
+ "learning_rate": 0.00010492028740614485,
5007
+ "loss": 4.3936,
5008
+ "num_input_tokens_seen": 547888450,
5009
+ "step": 93750
5010
+ },
5011
+ {
5012
+ "epoch": 1.0347734573445222,
5013
+ "grad_norm": 2.011516571044922,
5014
+ "learning_rate": 0.00010483212601201936,
5015
+ "loss": 4.3861,
5016
+ "num_input_tokens_seen": 548752514,
5017
+ "step": 93900
5018
+ },
5019
+ {
5020
+ "epoch": 1.0364264500878841,
5021
+ "grad_norm": 1.8368171453475952,
5022
+ "learning_rate": 0.00010474396461789383,
5023
+ "loss": 4.3975,
5024
+ "num_input_tokens_seen": 549641218,
5025
+ "step": 94050
5026
+ },
5027
+ {
5028
+ "epoch": 1.0380794428312459,
5029
+ "grad_norm": 2.0658929347991943,
5030
+ "learning_rate": 0.00010465639096639582,
5031
+ "loss": 4.4042,
5032
+ "num_input_tokens_seen": 550521378,
5033
+ "step": 94200
5034
+ },
5035
+ {
5036
+ "epoch": 1.0397324355746078,
5037
+ "grad_norm": 1.8516744375228882,
5038
+ "learning_rate": 0.00010456822957227031,
5039
+ "loss": 4.3937,
5040
+ "num_input_tokens_seen": 551403138,
5041
+ "step": 94350
5042
+ },
5043
+ {
5044
+ "epoch": 1.0413854283179698,
5045
+ "grad_norm": 1.9704523086547852,
5046
+ "learning_rate": 0.0001044800681781448,
5047
+ "loss": 4.3892,
5048
+ "num_input_tokens_seen": 552268866,
5049
+ "step": 94500
5050
+ },
5051
+ {
5052
+ "epoch": 1.0430384210613315,
5053
+ "grad_norm": 1.8856583833694458,
5054
+ "learning_rate": 0.00010439190678401929,
5055
+ "loss": 4.3969,
5056
+ "num_input_tokens_seen": 553139522,
5057
+ "step": 94650
5058
+ },
5059
+ {
5060
+ "epoch": 1.0446914138046934,
5061
+ "grad_norm": 1.9823240041732788,
5062
+ "learning_rate": 0.00010430374538989378,
5063
+ "loss": 4.3937,
5064
+ "num_input_tokens_seen": 554009858,
5065
+ "step": 94800
5066
+ },
5067
+ {
5068
+ "epoch": 1.0463444065480552,
5069
+ "grad_norm": 1.8391404151916504,
5070
+ "learning_rate": 0.00010421558399576827,
5071
+ "loss": 4.3891,
5072
+ "num_input_tokens_seen": 554896962,
5073
+ "step": 94950
5074
+ },
5075
+ {
5076
+ "epoch": 1.047997399291417,
5077
+ "grad_norm": 1.829777717590332,
5078
+ "learning_rate": 0.00010412742260164274,
5079
+ "loss": 4.3996,
5080
+ "num_input_tokens_seen": 555778274,
5081
+ "step": 95100
5082
+ },
5083
+ {
5084
+ "epoch": 1.049650392034779,
5085
+ "grad_norm": 1.884190320968628,
5086
+ "learning_rate": 0.00010403926120751724,
5087
+ "loss": 4.3899,
5088
+ "num_input_tokens_seen": 556658210,
5089
+ "step": 95250
5090
+ },
5091
+ {
5092
+ "epoch": 1.0513033847781408,
5093
+ "grad_norm": 1.8368123769760132,
5094
+ "learning_rate": 0.00010395109981339172,
5095
+ "loss": 4.3989,
5096
+ "num_input_tokens_seen": 557549442,
5097
+ "step": 95400
5098
+ },
5099
+ {
5100
+ "epoch": 1.0529563775215027,
5101
+ "grad_norm": 1.7985849380493164,
5102
+ "learning_rate": 0.00010386293841926622,
5103
+ "loss": 4.3868,
5104
+ "num_input_tokens_seen": 558417634,
5105
+ "step": 95550
5106
+ },
5107
+ {
5108
+ "epoch": 1.0546093702648645,
5109
+ "grad_norm": 1.8913172483444214,
5110
+ "learning_rate": 0.0001037747770251407,
5111
+ "loss": 4.4031,
5112
+ "num_input_tokens_seen": 559314882,
5113
+ "step": 95700
5114
+ },
5115
+ {
5116
+ "epoch": 1.0562623630082264,
5117
+ "grad_norm": 1.9179192781448364,
5118
+ "learning_rate": 0.0001036866156310152,
5119
+ "loss": 4.3812,
5120
+ "num_input_tokens_seen": 560207298,
5121
+ "step": 95850
5122
+ },
5123
+ {
5124
+ "epoch": 1.0579153557515883,
5125
+ "grad_norm": 1.8890949487686157,
5126
+ "learning_rate": 0.00010359845423688967,
5127
+ "loss": 4.3916,
5128
+ "num_input_tokens_seen": 561097570,
5129
+ "step": 96000
5130
+ },
5131
+ {
5132
+ "epoch": 1.05956834849495,
5133
+ "grad_norm": 1.7995752096176147,
5134
+ "learning_rate": 0.00010351088058539166,
5135
+ "loss": 4.3933,
5136
+ "num_input_tokens_seen": 561973218,
5137
+ "step": 96150
5138
+ },
5139
+ {
5140
+ "epoch": 1.061221341238312,
5141
+ "grad_norm": 1.928031086921692,
5142
+ "learning_rate": 0.00010342330693389365,
5143
+ "loss": 4.3914,
5144
+ "num_input_tokens_seen": 562851074,
5145
+ "step": 96300
5146
+ },
5147
+ {
5148
+ "epoch": 1.0628743339816737,
5149
+ "grad_norm": 1.94650137424469,
5150
+ "learning_rate": 0.00010333514553976814,
5151
+ "loss": 4.3766,
5152
+ "num_input_tokens_seen": 563725506,
5153
+ "step": 96450
5154
+ },
5155
+ {
5156
+ "epoch": 1.0645273267250357,
5157
+ "grad_norm": 1.8535209894180298,
5158
+ "learning_rate": 0.00010324698414564263,
5159
+ "loss": 4.3916,
5160
+ "num_input_tokens_seen": 564576034,
5161
+ "step": 96600
5162
+ },
5163
+ {
5164
+ "epoch": 1.0661803194683976,
5165
+ "grad_norm": 1.9456048011779785,
5166
+ "learning_rate": 0.00010315882275151712,
5167
+ "loss": 4.3975,
5168
+ "num_input_tokens_seen": 565446626,
5169
+ "step": 96750
5170
+ },
5171
+ {
5172
+ "epoch": 1.0678333122117594,
5173
+ "grad_norm": 1.8319114446640015,
5174
+ "learning_rate": 0.0001030706613573916,
5175
+ "loss": 4.3977,
5176
+ "num_input_tokens_seen": 566321378,
5177
+ "step": 96900
5178
+ },
5179
+ {
5180
+ "epoch": 1.0694863049551213,
5181
+ "grad_norm": 2.1267592906951904,
5182
+ "learning_rate": 0.0001029824999632661,
5183
+ "loss": 4.3896,
5184
+ "num_input_tokens_seen": 567203778,
5185
+ "step": 97050
5186
+ },
5187
+ {
5188
+ "epoch": 1.071139297698483,
5189
+ "grad_norm": 1.8523855209350586,
5190
+ "learning_rate": 0.00010289433856914057,
5191
+ "loss": 4.3868,
5192
+ "num_input_tokens_seen": 568083906,
5193
+ "step": 97200
5194
+ },
5195
+ {
5196
+ "epoch": 1.072792290441845,
5197
+ "grad_norm": 1.882645606994629,
5198
+ "learning_rate": 0.00010280617717501507,
5199
+ "loss": 4.3976,
5200
+ "num_input_tokens_seen": 568970690,
5201
+ "step": 97350
5202
+ },
5203
+ {
5204
+ "epoch": 1.0744452831852067,
5205
+ "grad_norm": 1.9347394704818726,
5206
+ "learning_rate": 0.00010271801578088955,
5207
+ "loss": 4.3905,
5208
+ "num_input_tokens_seen": 569844482,
5209
+ "step": 97500
5210
+ },
5211
+ {
5212
+ "epoch": 1.0760982759285687,
5213
+ "grad_norm": 1.855491280555725,
5214
+ "learning_rate": 0.00010262985438676405,
5215
+ "loss": 4.3886,
5216
+ "num_input_tokens_seen": 570717602,
5217
+ "step": 97650
5218
+ },
5219
+ {
5220
+ "epoch": 1.0777512686719306,
5221
+ "grad_norm": 1.8031153678894043,
5222
+ "learning_rate": 0.00010254169299263853,
5223
+ "loss": 4.3807,
5224
+ "num_input_tokens_seen": 571591586,
5225
+ "step": 97800
5226
+ },
5227
+ {
5228
+ "epoch": 1.0794042614152923,
5229
+ "grad_norm": 1.9792248010635376,
5230
+ "learning_rate": 0.00010245353159851303,
5231
+ "loss": 4.389,
5232
+ "num_input_tokens_seen": 572476162,
5233
+ "step": 97950
5234
+ },
5235
+ {
5236
+ "epoch": 1.0810572541586543,
5237
+ "grad_norm": 1.9110472202301025,
5238
+ "learning_rate": 0.0001023653702043875,
5239
+ "loss": 4.3889,
5240
+ "num_input_tokens_seen": 573353346,
5241
+ "step": 98100
5242
+ },
5243
+ {
5244
+ "epoch": 1.082710246902016,
5245
+ "grad_norm": 1.9655945301055908,
5246
+ "learning_rate": 0.000102277208810262,
5247
+ "loss": 4.3808,
5248
+ "num_input_tokens_seen": 574237986,
5249
+ "step": 98250
5250
+ },
5251
+ {
5252
+ "epoch": 1.084363239645378,
5253
+ "grad_norm": 1.806372880935669,
5254
+ "learning_rate": 0.00010218904741613648,
5255
+ "loss": 4.389,
5256
+ "num_input_tokens_seen": 575113346,
5257
+ "step": 98400
5258
+ },
5259
+ {
5260
+ "epoch": 1.08601623238874,
5261
+ "grad_norm": 1.9266657829284668,
5262
+ "learning_rate": 0.00010210147376463847,
5263
+ "loss": 4.3957,
5264
+ "num_input_tokens_seen": 576010818,
5265
+ "step": 98550
5266
+ },
5267
+ {
5268
+ "epoch": 1.0876692251321016,
5269
+ "grad_norm": 1.8409209251403809,
5270
+ "learning_rate": 0.00010201331237051296,
5271
+ "loss": 4.3949,
5272
+ "num_input_tokens_seen": 576872610,
5273
+ "step": 98700
5274
+ },
5275
+ {
5276
+ "epoch": 1.0893222178754636,
5277
+ "grad_norm": 1.7804383039474487,
5278
+ "learning_rate": 0.00010192515097638745,
5279
+ "loss": 4.3835,
5280
+ "num_input_tokens_seen": 577737090,
5281
+ "step": 98850
5282
+ },
5283
+ {
5284
+ "epoch": 1.0909752106188253,
5285
+ "grad_norm": 1.8269861936569214,
5286
+ "learning_rate": 0.00010183698958226194,
5287
+ "loss": 4.3967,
5288
+ "num_input_tokens_seen": 578610178,
5289
+ "step": 99000
5290
+ },
5291
+ {
5292
+ "epoch": 1.0926282033621872,
5293
+ "grad_norm": 1.9065062999725342,
5294
+ "learning_rate": 0.00010174882818813641,
5295
+ "loss": 4.395,
5296
+ "num_input_tokens_seen": 579483074,
5297
+ "step": 99150
5298
+ },
5299
+ {
5300
+ "epoch": 1.0942811961055492,
5301
+ "grad_norm": 1.8511546850204468,
5302
+ "learning_rate": 0.00010166066679401092,
5303
+ "loss": 4.3847,
5304
+ "num_input_tokens_seen": 580339138,
5305
+ "step": 99300
5306
+ },
5307
+ {
5308
+ "epoch": 1.095934188848911,
5309
+ "grad_norm": 1.9003854990005493,
5310
+ "learning_rate": 0.00010157250539988539,
5311
+ "loss": 4.3878,
5312
+ "num_input_tokens_seen": 581214146,
5313
+ "step": 99450
5314
  }
5315
  ],
5316
  "logging_steps": 150,
5317
  "max_steps": 272232,
5318
+ "num_input_tokens_seen": 581503010,
5319
  "num_train_epochs": 3,
5320
  "save_steps": 500,
5321
  "stateful_callbacks": {
 
5330
  "attributes": {}
5331
  }
5332
  },
5333
+ "total_flos": 8978778636326400.0,
5334
  "train_batch_size": 32,
5335
  "trial_name": null,
5336
  "trial_params": null