romainnn commited on
Commit
de2af30
·
verified ·
1 Parent(s): 919d7f7

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8c3b7354bdafaf40f8c3750e79685a988156602c01307cad8150c8d3191850d
3
  size 289512208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc7f9f12a7d975eceff199ac96f1fb4abb58016ab42da13fe25b1845a1e0b3ce
3
  size 289512208
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:062ebcb0ddbe2ed847f0c77f0c46ff28b6fe4af7fe26bb63c3feadce3f4f1df6
3
  size 147781972
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a24bd2239b20338e528442d83d2139315c8017ad79ac6456a27ebc2a7c4982
3
  size 147781972
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cd9da745e01d9c9e271b256eba61ac6e1f1f956439fe5a27252af92b2c1936d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ace9fc649252ea1299cd2d9b1953184b717d1b1778bd2d51cf81f8fdd857fb
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:409f5f9e71c66aad357639b3334ffe5097e507a7b7ddeed673d739f3f24cfba8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14de197ce4fca667a77214b11d375124cfec5ed9c075fb60180e734827aaa864
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2162970304489136,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-500",
4
- "epoch": 0.031782606968336576,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3555,6 +3555,714 @@
3555
  "eval_samples_per_second": 4.035,
3556
  "eval_steps_per_second": 1.009,
3557
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3558
  }
3559
  ],
3560
  "logging_steps": 1,
@@ -3583,7 +4291,7 @@
3583
  "attributes": {}
3584
  }
3585
  },
3586
- "total_flos": 2.59811262332928e+18,
3587
  "train_batch_size": 4,
3588
  "trial_name": null,
3589
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.2089511156082153,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
+ "epoch": 0.03813912836200389,
5
  "eval_steps": 100,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3555
  "eval_samples_per_second": 4.035,
3556
  "eval_steps_per_second": 1.009,
3557
  "step": 500
3558
+ },
3559
+ {
3560
+ "epoch": 0.03184617218227325,
3561
+ "grad_norm": 0.26652681827545166,
3562
+ "learning_rate": 7.162871012503003e-05,
3563
+ "loss": 1.2841,
3564
+ "step": 501
3565
+ },
3566
+ {
3567
+ "epoch": 0.031909737396209926,
3568
+ "grad_norm": 0.258722722530365,
3569
+ "learning_rate": 7.126596213658488e-05,
3570
+ "loss": 1.3076,
3571
+ "step": 502
3572
+ },
3573
+ {
3574
+ "epoch": 0.031973302610146595,
3575
+ "grad_norm": 0.2519145607948303,
3576
+ "learning_rate": 7.090362580911808e-05,
3577
+ "loss": 1.3191,
3578
+ "step": 503
3579
+ },
3580
+ {
3581
+ "epoch": 0.03203686782408327,
3582
+ "grad_norm": 0.24408216774463654,
3583
+ "learning_rate": 7.05417063336761e-05,
3584
+ "loss": 1.2056,
3585
+ "step": 504
3586
+ },
3587
+ {
3588
+ "epoch": 0.032100433038019946,
3589
+ "grad_norm": 0.25859251618385315,
3590
+ "learning_rate": 7.018020889533348e-05,
3591
+ "loss": 1.2509,
3592
+ "step": 505
3593
+ },
3594
+ {
3595
+ "epoch": 0.032163998251956614,
3596
+ "grad_norm": 0.274295836687088,
3597
+ "learning_rate": 6.981913867311819e-05,
3598
+ "loss": 1.1457,
3599
+ "step": 506
3600
+ },
3601
+ {
3602
+ "epoch": 0.03222756346589329,
3603
+ "grad_norm": 0.25392991304397583,
3604
+ "learning_rate": 6.945850083993781e-05,
3605
+ "loss": 1.1389,
3606
+ "step": 507
3607
+ },
3608
+ {
3609
+ "epoch": 0.032291128679829965,
3610
+ "grad_norm": 0.2518613338470459,
3611
+ "learning_rate": 6.909830056250527e-05,
3612
+ "loss": 1.1288,
3613
+ "step": 508
3614
+ },
3615
+ {
3616
+ "epoch": 0.032354693893766634,
3617
+ "grad_norm": 0.25153687596321106,
3618
+ "learning_rate": 6.873854300126467e-05,
3619
+ "loss": 1.1106,
3620
+ "step": 509
3621
+ },
3622
+ {
3623
+ "epoch": 0.03241825910770331,
3624
+ "grad_norm": 0.2594717741012573,
3625
+ "learning_rate": 6.83792333103176e-05,
3626
+ "loss": 1.2556,
3627
+ "step": 510
3628
+ },
3629
+ {
3630
+ "epoch": 0.032481824321639985,
3631
+ "grad_norm": 0.2567812204360962,
3632
+ "learning_rate": 6.802037663734923e-05,
3633
+ "loss": 1.1773,
3634
+ "step": 511
3635
+ },
3636
+ {
3637
+ "epoch": 0.03254538953557665,
3638
+ "grad_norm": 0.2533416748046875,
3639
+ "learning_rate": 6.766197812355438e-05,
3640
+ "loss": 1.2053,
3641
+ "step": 512
3642
+ },
3643
+ {
3644
+ "epoch": 0.03260895474951333,
3645
+ "grad_norm": 0.2625131607055664,
3646
+ "learning_rate": 6.73040429035641e-05,
3647
+ "loss": 1.2467,
3648
+ "step": 513
3649
+ },
3650
+ {
3651
+ "epoch": 0.032672519963450004,
3652
+ "grad_norm": 0.2665756344795227,
3653
+ "learning_rate": 6.69465761053721e-05,
3654
+ "loss": 1.2737,
3655
+ "step": 514
3656
+ },
3657
+ {
3658
+ "epoch": 0.03273608517738667,
3659
+ "grad_norm": 0.2558302581310272,
3660
+ "learning_rate": 6.658958285026102e-05,
3661
+ "loss": 1.2684,
3662
+ "step": 515
3663
+ },
3664
+ {
3665
+ "epoch": 0.03279965039132335,
3666
+ "grad_norm": 0.24979649484157562,
3667
+ "learning_rate": 6.623306825272937e-05,
3668
+ "loss": 1.2144,
3669
+ "step": 516
3670
+ },
3671
+ {
3672
+ "epoch": 0.032863215605260024,
3673
+ "grad_norm": 0.25010988116264343,
3674
+ "learning_rate": 6.58770374204181e-05,
3675
+ "loss": 1.2679,
3676
+ "step": 517
3677
+ },
3678
+ {
3679
+ "epoch": 0.03292678081919669,
3680
+ "grad_norm": 0.2494756430387497,
3681
+ "learning_rate": 6.552149545403739e-05,
3682
+ "loss": 1.2291,
3683
+ "step": 518
3684
+ },
3685
+ {
3686
+ "epoch": 0.03299034603313337,
3687
+ "grad_norm": 0.2528434097766876,
3688
+ "learning_rate": 6.516644744729367e-05,
3689
+ "loss": 1.2519,
3690
+ "step": 519
3691
+ },
3692
+ {
3693
+ "epoch": 0.03305391124707004,
3694
+ "grad_norm": 0.2631024718284607,
3695
+ "learning_rate": 6.48118984868167e-05,
3696
+ "loss": 1.2086,
3697
+ "step": 520
3698
+ },
3699
+ {
3700
+ "epoch": 0.03311747646100671,
3701
+ "grad_norm": 0.2514025866985321,
3702
+ "learning_rate": 6.445785365208645e-05,
3703
+ "loss": 1.2098,
3704
+ "step": 521
3705
+ },
3706
+ {
3707
+ "epoch": 0.03318104167494339,
3708
+ "grad_norm": 0.25096845626831055,
3709
+ "learning_rate": 6.410431801536058e-05,
3710
+ "loss": 1.2062,
3711
+ "step": 522
3712
+ },
3713
+ {
3714
+ "epoch": 0.03324460688888006,
3715
+ "grad_norm": 0.24552412331104279,
3716
+ "learning_rate": 6.375129664160168e-05,
3717
+ "loss": 1.139,
3718
+ "step": 523
3719
+ },
3720
+ {
3721
+ "epoch": 0.03330817210281673,
3722
+ "grad_norm": 0.257235586643219,
3723
+ "learning_rate": 6.339879458840465e-05,
3724
+ "loss": 1.3102,
3725
+ "step": 524
3726
+ },
3727
+ {
3728
+ "epoch": 0.03337173731675341,
3729
+ "grad_norm": 0.24831882119178772,
3730
+ "learning_rate": 6.304681690592431e-05,
3731
+ "loss": 1.1848,
3732
+ "step": 525
3733
+ },
3734
+ {
3735
+ "epoch": 0.03343530253069008,
3736
+ "grad_norm": 0.2522595524787903,
3737
+ "learning_rate": 6.269536863680307e-05,
3738
+ "loss": 1.228,
3739
+ "step": 526
3740
+ },
3741
+ {
3742
+ "epoch": 0.03349886774462675,
3743
+ "grad_norm": 0.25006258487701416,
3744
+ "learning_rate": 6.23444548160986e-05,
3745
+ "loss": 1.1059,
3746
+ "step": 527
3747
+ },
3748
+ {
3749
+ "epoch": 0.033562432958563426,
3750
+ "grad_norm": 0.2498662769794464,
3751
+ "learning_rate": 6.199408047121174e-05,
3752
+ "loss": 1.2354,
3753
+ "step": 528
3754
+ },
3755
+ {
3756
+ "epoch": 0.0336259981725001,
3757
+ "grad_norm": 0.25638020038604736,
3758
+ "learning_rate": 6.16442506218146e-05,
3759
+ "loss": 1.2332,
3760
+ "step": 529
3761
+ },
3762
+ {
3763
+ "epoch": 0.03368956338643677,
3764
+ "grad_norm": 0.26172903180122375,
3765
+ "learning_rate": 6.129497027977829e-05,
3766
+ "loss": 1.2062,
3767
+ "step": 530
3768
+ },
3769
+ {
3770
+ "epoch": 0.033753128600373446,
3771
+ "grad_norm": 0.2451334148645401,
3772
+ "learning_rate": 6.0946244449101574e-05,
3773
+ "loss": 1.2513,
3774
+ "step": 531
3775
+ },
3776
+ {
3777
+ "epoch": 0.03381669381431012,
3778
+ "grad_norm": 0.2550620436668396,
3779
+ "learning_rate": 6.059807812583883e-05,
3780
+ "loss": 1.08,
3781
+ "step": 532
3782
+ },
3783
+ {
3784
+ "epoch": 0.03388025902824679,
3785
+ "grad_norm": 0.2467305064201355,
3786
+ "learning_rate": 6.02504762980286e-05,
3787
+ "loss": 1.1822,
3788
+ "step": 533
3789
+ },
3790
+ {
3791
+ "epoch": 0.033943824242183465,
3792
+ "grad_norm": 0.2539975345134735,
3793
+ "learning_rate": 5.990344394562226e-05,
3794
+ "loss": 1.2334,
3795
+ "step": 534
3796
+ },
3797
+ {
3798
+ "epoch": 0.03400738945612014,
3799
+ "grad_norm": 0.26874592900276184,
3800
+ "learning_rate": 5.955698604041231e-05,
3801
+ "loss": 1.2736,
3802
+ "step": 535
3803
+ },
3804
+ {
3805
+ "epoch": 0.03407095467005681,
3806
+ "grad_norm": 0.25802844762802124,
3807
+ "learning_rate": 5.92111075459616e-05,
3808
+ "loss": 1.1959,
3809
+ "step": 536
3810
+ },
3811
+ {
3812
+ "epoch": 0.034134519883993485,
3813
+ "grad_norm": 0.2507290542125702,
3814
+ "learning_rate": 5.88658134175319e-05,
3815
+ "loss": 1.2045,
3816
+ "step": 537
3817
+ },
3818
+ {
3819
+ "epoch": 0.03419808509793016,
3820
+ "grad_norm": 0.25168654322624207,
3821
+ "learning_rate": 5.852110860201294e-05,
3822
+ "loss": 1.259,
3823
+ "step": 538
3824
+ },
3825
+ {
3826
+ "epoch": 0.03426165031186683,
3827
+ "grad_norm": 0.25015729665756226,
3828
+ "learning_rate": 5.817699803785174e-05,
3829
+ "loss": 1.0865,
3830
+ "step": 539
3831
+ },
3832
+ {
3833
+ "epoch": 0.034325215525803504,
3834
+ "grad_norm": 0.24665945768356323,
3835
+ "learning_rate": 5.7833486654981606e-05,
3836
+ "loss": 1.1366,
3837
+ "step": 540
3838
+ },
3839
+ {
3840
+ "epoch": 0.03438878073974018,
3841
+ "grad_norm": 0.2521713972091675,
3842
+ "learning_rate": 5.7490579374751686e-05,
3843
+ "loss": 1.2052,
3844
+ "step": 541
3845
+ },
3846
+ {
3847
+ "epoch": 0.03445234595367685,
3848
+ "grad_norm": 0.24369041621685028,
3849
+ "learning_rate": 5.714828110985635e-05,
3850
+ "loss": 1.1564,
3851
+ "step": 542
3852
+ },
3853
+ {
3854
+ "epoch": 0.034515911167613524,
3855
+ "grad_norm": 0.2600708603858948,
3856
+ "learning_rate": 5.6806596764264874e-05,
3857
+ "loss": 1.1852,
3858
+ "step": 543
3859
+ },
3860
+ {
3861
+ "epoch": 0.0345794763815502,
3862
+ "grad_norm": 0.25693967938423157,
3863
+ "learning_rate": 5.6465531233151126e-05,
3864
+ "loss": 1.1887,
3865
+ "step": 544
3866
+ },
3867
+ {
3868
+ "epoch": 0.03464304159548687,
3869
+ "grad_norm": 0.24953435361385345,
3870
+ "learning_rate": 5.6125089402823485e-05,
3871
+ "loss": 1.0897,
3872
+ "step": 545
3873
+ },
3874
+ {
3875
+ "epoch": 0.03470660680942354,
3876
+ "grad_norm": 0.2587718665599823,
3877
+ "learning_rate": 5.578527615065492e-05,
3878
+ "loss": 1.1345,
3879
+ "step": 546
3880
+ },
3881
+ {
3882
+ "epoch": 0.03477017202336022,
3883
+ "grad_norm": 0.2567615807056427,
3884
+ "learning_rate": 5.544609634501279e-05,
3885
+ "loss": 1.277,
3886
+ "step": 547
3887
+ },
3888
+ {
3889
+ "epoch": 0.03483373723729689,
3890
+ "grad_norm": 0.25938692688941956,
3891
+ "learning_rate": 5.510755484518955e-05,
3892
+ "loss": 1.2087,
3893
+ "step": 548
3894
+ },
3895
+ {
3896
+ "epoch": 0.03489730245123356,
3897
+ "grad_norm": 0.25982430577278137,
3898
+ "learning_rate": 5.476965650133279e-05,
3899
+ "loss": 1.2554,
3900
+ "step": 549
3901
+ },
3902
+ {
3903
+ "epoch": 0.03496086766517024,
3904
+ "grad_norm": 0.25441205501556396,
3905
+ "learning_rate": 5.443240615437586e-05,
3906
+ "loss": 1.1437,
3907
+ "step": 550
3908
+ },
3909
+ {
3910
+ "epoch": 0.035024432879106907,
3911
+ "grad_norm": 0.2553006410598755,
3912
+ "learning_rate": 5.4095808635968546e-05,
3913
+ "loss": 1.2544,
3914
+ "step": 551
3915
+ },
3916
+ {
3917
+ "epoch": 0.03508799809304358,
3918
+ "grad_norm": 0.2520417273044586,
3919
+ "learning_rate": 5.375986876840784e-05,
3920
+ "loss": 1.0661,
3921
+ "step": 552
3922
+ },
3923
+ {
3924
+ "epoch": 0.03515156330698026,
3925
+ "grad_norm": 0.2597469091415405,
3926
+ "learning_rate": 5.342459136456881e-05,
3927
+ "loss": 1.1732,
3928
+ "step": 553
3929
+ },
3930
+ {
3931
+ "epoch": 0.035215128520916926,
3932
+ "grad_norm": 0.2597144544124603,
3933
+ "learning_rate": 5.30899812278356e-05,
3934
+ "loss": 1.2128,
3935
+ "step": 554
3936
+ },
3937
+ {
3938
+ "epoch": 0.0352786937348536,
3939
+ "grad_norm": 0.2565470039844513,
3940
+ "learning_rate": 5.275604315203293e-05,
3941
+ "loss": 1.1585,
3942
+ "step": 555
3943
+ },
3944
+ {
3945
+ "epoch": 0.03534225894879028,
3946
+ "grad_norm": 0.24314232170581818,
3947
+ "learning_rate": 5.2422781921356826e-05,
3948
+ "loss": 1.0955,
3949
+ "step": 556
3950
+ },
3951
+ {
3952
+ "epoch": 0.035405824162726945,
3953
+ "grad_norm": 0.24443262815475464,
3954
+ "learning_rate": 5.209020231030672e-05,
3955
+ "loss": 1.1649,
3956
+ "step": 557
3957
+ },
3958
+ {
3959
+ "epoch": 0.03546938937666362,
3960
+ "grad_norm": 0.26083680987358093,
3961
+ "learning_rate": 5.1758309083616673e-05,
3962
+ "loss": 1.2195,
3963
+ "step": 558
3964
+ },
3965
+ {
3966
+ "epoch": 0.035532954590600296,
3967
+ "grad_norm": 0.25181084871292114,
3968
+ "learning_rate": 5.142710699618701e-05,
3969
+ "loss": 1.2411,
3970
+ "step": 559
3971
+ },
3972
+ {
3973
+ "epoch": 0.035596519804536965,
3974
+ "grad_norm": 0.24913780391216278,
3975
+ "learning_rate": 5.109660079301668e-05,
3976
+ "loss": 1.2251,
3977
+ "step": 560
3978
+ },
3979
+ {
3980
+ "epoch": 0.03566008501847364,
3981
+ "grad_norm": 0.2578226625919342,
3982
+ "learning_rate": 5.076679520913479e-05,
3983
+ "loss": 1.1685,
3984
+ "step": 561
3985
+ },
3986
+ {
3987
+ "epoch": 0.035723650232410316,
3988
+ "grad_norm": 0.25115758180618286,
3989
+ "learning_rate": 5.043769496953299e-05,
3990
+ "loss": 1.0909,
3991
+ "step": 562
3992
+ },
3993
+ {
3994
+ "epoch": 0.035787215446346984,
3995
+ "grad_norm": 0.24539780616760254,
3996
+ "learning_rate": 5.010930478909779e-05,
3997
+ "loss": 1.221,
3998
+ "step": 563
3999
+ },
4000
+ {
4001
+ "epoch": 0.03585078066028366,
4002
+ "grad_norm": 0.26085364818573,
4003
+ "learning_rate": 4.9781629372542895e-05,
4004
+ "loss": 1.2141,
4005
+ "step": 564
4006
+ },
4007
+ {
4008
+ "epoch": 0.035914345874220335,
4009
+ "grad_norm": 0.25367042422294617,
4010
+ "learning_rate": 4.945467341434195e-05,
4011
+ "loss": 1.2999,
4012
+ "step": 565
4013
+ },
4014
+ {
4015
+ "epoch": 0.035977911088157004,
4016
+ "grad_norm": 0.27679672837257385,
4017
+ "learning_rate": 4.912844159866112e-05,
4018
+ "loss": 1.2494,
4019
+ "step": 566
4020
+ },
4021
+ {
4022
+ "epoch": 0.03604147630209368,
4023
+ "grad_norm": 0.2423914223909378,
4024
+ "learning_rate": 4.880293859929227e-05,
4025
+ "loss": 1.1681,
4026
+ "step": 567
4027
+ },
4028
+ {
4029
+ "epoch": 0.036105041516030355,
4030
+ "grad_norm": 0.24873730540275574,
4031
+ "learning_rate": 4.847816907958549e-05,
4032
+ "loss": 1.2964,
4033
+ "step": 568
4034
+ },
4035
+ {
4036
+ "epoch": 0.03616860672996702,
4037
+ "grad_norm": 0.2618268132209778,
4038
+ "learning_rate": 4.8154137692382907e-05,
4039
+ "loss": 1.2184,
4040
+ "step": 569
4041
+ },
4042
+ {
4043
+ "epoch": 0.0362321719439037,
4044
+ "grad_norm": 0.2559436857700348,
4045
+ "learning_rate": 4.783084907995156e-05,
4046
+ "loss": 1.216,
4047
+ "step": 570
4048
+ },
4049
+ {
4050
+ "epoch": 0.036295737157840374,
4051
+ "grad_norm": 0.2557474970817566,
4052
+ "learning_rate": 4.750830787391708e-05,
4053
+ "loss": 1.2993,
4054
+ "step": 571
4055
+ },
4056
+ {
4057
+ "epoch": 0.03635930237177704,
4058
+ "grad_norm": 0.25075972080230713,
4059
+ "learning_rate": 4.718651869519731e-05,
4060
+ "loss": 1.1178,
4061
+ "step": 572
4062
+ },
4063
+ {
4064
+ "epoch": 0.03642286758571372,
4065
+ "grad_norm": 0.2534898817539215,
4066
+ "learning_rate": 4.686548615393613e-05,
4067
+ "loss": 1.1891,
4068
+ "step": 573
4069
+ },
4070
+ {
4071
+ "epoch": 0.036486432799650394,
4072
+ "grad_norm": 0.26913774013519287,
4073
+ "learning_rate": 4.654521484943735e-05,
4074
+ "loss": 1.2552,
4075
+ "step": 574
4076
+ },
4077
+ {
4078
+ "epoch": 0.03654999801358706,
4079
+ "grad_norm": 0.2566869556903839,
4080
+ "learning_rate": 4.622570937009879e-05,
4081
+ "loss": 1.3527,
4082
+ "step": 575
4083
+ },
4084
+ {
4085
+ "epoch": 0.03661356322752374,
4086
+ "grad_norm": 0.26296791434288025,
4087
+ "learning_rate": 4.59069742933468e-05,
4088
+ "loss": 1.2464,
4089
+ "step": 576
4090
+ },
4091
+ {
4092
+ "epoch": 0.03667712844146041,
4093
+ "grad_norm": 0.261929988861084,
4094
+ "learning_rate": 4.558901418557021e-05,
4095
+ "loss": 1.2744,
4096
+ "step": 577
4097
+ },
4098
+ {
4099
+ "epoch": 0.03674069365539708,
4100
+ "grad_norm": 0.252572238445282,
4101
+ "learning_rate": 4.527183360205541e-05,
4102
+ "loss": 1.112,
4103
+ "step": 578
4104
+ },
4105
+ {
4106
+ "epoch": 0.03680425886933376,
4107
+ "grad_norm": 0.27232858538627625,
4108
+ "learning_rate": 4.495543708692075e-05,
4109
+ "loss": 1.2743,
4110
+ "step": 579
4111
+ },
4112
+ {
4113
+ "epoch": 0.03686782408327043,
4114
+ "grad_norm": 0.2609007656574249,
4115
+ "learning_rate": 4.4639829173051554e-05,
4116
+ "loss": 1.3188,
4117
+ "step": 580
4118
+ },
4119
+ {
4120
+ "epoch": 0.0369313892972071,
4121
+ "grad_norm": 0.2632231116294861,
4122
+ "learning_rate": 4.43250143820352e-05,
4123
+ "loss": 1.1243,
4124
+ "step": 581
4125
+ },
4126
+ {
4127
+ "epoch": 0.03699495451114378,
4128
+ "grad_norm": 0.2551668882369995,
4129
+ "learning_rate": 4.401099722409631e-05,
4130
+ "loss": 1.1864,
4131
+ "step": 582
4132
+ },
4133
+ {
4134
+ "epoch": 0.03705851972508045,
4135
+ "grad_norm": 0.2566946744918823,
4136
+ "learning_rate": 4.369778219803211e-05,
4137
+ "loss": 1.2117,
4138
+ "step": 583
4139
+ },
4140
+ {
4141
+ "epoch": 0.03712208493901712,
4142
+ "grad_norm": 0.2648017704486847,
4143
+ "learning_rate": 4.338537379114801e-05,
4144
+ "loss": 1.2357,
4145
+ "step": 584
4146
+ },
4147
+ {
4148
+ "epoch": 0.037185650152953796,
4149
+ "grad_norm": 0.24553461372852325,
4150
+ "learning_rate": 4.307377647919343e-05,
4151
+ "loss": 1.1774,
4152
+ "step": 585
4153
+ },
4154
+ {
4155
+ "epoch": 0.03724921536689047,
4156
+ "grad_norm": 0.24831490218639374,
4157
+ "learning_rate": 4.2762994726297346e-05,
4158
+ "loss": 1.1336,
4159
+ "step": 586
4160
+ },
4161
+ {
4162
+ "epoch": 0.03731278058082714,
4163
+ "grad_norm": 0.2644321024417877,
4164
+ "learning_rate": 4.245303298490467e-05,
4165
+ "loss": 1.1865,
4166
+ "step": 587
4167
+ },
4168
+ {
4169
+ "epoch": 0.037376345794763816,
4170
+ "grad_norm": 0.2654249370098114,
4171
+ "learning_rate": 4.2143895695712444e-05,
4172
+ "loss": 1.1872,
4173
+ "step": 588
4174
+ },
4175
+ {
4176
+ "epoch": 0.03743991100870049,
4177
+ "grad_norm": 0.24722780287265778,
4178
+ "learning_rate": 4.183558728760586e-05,
4179
+ "loss": 1.1609,
4180
+ "step": 589
4181
+ },
4182
+ {
4183
+ "epoch": 0.03750347622263716,
4184
+ "grad_norm": 0.2560960054397583,
4185
+ "learning_rate": 4.152811217759529e-05,
4186
+ "loss": 1.1897,
4187
+ "step": 590
4188
+ },
4189
+ {
4190
+ "epoch": 0.037567041436573835,
4191
+ "grad_norm": 0.24647943675518036,
4192
+ "learning_rate": 4.12214747707527e-05,
4193
+ "loss": 1.1098,
4194
+ "step": 591
4195
+ },
4196
+ {
4197
+ "epoch": 0.03763060665051051,
4198
+ "grad_norm": 0.2763408422470093,
4199
+ "learning_rate": 4.091567946014858e-05,
4200
+ "loss": 1.2313,
4201
+ "step": 592
4202
+ },
4203
+ {
4204
+ "epoch": 0.03769417186444718,
4205
+ "grad_norm": 0.254727303981781,
4206
+ "learning_rate": 4.061073062678912e-05,
4207
+ "loss": 1.2363,
4208
+ "step": 593
4209
+ },
4210
+ {
4211
+ "epoch": 0.037757737078383855,
4212
+ "grad_norm": 0.2722640037536621,
4213
+ "learning_rate": 4.0306632639553323e-05,
4214
+ "loss": 1.1633,
4215
+ "step": 594
4216
+ },
4217
+ {
4218
+ "epoch": 0.03782130229232053,
4219
+ "grad_norm": 0.2594556212425232,
4220
+ "learning_rate": 4.000338985513046e-05,
4221
+ "loss": 1.224,
4222
+ "step": 595
4223
+ },
4224
+ {
4225
+ "epoch": 0.0378848675062572,
4226
+ "grad_norm": 0.25068074464797974,
4227
+ "learning_rate": 3.970100661795766e-05,
4228
+ "loss": 1.1809,
4229
+ "step": 596
4230
+ },
4231
+ {
4232
+ "epoch": 0.037948432720193874,
4233
+ "grad_norm": 0.2550009787082672,
4234
+ "learning_rate": 3.9399487260157766e-05,
4235
+ "loss": 1.2022,
4236
+ "step": 597
4237
+ },
4238
+ {
4239
+ "epoch": 0.03801199793413055,
4240
+ "grad_norm": 0.2593897879123688,
4241
+ "learning_rate": 3.909883610147696e-05,
4242
+ "loss": 1.3124,
4243
+ "step": 598
4244
+ },
4245
+ {
4246
+ "epoch": 0.03807556314806722,
4247
+ "grad_norm": 0.2558402717113495,
4248
+ "learning_rate": 3.879905744922329e-05,
4249
+ "loss": 1.1618,
4250
+ "step": 599
4251
+ },
4252
+ {
4253
+ "epoch": 0.03813912836200389,
4254
+ "grad_norm": 0.25934746861457825,
4255
+ "learning_rate": 3.8500155598204644e-05,
4256
+ "loss": 1.0083,
4257
+ "step": 600
4258
+ },
4259
+ {
4260
+ "epoch": 0.03813912836200389,
4261
+ "eval_loss": 1.2089511156082153,
4262
+ "eval_runtime": 1239.249,
4263
+ "eval_samples_per_second": 4.035,
4264
+ "eval_steps_per_second": 1.009,
4265
+ "step": 600
4266
  }
4267
  ],
4268
  "logging_steps": 1,
 
4291
  "attributes": {}
4292
  }
4293
  },
4294
+ "total_flos": 3.117735147995136e+18,
4295
  "train_batch_size": 4,
4296
  "trial_name": null,
4297
  "trial_params": null