math-stratos-verified-scaled-0.25 / trainer_state.json
sedrickkeh's picture
End of training
259e0ba verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.995695839311334,
"eval_steps": 500,
"global_step": 696,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00430416068866571,
"grad_norm": 5.988248348236084,
"learning_rate": 1.4285714285714287e-07,
"loss": 0.8064,
"step": 1
},
{
"epoch": 0.00860832137733142,
"grad_norm": 5.868161678314209,
"learning_rate": 2.8571428571428575e-07,
"loss": 0.7963,
"step": 2
},
{
"epoch": 0.01291248206599713,
"grad_norm": 6.096586227416992,
"learning_rate": 4.285714285714286e-07,
"loss": 0.8413,
"step": 3
},
{
"epoch": 0.01721664275466284,
"grad_norm": 6.065792083740234,
"learning_rate": 5.714285714285715e-07,
"loss": 0.81,
"step": 4
},
{
"epoch": 0.021520803443328552,
"grad_norm": 5.93134069442749,
"learning_rate": 7.142857142857143e-07,
"loss": 0.8204,
"step": 5
},
{
"epoch": 0.02582496413199426,
"grad_norm": 5.619973182678223,
"learning_rate": 8.571428571428572e-07,
"loss": 0.7774,
"step": 6
},
{
"epoch": 0.03012912482065997,
"grad_norm": 5.30891227722168,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7689,
"step": 7
},
{
"epoch": 0.03443328550932568,
"grad_norm": 5.255387306213379,
"learning_rate": 1.142857142857143e-06,
"loss": 0.7698,
"step": 8
},
{
"epoch": 0.03873744619799139,
"grad_norm": 4.594310283660889,
"learning_rate": 1.2857142857142856e-06,
"loss": 0.7616,
"step": 9
},
{
"epoch": 0.043041606886657105,
"grad_norm": 4.302555084228516,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.7349,
"step": 10
},
{
"epoch": 0.047345767575322814,
"grad_norm": 4.202669143676758,
"learning_rate": 1.5714285714285714e-06,
"loss": 0.7766,
"step": 11
},
{
"epoch": 0.05164992826398852,
"grad_norm": 2.575376033782959,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.7242,
"step": 12
},
{
"epoch": 0.05595408895265423,
"grad_norm": 2.4199719429016113,
"learning_rate": 1.8571428571428573e-06,
"loss": 0.7208,
"step": 13
},
{
"epoch": 0.06025824964131994,
"grad_norm": 2.2063937187194824,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6931,
"step": 14
},
{
"epoch": 0.06456241032998565,
"grad_norm": 2.0030524730682373,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.6554,
"step": 15
},
{
"epoch": 0.06886657101865136,
"grad_norm": 1.7058610916137695,
"learning_rate": 2.285714285714286e-06,
"loss": 0.69,
"step": 16
},
{
"epoch": 0.07317073170731707,
"grad_norm": 2.8892016410827637,
"learning_rate": 2.428571428571429e-06,
"loss": 0.6773,
"step": 17
},
{
"epoch": 0.07747489239598278,
"grad_norm": 3.1138792037963867,
"learning_rate": 2.571428571428571e-06,
"loss": 0.6556,
"step": 18
},
{
"epoch": 0.08177905308464849,
"grad_norm": 3.2966556549072266,
"learning_rate": 2.7142857142857144e-06,
"loss": 0.6714,
"step": 19
},
{
"epoch": 0.08608321377331421,
"grad_norm": 3.1827149391174316,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.6798,
"step": 20
},
{
"epoch": 0.09038737446197992,
"grad_norm": 2.7425405979156494,
"learning_rate": 3e-06,
"loss": 0.6626,
"step": 21
},
{
"epoch": 0.09469153515064563,
"grad_norm": 2.32190203666687,
"learning_rate": 3.142857142857143e-06,
"loss": 0.6339,
"step": 22
},
{
"epoch": 0.09899569583931134,
"grad_norm": 1.6165233850479126,
"learning_rate": 3.285714285714286e-06,
"loss": 0.6199,
"step": 23
},
{
"epoch": 0.10329985652797705,
"grad_norm": 1.4535725116729736,
"learning_rate": 3.428571428571429e-06,
"loss": 0.6393,
"step": 24
},
{
"epoch": 0.10760401721664276,
"grad_norm": 1.3143177032470703,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.6285,
"step": 25
},
{
"epoch": 0.11190817790530846,
"grad_norm": 1.0768671035766602,
"learning_rate": 3.7142857142857146e-06,
"loss": 0.6097,
"step": 26
},
{
"epoch": 0.11621233859397417,
"grad_norm": 1.0377610921859741,
"learning_rate": 3.857142857142858e-06,
"loss": 0.6034,
"step": 27
},
{
"epoch": 0.12051649928263988,
"grad_norm": 1.0033104419708252,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5705,
"step": 28
},
{
"epoch": 0.12482065997130559,
"grad_norm": 1.0018284320831299,
"learning_rate": 4.1428571428571435e-06,
"loss": 0.5675,
"step": 29
},
{
"epoch": 0.1291248206599713,
"grad_norm": 0.8010370135307312,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.5903,
"step": 30
},
{
"epoch": 0.133428981348637,
"grad_norm": 0.6782485246658325,
"learning_rate": 4.428571428571429e-06,
"loss": 0.5644,
"step": 31
},
{
"epoch": 0.13773314203730272,
"grad_norm": 0.6411944627761841,
"learning_rate": 4.571428571428572e-06,
"loss": 0.5627,
"step": 32
},
{
"epoch": 0.14203730272596843,
"grad_norm": 0.727299153804779,
"learning_rate": 4.714285714285715e-06,
"loss": 0.5247,
"step": 33
},
{
"epoch": 0.14634146341463414,
"grad_norm": 0.8326959609985352,
"learning_rate": 4.857142857142858e-06,
"loss": 0.571,
"step": 34
},
{
"epoch": 0.15064562410329985,
"grad_norm": 0.7021209597587585,
"learning_rate": 5e-06,
"loss": 0.5347,
"step": 35
},
{
"epoch": 0.15494978479196556,
"grad_norm": 0.5911878943443298,
"learning_rate": 5.142857142857142e-06,
"loss": 0.5576,
"step": 36
},
{
"epoch": 0.15925394548063126,
"grad_norm": 0.5217288136482239,
"learning_rate": 5.285714285714286e-06,
"loss": 0.52,
"step": 37
},
{
"epoch": 0.16355810616929697,
"grad_norm": 0.6184452772140503,
"learning_rate": 5.428571428571429e-06,
"loss": 0.5418,
"step": 38
},
{
"epoch": 0.1678622668579627,
"grad_norm": 0.6969144344329834,
"learning_rate": 5.571428571428572e-06,
"loss": 0.5394,
"step": 39
},
{
"epoch": 0.17216642754662842,
"grad_norm": 0.5691121816635132,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.532,
"step": 40
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.497399240732193,
"learning_rate": 5.857142857142858e-06,
"loss": 0.5591,
"step": 41
},
{
"epoch": 0.18077474892395984,
"grad_norm": 0.5312875509262085,
"learning_rate": 6e-06,
"loss": 0.5396,
"step": 42
},
{
"epoch": 0.18507890961262555,
"grad_norm": 0.5770351886749268,
"learning_rate": 6.142857142857144e-06,
"loss": 0.511,
"step": 43
},
{
"epoch": 0.18938307030129126,
"grad_norm": 0.5931875109672546,
"learning_rate": 6.285714285714286e-06,
"loss": 0.5501,
"step": 44
},
{
"epoch": 0.19368723098995697,
"grad_norm": 0.45138663053512573,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.5348,
"step": 45
},
{
"epoch": 0.19799139167862267,
"grad_norm": 0.5075214505195618,
"learning_rate": 6.571428571428572e-06,
"loss": 0.5244,
"step": 46
},
{
"epoch": 0.20229555236728838,
"grad_norm": 0.5202081203460693,
"learning_rate": 6.714285714285714e-06,
"loss": 0.5368,
"step": 47
},
{
"epoch": 0.2065997130559541,
"grad_norm": 0.4660142660140991,
"learning_rate": 6.857142857142858e-06,
"loss": 0.5289,
"step": 48
},
{
"epoch": 0.2109038737446198,
"grad_norm": 0.4269562065601349,
"learning_rate": 7e-06,
"loss": 0.5287,
"step": 49
},
{
"epoch": 0.2152080344332855,
"grad_norm": 0.4298643469810486,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.5133,
"step": 50
},
{
"epoch": 0.21951219512195122,
"grad_norm": 0.4547773599624634,
"learning_rate": 7.285714285714286e-06,
"loss": 0.5201,
"step": 51
},
{
"epoch": 0.22381635581061693,
"grad_norm": 0.48563310503959656,
"learning_rate": 7.428571428571429e-06,
"loss": 0.5327,
"step": 52
},
{
"epoch": 0.22812051649928264,
"grad_norm": 0.4938199818134308,
"learning_rate": 7.571428571428572e-06,
"loss": 0.5184,
"step": 53
},
{
"epoch": 0.23242467718794835,
"grad_norm": 0.45804429054260254,
"learning_rate": 7.714285714285716e-06,
"loss": 0.5168,
"step": 54
},
{
"epoch": 0.23672883787661406,
"grad_norm": 0.4584444761276245,
"learning_rate": 7.857142857142858e-06,
"loss": 0.5104,
"step": 55
},
{
"epoch": 0.24103299856527977,
"grad_norm": 0.4324899911880493,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4843,
"step": 56
},
{
"epoch": 0.24533715925394547,
"grad_norm": 0.513468861579895,
"learning_rate": 8.142857142857143e-06,
"loss": 0.5426,
"step": 57
},
{
"epoch": 0.24964131994261118,
"grad_norm": 0.4595116972923279,
"learning_rate": 8.285714285714287e-06,
"loss": 0.5056,
"step": 58
},
{
"epoch": 0.2539454806312769,
"grad_norm": 0.4523639380931854,
"learning_rate": 8.428571428571429e-06,
"loss": 0.4841,
"step": 59
},
{
"epoch": 0.2582496413199426,
"grad_norm": 0.4841617941856384,
"learning_rate": 8.571428571428571e-06,
"loss": 0.5114,
"step": 60
},
{
"epoch": 0.26255380200860834,
"grad_norm": 0.47025153040885925,
"learning_rate": 8.714285714285715e-06,
"loss": 0.4994,
"step": 61
},
{
"epoch": 0.266857962697274,
"grad_norm": 0.4940146207809448,
"learning_rate": 8.857142857142858e-06,
"loss": 0.5052,
"step": 62
},
{
"epoch": 0.27116212338593976,
"grad_norm": 0.47717440128326416,
"learning_rate": 9e-06,
"loss": 0.4856,
"step": 63
},
{
"epoch": 0.27546628407460544,
"grad_norm": 0.44342952966690063,
"learning_rate": 9.142857142857144e-06,
"loss": 0.4889,
"step": 64
},
{
"epoch": 0.2797704447632712,
"grad_norm": 0.47397580742836,
"learning_rate": 9.285714285714288e-06,
"loss": 0.4723,
"step": 65
},
{
"epoch": 0.28407460545193686,
"grad_norm": 0.46283072233200073,
"learning_rate": 9.42857142857143e-06,
"loss": 0.5035,
"step": 66
},
{
"epoch": 0.2883787661406026,
"grad_norm": 0.4001065194606781,
"learning_rate": 9.571428571428573e-06,
"loss": 0.4608,
"step": 67
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.5248638987541199,
"learning_rate": 9.714285714285715e-06,
"loss": 0.5025,
"step": 68
},
{
"epoch": 0.296987087517934,
"grad_norm": 0.41630035638809204,
"learning_rate": 9.857142857142859e-06,
"loss": 0.4712,
"step": 69
},
{
"epoch": 0.3012912482065997,
"grad_norm": 0.4507600665092468,
"learning_rate": 1e-05,
"loss": 0.5068,
"step": 70
},
{
"epoch": 0.30559540889526543,
"grad_norm": 0.537525475025177,
"learning_rate": 9.999937036309402e-06,
"loss": 0.4913,
"step": 71
},
{
"epoch": 0.3098995695839311,
"grad_norm": 0.4286635220050812,
"learning_rate": 9.999748146823376e-06,
"loss": 0.488,
"step": 72
},
{
"epoch": 0.31420373027259685,
"grad_norm": 0.4658080041408539,
"learning_rate": 9.999433336299195e-06,
"loss": 0.4808,
"step": 73
},
{
"epoch": 0.31850789096126253,
"grad_norm": 0.47707945108413696,
"learning_rate": 9.99899261266551e-06,
"loss": 0.5032,
"step": 74
},
{
"epoch": 0.32281205164992827,
"grad_norm": 0.5236383080482483,
"learning_rate": 9.99842598702216e-06,
"loss": 0.4748,
"step": 75
},
{
"epoch": 0.32711621233859395,
"grad_norm": 0.43928998708724976,
"learning_rate": 9.997733473639876e-06,
"loss": 0.4755,
"step": 76
},
{
"epoch": 0.3314203730272597,
"grad_norm": 0.4532296657562256,
"learning_rate": 9.996915089959942e-06,
"loss": 0.4868,
"step": 77
},
{
"epoch": 0.3357245337159254,
"grad_norm": 0.4567766785621643,
"learning_rate": 9.995970856593739e-06,
"loss": 0.4919,
"step": 78
},
{
"epoch": 0.3400286944045911,
"grad_norm": 0.48937737941741943,
"learning_rate": 9.994900797322233e-06,
"loss": 0.495,
"step": 79
},
{
"epoch": 0.34433285509325684,
"grad_norm": 0.48139920830726624,
"learning_rate": 9.993704939095376e-06,
"loss": 0.4902,
"step": 80
},
{
"epoch": 0.3486370157819225,
"grad_norm": 0.46625345945358276,
"learning_rate": 9.99238331203143e-06,
"loss": 0.475,
"step": 81
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.455159991979599,
"learning_rate": 9.9909359494162e-06,
"loss": 0.4568,
"step": 82
},
{
"epoch": 0.35724533715925394,
"grad_norm": 0.482522189617157,
"learning_rate": 9.989362887702203e-06,
"loss": 0.5028,
"step": 83
},
{
"epoch": 0.3615494978479197,
"grad_norm": 0.5009233355522156,
"learning_rate": 9.987664166507749e-06,
"loss": 0.4727,
"step": 84
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.4649881422519684,
"learning_rate": 9.985839828615937e-06,
"loss": 0.4589,
"step": 85
},
{
"epoch": 0.3701578192252511,
"grad_norm": 0.49685046076774597,
"learning_rate": 9.983889919973586e-06,
"loss": 0.4782,
"step": 86
},
{
"epoch": 0.3744619799139168,
"grad_norm": 0.4575681984424591,
"learning_rate": 9.981814489690077e-06,
"loss": 0.4526,
"step": 87
},
{
"epoch": 0.3787661406025825,
"grad_norm": 0.4114251434803009,
"learning_rate": 9.979613590036108e-06,
"loss": 0.4472,
"step": 88
},
{
"epoch": 0.3830703012912482,
"grad_norm": 0.5058827996253967,
"learning_rate": 9.977287276442385e-06,
"loss": 0.4867,
"step": 89
},
{
"epoch": 0.38737446197991393,
"grad_norm": 0.4725145697593689,
"learning_rate": 9.974835607498224e-06,
"loss": 0.4679,
"step": 90
},
{
"epoch": 0.3916786226685796,
"grad_norm": 0.48240530490875244,
"learning_rate": 9.972258644950074e-06,
"loss": 0.4587,
"step": 91
},
{
"epoch": 0.39598278335724535,
"grad_norm": 0.515557050704956,
"learning_rate": 9.969556453699966e-06,
"loss": 0.4654,
"step": 92
},
{
"epoch": 0.40028694404591103,
"grad_norm": 0.4809204638004303,
"learning_rate": 9.966729101803872e-06,
"loss": 0.4667,
"step": 93
},
{
"epoch": 0.40459110473457677,
"grad_norm": 0.5204639434814453,
"learning_rate": 9.963776660469996e-06,
"loss": 0.4775,
"step": 94
},
{
"epoch": 0.40889526542324245,
"grad_norm": 0.47317999601364136,
"learning_rate": 9.960699204056978e-06,
"loss": 0.4365,
"step": 95
},
{
"epoch": 0.4131994261119082,
"grad_norm": 0.4386410117149353,
"learning_rate": 9.957496810072027e-06,
"loss": 0.4728,
"step": 96
},
{
"epoch": 0.41750358680057387,
"grad_norm": 0.5309630632400513,
"learning_rate": 9.954169559168958e-06,
"loss": 0.4643,
"step": 97
},
{
"epoch": 0.4218077474892396,
"grad_norm": 0.4550637900829315,
"learning_rate": 9.95071753514617e-06,
"loss": 0.4462,
"step": 98
},
{
"epoch": 0.4261119081779053,
"grad_norm": 0.46194523572921753,
"learning_rate": 9.947140824944533e-06,
"loss": 0.4656,
"step": 99
},
{
"epoch": 0.430416068866571,
"grad_norm": 0.4840027689933777,
"learning_rate": 9.943439518645193e-06,
"loss": 0.4684,
"step": 100
},
{
"epoch": 0.4347202295552367,
"grad_norm": 0.4388582706451416,
"learning_rate": 9.939613709467317e-06,
"loss": 0.4806,
"step": 101
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.44998958706855774,
"learning_rate": 9.935663493765726e-06,
"loss": 0.4464,
"step": 102
},
{
"epoch": 0.4433285509325681,
"grad_norm": 0.41248953342437744,
"learning_rate": 9.93158897102849e-06,
"loss": 0.4678,
"step": 103
},
{
"epoch": 0.44763271162123386,
"grad_norm": 0.5117968916893005,
"learning_rate": 9.9273902438744e-06,
"loss": 0.4844,
"step": 104
},
{
"epoch": 0.4519368723098996,
"grad_norm": 0.42773979902267456,
"learning_rate": 9.923067418050399e-06,
"loss": 0.4786,
"step": 105
},
{
"epoch": 0.4562410329985653,
"grad_norm": 0.4874267876148224,
"learning_rate": 9.918620602428916e-06,
"loss": 0.4718,
"step": 106
},
{
"epoch": 0.460545193687231,
"grad_norm": 0.48003584146499634,
"learning_rate": 9.91404990900512e-06,
"loss": 0.4654,
"step": 107
},
{
"epoch": 0.4648493543758967,
"grad_norm": 0.48111721873283386,
"learning_rate": 9.909355452894098e-06,
"loss": 0.4657,
"step": 108
},
{
"epoch": 0.46915351506456243,
"grad_norm": 0.5006975531578064,
"learning_rate": 9.904537352327968e-06,
"loss": 0.435,
"step": 109
},
{
"epoch": 0.4734576757532281,
"grad_norm": 0.4520009756088257,
"learning_rate": 9.899595728652883e-06,
"loss": 0.4405,
"step": 110
},
{
"epoch": 0.47776183644189385,
"grad_norm": 0.43760401010513306,
"learning_rate": 9.894530706325994e-06,
"loss": 0.4649,
"step": 111
},
{
"epoch": 0.48206599713055953,
"grad_norm": 0.4725401997566223,
"learning_rate": 9.889342412912296e-06,
"loss": 0.4803,
"step": 112
},
{
"epoch": 0.48637015781922527,
"grad_norm": 0.4176185429096222,
"learning_rate": 9.88403097908143e-06,
"loss": 0.4395,
"step": 113
},
{
"epoch": 0.49067431850789095,
"grad_norm": 0.4956033229827881,
"learning_rate": 9.878596538604388e-06,
"loss": 0.4865,
"step": 114
},
{
"epoch": 0.4949784791965567,
"grad_norm": 0.4664049744606018,
"learning_rate": 9.87303922835014e-06,
"loss": 0.481,
"step": 115
},
{
"epoch": 0.49928263988522237,
"grad_norm": 0.4784802496433258,
"learning_rate": 9.867359188282193e-06,
"loss": 0.4477,
"step": 116
},
{
"epoch": 0.503586800573888,
"grad_norm": 0.4246116280555725,
"learning_rate": 9.861556561455061e-06,
"loss": 0.4474,
"step": 117
},
{
"epoch": 0.5078909612625538,
"grad_norm": 0.4217904508113861,
"learning_rate": 9.855631494010661e-06,
"loss": 0.4519,
"step": 118
},
{
"epoch": 0.5121951219512195,
"grad_norm": 0.45506662130355835,
"learning_rate": 9.849584135174642e-06,
"loss": 0.4738,
"step": 119
},
{
"epoch": 0.5164992826398852,
"grad_norm": 0.5048141479492188,
"learning_rate": 9.843414637252615e-06,
"loss": 0.4667,
"step": 120
},
{
"epoch": 0.5208034433285509,
"grad_norm": 0.4061611294746399,
"learning_rate": 9.837123155626323e-06,
"loss": 0.4627,
"step": 121
},
{
"epoch": 0.5251076040172167,
"grad_norm": 0.5009416937828064,
"learning_rate": 9.830709848749727e-06,
"loss": 0.4492,
"step": 122
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.3940972685813904,
"learning_rate": 9.824174878145017e-06,
"loss": 0.4575,
"step": 123
},
{
"epoch": 0.533715925394548,
"grad_norm": 0.4693102240562439,
"learning_rate": 9.817518408398536e-06,
"loss": 0.476,
"step": 124
},
{
"epoch": 0.5380200860832137,
"grad_norm": 0.4415923058986664,
"learning_rate": 9.810740607156647e-06,
"loss": 0.4524,
"step": 125
},
{
"epoch": 0.5423242467718795,
"grad_norm": 0.4520181119441986,
"learning_rate": 9.803841645121505e-06,
"loss": 0.4929,
"step": 126
},
{
"epoch": 0.5466284074605452,
"grad_norm": 0.4665045142173767,
"learning_rate": 9.796821696046748e-06,
"loss": 0.4666,
"step": 127
},
{
"epoch": 0.5509325681492109,
"grad_norm": 0.48491621017456055,
"learning_rate": 9.78968093673314e-06,
"loss": 0.4667,
"step": 128
},
{
"epoch": 0.5552367288378766,
"grad_norm": 0.6082088351249695,
"learning_rate": 9.782419547024108e-06,
"loss": 0.4698,
"step": 129
},
{
"epoch": 0.5595408895265424,
"grad_norm": 0.4258078932762146,
"learning_rate": 9.775037709801206e-06,
"loss": 0.4683,
"step": 130
},
{
"epoch": 0.563845050215208,
"grad_norm": 0.45697230100631714,
"learning_rate": 9.76753561097952e-06,
"loss": 0.4294,
"step": 131
},
{
"epoch": 0.5681492109038737,
"grad_norm": 0.4770420789718628,
"learning_rate": 9.759913439502982e-06,
"loss": 0.4744,
"step": 132
},
{
"epoch": 0.5724533715925395,
"grad_norm": 0.4091593623161316,
"learning_rate": 9.752171387339612e-06,
"loss": 0.4905,
"step": 133
},
{
"epoch": 0.5767575322812052,
"grad_norm": 0.513836145401001,
"learning_rate": 9.74430964947668e-06,
"loss": 0.463,
"step": 134
},
{
"epoch": 0.5810616929698709,
"grad_norm": 0.42678093910217285,
"learning_rate": 9.736328423915797e-06,
"loss": 0.4579,
"step": 135
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.4890856146812439,
"learning_rate": 9.728227911667934e-06,
"loss": 0.4589,
"step": 136
},
{
"epoch": 0.5896700143472023,
"grad_norm": 0.4562450349330902,
"learning_rate": 9.720008316748344e-06,
"loss": 0.4649,
"step": 137
},
{
"epoch": 0.593974175035868,
"grad_norm": 0.42691197991371155,
"learning_rate": 9.711669846171443e-06,
"loss": 0.4434,
"step": 138
},
{
"epoch": 0.5982783357245337,
"grad_norm": 0.4402090609073639,
"learning_rate": 9.703212709945583e-06,
"loss": 0.4557,
"step": 139
},
{
"epoch": 0.6025824964131994,
"grad_norm": 0.47748616337776184,
"learning_rate": 9.694637121067764e-06,
"loss": 0.4711,
"step": 140
},
{
"epoch": 0.6068866571018652,
"grad_norm": 0.45207276940345764,
"learning_rate": 9.685943295518283e-06,
"loss": 0.4519,
"step": 141
},
{
"epoch": 0.6111908177905309,
"grad_norm": 0.4485815167427063,
"learning_rate": 9.677131452255272e-06,
"loss": 0.461,
"step": 142
},
{
"epoch": 0.6154949784791965,
"grad_norm": 0.47839170694351196,
"learning_rate": 9.668201813209202e-06,
"loss": 0.4747,
"step": 143
},
{
"epoch": 0.6197991391678622,
"grad_norm": 0.45290523767471313,
"learning_rate": 9.659154603277283e-06,
"loss": 0.4605,
"step": 144
},
{
"epoch": 0.624103299856528,
"grad_norm": 0.41288816928863525,
"learning_rate": 9.649990050317806e-06,
"loss": 0.4748,
"step": 145
},
{
"epoch": 0.6284074605451937,
"grad_norm": 0.3928135335445404,
"learning_rate": 9.640708385144403e-06,
"loss": 0.4435,
"step": 146
},
{
"epoch": 0.6327116212338594,
"grad_norm": 0.46720483899116516,
"learning_rate": 9.631309841520233e-06,
"loss": 0.452,
"step": 147
},
{
"epoch": 0.6370157819225251,
"grad_norm": 0.42169883847236633,
"learning_rate": 9.62179465615209e-06,
"loss": 0.4544,
"step": 148
},
{
"epoch": 0.6413199426111909,
"grad_norm": 0.5521188974380493,
"learning_rate": 9.612163068684453e-06,
"loss": 0.4507,
"step": 149
},
{
"epoch": 0.6456241032998565,
"grad_norm": 0.46388116478919983,
"learning_rate": 9.602415321693434e-06,
"loss": 0.465,
"step": 150
},
{
"epoch": 0.6499282639885222,
"grad_norm": 0.4932452440261841,
"learning_rate": 9.592551660680687e-06,
"loss": 0.4592,
"step": 151
},
{
"epoch": 0.6542324246771879,
"grad_norm": 0.42455729842185974,
"learning_rate": 9.582572334067213e-06,
"loss": 0.4617,
"step": 152
},
{
"epoch": 0.6585365853658537,
"grad_norm": 0.4208996593952179,
"learning_rate": 9.572477593187101e-06,
"loss": 0.4629,
"step": 153
},
{
"epoch": 0.6628407460545194,
"grad_norm": 0.4985635578632355,
"learning_rate": 9.562267692281212e-06,
"loss": 0.4744,
"step": 154
},
{
"epoch": 0.667144906743185,
"grad_norm": 0.402473121881485,
"learning_rate": 9.551942888490759e-06,
"loss": 0.4369,
"step": 155
},
{
"epoch": 0.6714490674318508,
"grad_norm": 0.4076909124851227,
"learning_rate": 9.541503441850844e-06,
"loss": 0.4687,
"step": 156
},
{
"epoch": 0.6757532281205165,
"grad_norm": 0.5572343468666077,
"learning_rate": 9.530949615283902e-06,
"loss": 0.4809,
"step": 157
},
{
"epoch": 0.6800573888091822,
"grad_norm": 0.416309654712677,
"learning_rate": 9.520281674593084e-06,
"loss": 0.4406,
"step": 158
},
{
"epoch": 0.6843615494978479,
"grad_norm": 0.41716885566711426,
"learning_rate": 9.509499888455554e-06,
"loss": 0.4687,
"step": 159
},
{
"epoch": 0.6886657101865137,
"grad_norm": 0.4318557679653168,
"learning_rate": 9.498604528415731e-06,
"loss": 0.4547,
"step": 160
},
{
"epoch": 0.6929698708751794,
"grad_norm": 0.46745797991752625,
"learning_rate": 9.487595868878447e-06,
"loss": 0.4477,
"step": 161
},
{
"epoch": 0.697274031563845,
"grad_norm": 0.4273889362812042,
"learning_rate": 9.476474187102033e-06,
"loss": 0.4687,
"step": 162
},
{
"epoch": 0.7015781922525107,
"grad_norm": 0.40328237414360046,
"learning_rate": 9.465239763191345e-06,
"loss": 0.4547,
"step": 163
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.40811076760292053,
"learning_rate": 9.453892880090696e-06,
"loss": 0.4593,
"step": 164
},
{
"epoch": 0.7101865136298422,
"grad_norm": 0.4333866238594055,
"learning_rate": 9.442433823576741e-06,
"loss": 0.447,
"step": 165
},
{
"epoch": 0.7144906743185079,
"grad_norm": 0.4304181933403015,
"learning_rate": 9.430862882251279e-06,
"loss": 0.4388,
"step": 166
},
{
"epoch": 0.7187948350071736,
"grad_norm": 0.4919649660587311,
"learning_rate": 9.419180347533976e-06,
"loss": 0.4457,
"step": 167
},
{
"epoch": 0.7230989956958394,
"grad_norm": 0.3899565041065216,
"learning_rate": 9.40738651365503e-06,
"loss": 0.4673,
"step": 168
},
{
"epoch": 0.727403156384505,
"grad_norm": 0.44213035702705383,
"learning_rate": 9.395481677647767e-06,
"loss": 0.4514,
"step": 169
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.4894409775733948,
"learning_rate": 9.38346613934115e-06,
"loss": 0.4603,
"step": 170
},
{
"epoch": 0.7360114777618364,
"grad_norm": 0.38324883580207825,
"learning_rate": 9.371340201352234e-06,
"loss": 0.4506,
"step": 171
},
{
"epoch": 0.7403156384505022,
"grad_norm": 0.5055360198020935,
"learning_rate": 9.359104169078541e-06,
"loss": 0.4528,
"step": 172
},
{
"epoch": 0.7446197991391679,
"grad_norm": 0.45433783531188965,
"learning_rate": 9.346758350690373e-06,
"loss": 0.4572,
"step": 173
},
{
"epoch": 0.7489239598278336,
"grad_norm": 0.45139914751052856,
"learning_rate": 9.334303057123044e-06,
"loss": 0.4399,
"step": 174
},
{
"epoch": 0.7532281205164992,
"grad_norm": 0.44593778252601624,
"learning_rate": 9.321738602069057e-06,
"loss": 0.475,
"step": 175
},
{
"epoch": 0.757532281205165,
"grad_norm": 0.5288325548171997,
"learning_rate": 9.309065301970193e-06,
"loss": 0.4577,
"step": 176
},
{
"epoch": 0.7618364418938307,
"grad_norm": 0.45763787627220154,
"learning_rate": 9.296283476009551e-06,
"loss": 0.4614,
"step": 177
},
{
"epoch": 0.7661406025824964,
"grad_norm": 0.4407108724117279,
"learning_rate": 9.283393446103506e-06,
"loss": 0.4518,
"step": 178
},
{
"epoch": 0.7704447632711621,
"grad_norm": 0.5397217869758606,
"learning_rate": 9.270395536893599e-06,
"loss": 0.4698,
"step": 179
},
{
"epoch": 0.7747489239598279,
"grad_norm": 0.4697708189487457,
"learning_rate": 9.257290075738365e-06,
"loss": 0.4505,
"step": 180
},
{
"epoch": 0.7790530846484935,
"grad_norm": 0.4129229187965393,
"learning_rate": 9.244077392705085e-06,
"loss": 0.4336,
"step": 181
},
{
"epoch": 0.7833572453371592,
"grad_norm": 0.45845144987106323,
"learning_rate": 9.23075782056147e-06,
"loss": 0.4492,
"step": 182
},
{
"epoch": 0.787661406025825,
"grad_norm": 0.4498468339443207,
"learning_rate": 9.217331694767291e-06,
"loss": 0.45,
"step": 183
},
{
"epoch": 0.7919655667144907,
"grad_norm": 0.5015835165977478,
"learning_rate": 9.20379935346592e-06,
"loss": 0.4644,
"step": 184
},
{
"epoch": 0.7962697274031564,
"grad_norm": 0.39411643147468567,
"learning_rate": 9.190161137475814e-06,
"loss": 0.4346,
"step": 185
},
{
"epoch": 0.8005738880918221,
"grad_norm": 0.44030869007110596,
"learning_rate": 9.176417390281944e-06,
"loss": 0.4266,
"step": 186
},
{
"epoch": 0.8048780487804879,
"grad_norm": 0.4082520604133606,
"learning_rate": 9.162568458027122e-06,
"loss": 0.4401,
"step": 187
},
{
"epoch": 0.8091822094691535,
"grad_norm": 0.44995787739753723,
"learning_rate": 9.148614689503307e-06,
"loss": 0.4394,
"step": 188
},
{
"epoch": 0.8134863701578192,
"grad_norm": 0.4435669183731079,
"learning_rate": 9.134556436142801e-06,
"loss": 0.4533,
"step": 189
},
{
"epoch": 0.8177905308464849,
"grad_norm": 0.4199334383010864,
"learning_rate": 9.120394052009412e-06,
"loss": 0.4515,
"step": 190
},
{
"epoch": 0.8220946915351507,
"grad_norm": 0.46623045206069946,
"learning_rate": 9.10612789378953e-06,
"loss": 0.4532,
"step": 191
},
{
"epoch": 0.8263988522238164,
"grad_norm": 0.44973841309547424,
"learning_rate": 9.091758320783139e-06,
"loss": 0.4528,
"step": 192
},
{
"epoch": 0.830703012912482,
"grad_norm": 0.433490514755249,
"learning_rate": 9.077285694894786e-06,
"loss": 0.4468,
"step": 193
},
{
"epoch": 0.8350071736011477,
"grad_norm": 0.5686512589454651,
"learning_rate": 9.062710380624439e-06,
"loss": 0.4685,
"step": 194
},
{
"epoch": 0.8393113342898135,
"grad_norm": 0.43695539236068726,
"learning_rate": 9.048032745058335e-06,
"loss": 0.4653,
"step": 195
},
{
"epoch": 0.8436154949784792,
"grad_norm": 0.45285752415657043,
"learning_rate": 9.033253157859715e-06,
"loss": 0.4362,
"step": 196
},
{
"epoch": 0.8479196556671449,
"grad_norm": 0.4667205214500427,
"learning_rate": 9.018371991259516e-06,
"loss": 0.4605,
"step": 197
},
{
"epoch": 0.8522238163558106,
"grad_norm": 0.39525923132896423,
"learning_rate": 9.003389620047012e-06,
"loss": 0.4515,
"step": 198
},
{
"epoch": 0.8565279770444764,
"grad_norm": 0.49101290106773376,
"learning_rate": 8.988306421560354e-06,
"loss": 0.4777,
"step": 199
},
{
"epoch": 0.860832137733142,
"grad_norm": 0.44344207644462585,
"learning_rate": 8.973122775677078e-06,
"loss": 0.449,
"step": 200
},
{
"epoch": 0.8651362984218077,
"grad_norm": 0.4382517337799072,
"learning_rate": 8.957839064804542e-06,
"loss": 0.4584,
"step": 201
},
{
"epoch": 0.8694404591104734,
"grad_norm": 0.4284365177154541,
"learning_rate": 8.942455673870278e-06,
"loss": 0.441,
"step": 202
},
{
"epoch": 0.8737446197991392,
"grad_norm": 0.3757075071334839,
"learning_rate": 8.926972990312314e-06,
"loss": 0.4408,
"step": 203
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.484001100063324,
"learning_rate": 8.91139140406941e-06,
"loss": 0.4446,
"step": 204
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.46470439434051514,
"learning_rate": 8.895711307571235e-06,
"loss": 0.4541,
"step": 205
},
{
"epoch": 0.8866571018651362,
"grad_norm": 0.39970001578330994,
"learning_rate": 8.879933095728485e-06,
"loss": 0.4382,
"step": 206
},
{
"epoch": 0.890961262553802,
"grad_norm": 0.5493953824043274,
"learning_rate": 8.864057165922944e-06,
"loss": 0.4737,
"step": 207
},
{
"epoch": 0.8952654232424677,
"grad_norm": 0.3999284505844116,
"learning_rate": 8.848083917997463e-06,
"loss": 0.4428,
"step": 208
},
{
"epoch": 0.8995695839311334,
"grad_norm": 0.4151836633682251,
"learning_rate": 8.832013754245895e-06,
"loss": 0.4416,
"step": 209
},
{
"epoch": 0.9038737446197992,
"grad_norm": 0.5059155225753784,
"learning_rate": 8.815847079402972e-06,
"loss": 0.4445,
"step": 210
},
{
"epoch": 0.9081779053084649,
"grad_norm": 0.39089852571487427,
"learning_rate": 8.799584300634096e-06,
"loss": 0.44,
"step": 211
},
{
"epoch": 0.9124820659971306,
"grad_norm": 0.4415070712566376,
"learning_rate": 8.783225827525098e-06,
"loss": 0.4423,
"step": 212
},
{
"epoch": 0.9167862266857962,
"grad_norm": 0.49563416838645935,
"learning_rate": 8.766772072071911e-06,
"loss": 0.492,
"step": 213
},
{
"epoch": 0.921090387374462,
"grad_norm": 0.4825840890407562,
"learning_rate": 8.750223448670204e-06,
"loss": 0.4745,
"step": 214
},
{
"epoch": 0.9253945480631277,
"grad_norm": 0.41341516375541687,
"learning_rate": 8.733580374104936e-06,
"loss": 0.4607,
"step": 215
},
{
"epoch": 0.9296987087517934,
"grad_norm": 0.4265018701553345,
"learning_rate": 8.716843267539868e-06,
"loss": 0.4277,
"step": 216
},
{
"epoch": 0.9340028694404591,
"grad_norm": 0.4438970386981964,
"learning_rate": 8.700012550507e-06,
"loss": 0.4489,
"step": 217
},
{
"epoch": 0.9383070301291249,
"grad_norm": 0.427259624004364,
"learning_rate": 8.683088646895955e-06,
"loss": 0.4715,
"step": 218
},
{
"epoch": 0.9426111908177905,
"grad_norm": 0.4407413899898529,
"learning_rate": 8.666071982943306e-06,
"loss": 0.4482,
"step": 219
},
{
"epoch": 0.9469153515064562,
"grad_norm": 0.42888280749320984,
"learning_rate": 8.648962987221837e-06,
"loss": 0.4584,
"step": 220
},
{
"epoch": 0.9512195121951219,
"grad_norm": 0.43822959065437317,
"learning_rate": 8.631762090629756e-06,
"loss": 0.4603,
"step": 221
},
{
"epoch": 0.9555236728837877,
"grad_norm": 0.4983186423778534,
"learning_rate": 8.614469726379833e-06,
"loss": 0.4278,
"step": 222
},
{
"epoch": 0.9598278335724534,
"grad_norm": 0.42594388127326965,
"learning_rate": 8.597086329988498e-06,
"loss": 0.4565,
"step": 223
},
{
"epoch": 0.9641319942611191,
"grad_norm": 0.4089045822620392,
"learning_rate": 8.579612339264867e-06,
"loss": 0.3956,
"step": 224
},
{
"epoch": 0.9684361549497847,
"grad_norm": 0.5381679534912109,
"learning_rate": 8.562048194299719e-06,
"loss": 0.469,
"step": 225
},
{
"epoch": 0.9727403156384505,
"grad_norm": 0.46227994561195374,
"learning_rate": 8.544394337454409e-06,
"loss": 0.4566,
"step": 226
},
{
"epoch": 0.9770444763271162,
"grad_norm": 0.4622877836227417,
"learning_rate": 8.52665121334973e-06,
"loss": 0.4523,
"step": 227
},
{
"epoch": 0.9813486370157819,
"grad_norm": 0.4579131603240967,
"learning_rate": 8.508819268854713e-06,
"loss": 0.4389,
"step": 228
},
{
"epoch": 0.9856527977044476,
"grad_norm": 0.47624289989471436,
"learning_rate": 8.49089895307537e-06,
"loss": 0.4619,
"step": 229
},
{
"epoch": 0.9899569583931134,
"grad_norm": 0.4958518445491791,
"learning_rate": 8.472890717343391e-06,
"loss": 0.4549,
"step": 230
},
{
"epoch": 0.994261119081779,
"grad_norm": 0.4146155118942261,
"learning_rate": 8.454795015204767e-06,
"loss": 0.4366,
"step": 231
},
{
"epoch": 0.9985652797704447,
"grad_norm": 0.5480135083198547,
"learning_rate": 8.436612302408376e-06,
"loss": 0.4537,
"step": 232
},
{
"epoch": 1.0028694404591105,
"grad_norm": 0.814457356929779,
"learning_rate": 8.418343036894497e-06,
"loss": 0.7295,
"step": 233
},
{
"epoch": 1.007173601147776,
"grad_norm": 0.4416688084602356,
"learning_rate": 8.399987678783285e-06,
"loss": 0.3523,
"step": 234
},
{
"epoch": 1.011477761836442,
"grad_norm": 0.683137059211731,
"learning_rate": 8.381546690363174e-06,
"loss": 0.4444,
"step": 235
},
{
"epoch": 1.0157819225251077,
"grad_norm": 0.5394715070724487,
"learning_rate": 8.36302053607924e-06,
"loss": 0.447,
"step": 236
},
{
"epoch": 1.0200860832137733,
"grad_norm": 0.5086341500282288,
"learning_rate": 8.344409682521499e-06,
"loss": 0.4487,
"step": 237
},
{
"epoch": 1.024390243902439,
"grad_norm": 0.6077011227607727,
"learning_rate": 8.325714598413169e-06,
"loss": 0.3777,
"step": 238
},
{
"epoch": 1.0286944045911048,
"grad_norm": 0.4829539358615875,
"learning_rate": 8.306935754598838e-06,
"loss": 0.4263,
"step": 239
},
{
"epoch": 1.0329985652797704,
"grad_norm": 0.5820494294166565,
"learning_rate": 8.288073624032634e-06,
"loss": 0.4574,
"step": 240
},
{
"epoch": 1.0373027259684362,
"grad_norm": 0.4590955376625061,
"learning_rate": 8.269128681766296e-06,
"loss": 0.3603,
"step": 241
},
{
"epoch": 1.0416068866571018,
"grad_norm": 0.5867065191268921,
"learning_rate": 8.250101404937223e-06,
"loss": 0.4503,
"step": 242
},
{
"epoch": 1.0459110473457676,
"grad_norm": 0.4794483184814453,
"learning_rate": 8.230992272756438e-06,
"loss": 0.4189,
"step": 243
},
{
"epoch": 1.0502152080344334,
"grad_norm": 0.520908534526825,
"learning_rate": 8.211801766496537e-06,
"loss": 0.448,
"step": 244
},
{
"epoch": 1.054519368723099,
"grad_norm": 0.4657638967037201,
"learning_rate": 8.192530369479562e-06,
"loss": 0.4121,
"step": 245
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.5529608130455017,
"learning_rate": 8.17317856706482e-06,
"loss": 0.4463,
"step": 246
},
{
"epoch": 1.0631276901004305,
"grad_norm": 0.5230234861373901,
"learning_rate": 8.153746846636675e-06,
"loss": 0.4075,
"step": 247
},
{
"epoch": 1.067431850789096,
"grad_norm": 0.4816892147064209,
"learning_rate": 8.13423569759226e-06,
"loss": 0.4085,
"step": 248
},
{
"epoch": 1.0717360114777619,
"grad_norm": 0.475556880235672,
"learning_rate": 8.114645611329152e-06,
"loss": 0.4149,
"step": 249
},
{
"epoch": 1.0760401721664274,
"grad_norm": 0.42703869938850403,
"learning_rate": 8.094977081233006e-06,
"loss": 0.395,
"step": 250
},
{
"epoch": 1.0803443328550932,
"grad_norm": 0.4868505895137787,
"learning_rate": 8.075230602665118e-06,
"loss": 0.399,
"step": 251
},
{
"epoch": 1.084648493543759,
"grad_norm": 0.4027485251426697,
"learning_rate": 8.055406672949957e-06,
"loss": 0.3854,
"step": 252
},
{
"epoch": 1.0889526542324246,
"grad_norm": 0.4265587627887726,
"learning_rate": 8.03550579136263e-06,
"loss": 0.4077,
"step": 253
},
{
"epoch": 1.0932568149210904,
"grad_norm": 0.43399757146835327,
"learning_rate": 8.015528459116321e-06,
"loss": 0.4204,
"step": 254
},
{
"epoch": 1.0975609756097562,
"grad_norm": 0.46295246481895447,
"learning_rate": 7.995475179349657e-06,
"loss": 0.4132,
"step": 255
},
{
"epoch": 1.1018651362984218,
"grad_norm": 0.3826509118080139,
"learning_rate": 7.975346457114034e-06,
"loss": 0.4238,
"step": 256
},
{
"epoch": 1.1061692969870875,
"grad_norm": 0.45687171816825867,
"learning_rate": 7.955142799360914e-06,
"loss": 0.4395,
"step": 257
},
{
"epoch": 1.1104734576757531,
"grad_norm": 0.41050204634666443,
"learning_rate": 7.934864714929036e-06,
"loss": 0.4315,
"step": 258
},
{
"epoch": 1.114777618364419,
"grad_norm": 0.42092767357826233,
"learning_rate": 7.914512714531612e-06,
"loss": 0.4195,
"step": 259
},
{
"epoch": 1.1190817790530847,
"grad_norm": 0.4156297445297241,
"learning_rate": 7.894087310743468e-06,
"loss": 0.4628,
"step": 260
},
{
"epoch": 1.1233859397417503,
"grad_norm": 0.415634423494339,
"learning_rate": 7.873589017988124e-06,
"loss": 0.3867,
"step": 261
},
{
"epoch": 1.127690100430416,
"grad_norm": 0.40684717893600464,
"learning_rate": 7.853018352524845e-06,
"loss": 0.4505,
"step": 262
},
{
"epoch": 1.1319942611190819,
"grad_norm": 0.40677744150161743,
"learning_rate": 7.832375832435637e-06,
"loss": 0.4158,
"step": 263
},
{
"epoch": 1.1362984218077474,
"grad_norm": 0.40581533312797546,
"learning_rate": 7.811661977612202e-06,
"loss": 0.4406,
"step": 264
},
{
"epoch": 1.1406025824964132,
"grad_norm": 0.3579067587852478,
"learning_rate": 7.790877309742833e-06,
"loss": 0.418,
"step": 265
},
{
"epoch": 1.144906743185079,
"grad_norm": 0.39150920510292053,
"learning_rate": 7.770022352299294e-06,
"loss": 0.3976,
"step": 266
},
{
"epoch": 1.1492109038737446,
"grad_norm": 0.47496986389160156,
"learning_rate": 7.749097630523618e-06,
"loss": 0.4337,
"step": 267
},
{
"epoch": 1.1535150645624104,
"grad_norm": 0.3943747878074646,
"learning_rate": 7.728103671414889e-06,
"loss": 0.3984,
"step": 268
},
{
"epoch": 1.157819225251076,
"grad_norm": 0.34318605065345764,
"learning_rate": 7.707041003715962e-06,
"loss": 0.3682,
"step": 269
},
{
"epoch": 1.1621233859397417,
"grad_norm": 0.3580489456653595,
"learning_rate": 7.685910157900158e-06,
"loss": 0.4177,
"step": 270
},
{
"epoch": 1.1664275466284075,
"grad_norm": 0.4793737232685089,
"learning_rate": 7.66471166615789e-06,
"loss": 0.4102,
"step": 271
},
{
"epoch": 1.170731707317073,
"grad_norm": 0.39403393864631653,
"learning_rate": 7.643446062383273e-06,
"loss": 0.3908,
"step": 272
},
{
"epoch": 1.175035868005739,
"grad_norm": 0.39132651686668396,
"learning_rate": 7.622113882160658e-06,
"loss": 0.4313,
"step": 273
},
{
"epoch": 1.1793400286944047,
"grad_norm": 0.4182010293006897,
"learning_rate": 7.600715662751166e-06,
"loss": 0.3992,
"step": 274
},
{
"epoch": 1.1836441893830703,
"grad_norm": 0.4303508400917053,
"learning_rate": 7.579251943079145e-06,
"loss": 0.4271,
"step": 275
},
{
"epoch": 1.187948350071736,
"grad_norm": 0.3482970893383026,
"learning_rate": 7.557723263718596e-06,
"loss": 0.3602,
"step": 276
},
{
"epoch": 1.1922525107604018,
"grad_norm": 0.45806214213371277,
"learning_rate": 7.536130166879561e-06,
"loss": 0.4503,
"step": 277
},
{
"epoch": 1.1965566714490674,
"grad_norm": 0.41952845454216003,
"learning_rate": 7.514473196394467e-06,
"loss": 0.4377,
"step": 278
},
{
"epoch": 1.2008608321377332,
"grad_norm": 0.3832322657108307,
"learning_rate": 7.492752897704432e-06,
"loss": 0.4034,
"step": 279
},
{
"epoch": 1.2051649928263988,
"grad_norm": 0.4345414340496063,
"learning_rate": 7.470969817845518e-06,
"loss": 0.4549,
"step": 280
},
{
"epoch": 1.2094691535150646,
"grad_norm": 0.4060963988304138,
"learning_rate": 7.4491245054349716e-06,
"loss": 0.3924,
"step": 281
},
{
"epoch": 1.2137733142037304,
"grad_norm": 0.3915734887123108,
"learning_rate": 7.427217510657383e-06,
"loss": 0.434,
"step": 282
},
{
"epoch": 1.218077474892396,
"grad_norm": 0.3792259097099304,
"learning_rate": 7.405249385250854e-06,
"loss": 0.4219,
"step": 283
},
{
"epoch": 1.2223816355810617,
"grad_norm": 0.39508765935897827,
"learning_rate": 7.383220682493081e-06,
"loss": 0.4042,
"step": 284
},
{
"epoch": 1.2266857962697273,
"grad_norm": 0.39781150221824646,
"learning_rate": 7.361131957187435e-06,
"loss": 0.4149,
"step": 285
},
{
"epoch": 1.230989956958393,
"grad_norm": 0.34590664505958557,
"learning_rate": 7.338983765648985e-06,
"loss": 0.3643,
"step": 286
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.39236798882484436,
"learning_rate": 7.31677666569048e-06,
"loss": 0.4362,
"step": 287
},
{
"epoch": 1.2395982783357244,
"grad_norm": 0.36852768063545227,
"learning_rate": 7.294511216608308e-06,
"loss": 0.3771,
"step": 288
},
{
"epoch": 1.2439024390243902,
"grad_norm": 0.4464952051639557,
"learning_rate": 7.272187979168408e-06,
"loss": 0.4386,
"step": 289
},
{
"epoch": 1.248206599713056,
"grad_norm": 0.382019579410553,
"learning_rate": 7.249807515592149e-06,
"loss": 0.4192,
"step": 290
},
{
"epoch": 1.2525107604017216,
"grad_norm": 0.4468196630477905,
"learning_rate": 7.227370389542161e-06,
"loss": 0.4278,
"step": 291
},
{
"epoch": 1.2568149210903874,
"grad_norm": 0.4168325662612915,
"learning_rate": 7.2048771661081515e-06,
"loss": 0.4011,
"step": 292
},
{
"epoch": 1.2611190817790532,
"grad_norm": 0.4351046085357666,
"learning_rate": 7.182328411792664e-06,
"loss": 0.4019,
"step": 293
},
{
"epoch": 1.2654232424677188,
"grad_norm": 0.41555511951446533,
"learning_rate": 7.159724694496815e-06,
"loss": 0.3819,
"step": 294
},
{
"epoch": 1.2697274031563845,
"grad_norm": 0.4706554710865021,
"learning_rate": 7.137066583505987e-06,
"loss": 0.4372,
"step": 295
},
{
"epoch": 1.2740315638450501,
"grad_norm": 0.34467652440071106,
"learning_rate": 7.114354649475499e-06,
"loss": 0.3756,
"step": 296
},
{
"epoch": 1.278335724533716,
"grad_norm": 0.4327693581581116,
"learning_rate": 7.091589464416225e-06,
"loss": 0.4385,
"step": 297
},
{
"epoch": 1.2826398852223817,
"grad_norm": 0.409885048866272,
"learning_rate": 7.068771601680191e-06,
"loss": 0.426,
"step": 298
},
{
"epoch": 1.2869440459110473,
"grad_norm": 0.4596419036388397,
"learning_rate": 7.04590163594614e-06,
"loss": 0.4109,
"step": 299
},
{
"epoch": 1.291248206599713,
"grad_norm": 0.3838300108909607,
"learning_rate": 7.022980143205046e-06,
"loss": 0.391,
"step": 300
},
{
"epoch": 1.2955523672883786,
"grad_norm": 0.37262919545173645,
"learning_rate": 7.000007700745622e-06,
"loss": 0.4314,
"step": 301
},
{
"epoch": 1.2998565279770444,
"grad_norm": 0.44159433245658875,
"learning_rate": 6.976984887139775e-06,
"loss": 0.4172,
"step": 302
},
{
"epoch": 1.3041606886657102,
"grad_norm": 0.4745006859302521,
"learning_rate": 6.9539122822280246e-06,
"loss": 0.4023,
"step": 303
},
{
"epoch": 1.308464849354376,
"grad_norm": 0.3681146204471588,
"learning_rate": 6.930790467104916e-06,
"loss": 0.3768,
"step": 304
},
{
"epoch": 1.3127690100430416,
"grad_norm": 0.4388069212436676,
"learning_rate": 6.907620024104377e-06,
"loss": 0.4551,
"step": 305
},
{
"epoch": 1.3170731707317074,
"grad_norm": 0.40693527460098267,
"learning_rate": 6.884401536785045e-06,
"loss": 0.4001,
"step": 306
},
{
"epoch": 1.321377331420373,
"grad_norm": 0.47269055247306824,
"learning_rate": 6.861135589915583e-06,
"loss": 0.4123,
"step": 307
},
{
"epoch": 1.3256814921090387,
"grad_norm": 0.34925857186317444,
"learning_rate": 6.837822769459942e-06,
"loss": 0.3817,
"step": 308
},
{
"epoch": 1.3299856527977045,
"grad_norm": 0.3714245557785034,
"learning_rate": 6.814463662562609e-06,
"loss": 0.4048,
"step": 309
},
{
"epoch": 1.33428981348637,
"grad_norm": 0.46117159724235535,
"learning_rate": 6.791058857533814e-06,
"loss": 0.4096,
"step": 310
},
{
"epoch": 1.338593974175036,
"grad_norm": 0.3898836374282837,
"learning_rate": 6.767608943834721e-06,
"loss": 0.4042,
"step": 311
},
{
"epoch": 1.3428981348637015,
"grad_norm": 0.4022074043750763,
"learning_rate": 6.744114512062571e-06,
"loss": 0.3824,
"step": 312
},
{
"epoch": 1.3472022955523673,
"grad_norm": 0.43312016129493713,
"learning_rate": 6.720576153935818e-06,
"loss": 0.4576,
"step": 313
},
{
"epoch": 1.351506456241033,
"grad_norm": 0.4213770627975464,
"learning_rate": 6.696994462279223e-06,
"loss": 0.428,
"step": 314
},
{
"epoch": 1.3558106169296988,
"grad_norm": 0.4370782971382141,
"learning_rate": 6.673370031008919e-06,
"loss": 0.4351,
"step": 315
},
{
"epoch": 1.3601147776183644,
"grad_norm": 0.3750602900981903,
"learning_rate": 6.6497034551174585e-06,
"loss": 0.423,
"step": 316
},
{
"epoch": 1.3644189383070302,
"grad_norm": 0.3736652135848999,
"learning_rate": 6.625995330658828e-06,
"loss": 0.3869,
"step": 317
},
{
"epoch": 1.3687230989956958,
"grad_norm": 0.4050619304180145,
"learning_rate": 6.602246254733431e-06,
"loss": 0.4052,
"step": 318
},
{
"epoch": 1.3730272596843616,
"grad_norm": 0.3807079493999481,
"learning_rate": 6.578456825473055e-06,
"loss": 0.4164,
"step": 319
},
{
"epoch": 1.3773314203730274,
"grad_norm": 0.39028626680374146,
"learning_rate": 6.554627642025807e-06,
"loss": 0.3715,
"step": 320
},
{
"epoch": 1.381635581061693,
"grad_norm": 0.4095914959907532,
"learning_rate": 6.53075930454102e-06,
"loss": 0.4294,
"step": 321
},
{
"epoch": 1.3859397417503587,
"grad_norm": 0.3441252112388611,
"learning_rate": 6.506852414154138e-06,
"loss": 0.395,
"step": 322
},
{
"epoch": 1.3902439024390243,
"grad_norm": 0.38136348128318787,
"learning_rate": 6.482907572971584e-06,
"loss": 0.4512,
"step": 323
},
{
"epoch": 1.39454806312769,
"grad_norm": 0.3387623429298401,
"learning_rate": 6.4589253840555856e-06,
"loss": 0.3324,
"step": 324
},
{
"epoch": 1.3988522238163559,
"grad_norm": 0.3917059600353241,
"learning_rate": 6.434906451408991e-06,
"loss": 0.4544,
"step": 325
},
{
"epoch": 1.4031563845050214,
"grad_norm": 0.35180503129959106,
"learning_rate": 6.41085137996006e-06,
"loss": 0.3851,
"step": 326
},
{
"epoch": 1.4074605451936872,
"grad_norm": 0.35865458846092224,
"learning_rate": 6.386760775547221e-06,
"loss": 0.4026,
"step": 327
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.34770017862319946,
"learning_rate": 6.362635244903818e-06,
"loss": 0.397,
"step": 328
},
{
"epoch": 1.4160688665710186,
"grad_norm": 0.397268682718277,
"learning_rate": 6.338475395642834e-06,
"loss": 0.4378,
"step": 329
},
{
"epoch": 1.4203730272596844,
"grad_norm": 0.35202059149742126,
"learning_rate": 6.314281836241573e-06,
"loss": 0.4169,
"step": 330
},
{
"epoch": 1.4246771879483502,
"grad_norm": 0.35058972239494324,
"learning_rate": 6.2900551760263564e-06,
"loss": 0.4158,
"step": 331
},
{
"epoch": 1.4289813486370158,
"grad_norm": 0.4056157171726227,
"learning_rate": 6.265796025157154e-06,
"loss": 0.376,
"step": 332
},
{
"epoch": 1.4332855093256816,
"grad_norm": 0.3331606090068817,
"learning_rate": 6.241504994612237e-06,
"loss": 0.4236,
"step": 333
},
{
"epoch": 1.4375896700143471,
"grad_norm": 0.3854994475841522,
"learning_rate": 6.217182696172776e-06,
"loss": 0.4023,
"step": 334
},
{
"epoch": 1.441893830703013,
"grad_norm": 0.36590859293937683,
"learning_rate": 6.192829742407442e-06,
"loss": 0.4245,
"step": 335
},
{
"epoch": 1.4461979913916787,
"grad_norm": 0.3405327796936035,
"learning_rate": 6.168446746656973e-06,
"loss": 0.4134,
"step": 336
},
{
"epoch": 1.4505021520803443,
"grad_norm": 0.35657453536987305,
"learning_rate": 6.144034323018728e-06,
"loss": 0.4235,
"step": 337
},
{
"epoch": 1.45480631276901,
"grad_norm": 0.3618459701538086,
"learning_rate": 6.119593086331225e-06,
"loss": 0.3759,
"step": 338
},
{
"epoch": 1.4591104734576756,
"grad_norm": 0.3598412871360779,
"learning_rate": 6.095123652158648e-06,
"loss": 0.4016,
"step": 339
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.3681644797325134,
"learning_rate": 6.070626636775349e-06,
"loss": 0.4307,
"step": 340
},
{
"epoch": 1.4677187948350072,
"grad_norm": 0.32633137702941895,
"learning_rate": 6.046102657150328e-06,
"loss": 0.3716,
"step": 341
},
{
"epoch": 1.472022955523673,
"grad_norm": 0.3902093172073364,
"learning_rate": 6.021552330931693e-06,
"loss": 0.4195,
"step": 342
},
{
"epoch": 1.4763271162123386,
"grad_norm": 0.397809237241745,
"learning_rate": 5.996976276431097e-06,
"loss": 0.4444,
"step": 343
},
{
"epoch": 1.4806312769010044,
"grad_norm": 0.36122214794158936,
"learning_rate": 5.972375112608182e-06,
"loss": 0.3855,
"step": 344
},
{
"epoch": 1.48493543758967,
"grad_norm": 0.3352508842945099,
"learning_rate": 5.947749459054972e-06,
"loss": 0.4506,
"step": 345
},
{
"epoch": 1.4892395982783357,
"grad_norm": 0.36873918771743774,
"learning_rate": 5.923099935980278e-06,
"loss": 0.419,
"step": 346
},
{
"epoch": 1.4935437589670015,
"grad_norm": 0.3478635847568512,
"learning_rate": 5.898427164194084e-06,
"loss": 0.3614,
"step": 347
},
{
"epoch": 1.497847919655667,
"grad_norm": 0.3391861915588379,
"learning_rate": 5.8737317650918905e-06,
"loss": 0.3903,
"step": 348
},
{
"epoch": 1.502152080344333,
"grad_norm": 0.3982401490211487,
"learning_rate": 5.849014360639087e-06,
"loss": 0.4654,
"step": 349
},
{
"epoch": 1.5064562410329985,
"grad_norm": 0.3000711500644684,
"learning_rate": 5.824275573355278e-06,
"loss": 0.3453,
"step": 350
},
{
"epoch": 1.5107604017216643,
"grad_norm": 0.39935576915740967,
"learning_rate": 5.799516026298601e-06,
"loss": 0.4482,
"step": 351
},
{
"epoch": 1.51506456241033,
"grad_norm": 0.33801084756851196,
"learning_rate": 5.7747363430500395e-06,
"loss": 0.3708,
"step": 352
},
{
"epoch": 1.5193687230989958,
"grad_norm": 0.40248093008995056,
"learning_rate": 5.74993714769772e-06,
"loss": 0.4319,
"step": 353
},
{
"epoch": 1.5236728837876614,
"grad_norm": 0.38056331872940063,
"learning_rate": 5.725119064821185e-06,
"loss": 0.4125,
"step": 354
},
{
"epoch": 1.527977044476327,
"grad_norm": 0.34441474080085754,
"learning_rate": 5.700282719475672e-06,
"loss": 0.3856,
"step": 355
},
{
"epoch": 1.5322812051649928,
"grad_norm": 0.36836621165275574,
"learning_rate": 5.675428737176367e-06,
"loss": 0.4293,
"step": 356
},
{
"epoch": 1.5365853658536586,
"grad_norm": 0.33872583508491516,
"learning_rate": 5.65055774388265e-06,
"loss": 0.4381,
"step": 357
},
{
"epoch": 1.5408895265423244,
"grad_norm": 0.3220119774341583,
"learning_rate": 5.625670365982332e-06,
"loss": 0.3737,
"step": 358
},
{
"epoch": 1.54519368723099,
"grad_norm": 0.41862717270851135,
"learning_rate": 5.600767230275878e-06,
"loss": 0.4366,
"step": 359
},
{
"epoch": 1.5494978479196555,
"grad_norm": 0.36884069442749023,
"learning_rate": 5.575848963960621e-06,
"loss": 0.3808,
"step": 360
},
{
"epoch": 1.5538020086083213,
"grad_norm": 0.37247779965400696,
"learning_rate": 5.5509161946149635e-06,
"loss": 0.436,
"step": 361
},
{
"epoch": 1.558106169296987,
"grad_norm": 0.3505042791366577,
"learning_rate": 5.525969550182577e-06,
"loss": 0.3985,
"step": 362
},
{
"epoch": 1.5624103299856529,
"grad_norm": 0.37073713541030884,
"learning_rate": 5.501009658956583e-06,
"loss": 0.3789,
"step": 363
},
{
"epoch": 1.5667144906743187,
"grad_norm": 0.3942989706993103,
"learning_rate": 5.4760371495637256e-06,
"loss": 0.4073,
"step": 364
},
{
"epoch": 1.5710186513629842,
"grad_norm": 0.3741658329963684,
"learning_rate": 5.451052650948549e-06,
"loss": 0.4109,
"step": 365
},
{
"epoch": 1.5753228120516498,
"grad_norm": 0.3425599932670593,
"learning_rate": 5.426056792357552e-06,
"loss": 0.3824,
"step": 366
},
{
"epoch": 1.5796269727403156,
"grad_norm": 0.4195639491081238,
"learning_rate": 5.40105020332333e-06,
"loss": 0.407,
"step": 367
},
{
"epoch": 1.5839311334289814,
"grad_norm": 0.37894967198371887,
"learning_rate": 5.376033513648743e-06,
"loss": 0.4505,
"step": 368
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.3418775498867035,
"learning_rate": 5.3510073533910344e-06,
"loss": 0.363,
"step": 369
},
{
"epoch": 1.5925394548063128,
"grad_norm": 0.40504151582717896,
"learning_rate": 5.325972352845965e-06,
"loss": 0.4566,
"step": 370
},
{
"epoch": 1.5968436154949783,
"grad_norm": 0.37350550293922424,
"learning_rate": 5.30092914253195e-06,
"loss": 0.4217,
"step": 371
},
{
"epoch": 1.6011477761836441,
"grad_norm": 0.3756239414215088,
"learning_rate": 5.2758783531741655e-06,
"loss": 0.3634,
"step": 372
},
{
"epoch": 1.60545193687231,
"grad_norm": 0.4232361316680908,
"learning_rate": 5.25082061568867e-06,
"loss": 0.4412,
"step": 373
},
{
"epoch": 1.6097560975609757,
"grad_norm": 0.376746267080307,
"learning_rate": 5.225756561166521e-06,
"loss": 0.4105,
"step": 374
},
{
"epoch": 1.6140602582496413,
"grad_norm": 0.36558979749679565,
"learning_rate": 5.200686820857862e-06,
"loss": 0.414,
"step": 375
},
{
"epoch": 1.618364418938307,
"grad_norm": 0.31528496742248535,
"learning_rate": 5.175612026156045e-06,
"loss": 0.375,
"step": 376
},
{
"epoch": 1.6226685796269726,
"grad_norm": 0.3867856562137604,
"learning_rate": 5.150532808581718e-06,
"loss": 0.4215,
"step": 377
},
{
"epoch": 1.6269727403156384,
"grad_norm": 0.36438819766044617,
"learning_rate": 5.125449799766916e-06,
"loss": 0.3651,
"step": 378
},
{
"epoch": 1.6312769010043042,
"grad_norm": 0.34973829984664917,
"learning_rate": 5.100363631439162e-06,
"loss": 0.4132,
"step": 379
},
{
"epoch": 1.63558106169297,
"grad_norm": 0.32271939516067505,
"learning_rate": 5.075274935405554e-06,
"loss": 0.3972,
"step": 380
},
{
"epoch": 1.6398852223816356,
"grad_norm": 0.36957699060440063,
"learning_rate": 5.0501843435368495e-06,
"loss": 0.3961,
"step": 381
},
{
"epoch": 1.6441893830703012,
"grad_norm": 0.3583492338657379,
"learning_rate": 5.025092487751552e-06,
"loss": 0.4226,
"step": 382
},
{
"epoch": 1.648493543758967,
"grad_norm": 0.3203399181365967,
"learning_rate": 5e-06,
"loss": 0.3844,
"step": 383
},
{
"epoch": 1.6527977044476327,
"grad_norm": 0.3899810314178467,
"learning_rate": 4.974907512248451e-06,
"loss": 0.412,
"step": 384
},
{
"epoch": 1.6571018651362985,
"grad_norm": 0.3584853410720825,
"learning_rate": 4.949815656463151e-06,
"loss": 0.4106,
"step": 385
},
{
"epoch": 1.661406025824964,
"grad_norm": 0.3646269142627716,
"learning_rate": 4.924725064594448e-06,
"loss": 0.3887,
"step": 386
},
{
"epoch": 1.6657101865136297,
"grad_norm": 0.3446120321750641,
"learning_rate": 4.89963636856084e-06,
"loss": 0.4095,
"step": 387
},
{
"epoch": 1.6700143472022955,
"grad_norm": 0.36433911323547363,
"learning_rate": 4.874550200233085e-06,
"loss": 0.4447,
"step": 388
},
{
"epoch": 1.6743185078909613,
"grad_norm": 0.3468761444091797,
"learning_rate": 4.8494671914182835e-06,
"loss": 0.4033,
"step": 389
},
{
"epoch": 1.678622668579627,
"grad_norm": 0.34094858169555664,
"learning_rate": 4.824387973843957e-06,
"loss": 0.4097,
"step": 390
},
{
"epoch": 1.6829268292682928,
"grad_norm": 0.348934531211853,
"learning_rate": 4.7993131791421385e-06,
"loss": 0.376,
"step": 391
},
{
"epoch": 1.6872309899569584,
"grad_norm": 0.36609116196632385,
"learning_rate": 4.7742434388334815e-06,
"loss": 0.4128,
"step": 392
},
{
"epoch": 1.691535150645624,
"grad_norm": 0.3412748873233795,
"learning_rate": 4.749179384311331e-06,
"loss": 0.4038,
"step": 393
},
{
"epoch": 1.6958393113342898,
"grad_norm": 0.3432169556617737,
"learning_rate": 4.724121646825838e-06,
"loss": 0.3885,
"step": 394
},
{
"epoch": 1.7001434720229556,
"grad_norm": 0.3838341236114502,
"learning_rate": 4.699070857468052e-06,
"loss": 0.4334,
"step": 395
},
{
"epoch": 1.7044476327116214,
"grad_norm": 0.32072073221206665,
"learning_rate": 4.674027647154037e-06,
"loss": 0.3766,
"step": 396
},
{
"epoch": 1.708751793400287,
"grad_norm": 0.3909914493560791,
"learning_rate": 4.648992646608968e-06,
"loss": 0.4301,
"step": 397
},
{
"epoch": 1.7130559540889525,
"grad_norm": 0.3556109666824341,
"learning_rate": 4.623966486351257e-06,
"loss": 0.3922,
"step": 398
},
{
"epoch": 1.7173601147776183,
"grad_norm": 0.31979143619537354,
"learning_rate": 4.598949796676672e-06,
"loss": 0.3612,
"step": 399
},
{
"epoch": 1.721664275466284,
"grad_norm": 0.3717595040798187,
"learning_rate": 4.573943207642452e-06,
"loss": 0.4423,
"step": 400
},
{
"epoch": 1.7259684361549499,
"grad_norm": 0.3407822847366333,
"learning_rate": 4.548947349051452e-06,
"loss": 0.406,
"step": 401
},
{
"epoch": 1.7302725968436155,
"grad_norm": 0.3208092451095581,
"learning_rate": 4.523962850436276e-06,
"loss": 0.3839,
"step": 402
},
{
"epoch": 1.7345767575322812,
"grad_norm": 0.3565741181373596,
"learning_rate": 4.498990341043419e-06,
"loss": 0.4238,
"step": 403
},
{
"epoch": 1.7388809182209468,
"grad_norm": 0.3671400547027588,
"learning_rate": 4.474030449817423e-06,
"loss": 0.416,
"step": 404
},
{
"epoch": 1.7431850789096126,
"grad_norm": 0.3624553382396698,
"learning_rate": 4.449083805385037e-06,
"loss": 0.4226,
"step": 405
},
{
"epoch": 1.7474892395982784,
"grad_norm": 0.30359649658203125,
"learning_rate": 4.424151036039381e-06,
"loss": 0.3579,
"step": 406
},
{
"epoch": 1.7517934002869442,
"grad_norm": 0.38856279850006104,
"learning_rate": 4.3992327697241225e-06,
"loss": 0.419,
"step": 407
},
{
"epoch": 1.7560975609756098,
"grad_norm": 0.370222806930542,
"learning_rate": 4.3743296340176694e-06,
"loss": 0.3893,
"step": 408
},
{
"epoch": 1.7604017216642753,
"grad_norm": 0.3267267048358917,
"learning_rate": 4.3494422561173515e-06,
"loss": 0.3741,
"step": 409
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.3811343014240265,
"learning_rate": 4.3245712628236356e-06,
"loss": 0.4255,
"step": 410
},
{
"epoch": 1.769010043041607,
"grad_norm": 0.3775186538696289,
"learning_rate": 4.299717280524329e-06,
"loss": 0.4033,
"step": 411
},
{
"epoch": 1.7733142037302727,
"grad_norm": 0.34413671493530273,
"learning_rate": 4.274880935178817e-06,
"loss": 0.4254,
"step": 412
},
{
"epoch": 1.7776183644189383,
"grad_norm": 0.3962193429470062,
"learning_rate": 4.250062852302283e-06,
"loss": 0.4121,
"step": 413
},
{
"epoch": 1.781922525107604,
"grad_norm": 0.35180673003196716,
"learning_rate": 4.225263656949961e-06,
"loss": 0.3635,
"step": 414
},
{
"epoch": 1.7862266857962696,
"grad_norm": 0.3940863311290741,
"learning_rate": 4.200483973701401e-06,
"loss": 0.4167,
"step": 415
},
{
"epoch": 1.7905308464849354,
"grad_norm": 0.3632669448852539,
"learning_rate": 4.175724426644724e-06,
"loss": 0.368,
"step": 416
},
{
"epoch": 1.7948350071736012,
"grad_norm": 0.40205034613609314,
"learning_rate": 4.150985639360914e-06,
"loss": 0.4814,
"step": 417
},
{
"epoch": 1.799139167862267,
"grad_norm": 0.38346531987190247,
"learning_rate": 4.12626823490811e-06,
"loss": 0.377,
"step": 418
},
{
"epoch": 1.8034433285509326,
"grad_norm": 0.3857615888118744,
"learning_rate": 4.1015728358059185e-06,
"loss": 0.3983,
"step": 419
},
{
"epoch": 1.8077474892395982,
"grad_norm": 0.36002928018569946,
"learning_rate": 4.076900064019721e-06,
"loss": 0.4242,
"step": 420
},
{
"epoch": 1.812051649928264,
"grad_norm": 0.3512604832649231,
"learning_rate": 4.052250540945029e-06,
"loss": 0.3908,
"step": 421
},
{
"epoch": 1.8163558106169297,
"grad_norm": 0.37411588430404663,
"learning_rate": 4.02762488739182e-06,
"loss": 0.4181,
"step": 422
},
{
"epoch": 1.8206599713055955,
"grad_norm": 0.3773100674152374,
"learning_rate": 4.003023723568903e-06,
"loss": 0.4013,
"step": 423
},
{
"epoch": 1.824964131994261,
"grad_norm": 0.3702990710735321,
"learning_rate": 3.978447669068309e-06,
"loss": 0.4012,
"step": 424
},
{
"epoch": 1.8292682926829267,
"grad_norm": 0.38589340448379517,
"learning_rate": 3.953897342849673e-06,
"loss": 0.4213,
"step": 425
},
{
"epoch": 1.8335724533715925,
"grad_norm": 0.3330689072608948,
"learning_rate": 3.929373363224654e-06,
"loss": 0.4151,
"step": 426
},
{
"epoch": 1.8378766140602583,
"grad_norm": 0.3233312666416168,
"learning_rate": 3.904876347841354e-06,
"loss": 0.36,
"step": 427
},
{
"epoch": 1.842180774748924,
"grad_norm": 0.3806193172931671,
"learning_rate": 3.8804069136687775e-06,
"loss": 0.4324,
"step": 428
},
{
"epoch": 1.8464849354375896,
"grad_norm": 0.792082667350769,
"learning_rate": 3.8559656769812746e-06,
"loss": 0.396,
"step": 429
},
{
"epoch": 1.8507890961262554,
"grad_norm": 0.342756986618042,
"learning_rate": 3.8315532533430285e-06,
"loss": 0.4244,
"step": 430
},
{
"epoch": 1.855093256814921,
"grad_norm": 0.38503968715667725,
"learning_rate": 3.8071702575925594e-06,
"loss": 0.4343,
"step": 431
},
{
"epoch": 1.8593974175035868,
"grad_norm": 0.36260154843330383,
"learning_rate": 3.7828173038272266e-06,
"loss": 0.3749,
"step": 432
},
{
"epoch": 1.8637015781922526,
"grad_norm": 0.3732585310935974,
"learning_rate": 3.7584950053877646e-06,
"loss": 0.402,
"step": 433
},
{
"epoch": 1.8680057388809184,
"grad_norm": 0.31354090571403503,
"learning_rate": 3.7342039748428473e-06,
"loss": 0.3647,
"step": 434
},
{
"epoch": 1.872309899569584,
"grad_norm": 0.3549199402332306,
"learning_rate": 3.709944823973647e-06,
"loss": 0.4235,
"step": 435
},
{
"epoch": 1.8766140602582495,
"grad_norm": 0.354219913482666,
"learning_rate": 3.685718163758427e-06,
"loss": 0.4093,
"step": 436
},
{
"epoch": 1.8809182209469153,
"grad_norm": 0.35462522506713867,
"learning_rate": 3.6615246043571674e-06,
"loss": 0.4102,
"step": 437
},
{
"epoch": 1.885222381635581,
"grad_norm": 0.3581220805644989,
"learning_rate": 3.6373647550961834e-06,
"loss": 0.4017,
"step": 438
},
{
"epoch": 1.8895265423242469,
"grad_norm": 0.31649893522262573,
"learning_rate": 3.61323922445278e-06,
"loss": 0.398,
"step": 439
},
{
"epoch": 1.8938307030129125,
"grad_norm": 0.35182642936706543,
"learning_rate": 3.5891486200399413e-06,
"loss": 0.4358,
"step": 440
},
{
"epoch": 1.8981348637015782,
"grad_norm": 0.34824714064598083,
"learning_rate": 3.5650935485910103e-06,
"loss": 0.3855,
"step": 441
},
{
"epoch": 1.9024390243902438,
"grad_norm": 0.34262511134147644,
"learning_rate": 3.5410746159444165e-06,
"loss": 0.3932,
"step": 442
},
{
"epoch": 1.9067431850789096,
"grad_norm": 0.3388313353061676,
"learning_rate": 3.5170924270284166e-06,
"loss": 0.4004,
"step": 443
},
{
"epoch": 1.9110473457675754,
"grad_norm": 0.3349505364894867,
"learning_rate": 3.4931475858458634e-06,
"loss": 0.411,
"step": 444
},
{
"epoch": 1.9153515064562412,
"grad_norm": 0.3396126329898834,
"learning_rate": 3.469240695458983e-06,
"loss": 0.4102,
"step": 445
},
{
"epoch": 1.9196556671449068,
"grad_norm": 0.3140477240085602,
"learning_rate": 3.445372357974194e-06,
"loss": 0.3899,
"step": 446
},
{
"epoch": 1.9239598278335723,
"grad_norm": 0.37466076016426086,
"learning_rate": 3.4215431745269463e-06,
"loss": 0.4131,
"step": 447
},
{
"epoch": 1.9282639885222381,
"grad_norm": 0.3352051079273224,
"learning_rate": 3.397753745266571e-06,
"loss": 0.4151,
"step": 448
},
{
"epoch": 1.932568149210904,
"grad_norm": 0.3039771020412445,
"learning_rate": 3.374004669341173e-06,
"loss": 0.3685,
"step": 449
},
{
"epoch": 1.9368723098995697,
"grad_norm": 0.36881503462791443,
"learning_rate": 3.350296544882543e-06,
"loss": 0.4254,
"step": 450
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.31767529249191284,
"learning_rate": 3.326629968991083e-06,
"loss": 0.3936,
"step": 451
},
{
"epoch": 1.9454806312769009,
"grad_norm": 0.3398596942424774,
"learning_rate": 3.303005537720778e-06,
"loss": 0.4302,
"step": 452
},
{
"epoch": 1.9497847919655666,
"grad_norm": 0.33376285433769226,
"learning_rate": 3.2794238460641837e-06,
"loss": 0.3669,
"step": 453
},
{
"epoch": 1.9540889526542324,
"grad_norm": 0.391971617937088,
"learning_rate": 3.255885487937431e-06,
"loss": 0.4454,
"step": 454
},
{
"epoch": 1.9583931133428982,
"grad_norm": 0.34021979570388794,
"learning_rate": 3.2323910561652798e-06,
"loss": 0.4036,
"step": 455
},
{
"epoch": 1.9626972740315638,
"grad_norm": 0.35812684893608093,
"learning_rate": 3.2089411424661864e-06,
"loss": 0.3968,
"step": 456
},
{
"epoch": 1.9670014347202296,
"grad_norm": 0.35335344076156616,
"learning_rate": 3.185536337437393e-06,
"loss": 0.3989,
"step": 457
},
{
"epoch": 1.9713055954088952,
"grad_norm": 0.33904704451560974,
"learning_rate": 3.1621772305400603e-06,
"loss": 0.4135,
"step": 458
},
{
"epoch": 1.975609756097561,
"grad_norm": 0.3260011374950409,
"learning_rate": 3.138864410084419e-06,
"loss": 0.4434,
"step": 459
},
{
"epoch": 1.9799139167862267,
"grad_norm": 0.3138202428817749,
"learning_rate": 3.1155984632149565e-06,
"loss": 0.3821,
"step": 460
},
{
"epoch": 1.9842180774748925,
"grad_norm": 0.33707910776138306,
"learning_rate": 3.0923799758956265e-06,
"loss": 0.4051,
"step": 461
},
{
"epoch": 1.988522238163558,
"grad_norm": 0.350344181060791,
"learning_rate": 3.0692095328950843e-06,
"loss": 0.422,
"step": 462
},
{
"epoch": 1.9928263988522237,
"grad_norm": 0.3441988527774811,
"learning_rate": 3.0460877177719763e-06,
"loss": 0.4065,
"step": 463
},
{
"epoch": 1.9971305595408895,
"grad_norm": 0.35449886322021484,
"learning_rate": 3.023015112860228e-06,
"loss": 0.4182,
"step": 464
},
{
"epoch": 2.0014347202295553,
"grad_norm": 0.6956499218940735,
"learning_rate": 2.9999922992543777e-06,
"loss": 0.6638,
"step": 465
},
{
"epoch": 2.005738880918221,
"grad_norm": 0.4010846018791199,
"learning_rate": 2.977019856794955e-06,
"loss": 0.3728,
"step": 466
},
{
"epoch": 2.010043041606887,
"grad_norm": 0.4007827043533325,
"learning_rate": 2.9540983640538635e-06,
"loss": 0.3776,
"step": 467
},
{
"epoch": 2.014347202295552,
"grad_norm": 0.38452622294425964,
"learning_rate": 2.93122839831981e-06,
"loss": 0.3774,
"step": 468
},
{
"epoch": 2.018651362984218,
"grad_norm": 0.3884631097316742,
"learning_rate": 2.908410535583777e-06,
"loss": 0.401,
"step": 469
},
{
"epoch": 2.022955523672884,
"grad_norm": 0.37267711758613586,
"learning_rate": 2.8856453505245018e-06,
"loss": 0.3421,
"step": 470
},
{
"epoch": 2.0272596843615496,
"grad_norm": 0.41387253999710083,
"learning_rate": 2.8629334164940127e-06,
"loss": 0.3963,
"step": 471
},
{
"epoch": 2.0315638450502154,
"grad_norm": 0.38894036412239075,
"learning_rate": 2.840275305503186e-06,
"loss": 0.411,
"step": 472
},
{
"epoch": 2.0358680057388807,
"grad_norm": 0.3363220989704132,
"learning_rate": 2.817671588207338e-06,
"loss": 0.3524,
"step": 473
},
{
"epoch": 2.0401721664275465,
"grad_norm": 0.3584243357181549,
"learning_rate": 2.7951228338918506e-06,
"loss": 0.4025,
"step": 474
},
{
"epoch": 2.0444763271162123,
"grad_norm": 0.34380313754081726,
"learning_rate": 2.77262961045784e-06,
"loss": 0.3464,
"step": 475
},
{
"epoch": 2.048780487804878,
"grad_norm": 0.34843042492866516,
"learning_rate": 2.7501924844078538e-06,
"loss": 0.3732,
"step": 476
},
{
"epoch": 2.053084648493544,
"grad_norm": 0.37281617522239685,
"learning_rate": 2.7278120208315927e-06,
"loss": 0.3836,
"step": 477
},
{
"epoch": 2.0573888091822097,
"grad_norm": 0.38437455892562866,
"learning_rate": 2.7054887833916933e-06,
"loss": 0.4123,
"step": 478
},
{
"epoch": 2.061692969870875,
"grad_norm": 0.3231838643550873,
"learning_rate": 2.6832233343095225e-06,
"loss": 0.3418,
"step": 479
},
{
"epoch": 2.065997130559541,
"grad_norm": 0.3335508406162262,
"learning_rate": 2.6610162343510183e-06,
"loss": 0.3768,
"step": 480
},
{
"epoch": 2.0703012912482066,
"grad_norm": 0.38742804527282715,
"learning_rate": 2.6388680428125657e-06,
"loss": 0.3966,
"step": 481
},
{
"epoch": 2.0746054519368724,
"grad_norm": 0.3358624577522278,
"learning_rate": 2.616779317506921e-06,
"loss": 0.3589,
"step": 482
},
{
"epoch": 2.078909612625538,
"grad_norm": 0.32186657190322876,
"learning_rate": 2.594750614749148e-06,
"loss": 0.3813,
"step": 483
},
{
"epoch": 2.0832137733142035,
"grad_norm": 0.3297998607158661,
"learning_rate": 2.572782489342617e-06,
"loss": 0.3774,
"step": 484
},
{
"epoch": 2.0875179340028693,
"grad_norm": 0.33904680609703064,
"learning_rate": 2.5508754945650305e-06,
"loss": 0.3792,
"step": 485
},
{
"epoch": 2.091822094691535,
"grad_norm": 0.34405916929244995,
"learning_rate": 2.5290301821544826e-06,
"loss": 0.3801,
"step": 486
},
{
"epoch": 2.096126255380201,
"grad_norm": 0.30022576451301575,
"learning_rate": 2.5072471022955703e-06,
"loss": 0.3291,
"step": 487
},
{
"epoch": 2.1004304160688667,
"grad_norm": 0.33434441685676575,
"learning_rate": 2.4855268036055346e-06,
"loss": 0.3989,
"step": 488
},
{
"epoch": 2.104734576757532,
"grad_norm": 0.31754809617996216,
"learning_rate": 2.4638698331204404e-06,
"loss": 0.3622,
"step": 489
},
{
"epoch": 2.109038737446198,
"grad_norm": 0.343357115983963,
"learning_rate": 2.4422767362814045e-06,
"loss": 0.4263,
"step": 490
},
{
"epoch": 2.1133428981348636,
"grad_norm": 0.30014854669570923,
"learning_rate": 2.420748056920856e-06,
"loss": 0.326,
"step": 491
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.33290165662765503,
"learning_rate": 2.3992843372488357e-06,
"loss": 0.3947,
"step": 492
},
{
"epoch": 2.1219512195121952,
"grad_norm": 0.33419376611709595,
"learning_rate": 2.3778861178393453e-06,
"loss": 0.3595,
"step": 493
},
{
"epoch": 2.126255380200861,
"grad_norm": 0.34245628118515015,
"learning_rate": 2.3565539376167295e-06,
"loss": 0.4118,
"step": 494
},
{
"epoch": 2.1305595408895264,
"grad_norm": 0.3388068377971649,
"learning_rate": 2.3352883338421085e-06,
"loss": 0.3537,
"step": 495
},
{
"epoch": 2.134863701578192,
"grad_norm": 0.336579829454422,
"learning_rate": 2.3140898420998425e-06,
"loss": 0.3787,
"step": 496
},
{
"epoch": 2.139167862266858,
"grad_norm": 0.3296073377132416,
"learning_rate": 2.2929589962840375e-06,
"loss": 0.3837,
"step": 497
},
{
"epoch": 2.1434720229555237,
"grad_norm": 0.3224649131298065,
"learning_rate": 2.271896328585114e-06,
"loss": 0.3662,
"step": 498
},
{
"epoch": 2.1477761836441895,
"grad_norm": 0.34240302443504333,
"learning_rate": 2.2509023694763844e-06,
"loss": 0.3793,
"step": 499
},
{
"epoch": 2.152080344332855,
"grad_norm": 0.3357837200164795,
"learning_rate": 2.2299776477007073e-06,
"loss": 0.4106,
"step": 500
},
{
"epoch": 2.1563845050215207,
"grad_norm": 0.33356544375419617,
"learning_rate": 2.2091226902571673e-06,
"loss": 0.3712,
"step": 501
},
{
"epoch": 2.1606886657101865,
"grad_norm": 0.3245741128921509,
"learning_rate": 2.1883380223878004e-06,
"loss": 0.3951,
"step": 502
},
{
"epoch": 2.1649928263988523,
"grad_norm": 0.3323533236980438,
"learning_rate": 2.1676241675643627e-06,
"loss": 0.3972,
"step": 503
},
{
"epoch": 2.169296987087518,
"grad_norm": 0.30774545669555664,
"learning_rate": 2.1469816474751566e-06,
"loss": 0.3501,
"step": 504
},
{
"epoch": 2.173601147776184,
"grad_norm": 0.343365877866745,
"learning_rate": 2.1264109820118783e-06,
"loss": 0.3766,
"step": 505
},
{
"epoch": 2.177905308464849,
"grad_norm": 0.3731037378311157,
"learning_rate": 2.105912689256533e-06,
"loss": 0.3999,
"step": 506
},
{
"epoch": 2.182209469153515,
"grad_norm": 0.34139010310173035,
"learning_rate": 2.0854872854683877e-06,
"loss": 0.3815,
"step": 507
},
{
"epoch": 2.186513629842181,
"grad_norm": 0.3012133240699768,
"learning_rate": 2.0651352850709656e-06,
"loss": 0.3492,
"step": 508
},
{
"epoch": 2.1908177905308466,
"grad_norm": 0.2982615530490875,
"learning_rate": 2.0448572006390875e-06,
"loss": 0.3643,
"step": 509
},
{
"epoch": 2.1951219512195124,
"grad_norm": 0.29477861523628235,
"learning_rate": 2.0246535428859652e-06,
"loss": 0.3599,
"step": 510
},
{
"epoch": 2.1994261119081777,
"grad_norm": 0.3607404828071594,
"learning_rate": 2.0045248206503454e-06,
"loss": 0.3724,
"step": 511
},
{
"epoch": 2.2037302725968435,
"grad_norm": 0.34111490845680237,
"learning_rate": 1.984471540883679e-06,
"loss": 0.3729,
"step": 512
},
{
"epoch": 2.2080344332855093,
"grad_norm": 0.3487270772457123,
"learning_rate": 1.964494208637369e-06,
"loss": 0.3818,
"step": 513
},
{
"epoch": 2.212338593974175,
"grad_norm": 0.32086241245269775,
"learning_rate": 1.9445933270500444e-06,
"loss": 0.3413,
"step": 514
},
{
"epoch": 2.216642754662841,
"grad_norm": 0.3221076428890228,
"learning_rate": 1.9247693973348834e-06,
"loss": 0.4038,
"step": 515
},
{
"epoch": 2.2209469153515062,
"grad_norm": 0.3006744086742401,
"learning_rate": 1.905022918766995e-06,
"loss": 0.3628,
"step": 516
},
{
"epoch": 2.225251076040172,
"grad_norm": 0.3235909044742584,
"learning_rate": 1.8853543886708498e-06,
"loss": 0.3845,
"step": 517
},
{
"epoch": 2.229555236728838,
"grad_norm": 0.324224591255188,
"learning_rate": 1.8657643024077431e-06,
"loss": 0.3699,
"step": 518
},
{
"epoch": 2.2338593974175036,
"grad_norm": 0.3157562017440796,
"learning_rate": 1.8462531533633238e-06,
"loss": 0.3697,
"step": 519
},
{
"epoch": 2.2381635581061694,
"grad_norm": 0.28793036937713623,
"learning_rate": 1.8268214329351797e-06,
"loss": 0.3522,
"step": 520
},
{
"epoch": 2.242467718794835,
"grad_norm": 0.33378034830093384,
"learning_rate": 1.8074696305204397e-06,
"loss": 0.4332,
"step": 521
},
{
"epoch": 2.2467718794835005,
"grad_norm": 0.3230198323726654,
"learning_rate": 1.7881982335034625e-06,
"loss": 0.3762,
"step": 522
},
{
"epoch": 2.2510760401721663,
"grad_norm": 0.3061073422431946,
"learning_rate": 1.7690077272435636e-06,
"loss": 0.3588,
"step": 523
},
{
"epoch": 2.255380200860832,
"grad_norm": 0.34299224615097046,
"learning_rate": 1.7498985950627794e-06,
"loss": 0.4348,
"step": 524
},
{
"epoch": 2.259684361549498,
"grad_norm": 0.3042338192462921,
"learning_rate": 1.7308713182337044e-06,
"loss": 0.3497,
"step": 525
},
{
"epoch": 2.2639885222381637,
"grad_norm": 0.2962125837802887,
"learning_rate": 1.7119263759673677e-06,
"loss": 0.3568,
"step": 526
},
{
"epoch": 2.2682926829268295,
"grad_norm": 0.3360259532928467,
"learning_rate": 1.6930642454011647e-06,
"loss": 0.3752,
"step": 527
},
{
"epoch": 2.272596843615495,
"grad_norm": 0.31935688853263855,
"learning_rate": 1.6742854015868349e-06,
"loss": 0.4088,
"step": 528
},
{
"epoch": 2.2769010043041606,
"grad_norm": 0.3136122226715088,
"learning_rate": 1.655590317478501e-06,
"loss": 0.3581,
"step": 529
},
{
"epoch": 2.2812051649928264,
"grad_norm": 0.3090389370918274,
"learning_rate": 1.6369794639207626e-06,
"loss": 0.3688,
"step": 530
},
{
"epoch": 2.2855093256814922,
"grad_norm": 0.3279600441455841,
"learning_rate": 1.6184533096368277e-06,
"loss": 0.3735,
"step": 531
},
{
"epoch": 2.289813486370158,
"grad_norm": 0.30414533615112305,
"learning_rate": 1.6000123212167158e-06,
"loss": 0.3703,
"step": 532
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.32438209652900696,
"learning_rate": 1.581656963105504e-06,
"loss": 0.3871,
"step": 533
},
{
"epoch": 2.298421807747489,
"grad_norm": 0.3222774863243103,
"learning_rate": 1.5633876975916261e-06,
"loss": 0.3851,
"step": 534
},
{
"epoch": 2.302725968436155,
"grad_norm": 0.33032676577568054,
"learning_rate": 1.5452049847952338e-06,
"loss": 0.3581,
"step": 535
},
{
"epoch": 2.3070301291248207,
"grad_norm": 0.3343183994293213,
"learning_rate": 1.5271092826566108e-06,
"loss": 0.3837,
"step": 536
},
{
"epoch": 2.3113342898134865,
"grad_norm": 0.30717727541923523,
"learning_rate": 1.5091010469246303e-06,
"loss": 0.3783,
"step": 537
},
{
"epoch": 2.315638450502152,
"grad_norm": 0.3004843294620514,
"learning_rate": 1.4911807311452874e-06,
"loss": 0.3855,
"step": 538
},
{
"epoch": 2.3199426111908177,
"grad_norm": 0.3067179322242737,
"learning_rate": 1.4733487866502698e-06,
"loss": 0.3829,
"step": 539
},
{
"epoch": 2.3242467718794835,
"grad_norm": 0.29565826058387756,
"learning_rate": 1.4556056625455922e-06,
"loss": 0.3657,
"step": 540
},
{
"epoch": 2.3285509325681493,
"grad_norm": 0.303459495306015,
"learning_rate": 1.4379518057002834e-06,
"loss": 0.37,
"step": 541
},
{
"epoch": 2.332855093256815,
"grad_norm": 0.3352012038230896,
"learning_rate": 1.4203876607351347e-06,
"loss": 0.3895,
"step": 542
},
{
"epoch": 2.3371592539454804,
"grad_norm": 0.31783953309059143,
"learning_rate": 1.4029136700115031e-06,
"loss": 0.3876,
"step": 543
},
{
"epoch": 2.341463414634146,
"grad_norm": 0.2909916341304779,
"learning_rate": 1.3855302736201686e-06,
"loss": 0.3556,
"step": 544
},
{
"epoch": 2.345767575322812,
"grad_norm": 0.30111250281333923,
"learning_rate": 1.3682379093702447e-06,
"loss": 0.3431,
"step": 545
},
{
"epoch": 2.350071736011478,
"grad_norm": 0.30051347613334656,
"learning_rate": 1.3510370127781635e-06,
"loss": 0.3874,
"step": 546
},
{
"epoch": 2.3543758967001436,
"grad_norm": 0.31402793526649475,
"learning_rate": 1.3339280170566959e-06,
"loss": 0.3869,
"step": 547
},
{
"epoch": 2.3586800573888094,
"grad_norm": 0.2965995967388153,
"learning_rate": 1.3169113531040462e-06,
"loss": 0.3299,
"step": 548
},
{
"epoch": 2.3629842180774747,
"grad_norm": 0.36196231842041016,
"learning_rate": 1.2999874494930004e-06,
"loss": 0.445,
"step": 549
},
{
"epoch": 2.3672883787661405,
"grad_norm": 0.3123576045036316,
"learning_rate": 1.2831567324601325e-06,
"loss": 0.3626,
"step": 550
},
{
"epoch": 2.3715925394548063,
"grad_norm": 0.28615859150886536,
"learning_rate": 1.266419625895064e-06,
"loss": 0.3355,
"step": 551
},
{
"epoch": 2.375896700143472,
"grad_norm": 0.30293789505958557,
"learning_rate": 1.2497765513297976e-06,
"loss": 0.3665,
"step": 552
},
{
"epoch": 2.380200860832138,
"grad_norm": 0.29470351338386536,
"learning_rate": 1.2332279279280907e-06,
"loss": 0.3904,
"step": 553
},
{
"epoch": 2.3845050215208037,
"grad_norm": 0.31690046191215515,
"learning_rate": 1.2167741724749026e-06,
"loss": 0.3656,
"step": 554
},
{
"epoch": 2.388809182209469,
"grad_norm": 0.30208268761634827,
"learning_rate": 1.2004156993659028e-06,
"loss": 0.3829,
"step": 555
},
{
"epoch": 2.393113342898135,
"grad_norm": 0.32322484254837036,
"learning_rate": 1.1841529205970281e-06,
"loss": 0.4277,
"step": 556
},
{
"epoch": 2.3974175035868006,
"grad_norm": 0.29321643710136414,
"learning_rate": 1.1679862457541052e-06,
"loss": 0.3543,
"step": 557
},
{
"epoch": 2.4017216642754664,
"grad_norm": 0.329012006521225,
"learning_rate": 1.1519160820025382e-06,
"loss": 0.384,
"step": 558
},
{
"epoch": 2.406025824964132,
"grad_norm": 0.31029197573661804,
"learning_rate": 1.1359428340770567e-06,
"loss": 0.3625,
"step": 559
},
{
"epoch": 2.4103299856527975,
"grad_norm": 0.29086750745773315,
"learning_rate": 1.1200669042715163e-06,
"loss": 0.3786,
"step": 560
},
{
"epoch": 2.4146341463414633,
"grad_norm": 0.3073953092098236,
"learning_rate": 1.104288692428766e-06,
"loss": 0.3716,
"step": 561
},
{
"epoch": 2.418938307030129,
"grad_norm": 0.29252809286117554,
"learning_rate": 1.0886085959305915e-06,
"loss": 0.3679,
"step": 562
},
{
"epoch": 2.423242467718795,
"grad_norm": 0.32353729009628296,
"learning_rate": 1.0730270096876876e-06,
"loss": 0.3757,
"step": 563
},
{
"epoch": 2.4275466284074607,
"grad_norm": 0.3193880617618561,
"learning_rate": 1.057544326129723e-06,
"loss": 0.3637,
"step": 564
},
{
"epoch": 2.431850789096126,
"grad_norm": 0.3040056824684143,
"learning_rate": 1.0421609351954599e-06,
"loss": 0.3614,
"step": 565
},
{
"epoch": 2.436154949784792,
"grad_norm": 0.30761218070983887,
"learning_rate": 1.026877224322923e-06,
"loss": 0.3713,
"step": 566
},
{
"epoch": 2.4404591104734576,
"grad_norm": 0.3160081207752228,
"learning_rate": 1.0116935784396482e-06,
"loss": 0.3988,
"step": 567
},
{
"epoch": 2.4447632711621234,
"grad_norm": 0.3210393488407135,
"learning_rate": 9.966103799529891e-07,
"loss": 0.3731,
"step": 568
},
{
"epoch": 2.4490674318507892,
"grad_norm": 0.30738240480422974,
"learning_rate": 9.816280087404851e-07,
"loss": 0.3778,
"step": 569
},
{
"epoch": 2.4533715925394546,
"grad_norm": 0.2967623770236969,
"learning_rate": 9.66746842140287e-07,
"loss": 0.3615,
"step": 570
},
{
"epoch": 2.4576757532281204,
"grad_norm": 0.31555816531181335,
"learning_rate": 9.519672549416659e-07,
"loss": 0.3846,
"step": 571
},
{
"epoch": 2.461979913916786,
"grad_norm": 0.3220941722393036,
"learning_rate": 9.372896193755621e-07,
"loss": 0.3698,
"step": 572
},
{
"epoch": 2.466284074605452,
"grad_norm": 0.31963497400283813,
"learning_rate": 9.227143051052162e-07,
"loss": 0.3646,
"step": 573
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.3254527449607849,
"learning_rate": 9.082416792168608e-07,
"loss": 0.4002,
"step": 574
},
{
"epoch": 2.4748923959827835,
"grad_norm": 0.301438570022583,
"learning_rate": 8.938721062104727e-07,
"loss": 0.3935,
"step": 575
},
{
"epoch": 2.479196556671449,
"grad_norm": 0.27928128838539124,
"learning_rate": 8.7960594799059e-07,
"loss": 0.3782,
"step": 576
},
{
"epoch": 2.4835007173601147,
"grad_norm": 0.3043065667152405,
"learning_rate": 8.654435638572e-07,
"loss": 0.392,
"step": 577
},
{
"epoch": 2.4878048780487805,
"grad_norm": 0.2972443699836731,
"learning_rate": 8.513853104966951e-07,
"loss": 0.3619,
"step": 578
},
{
"epoch": 2.4921090387374463,
"grad_norm": 0.3183661997318268,
"learning_rate": 8.374315419728784e-07,
"loss": 0.4039,
"step": 579
},
{
"epoch": 2.496413199426112,
"grad_norm": 0.28181061148643494,
"learning_rate": 8.235826097180566e-07,
"loss": 0.3168,
"step": 580
},
{
"epoch": 2.500717360114778,
"grad_norm": 0.3265518546104431,
"learning_rate": 8.098388625241854e-07,
"loss": 0.3845,
"step": 581
},
{
"epoch": 2.505021520803443,
"grad_norm": 0.31692567467689514,
"learning_rate": 7.962006465340821e-07,
"loss": 0.3945,
"step": 582
},
{
"epoch": 2.509325681492109,
"grad_norm": 0.2771775722503662,
"learning_rate": 7.8266830523271e-07,
"loss": 0.3473,
"step": 583
},
{
"epoch": 2.513629842180775,
"grad_norm": 0.32277750968933105,
"learning_rate": 7.692421794385313e-07,
"loss": 0.4079,
"step": 584
},
{
"epoch": 2.5179340028694406,
"grad_norm": 0.31330087780952454,
"learning_rate": 7.559226072949166e-07,
"loss": 0.393,
"step": 585
},
{
"epoch": 2.5222381635581064,
"grad_norm": 0.41533592343330383,
"learning_rate": 7.427099242616348e-07,
"loss": 0.3804,
"step": 586
},
{
"epoch": 2.5265423242467717,
"grad_norm": 0.29984721541404724,
"learning_rate": 7.296044631064014e-07,
"loss": 0.3765,
"step": 587
},
{
"epoch": 2.5308464849354375,
"grad_norm": 0.30083853006362915,
"learning_rate": 7.166065538964955e-07,
"loss": 0.3867,
"step": 588
},
{
"epoch": 2.5351506456241033,
"grad_norm": 0.30099427700042725,
"learning_rate": 7.037165239904514e-07,
"loss": 0.3831,
"step": 589
},
{
"epoch": 2.539454806312769,
"grad_norm": 0.3282623589038849,
"learning_rate": 6.909346980298093e-07,
"loss": 0.3948,
"step": 590
},
{
"epoch": 2.543758967001435,
"grad_norm": 0.27795639634132385,
"learning_rate": 6.782613979309443e-07,
"loss": 0.3888,
"step": 591
},
{
"epoch": 2.5480631276901002,
"grad_norm": 0.2930023968219757,
"learning_rate": 6.656969428769567e-07,
"loss": 0.3468,
"step": 592
},
{
"epoch": 2.552367288378766,
"grad_norm": 0.3128500282764435,
"learning_rate": 6.532416493096272e-07,
"loss": 0.3889,
"step": 593
},
{
"epoch": 2.556671449067432,
"grad_norm": 0.3318694829940796,
"learning_rate": 6.408958309214597e-07,
"loss": 0.3928,
"step": 594
},
{
"epoch": 2.5609756097560976,
"grad_norm": 0.335523784160614,
"learning_rate": 6.286597986477683e-07,
"loss": 0.3666,
"step": 595
},
{
"epoch": 2.5652797704447634,
"grad_norm": 0.29514452815055847,
"learning_rate": 6.165338606588517e-07,
"loss": 0.3416,
"step": 596
},
{
"epoch": 2.5695839311334288,
"grad_norm": 0.35578909516334534,
"learning_rate": 6.045183223522339e-07,
"loss": 0.4163,
"step": 597
},
{
"epoch": 2.5738880918220945,
"grad_norm": 0.3049773871898651,
"learning_rate": 5.926134863449712e-07,
"loss": 0.3584,
"step": 598
},
{
"epoch": 2.5781922525107603,
"grad_norm": 0.2862030267715454,
"learning_rate": 5.808196524660253e-07,
"loss": 0.3552,
"step": 599
},
{
"epoch": 2.582496413199426,
"grad_norm": 0.32195690274238586,
"learning_rate": 5.691371177487215e-07,
"loss": 0.4089,
"step": 600
},
{
"epoch": 2.586800573888092,
"grad_norm": 0.3421590030193329,
"learning_rate": 5.575661764232593e-07,
"loss": 0.3779,
"step": 601
},
{
"epoch": 2.5911047345767573,
"grad_norm": 0.30208882689476013,
"learning_rate": 5.461071199093048e-07,
"loss": 0.4002,
"step": 602
},
{
"epoch": 2.5954088952654235,
"grad_norm": 0.2821211516857147,
"learning_rate": 5.347602368086563e-07,
"loss": 0.3484,
"step": 603
},
{
"epoch": 2.599713055954089,
"grad_norm": 0.2918562889099121,
"learning_rate": 5.235258128979676e-07,
"loss": 0.3596,
"step": 604
},
{
"epoch": 2.6040172166427547,
"grad_norm": 0.2854442596435547,
"learning_rate": 5.124041311215544e-07,
"loss": 0.3605,
"step": 605
},
{
"epoch": 2.6083213773314204,
"grad_norm": 0.2967503070831299,
"learning_rate": 5.0139547158427e-07,
"loss": 0.4133,
"step": 606
},
{
"epoch": 2.6126255380200862,
"grad_norm": 0.29674726724624634,
"learning_rate": 4.905001115444475e-07,
"loss": 0.3752,
"step": 607
},
{
"epoch": 2.616929698708752,
"grad_norm": 0.30667829513549805,
"learning_rate": 4.797183254069176e-07,
"loss": 0.3829,
"step": 608
},
{
"epoch": 2.6212338593974174,
"grad_norm": 0.3153044879436493,
"learning_rate": 4.690503847160982e-07,
"loss": 0.4159,
"step": 609
},
{
"epoch": 2.625538020086083,
"grad_norm": 0.3004702925682068,
"learning_rate": 4.5849655814915683e-07,
"loss": 0.3548,
"step": 610
},
{
"epoch": 2.629842180774749,
"grad_norm": 0.2953874468803406,
"learning_rate": 4.4805711150924304e-07,
"loss": 0.3677,
"step": 611
},
{
"epoch": 2.6341463414634148,
"grad_norm": 0.30996379256248474,
"learning_rate": 4.3773230771879004e-07,
"loss": 0.4144,
"step": 612
},
{
"epoch": 2.6384505021520805,
"grad_norm": 0.2848236560821533,
"learning_rate": 4.2752240681290027e-07,
"loss": 0.3533,
"step": 613
},
{
"epoch": 2.642754662840746,
"grad_norm": 0.31599825620651245,
"learning_rate": 4.1742766593278974e-07,
"loss": 0.3949,
"step": 614
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.2851794362068176,
"learning_rate": 4.074483393193135e-07,
"loss": 0.3824,
"step": 615
},
{
"epoch": 2.6513629842180775,
"grad_norm": 0.2868153154850006,
"learning_rate": 3.9758467830656623e-07,
"loss": 0.3311,
"step": 616
},
{
"epoch": 2.6556671449067433,
"grad_norm": 0.312959223985672,
"learning_rate": 3.8783693131554836e-07,
"loss": 0.3832,
"step": 617
},
{
"epoch": 2.659971305595409,
"grad_norm": 0.320313423871994,
"learning_rate": 3.782053438479094e-07,
"loss": 0.3637,
"step": 618
},
{
"epoch": 2.6642754662840744,
"grad_norm": 0.2905493974685669,
"learning_rate": 3.686901584797675e-07,
"loss": 0.3638,
"step": 619
},
{
"epoch": 2.66857962697274,
"grad_norm": 0.2887527644634247,
"learning_rate": 3.5929161485559694e-07,
"loss": 0.376,
"step": 620
},
{
"epoch": 2.672883787661406,
"grad_norm": 0.2931526303291321,
"learning_rate": 3.5000994968219406e-07,
"loss": 0.372,
"step": 621
},
{
"epoch": 2.677187948350072,
"grad_norm": 0.2868293821811676,
"learning_rate": 3.4084539672271764e-07,
"loss": 0.361,
"step": 622
},
{
"epoch": 2.6814921090387376,
"grad_norm": 0.3189477026462555,
"learning_rate": 3.3179818679079936e-07,
"loss": 0.3799,
"step": 623
},
{
"epoch": 2.685796269727403,
"grad_norm": 0.30111587047576904,
"learning_rate": 3.228685477447291e-07,
"loss": 0.3515,
"step": 624
},
{
"epoch": 2.6901004304160687,
"grad_norm": 0.31718146800994873,
"learning_rate": 3.140567044817172e-07,
"loss": 0.383,
"step": 625
},
{
"epoch": 2.6944045911047345,
"grad_norm": 0.3003697991371155,
"learning_rate": 3.0536287893223603e-07,
"loss": 0.3733,
"step": 626
},
{
"epoch": 2.6987087517934003,
"grad_norm": 0.2880096435546875,
"learning_rate": 2.967872900544194e-07,
"loss": 0.3751,
"step": 627
},
{
"epoch": 2.703012912482066,
"grad_norm": 0.27889522910118103,
"learning_rate": 2.883301538285582e-07,
"loss": 0.3578,
"step": 628
},
{
"epoch": 2.7073170731707314,
"grad_norm": 0.29290032386779785,
"learning_rate": 2.799916832516575e-07,
"loss": 0.4041,
"step": 629
},
{
"epoch": 2.7116212338593977,
"grad_norm": 0.2837107181549072,
"learning_rate": 2.717720883320685e-07,
"loss": 0.3779,
"step": 630
},
{
"epoch": 2.715925394548063,
"grad_norm": 0.2823951840400696,
"learning_rate": 2.6367157608420347e-07,
"loss": 0.3444,
"step": 631
},
{
"epoch": 2.720229555236729,
"grad_norm": 0.32553017139434814,
"learning_rate": 2.556903505233216e-07,
"loss": 0.3881,
"step": 632
},
{
"epoch": 2.7245337159253946,
"grad_norm": 0.28817129135131836,
"learning_rate": 2.4782861266038904e-07,
"loss": 0.3657,
"step": 633
},
{
"epoch": 2.7288378766140604,
"grad_norm": 0.29924365878105164,
"learning_rate": 2.4008656049701875e-07,
"loss": 0.3845,
"step": 634
},
{
"epoch": 2.733142037302726,
"grad_norm": 0.2882951498031616,
"learning_rate": 2.3246438902048196e-07,
"loss": 0.3815,
"step": 635
},
{
"epoch": 2.7374461979913915,
"grad_norm": 0.2708081305027008,
"learning_rate": 2.2496229019879635e-07,
"loss": 0.3297,
"step": 636
},
{
"epoch": 2.7417503586800573,
"grad_norm": 0.2893579602241516,
"learning_rate": 2.175804529758929e-07,
"loss": 0.3956,
"step": 637
},
{
"epoch": 2.746054519368723,
"grad_norm": 0.30808335542678833,
"learning_rate": 2.1031906326685946e-07,
"loss": 0.3644,
"step": 638
},
{
"epoch": 2.750358680057389,
"grad_norm": 0.3239193856716156,
"learning_rate": 2.0317830395325255e-07,
"loss": 0.4271,
"step": 639
},
{
"epoch": 2.7546628407460547,
"grad_norm": 0.29798826575279236,
"learning_rate": 1.9615835487849677e-07,
"loss": 0.3978,
"step": 640
},
{
"epoch": 2.75896700143472,
"grad_norm": 0.31455305218696594,
"learning_rate": 1.8925939284335225e-07,
"loss": 0.4061,
"step": 641
},
{
"epoch": 2.763271162123386,
"grad_norm": 0.283496618270874,
"learning_rate": 1.824815916014644e-07,
"loss": 0.3499,
"step": 642
},
{
"epoch": 2.7675753228120517,
"grad_norm": 0.2762824296951294,
"learning_rate": 1.7582512185498446e-07,
"loss": 0.3588,
"step": 643
},
{
"epoch": 2.7718794835007174,
"grad_norm": 0.30749961733818054,
"learning_rate": 1.6929015125027314e-07,
"loss": 0.3768,
"step": 644
},
{
"epoch": 2.7761836441893832,
"grad_norm": 0.2810935974121094,
"learning_rate": 1.6287684437367724e-07,
"loss": 0.3517,
"step": 645
},
{
"epoch": 2.7804878048780486,
"grad_norm": 0.2916150689125061,
"learning_rate": 1.5658536274738623e-07,
"loss": 0.3764,
"step": 646
},
{
"epoch": 2.7847919655667144,
"grad_norm": 0.2854636013507843,
"learning_rate": 1.504158648253584e-07,
"loss": 0.3823,
"step": 647
},
{
"epoch": 2.78909612625538,
"grad_norm": 0.2888132929801941,
"learning_rate": 1.443685059893396e-07,
"loss": 0.3456,
"step": 648
},
{
"epoch": 2.793400286944046,
"grad_norm": 0.2825074791908264,
"learning_rate": 1.3844343854494123e-07,
"loss": 0.3695,
"step": 649
},
{
"epoch": 2.7977044476327118,
"grad_norm": 0.2972644865512848,
"learning_rate": 1.3264081171780797e-07,
"loss": 0.3661,
"step": 650
},
{
"epoch": 2.802008608321377,
"grad_norm": 0.2751018702983856,
"learning_rate": 1.2696077164986e-07,
"loss": 0.3741,
"step": 651
},
{
"epoch": 2.806312769010043,
"grad_norm": 0.2982390224933624,
"learning_rate": 1.2140346139561277e-07,
"loss": 0.3896,
"step": 652
},
{
"epoch": 2.8106169296987087,
"grad_norm": 0.29119473695755005,
"learning_rate": 1.1596902091857043e-07,
"loss": 0.3667,
"step": 653
},
{
"epoch": 2.8149210903873745,
"grad_norm": 0.30065134167671204,
"learning_rate": 1.1065758708770468e-07,
"loss": 0.3591,
"step": 654
},
{
"epoch": 2.8192252510760403,
"grad_norm": 0.28981831669807434,
"learning_rate": 1.0546929367400705e-07,
"loss": 0.3288,
"step": 655
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.3213971257209778,
"learning_rate": 1.004042713471165e-07,
"loss": 0.393,
"step": 656
},
{
"epoch": 2.827833572453372,
"grad_norm": 0.27952906489372253,
"learning_rate": 9.546264767203328e-08,
"loss": 0.3538,
"step": 657
},
{
"epoch": 2.832137733142037,
"grad_norm": 0.32866302132606506,
"learning_rate": 9.064454710590253e-08,
"loss": 0.3927,
"step": 658
},
{
"epoch": 2.836441893830703,
"grad_norm": 0.3011215329170227,
"learning_rate": 8.595009099488238e-08,
"loss": 0.377,
"step": 659
},
{
"epoch": 2.840746054519369,
"grad_norm": 0.2855226397514343,
"learning_rate": 8.137939757108526e-08,
"loss": 0.3939,
"step": 660
},
{
"epoch": 2.8450502152080346,
"grad_norm": 0.2855052649974823,
"learning_rate": 7.693258194960252e-08,
"loss": 0.3562,
"step": 661
},
{
"epoch": 2.8493543758967004,
"grad_norm": 0.30429205298423767,
"learning_rate": 7.260975612560173e-08,
"loss": 0.3841,
"step": 662
},
{
"epoch": 2.8536585365853657,
"grad_norm": 0.28108206391334534,
"learning_rate": 6.84110289715112e-08,
"loss": 0.3377,
"step": 663
},
{
"epoch": 2.8579626972740315,
"grad_norm": 0.2989685833454132,
"learning_rate": 6.433650623427379e-08,
"loss": 0.4037,
"step": 664
},
{
"epoch": 2.8622668579626973,
"grad_norm": 0.2865199148654938,
"learning_rate": 6.038629053268464e-08,
"loss": 0.3728,
"step": 665
},
{
"epoch": 2.866571018651363,
"grad_norm": 0.2750672698020935,
"learning_rate": 5.6560481354807625e-08,
"loss": 0.356,
"step": 666
},
{
"epoch": 2.870875179340029,
"grad_norm": 0.2924894094467163,
"learning_rate": 5.285917505546967e-08,
"loss": 0.3998,
"step": 667
},
{
"epoch": 2.8751793400286942,
"grad_norm": 0.2777416706085205,
"learning_rate": 4.928246485383148e-08,
"loss": 0.3753,
"step": 668
},
{
"epoch": 2.87948350071736,
"grad_norm": 0.3033868968486786,
"learning_rate": 4.583044083104282e-08,
"loss": 0.393,
"step": 669
},
{
"epoch": 2.883787661406026,
"grad_norm": 0.32299816608428955,
"learning_rate": 4.250318992797375e-08,
"loss": 0.3897,
"step": 670
},
{
"epoch": 2.8880918220946916,
"grad_norm": 0.2930081784725189,
"learning_rate": 3.9300795943021943e-08,
"loss": 0.3355,
"step": 671
},
{
"epoch": 2.8923959827833574,
"grad_norm": 0.30189839005470276,
"learning_rate": 3.622333953000601e-08,
"loss": 0.379,
"step": 672
},
{
"epoch": 2.8967001434720228,
"grad_norm": 0.29947370290756226,
"learning_rate": 3.3270898196129944e-08,
"loss": 0.3706,
"step": 673
},
{
"epoch": 2.9010043041606886,
"grad_norm": 0.29969486594200134,
"learning_rate": 3.0443546300035764e-08,
"loss": 0.3586,
"step": 674
},
{
"epoch": 2.9053084648493543,
"grad_norm": 0.3195127546787262,
"learning_rate": 2.77413550499267e-08,
"loss": 0.4084,
"step": 675
},
{
"epoch": 2.90961262553802,
"grad_norm": 0.27233725786209106,
"learning_rate": 2.516439250177749e-08,
"loss": 0.3481,
"step": 676
},
{
"epoch": 2.913916786226686,
"grad_norm": 0.2925049960613251,
"learning_rate": 2.2712723557616335e-08,
"loss": 0.378,
"step": 677
},
{
"epoch": 2.9182209469153513,
"grad_norm": 0.2842983603477478,
"learning_rate": 2.038640996389285e-08,
"loss": 0.3678,
"step": 678
},
{
"epoch": 2.922525107604017,
"grad_norm": 0.2895757853984833,
"learning_rate": 1.818551030992377e-08,
"loss": 0.3775,
"step": 679
},
{
"epoch": 2.926829268292683,
"grad_norm": 0.2898154854774475,
"learning_rate": 1.6110080026414123e-08,
"loss": 0.3711,
"step": 680
},
{
"epoch": 2.9311334289813487,
"grad_norm": 0.27027469873428345,
"learning_rate": 1.4160171384064447e-08,
"loss": 0.3636,
"step": 681
},
{
"epoch": 2.9354375896700144,
"grad_norm": 0.29894378781318665,
"learning_rate": 1.2335833492252425e-08,
"loss": 0.3957,
"step": 682
},
{
"epoch": 2.93974175035868,
"grad_norm": 0.29910367727279663,
"learning_rate": 1.063711229779718e-08,
"loss": 0.382,
"step": 683
},
{
"epoch": 2.944045911047346,
"grad_norm": 0.28962647914886475,
"learning_rate": 9.06405058380022e-09,
"loss": 0.3698,
"step": 684
},
{
"epoch": 2.9483500717360114,
"grad_norm": 0.28313586115837097,
"learning_rate": 7.61668796857018e-09,
"loss": 0.3711,
"step": 685
},
{
"epoch": 2.952654232424677,
"grad_norm": 0.29572346806526184,
"learning_rate": 6.295060904623618e-09,
"loss": 0.3881,
"step": 686
},
{
"epoch": 2.956958393113343,
"grad_norm": 0.29672494530677795,
"learning_rate": 5.099202677767978e-09,
"loss": 0.4193,
"step": 687
},
{
"epoch": 2.9612625538020088,
"grad_norm": 0.27398431301116943,
"learning_rate": 4.02914340626226e-09,
"loss": 0.3409,
"step": 688
},
{
"epoch": 2.9655667144906745,
"grad_norm": 0.2979412376880646,
"learning_rate": 3.0849100400587307e-09,
"loss": 0.336,
"step": 689
},
{
"epoch": 2.96987087517934,
"grad_norm": 0.311142235994339,
"learning_rate": 2.2665263601240328e-09,
"loss": 0.3638,
"step": 690
},
{
"epoch": 2.9741750358680057,
"grad_norm": 0.27928242087364197,
"learning_rate": 1.5740129778413215e-09,
"loss": 0.3601,
"step": 691
},
{
"epoch": 2.9784791965566715,
"grad_norm": 0.30028945207595825,
"learning_rate": 1.0073873344895735e-09,
"loss": 0.4064,
"step": 692
},
{
"epoch": 2.9827833572453373,
"grad_norm": 0.30556008219718933,
"learning_rate": 5.666637008061582e-10,
"loss": 0.3881,
"step": 693
},
{
"epoch": 2.987087517934003,
"grad_norm": 0.29040220379829407,
"learning_rate": 2.5185317662490547e-10,
"loss": 0.358,
"step": 694
},
{
"epoch": 2.9913916786226684,
"grad_norm": 0.2974533140659332,
"learning_rate": 6.296369059854978e-11,
"loss": 0.3728,
"step": 695
},
{
"epoch": 2.995695839311334,
"grad_norm": 0.28847527503967285,
"learning_rate": 0.0,
"loss": 0.3634,
"step": 696
},
{
"epoch": 2.995695839311334,
"step": 696,
"total_flos": 890435552804864.0,
"train_loss": 0.4294309826760456,
"train_runtime": 14084.5907,
"train_samples_per_second": 4.746,
"train_steps_per_second": 0.049
}
],
"logging_steps": 1.0,
"max_steps": 696,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 890435552804864.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}