Mxode's picture
upload
f6519b4
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 9480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010548523206751054,
"grad_norm": 1.224403738975525,
"learning_rate": 0.00015789473684210527,
"loss": 7.6018,
"step": 10
},
{
"epoch": 0.002109704641350211,
"grad_norm": 1.0830174684524536,
"learning_rate": 0.00031578947368421053,
"loss": 7.0526,
"step": 20
},
{
"epoch": 0.0031645569620253164,
"grad_norm": 0.8968034982681274,
"learning_rate": 0.00047368421052631577,
"loss": 6.4082,
"step": 30
},
{
"epoch": 0.004219409282700422,
"grad_norm": 0.6159543395042419,
"learning_rate": 0.0006315789473684211,
"loss": 5.8731,
"step": 40
},
{
"epoch": 0.005274261603375527,
"grad_norm": 0.39616236090660095,
"learning_rate": 0.0007894736842105263,
"loss": 5.4938,
"step": 50
},
{
"epoch": 0.006329113924050633,
"grad_norm": 0.6134156584739685,
"learning_rate": 0.0009473684210526315,
"loss": 5.0902,
"step": 60
},
{
"epoch": 0.007383966244725738,
"grad_norm": 0.33243003487586975,
"learning_rate": 0.0011052631578947368,
"loss": 4.6618,
"step": 70
},
{
"epoch": 0.008438818565400843,
"grad_norm": 0.23324701189994812,
"learning_rate": 0.0012631578947368421,
"loss": 4.339,
"step": 80
},
{
"epoch": 0.00949367088607595,
"grad_norm": 0.4921824336051941,
"learning_rate": 0.0014210526315789472,
"loss": 4.1184,
"step": 90
},
{
"epoch": 0.010548523206751054,
"grad_norm": 0.3369714319705963,
"learning_rate": 0.0014999989494847376,
"loss": 3.9508,
"step": 100
},
{
"epoch": 0.011603375527426161,
"grad_norm": 0.27228906750679016,
"learning_rate": 0.0014999905453802946,
"loss": 3.8005,
"step": 110
},
{
"epoch": 0.012658227848101266,
"grad_norm": 0.2569299340248108,
"learning_rate": 0.0014999737372655805,
"loss": 3.6694,
"step": 120
},
{
"epoch": 0.013713080168776372,
"grad_norm": 0.6321362257003784,
"learning_rate": 0.0014999485253289388,
"loss": 3.5732,
"step": 130
},
{
"epoch": 0.014767932489451477,
"grad_norm": 0.265931636095047,
"learning_rate": 0.0014999149098528814,
"loss": 3.4612,
"step": 140
},
{
"epoch": 0.015822784810126583,
"grad_norm": 0.36976251006126404,
"learning_rate": 0.0014998728912140862,
"loss": 3.3647,
"step": 150
},
{
"epoch": 0.016877637130801686,
"grad_norm": 0.29034847021102905,
"learning_rate": 0.0014998224698833922,
"loss": 3.2909,
"step": 160
},
{
"epoch": 0.017932489451476793,
"grad_norm": 0.31680262088775635,
"learning_rate": 0.0014997636464257956,
"loss": 3.2053,
"step": 170
},
{
"epoch": 0.0189873417721519,
"grad_norm": 0.3954222500324249,
"learning_rate": 0.0014996964215004416,
"loss": 3.1432,
"step": 180
},
{
"epoch": 0.020042194092827006,
"grad_norm": 0.40973424911499023,
"learning_rate": 0.0014996207958606182,
"loss": 3.0757,
"step": 190
},
{
"epoch": 0.02109704641350211,
"grad_norm": 0.457225501537323,
"learning_rate": 0.001499536770353748,
"loss": 3.0013,
"step": 200
},
{
"epoch": 0.022151898734177215,
"grad_norm": 0.5260195732116699,
"learning_rate": 0.0014994443459213774,
"loss": 2.9466,
"step": 210
},
{
"epoch": 0.023206751054852322,
"grad_norm": 0.33434826135635376,
"learning_rate": 0.001499343523599168,
"loss": 2.8918,
"step": 220
},
{
"epoch": 0.024261603375527425,
"grad_norm": 0.3714075982570648,
"learning_rate": 0.0014992343045168823,
"loss": 2.8235,
"step": 230
},
{
"epoch": 0.02531645569620253,
"grad_norm": 0.3708628714084625,
"learning_rate": 0.0014991166898983739,
"loss": 2.769,
"step": 240
},
{
"epoch": 0.026371308016877638,
"grad_norm": 0.37826642394065857,
"learning_rate": 0.001498990681061572,
"loss": 2.7176,
"step": 250
},
{
"epoch": 0.027426160337552744,
"grad_norm": 0.3962521255016327,
"learning_rate": 0.001498856279418467,
"loss": 2.6607,
"step": 260
},
{
"epoch": 0.028481012658227847,
"grad_norm": 0.4650373160839081,
"learning_rate": 0.0014987134864750948,
"loss": 2.6222,
"step": 270
},
{
"epoch": 0.029535864978902954,
"grad_norm": 0.36165574193000793,
"learning_rate": 0.0014985623038315206,
"loss": 2.571,
"step": 280
},
{
"epoch": 0.03059071729957806,
"grad_norm": 0.4171614944934845,
"learning_rate": 0.0014984027331818193,
"loss": 2.5323,
"step": 290
},
{
"epoch": 0.03164556962025317,
"grad_norm": 0.5617461204528809,
"learning_rate": 0.0014982347763140584,
"loss": 2.4971,
"step": 300
},
{
"epoch": 0.03270042194092827,
"grad_norm": 0.6129226684570312,
"learning_rate": 0.0014980584351102762,
"loss": 2.4632,
"step": 310
},
{
"epoch": 0.03375527426160337,
"grad_norm": 0.46741676330566406,
"learning_rate": 0.001497873711546462,
"loss": 2.4185,
"step": 320
},
{
"epoch": 0.03481012658227848,
"grad_norm": 0.451256662607193,
"learning_rate": 0.0014976806076925334,
"loss": 2.3998,
"step": 330
},
{
"epoch": 0.035864978902953586,
"grad_norm": 0.43502041697502136,
"learning_rate": 0.0014974791257123137,
"loss": 2.367,
"step": 340
},
{
"epoch": 0.03691983122362869,
"grad_norm": 0.3662364184856415,
"learning_rate": 0.001497269267863507,
"loss": 2.3158,
"step": 350
},
{
"epoch": 0.0379746835443038,
"grad_norm": 0.4172323942184448,
"learning_rate": 0.0014970510364976724,
"loss": 2.2855,
"step": 360
},
{
"epoch": 0.039029535864978905,
"grad_norm": 0.3845485746860504,
"learning_rate": 0.0014968244340601996,
"loss": 2.2709,
"step": 370
},
{
"epoch": 0.04008438818565401,
"grad_norm": 0.5006797313690186,
"learning_rate": 0.001496589463090279,
"loss": 2.2583,
"step": 380
},
{
"epoch": 0.04113924050632911,
"grad_norm": 0.6061115264892578,
"learning_rate": 0.001496346126220875,
"loss": 2.2285,
"step": 390
},
{
"epoch": 0.04219409282700422,
"grad_norm": 0.36179161071777344,
"learning_rate": 0.0014960944261786966,
"loss": 2.1955,
"step": 400
},
{
"epoch": 0.043248945147679324,
"grad_norm": 0.3911934792995453,
"learning_rate": 0.0014958343657841655,
"loss": 2.1831,
"step": 410
},
{
"epoch": 0.04430379746835443,
"grad_norm": 0.4641297459602356,
"learning_rate": 0.001495565947951385,
"loss": 2.1568,
"step": 420
},
{
"epoch": 0.04535864978902954,
"grad_norm": 0.3561594486236572,
"learning_rate": 0.0014952891756881085,
"loss": 2.1319,
"step": 430
},
{
"epoch": 0.046413502109704644,
"grad_norm": 0.3676946759223938,
"learning_rate": 0.0014950040520957037,
"loss": 2.1113,
"step": 440
},
{
"epoch": 0.04746835443037975,
"grad_norm": 0.397060364484787,
"learning_rate": 0.0014947105803691204,
"loss": 2.1096,
"step": 450
},
{
"epoch": 0.04852320675105485,
"grad_norm": 0.5825448036193848,
"learning_rate": 0.0014944087637968522,
"loss": 2.0875,
"step": 460
},
{
"epoch": 0.049578059071729956,
"grad_norm": 0.3827154040336609,
"learning_rate": 0.0014940986057609012,
"loss": 2.0607,
"step": 470
},
{
"epoch": 0.05063291139240506,
"grad_norm": 0.44185370206832886,
"learning_rate": 0.0014937801097367396,
"loss": 2.0567,
"step": 480
},
{
"epoch": 0.05168776371308017,
"grad_norm": 0.37032192945480347,
"learning_rate": 0.001493453279293271,
"loss": 2.0288,
"step": 490
},
{
"epoch": 0.052742616033755275,
"grad_norm": 0.4027710556983948,
"learning_rate": 0.0014931181180927902,
"loss": 2.0196,
"step": 500
},
{
"epoch": 0.05379746835443038,
"grad_norm": 0.4590783715248108,
"learning_rate": 0.001492774629890942,
"loss": 2.0054,
"step": 510
},
{
"epoch": 0.05485232067510549,
"grad_norm": 0.43571019172668457,
"learning_rate": 0.001492422818536679,
"loss": 1.9931,
"step": 520
},
{
"epoch": 0.05590717299578059,
"grad_norm": 0.36802706122398376,
"learning_rate": 0.00149206268797222,
"loss": 1.9784,
"step": 530
},
{
"epoch": 0.056962025316455694,
"grad_norm": 0.42650750279426575,
"learning_rate": 0.0014916942422330032,
"loss": 1.9588,
"step": 540
},
{
"epoch": 0.0580168776371308,
"grad_norm": 0.3881192207336426,
"learning_rate": 0.001491317485447643,
"loss": 1.9576,
"step": 550
},
{
"epoch": 0.05907172995780591,
"grad_norm": 0.37164539098739624,
"learning_rate": 0.0014909324218378838,
"loss": 1.9199,
"step": 560
},
{
"epoch": 0.060126582278481014,
"grad_norm": 0.4603891372680664,
"learning_rate": 0.0014905390557185508,
"loss": 1.9272,
"step": 570
},
{
"epoch": 0.06118143459915612,
"grad_norm": 0.4279107451438904,
"learning_rate": 0.0014901373914975036,
"loss": 1.9275,
"step": 580
},
{
"epoch": 0.06223628691983123,
"grad_norm": 0.4203340411186218,
"learning_rate": 0.0014897274336755856,
"loss": 1.9022,
"step": 590
},
{
"epoch": 0.06329113924050633,
"grad_norm": 0.47206705808639526,
"learning_rate": 0.001489309186846575,
"loss": 1.8864,
"step": 600
},
{
"epoch": 0.06434599156118144,
"grad_norm": 0.46364250779151917,
"learning_rate": 0.0014888826556971313,
"loss": 1.8886,
"step": 610
},
{
"epoch": 0.06540084388185655,
"grad_norm": 0.41355302929878235,
"learning_rate": 0.0014884478450067444,
"loss": 1.8716,
"step": 620
},
{
"epoch": 0.06645569620253164,
"grad_norm": 0.5133219957351685,
"learning_rate": 0.0014880047596476807,
"loss": 1.8572,
"step": 630
},
{
"epoch": 0.06751054852320675,
"grad_norm": 0.5945013761520386,
"learning_rate": 0.0014875534045849274,
"loss": 1.8732,
"step": 640
},
{
"epoch": 0.06856540084388185,
"grad_norm": 0.37314435839653015,
"learning_rate": 0.0014870937848761388,
"loss": 1.8569,
"step": 650
},
{
"epoch": 0.06962025316455696,
"grad_norm": 0.36437222361564636,
"learning_rate": 0.001486625905671578,
"loss": 1.8432,
"step": 660
},
{
"epoch": 0.07067510548523206,
"grad_norm": 0.4192541539669037,
"learning_rate": 0.00148614977221406,
"loss": 1.8279,
"step": 670
},
{
"epoch": 0.07172995780590717,
"grad_norm": 0.35485246777534485,
"learning_rate": 0.0014856653898388927,
"loss": 1.8186,
"step": 680
},
{
"epoch": 0.07278481012658228,
"grad_norm": 0.43379876017570496,
"learning_rate": 0.001485172763973817,
"loss": 1.8302,
"step": 690
},
{
"epoch": 0.07383966244725738,
"grad_norm": 0.36517855525016785,
"learning_rate": 0.0014846719001389466,
"loss": 1.8109,
"step": 700
},
{
"epoch": 0.07489451476793249,
"grad_norm": 0.3900572955608368,
"learning_rate": 0.001484162803946705,
"loss": 1.8012,
"step": 710
},
{
"epoch": 0.0759493670886076,
"grad_norm": 0.5315179824829102,
"learning_rate": 0.0014836454811017635,
"loss": 1.787,
"step": 720
},
{
"epoch": 0.0770042194092827,
"grad_norm": 0.3824789524078369,
"learning_rate": 0.0014831199374009778,
"loss": 1.7798,
"step": 730
},
{
"epoch": 0.07805907172995781,
"grad_norm": 0.3980269730091095,
"learning_rate": 0.0014825861787333208,
"loss": 1.7912,
"step": 740
},
{
"epoch": 0.07911392405063292,
"grad_norm": 0.4327452778816223,
"learning_rate": 0.0014820442110798197,
"loss": 1.7699,
"step": 750
},
{
"epoch": 0.08016877637130802,
"grad_norm": 0.4212438762187958,
"learning_rate": 0.0014814940405134865,
"loss": 1.7656,
"step": 760
},
{
"epoch": 0.08122362869198312,
"grad_norm": 0.44190293550491333,
"learning_rate": 0.001480935673199251,
"loss": 1.7593,
"step": 770
},
{
"epoch": 0.08227848101265822,
"grad_norm": 0.45607924461364746,
"learning_rate": 0.0014803691153938915,
"loss": 1.7574,
"step": 780
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.37207379937171936,
"learning_rate": 0.0014797943734459653,
"loss": 1.758,
"step": 790
},
{
"epoch": 0.08438818565400844,
"grad_norm": 0.49604687094688416,
"learning_rate": 0.001479211453795736,
"loss": 1.7343,
"step": 800
},
{
"epoch": 0.08544303797468354,
"grad_norm": 0.41798657178878784,
"learning_rate": 0.0014786203629751033,
"loss": 1.72,
"step": 810
},
{
"epoch": 0.08649789029535865,
"grad_norm": 0.5231151580810547,
"learning_rate": 0.0014780211076075279,
"loss": 1.7314,
"step": 820
},
{
"epoch": 0.08755274261603375,
"grad_norm": 0.3442583978176117,
"learning_rate": 0.0014774136944079594,
"loss": 1.7333,
"step": 830
},
{
"epoch": 0.08860759493670886,
"grad_norm": 0.3477618992328644,
"learning_rate": 0.0014767981301827592,
"loss": 1.7091,
"step": 840
},
{
"epoch": 0.08966244725738397,
"grad_norm": 0.3565273582935333,
"learning_rate": 0.0014761744218296249,
"loss": 1.7073,
"step": 850
},
{
"epoch": 0.09071729957805907,
"grad_norm": 0.3818325102329254,
"learning_rate": 0.001475542576337513,
"loss": 1.7096,
"step": 860
},
{
"epoch": 0.09177215189873418,
"grad_norm": 0.3839077055454254,
"learning_rate": 0.001474902600786561,
"loss": 1.7062,
"step": 870
},
{
"epoch": 0.09282700421940929,
"grad_norm": 0.5717847347259521,
"learning_rate": 0.0014742545023480075,
"loss": 1.7008,
"step": 880
},
{
"epoch": 0.0938818565400844,
"grad_norm": 0.361653596162796,
"learning_rate": 0.0014735982882841117,
"loss": 1.6898,
"step": 890
},
{
"epoch": 0.0949367088607595,
"grad_norm": 0.34179946780204773,
"learning_rate": 0.0014729339659480727,
"loss": 1.6894,
"step": 900
},
{
"epoch": 0.09599156118143459,
"grad_norm": 0.37158748507499695,
"learning_rate": 0.0014722615427839468,
"loss": 1.6956,
"step": 910
},
{
"epoch": 0.0970464135021097,
"grad_norm": 0.36093300580978394,
"learning_rate": 0.0014715810263265633,
"loss": 1.6844,
"step": 920
},
{
"epoch": 0.0981012658227848,
"grad_norm": 0.3625962436199188,
"learning_rate": 0.0014708924242014423,
"loss": 1.6677,
"step": 930
},
{
"epoch": 0.09915611814345991,
"grad_norm": 0.39460405707359314,
"learning_rate": 0.0014701957441247064,
"loss": 1.6769,
"step": 940
},
{
"epoch": 0.10021097046413502,
"grad_norm": 0.4288382828235626,
"learning_rate": 0.0014694909939029959,
"loss": 1.6593,
"step": 950
},
{
"epoch": 0.10126582278481013,
"grad_norm": 0.4599696397781372,
"learning_rate": 0.0014687781814333814,
"loss": 1.6695,
"step": 960
},
{
"epoch": 0.10232067510548523,
"grad_norm": 0.34559813141822815,
"learning_rate": 0.0014680573147032746,
"loss": 1.6685,
"step": 970
},
{
"epoch": 0.10337552742616034,
"grad_norm": 0.4324914515018463,
"learning_rate": 0.0014673284017903392,
"loss": 1.647,
"step": 980
},
{
"epoch": 0.10443037974683544,
"grad_norm": 0.3713063895702362,
"learning_rate": 0.0014665914508624,
"loss": 1.6466,
"step": 990
},
{
"epoch": 0.10548523206751055,
"grad_norm": 0.36441338062286377,
"learning_rate": 0.0014658464701773526,
"loss": 1.6539,
"step": 1000
},
{
"epoch": 0.10654008438818566,
"grad_norm": 0.3947252929210663,
"learning_rate": 0.0014650934680830688,
"loss": 1.644,
"step": 1010
},
{
"epoch": 0.10759493670886076,
"grad_norm": 0.39086753129959106,
"learning_rate": 0.0014643324530173051,
"loss": 1.6356,
"step": 1020
},
{
"epoch": 0.10864978902953587,
"grad_norm": 0.4416680932044983,
"learning_rate": 0.0014635634335076067,
"loss": 1.6387,
"step": 1030
},
{
"epoch": 0.10970464135021098,
"grad_norm": 0.39951401948928833,
"learning_rate": 0.001462786418171213,
"loss": 1.6317,
"step": 1040
},
{
"epoch": 0.11075949367088607,
"grad_norm": 0.33045339584350586,
"learning_rate": 0.0014620014157149597,
"loss": 1.6345,
"step": 1050
},
{
"epoch": 0.11181434599156118,
"grad_norm": 0.3590207099914551,
"learning_rate": 0.001461208434935183,
"loss": 1.6198,
"step": 1060
},
{
"epoch": 0.11286919831223628,
"grad_norm": 0.38089054822921753,
"learning_rate": 0.0014604074847176197,
"loss": 1.6115,
"step": 1070
},
{
"epoch": 0.11392405063291139,
"grad_norm": 0.3784695863723755,
"learning_rate": 0.0014595985740373082,
"loss": 1.6124,
"step": 1080
},
{
"epoch": 0.1149789029535865,
"grad_norm": 0.46172553300857544,
"learning_rate": 0.0014587817119584873,
"loss": 1.6223,
"step": 1090
},
{
"epoch": 0.1160337552742616,
"grad_norm": 0.41999849677085876,
"learning_rate": 0.001457956907634496,
"loss": 1.6097,
"step": 1100
},
{
"epoch": 0.11708860759493671,
"grad_norm": 0.32970911264419556,
"learning_rate": 0.0014571241703076692,
"loss": 1.6125,
"step": 1110
},
{
"epoch": 0.11814345991561181,
"grad_norm": 0.34392091631889343,
"learning_rate": 0.0014562835093092348,
"loss": 1.6047,
"step": 1120
},
{
"epoch": 0.11919831223628692,
"grad_norm": 0.4005463421344757,
"learning_rate": 0.0014554349340592104,
"loss": 1.5932,
"step": 1130
},
{
"epoch": 0.12025316455696203,
"grad_norm": 0.4000810980796814,
"learning_rate": 0.001454578454066296,
"loss": 1.6066,
"step": 1140
},
{
"epoch": 0.12130801687763713,
"grad_norm": 0.5581340789794922,
"learning_rate": 0.0014537140789277678,
"loss": 1.5937,
"step": 1150
},
{
"epoch": 0.12236286919831224,
"grad_norm": 0.3587999641895294,
"learning_rate": 0.0014528418183293716,
"loss": 1.5998,
"step": 1160
},
{
"epoch": 0.12341772151898735,
"grad_norm": 0.3587309718132019,
"learning_rate": 0.001451961682045213,
"loss": 1.5899,
"step": 1170
},
{
"epoch": 0.12447257383966245,
"grad_norm": 0.38655397295951843,
"learning_rate": 0.001451073679937649,
"loss": 1.5741,
"step": 1180
},
{
"epoch": 0.12552742616033755,
"grad_norm": 0.36412402987480164,
"learning_rate": 0.0014501778219571766,
"loss": 1.5749,
"step": 1190
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.3607521653175354,
"learning_rate": 0.0014492741181423225,
"loss": 1.5859,
"step": 1200
},
{
"epoch": 0.12763713080168776,
"grad_norm": 0.3544326722621918,
"learning_rate": 0.0014483625786195285,
"loss": 1.5767,
"step": 1210
},
{
"epoch": 0.12869198312236288,
"grad_norm": 0.3618502616882324,
"learning_rate": 0.0014474432136030405,
"loss": 1.5659,
"step": 1220
},
{
"epoch": 0.12974683544303797,
"grad_norm": 0.3665081858634949,
"learning_rate": 0.0014465160333947923,
"loss": 1.5577,
"step": 1230
},
{
"epoch": 0.1308016877637131,
"grad_norm": 0.3543892502784729,
"learning_rate": 0.0014455810483842908,
"loss": 1.5737,
"step": 1240
},
{
"epoch": 0.13185654008438819,
"grad_norm": 0.435255229473114,
"learning_rate": 0.0014446382690484997,
"loss": 1.5761,
"step": 1250
},
{
"epoch": 0.13291139240506328,
"grad_norm": 0.3356614410877228,
"learning_rate": 0.0014436877059517215,
"loss": 1.5561,
"step": 1260
},
{
"epoch": 0.1339662447257384,
"grad_norm": 0.32167506217956543,
"learning_rate": 0.0014427293697454803,
"loss": 1.5594,
"step": 1270
},
{
"epoch": 0.1350210970464135,
"grad_norm": 0.34562185406684875,
"learning_rate": 0.001441763271168401,
"loss": 1.5668,
"step": 1280
},
{
"epoch": 0.1360759493670886,
"grad_norm": 0.3838872015476227,
"learning_rate": 0.00144078942104609,
"loss": 1.5561,
"step": 1290
},
{
"epoch": 0.1371308016877637,
"grad_norm": 0.3552131652832031,
"learning_rate": 0.001439807830291013,
"loss": 1.5432,
"step": 1300
},
{
"epoch": 0.13818565400843882,
"grad_norm": 0.3455946743488312,
"learning_rate": 0.0014388185099023744,
"loss": 1.55,
"step": 1310
},
{
"epoch": 0.13924050632911392,
"grad_norm": 0.3956320583820343,
"learning_rate": 0.0014378214709659916,
"loss": 1.5514,
"step": 1320
},
{
"epoch": 0.14029535864978904,
"grad_norm": 0.4718681573867798,
"learning_rate": 0.0014368167246541733,
"loss": 1.5419,
"step": 1330
},
{
"epoch": 0.14135021097046413,
"grad_norm": 0.37479954957962036,
"learning_rate": 0.0014358042822255918,
"loss": 1.5427,
"step": 1340
},
{
"epoch": 0.14240506329113925,
"grad_norm": 0.384405255317688,
"learning_rate": 0.0014347841550251597,
"loss": 1.5463,
"step": 1350
},
{
"epoch": 0.14345991561181434,
"grad_norm": 0.41001036763191223,
"learning_rate": 0.0014337563544838997,
"loss": 1.5386,
"step": 1360
},
{
"epoch": 0.14451476793248946,
"grad_norm": 0.4328341782093048,
"learning_rate": 0.001432720892118819,
"loss": 1.5383,
"step": 1370
},
{
"epoch": 0.14556962025316456,
"grad_norm": 0.4074663519859314,
"learning_rate": 0.0014316777795327794,
"loss": 1.5296,
"step": 1380
},
{
"epoch": 0.14662447257383968,
"grad_norm": 0.45945727825164795,
"learning_rate": 0.001430627028414366,
"loss": 1.5394,
"step": 1390
},
{
"epoch": 0.14767932489451477,
"grad_norm": 0.582301676273346,
"learning_rate": 0.0014295686505377586,
"loss": 1.5195,
"step": 1400
},
{
"epoch": 0.14873417721518986,
"grad_norm": 0.42094552516937256,
"learning_rate": 0.0014285026577625982,
"loss": 1.5244,
"step": 1410
},
{
"epoch": 0.14978902953586498,
"grad_norm": 0.36885884404182434,
"learning_rate": 0.0014274290620338542,
"loss": 1.5288,
"step": 1420
},
{
"epoch": 0.15084388185654007,
"grad_norm": 0.3604186773300171,
"learning_rate": 0.0014263478753816906,
"loss": 1.521,
"step": 1430
},
{
"epoch": 0.1518987341772152,
"grad_norm": 0.35973232984542847,
"learning_rate": 0.0014252591099213326,
"loss": 1.5165,
"step": 1440
},
{
"epoch": 0.1529535864978903,
"grad_norm": 0.3624361455440521,
"learning_rate": 0.001424162777852928,
"loss": 1.5175,
"step": 1450
},
{
"epoch": 0.1540084388185654,
"grad_norm": 0.35657012462615967,
"learning_rate": 0.0014230588914614134,
"loss": 1.5148,
"step": 1460
},
{
"epoch": 0.1550632911392405,
"grad_norm": 0.44882655143737793,
"learning_rate": 0.0014219474631163745,
"loss": 1.5112,
"step": 1470
},
{
"epoch": 0.15611814345991562,
"grad_norm": 0.4358622133731842,
"learning_rate": 0.001420828505271909,
"loss": 1.5153,
"step": 1480
},
{
"epoch": 0.1571729957805907,
"grad_norm": 0.42174819111824036,
"learning_rate": 0.0014197020304664856,
"loss": 1.5155,
"step": 1490
},
{
"epoch": 0.15822784810126583,
"grad_norm": 0.40337657928466797,
"learning_rate": 0.0014185680513228048,
"loss": 1.5093,
"step": 1500
},
{
"epoch": 0.15928270042194093,
"grad_norm": 0.43946516513824463,
"learning_rate": 0.0014174265805476564,
"loss": 1.5124,
"step": 1510
},
{
"epoch": 0.16033755274261605,
"grad_norm": 0.5411512851715088,
"learning_rate": 0.0014162776309317778,
"loss": 1.5142,
"step": 1520
},
{
"epoch": 0.16139240506329114,
"grad_norm": 0.3460575044155121,
"learning_rate": 0.0014151212153497108,
"loss": 1.4959,
"step": 1530
},
{
"epoch": 0.16244725738396623,
"grad_norm": 0.4668763279914856,
"learning_rate": 0.0014139573467596561,
"loss": 1.4874,
"step": 1540
},
{
"epoch": 0.16350210970464135,
"grad_norm": 0.3798237144947052,
"learning_rate": 0.00141278603820333,
"loss": 1.4891,
"step": 1550
},
{
"epoch": 0.16455696202531644,
"grad_norm": 0.3989150822162628,
"learning_rate": 0.0014116073028058165,
"loss": 1.4922,
"step": 1560
},
{
"epoch": 0.16561181434599156,
"grad_norm": 0.3972000181674957,
"learning_rate": 0.0014104211537754217,
"loss": 1.4833,
"step": 1570
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.4881563186645508,
"learning_rate": 0.001409227604403524,
"loss": 1.4954,
"step": 1580
},
{
"epoch": 0.16772151898734178,
"grad_norm": 0.40641921758651733,
"learning_rate": 0.0014080266680644277,
"loss": 1.49,
"step": 1590
},
{
"epoch": 0.16877637130801687,
"grad_norm": 0.3936208188533783,
"learning_rate": 0.0014068183582152103,
"loss": 1.4916,
"step": 1600
},
{
"epoch": 0.169831223628692,
"grad_norm": 0.3647351562976837,
"learning_rate": 0.001405602688395574,
"loss": 1.4895,
"step": 1610
},
{
"epoch": 0.17088607594936708,
"grad_norm": 0.4019123613834381,
"learning_rate": 0.0014043796722276924,
"loss": 1.4603,
"step": 1620
},
{
"epoch": 0.1719409282700422,
"grad_norm": 0.39883333444595337,
"learning_rate": 0.0014031493234160591,
"loss": 1.4766,
"step": 1630
},
{
"epoch": 0.1729957805907173,
"grad_norm": 0.36587414145469666,
"learning_rate": 0.0014019116557473332,
"loss": 1.4798,
"step": 1640
},
{
"epoch": 0.17405063291139242,
"grad_norm": 0.44427385926246643,
"learning_rate": 0.0014006666830901854,
"loss": 1.4702,
"step": 1650
},
{
"epoch": 0.1751054852320675,
"grad_norm": 0.34642136096954346,
"learning_rate": 0.001399414419395142,
"loss": 1.4708,
"step": 1660
},
{
"epoch": 0.17616033755274263,
"grad_norm": 0.4188520014286041,
"learning_rate": 0.0013981548786944293,
"loss": 1.4838,
"step": 1670
},
{
"epoch": 0.17721518987341772,
"grad_norm": 0.4440024495124817,
"learning_rate": 0.0013968880751018158,
"loss": 1.4674,
"step": 1680
},
{
"epoch": 0.17827004219409281,
"grad_norm": 0.5161822438240051,
"learning_rate": 0.0013956140228124545,
"loss": 1.4627,
"step": 1690
},
{
"epoch": 0.17932489451476794,
"grad_norm": 0.43483030796051025,
"learning_rate": 0.0013943327361027231,
"loss": 1.482,
"step": 1700
},
{
"epoch": 0.18037974683544303,
"grad_norm": 0.3826151490211487,
"learning_rate": 0.0013930442293300649,
"loss": 1.4594,
"step": 1710
},
{
"epoch": 0.18143459915611815,
"grad_norm": 0.351190984249115,
"learning_rate": 0.0013917485169328279,
"loss": 1.4576,
"step": 1720
},
{
"epoch": 0.18248945147679324,
"grad_norm": 0.38680753111839294,
"learning_rate": 0.0013904456134301016,
"loss": 1.4569,
"step": 1730
},
{
"epoch": 0.18354430379746836,
"grad_norm": 0.34136396646499634,
"learning_rate": 0.0013891355334215562,
"loss": 1.459,
"step": 1740
},
{
"epoch": 0.18459915611814345,
"grad_norm": 0.35594695806503296,
"learning_rate": 0.0013878182915872776,
"loss": 1.4808,
"step": 1750
},
{
"epoch": 0.18565400843881857,
"grad_norm": 0.3368285596370697,
"learning_rate": 0.001386493902687604,
"loss": 1.4578,
"step": 1760
},
{
"epoch": 0.18670886075949367,
"grad_norm": 0.33792853355407715,
"learning_rate": 0.00138516238156296,
"loss": 1.4599,
"step": 1770
},
{
"epoch": 0.1877637130801688,
"grad_norm": 0.35535696148872375,
"learning_rate": 0.0013838237431336895,
"loss": 1.4613,
"step": 1780
},
{
"epoch": 0.18881856540084388,
"grad_norm": 0.3883817791938782,
"learning_rate": 0.0013824780023998899,
"loss": 1.463,
"step": 1790
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.3484611511230469,
"learning_rate": 0.0013811251744412431,
"loss": 1.4543,
"step": 1800
},
{
"epoch": 0.1909282700421941,
"grad_norm": 0.3812905550003052,
"learning_rate": 0.0013797652744168473,
"loss": 1.4457,
"step": 1810
},
{
"epoch": 0.19198312236286919,
"grad_norm": 0.3451510965824127,
"learning_rate": 0.0013783983175650457,
"loss": 1.4416,
"step": 1820
},
{
"epoch": 0.1930379746835443,
"grad_norm": 0.391886442899704,
"learning_rate": 0.0013770243192032581,
"loss": 1.4458,
"step": 1830
},
{
"epoch": 0.1940928270042194,
"grad_norm": 0.3714632987976074,
"learning_rate": 0.0013756432947278064,
"loss": 1.447,
"step": 1840
},
{
"epoch": 0.19514767932489452,
"grad_norm": 0.35794293880462646,
"learning_rate": 0.0013742552596137444,
"loss": 1.4432,
"step": 1850
},
{
"epoch": 0.1962025316455696,
"grad_norm": 0.350976824760437,
"learning_rate": 0.0013728602294146833,
"loss": 1.4472,
"step": 1860
},
{
"epoch": 0.19725738396624473,
"grad_norm": 0.40909984707832336,
"learning_rate": 0.0013714582197626175,
"loss": 1.4506,
"step": 1870
},
{
"epoch": 0.19831223628691982,
"grad_norm": 0.3741796612739563,
"learning_rate": 0.0013700492463677501,
"loss": 1.4352,
"step": 1880
},
{
"epoch": 0.19936708860759494,
"grad_norm": 0.36106088757514954,
"learning_rate": 0.0013686333250183154,
"loss": 1.4426,
"step": 1890
},
{
"epoch": 0.20042194092827004,
"grad_norm": 0.3645462095737457,
"learning_rate": 0.001367210471580404,
"loss": 1.437,
"step": 1900
},
{
"epoch": 0.20147679324894516,
"grad_norm": 0.39476481080055237,
"learning_rate": 0.0013657807019977835,
"loss": 1.4379,
"step": 1910
},
{
"epoch": 0.20253164556962025,
"grad_norm": 0.3888195753097534,
"learning_rate": 0.0013643440322917198,
"loss": 1.4382,
"step": 1920
},
{
"epoch": 0.20358649789029537,
"grad_norm": 0.37703272700309753,
"learning_rate": 0.0013629004785607989,
"loss": 1.4363,
"step": 1930
},
{
"epoch": 0.20464135021097046,
"grad_norm": 0.3618707060813904,
"learning_rate": 0.0013614500569807445,
"loss": 1.4306,
"step": 1940
},
{
"epoch": 0.20569620253164558,
"grad_norm": 0.39237603545188904,
"learning_rate": 0.0013599927838042394,
"loss": 1.4269,
"step": 1950
},
{
"epoch": 0.20675105485232068,
"grad_norm": 0.3404817283153534,
"learning_rate": 0.0013585286753607408,
"loss": 1.4289,
"step": 1960
},
{
"epoch": 0.20780590717299577,
"grad_norm": 0.37156736850738525,
"learning_rate": 0.0013570577480562986,
"loss": 1.44,
"step": 1970
},
{
"epoch": 0.2088607594936709,
"grad_norm": 0.3893108069896698,
"learning_rate": 0.0013555800183733717,
"loss": 1.4195,
"step": 1980
},
{
"epoch": 0.20991561181434598,
"grad_norm": 0.3751927316188812,
"learning_rate": 0.0013540955028706425,
"loss": 1.418,
"step": 1990
},
{
"epoch": 0.2109704641350211,
"grad_norm": 0.35814914107322693,
"learning_rate": 0.0013526042181828324,
"loss": 1.4193,
"step": 2000
},
{
"epoch": 0.2120253164556962,
"grad_norm": 0.36299997568130493,
"learning_rate": 0.0013511061810205143,
"loss": 1.4257,
"step": 2010
},
{
"epoch": 0.21308016877637131,
"grad_norm": 0.4335196614265442,
"learning_rate": 0.001349601408169926,
"loss": 1.4377,
"step": 2020
},
{
"epoch": 0.2141350210970464,
"grad_norm": 0.41183096170425415,
"learning_rate": 0.0013480899164927823,
"loss": 1.4227,
"step": 2030
},
{
"epoch": 0.21518987341772153,
"grad_norm": 0.3383892774581909,
"learning_rate": 0.0013465717229260853,
"loss": 1.4217,
"step": 2040
},
{
"epoch": 0.21624472573839662,
"grad_norm": 0.4018089175224304,
"learning_rate": 0.001345046844481935,
"loss": 1.418,
"step": 2050
},
{
"epoch": 0.21729957805907174,
"grad_norm": 0.3578946590423584,
"learning_rate": 0.0013435152982473396,
"loss": 1.4104,
"step": 2060
},
{
"epoch": 0.21835443037974683,
"grad_norm": 0.35767829418182373,
"learning_rate": 0.0013419771013840217,
"loss": 1.4171,
"step": 2070
},
{
"epoch": 0.21940928270042195,
"grad_norm": 0.37678778171539307,
"learning_rate": 0.001340432271128229,
"loss": 1.421,
"step": 2080
},
{
"epoch": 0.22046413502109705,
"grad_norm": 0.3800208866596222,
"learning_rate": 0.0013388808247905381,
"loss": 1.4097,
"step": 2090
},
{
"epoch": 0.22151898734177214,
"grad_norm": 0.41718804836273193,
"learning_rate": 0.0013373227797556634,
"loss": 1.4164,
"step": 2100
},
{
"epoch": 0.22257383966244726,
"grad_norm": 0.39602622389793396,
"learning_rate": 0.00133575815348226,
"loss": 1.4052,
"step": 2110
},
{
"epoch": 0.22362869198312235,
"grad_norm": 0.33915984630584717,
"learning_rate": 0.0013341869635027292,
"loss": 1.405,
"step": 2120
},
{
"epoch": 0.22468354430379747,
"grad_norm": 0.387729674577713,
"learning_rate": 0.001332609227423022,
"loss": 1.4054,
"step": 2130
},
{
"epoch": 0.22573839662447256,
"grad_norm": 0.37279507517814636,
"learning_rate": 0.0013310249629224417,
"loss": 1.4021,
"step": 2140
},
{
"epoch": 0.22679324894514769,
"grad_norm": 0.43018755316734314,
"learning_rate": 0.0013294341877534454,
"loss": 1.4107,
"step": 2150
},
{
"epoch": 0.22784810126582278,
"grad_norm": 0.36134418845176697,
"learning_rate": 0.0013278369197414458,
"loss": 1.4214,
"step": 2160
},
{
"epoch": 0.2289029535864979,
"grad_norm": 0.34999746084213257,
"learning_rate": 0.0013262331767846104,
"loss": 1.4059,
"step": 2170
},
{
"epoch": 0.229957805907173,
"grad_norm": 0.3411415219306946,
"learning_rate": 0.0013246229768536628,
"loss": 1.3894,
"step": 2180
},
{
"epoch": 0.2310126582278481,
"grad_norm": 0.48818546533584595,
"learning_rate": 0.001323006337991679,
"loss": 1.4056,
"step": 2190
},
{
"epoch": 0.2320675105485232,
"grad_norm": 0.3689866364002228,
"learning_rate": 0.0013213832783138873,
"loss": 1.4079,
"step": 2200
},
{
"epoch": 0.23312236286919832,
"grad_norm": 0.47169798612594604,
"learning_rate": 0.0013197538160074633,
"loss": 1.3925,
"step": 2210
},
{
"epoch": 0.23417721518987342,
"grad_norm": 0.36988016963005066,
"learning_rate": 0.0013181179693313283,
"loss": 1.4046,
"step": 2220
},
{
"epoch": 0.23523206751054854,
"grad_norm": 0.3319711983203888,
"learning_rate": 0.0013164757566159428,
"loss": 1.397,
"step": 2230
},
{
"epoch": 0.23628691983122363,
"grad_norm": 0.3849964141845703,
"learning_rate": 0.001314827196263102,
"loss": 1.3931,
"step": 2240
},
{
"epoch": 0.23734177215189872,
"grad_norm": 0.355008602142334,
"learning_rate": 0.0013131723067457302,
"loss": 1.3916,
"step": 2250
},
{
"epoch": 0.23839662447257384,
"grad_norm": 0.40703532099723816,
"learning_rate": 0.0013115111066076721,
"loss": 1.3824,
"step": 2260
},
{
"epoch": 0.23945147679324894,
"grad_norm": 0.36470040678977966,
"learning_rate": 0.0013098436144634862,
"loss": 1.4164,
"step": 2270
},
{
"epoch": 0.24050632911392406,
"grad_norm": 0.3571237623691559,
"learning_rate": 0.0013081698489982364,
"loss": 1.402,
"step": 2280
},
{
"epoch": 0.24156118143459915,
"grad_norm": 0.39680442214012146,
"learning_rate": 0.001306489828967282,
"loss": 1.3905,
"step": 2290
},
{
"epoch": 0.24261603375527427,
"grad_norm": 0.39895832538604736,
"learning_rate": 0.0013048035731960679,
"loss": 1.3893,
"step": 2300
},
{
"epoch": 0.24367088607594936,
"grad_norm": 0.373119980096817,
"learning_rate": 0.0013031111005799133,
"loss": 1.3882,
"step": 2310
},
{
"epoch": 0.24472573839662448,
"grad_norm": 0.3598041534423828,
"learning_rate": 0.0013014124300838004,
"loss": 1.3977,
"step": 2320
},
{
"epoch": 0.24578059071729957,
"grad_norm": 0.3711779713630676,
"learning_rate": 0.0012997075807421612,
"loss": 1.3823,
"step": 2330
},
{
"epoch": 0.2468354430379747,
"grad_norm": 0.34823834896087646,
"learning_rate": 0.0012979965716586653,
"loss": 1.3766,
"step": 2340
},
{
"epoch": 0.2478902953586498,
"grad_norm": 0.3646671175956726,
"learning_rate": 0.0012962794220060048,
"loss": 1.3777,
"step": 2350
},
{
"epoch": 0.2489451476793249,
"grad_norm": 0.43606895208358765,
"learning_rate": 0.0012945561510256801,
"loss": 1.3853,
"step": 2360
},
{
"epoch": 0.25,
"grad_norm": 0.3553369641304016,
"learning_rate": 0.001292826778027784,
"loss": 1.3832,
"step": 2370
},
{
"epoch": 0.2510548523206751,
"grad_norm": 0.33519911766052246,
"learning_rate": 0.0012910913223907856,
"loss": 1.3786,
"step": 2380
},
{
"epoch": 0.2521097046413502,
"grad_norm": 0.3543609082698822,
"learning_rate": 0.0012893498035613123,
"loss": 1.3813,
"step": 2390
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.35886815190315247,
"learning_rate": 0.001287602241053933,
"loss": 1.381,
"step": 2400
},
{
"epoch": 0.2542194092827004,
"grad_norm": 0.36419469118118286,
"learning_rate": 0.0012858486544509392,
"loss": 1.3838,
"step": 2410
},
{
"epoch": 0.2552742616033755,
"grad_norm": 0.377615749835968,
"learning_rate": 0.0012840890634021249,
"loss": 1.377,
"step": 2420
},
{
"epoch": 0.2563291139240506,
"grad_norm": 0.34815406799316406,
"learning_rate": 0.0012823234876245667,
"loss": 1.3734,
"step": 2430
},
{
"epoch": 0.25738396624472576,
"grad_norm": 0.3714103698730469,
"learning_rate": 0.0012805519469024035,
"loss": 1.3823,
"step": 2440
},
{
"epoch": 0.25843881856540085,
"grad_norm": 0.395673930644989,
"learning_rate": 0.0012787744610866143,
"loss": 1.3688,
"step": 2450
},
{
"epoch": 0.25949367088607594,
"grad_norm": 0.34790366888046265,
"learning_rate": 0.0012769910500947954,
"loss": 1.3829,
"step": 2460
},
{
"epoch": 0.26054852320675104,
"grad_norm": 0.34407252073287964,
"learning_rate": 0.0012752017339109376,
"loss": 1.3743,
"step": 2470
},
{
"epoch": 0.2616033755274262,
"grad_norm": 0.3564300835132599,
"learning_rate": 0.0012734065325852029,
"loss": 1.3773,
"step": 2480
},
{
"epoch": 0.2626582278481013,
"grad_norm": 0.3561383783817291,
"learning_rate": 0.0012716054662336987,
"loss": 1.3661,
"step": 2490
},
{
"epoch": 0.26371308016877637,
"grad_norm": 0.46398332715034485,
"learning_rate": 0.001269798555038252,
"loss": 1.3706,
"step": 2500
},
{
"epoch": 0.26476793248945146,
"grad_norm": 0.34344884753227234,
"learning_rate": 0.0012679858192461864,
"loss": 1.3699,
"step": 2510
},
{
"epoch": 0.26582278481012656,
"grad_norm": 0.40484264492988586,
"learning_rate": 0.0012661672791700906,
"loss": 1.3746,
"step": 2520
},
{
"epoch": 0.2668776371308017,
"grad_norm": 0.3783828318119049,
"learning_rate": 0.0012643429551875945,
"loss": 1.3701,
"step": 2530
},
{
"epoch": 0.2679324894514768,
"grad_norm": 0.4477307200431824,
"learning_rate": 0.0012625128677411388,
"loss": 1.3695,
"step": 2540
},
{
"epoch": 0.2689873417721519,
"grad_norm": 0.47914379835128784,
"learning_rate": 0.0012606770373377475,
"loss": 1.3659,
"step": 2550
},
{
"epoch": 0.270042194092827,
"grad_norm": 0.35758259892463684,
"learning_rate": 0.0012588354845487959,
"loss": 1.3724,
"step": 2560
},
{
"epoch": 0.27109704641350213,
"grad_norm": 0.3747115135192871,
"learning_rate": 0.001256988230009783,
"loss": 1.3646,
"step": 2570
},
{
"epoch": 0.2721518987341772,
"grad_norm": 0.3435615301132202,
"learning_rate": 0.0012551352944200976,
"loss": 1.3652,
"step": 2580
},
{
"epoch": 0.2732067510548523,
"grad_norm": 0.3976442217826843,
"learning_rate": 0.0012532766985427874,
"loss": 1.3688,
"step": 2590
},
{
"epoch": 0.2742616033755274,
"grad_norm": 0.3415212631225586,
"learning_rate": 0.0012514124632043272,
"loss": 1.3703,
"step": 2600
},
{
"epoch": 0.27531645569620256,
"grad_norm": 0.3382601737976074,
"learning_rate": 0.0012495426092943842,
"loss": 1.3649,
"step": 2610
},
{
"epoch": 0.27637130801687765,
"grad_norm": 0.35768750309944153,
"learning_rate": 0.0012476671577655845,
"loss": 1.3642,
"step": 2620
},
{
"epoch": 0.27742616033755274,
"grad_norm": 0.3365379869937897,
"learning_rate": 0.0012457861296332774,
"loss": 1.3542,
"step": 2630
},
{
"epoch": 0.27848101265822783,
"grad_norm": 0.363199383020401,
"learning_rate": 0.001243899545975303,
"loss": 1.3624,
"step": 2640
},
{
"epoch": 0.2795358649789029,
"grad_norm": 0.3533865213394165,
"learning_rate": 0.0012420074279317515,
"loss": 1.3562,
"step": 2650
},
{
"epoch": 0.2805907172995781,
"grad_norm": 0.44423454999923706,
"learning_rate": 0.0012401097967047298,
"loss": 1.3517,
"step": 2660
},
{
"epoch": 0.28164556962025317,
"grad_norm": 0.36125123500823975,
"learning_rate": 0.001238206673558122,
"loss": 1.3569,
"step": 2670
},
{
"epoch": 0.28270042194092826,
"grad_norm": 0.35034412145614624,
"learning_rate": 0.0012362980798173526,
"loss": 1.3548,
"step": 2680
},
{
"epoch": 0.28375527426160335,
"grad_norm": 0.3607058823108673,
"learning_rate": 0.0012343840368691462,
"loss": 1.3552,
"step": 2690
},
{
"epoch": 0.2848101265822785,
"grad_norm": 0.36174681782722473,
"learning_rate": 0.0012324645661612886,
"loss": 1.3468,
"step": 2700
},
{
"epoch": 0.2858649789029536,
"grad_norm": 0.37004148960113525,
"learning_rate": 0.0012305396892023867,
"loss": 1.356,
"step": 2710
},
{
"epoch": 0.2869198312236287,
"grad_norm": 0.34047403931617737,
"learning_rate": 0.0012286094275616264,
"loss": 1.3549,
"step": 2720
},
{
"epoch": 0.2879746835443038,
"grad_norm": 0.3488231301307678,
"learning_rate": 0.0012266738028685318,
"loss": 1.3395,
"step": 2730
},
{
"epoch": 0.2890295358649789,
"grad_norm": 0.3499659597873688,
"learning_rate": 0.001224732836812723,
"loss": 1.3459,
"step": 2740
},
{
"epoch": 0.290084388185654,
"grad_norm": 0.3290250599384308,
"learning_rate": 0.0012227865511436724,
"loss": 1.3501,
"step": 2750
},
{
"epoch": 0.2911392405063291,
"grad_norm": 0.4917152523994446,
"learning_rate": 0.001220834967670461,
"loss": 1.3623,
"step": 2760
},
{
"epoch": 0.2921940928270042,
"grad_norm": 0.3639252185821533,
"learning_rate": 0.0012188781082615346,
"loss": 1.3574,
"step": 2770
},
{
"epoch": 0.29324894514767935,
"grad_norm": 0.34025079011917114,
"learning_rate": 0.0012169159948444588,
"loss": 1.3537,
"step": 2780
},
{
"epoch": 0.29430379746835444,
"grad_norm": 0.342074453830719,
"learning_rate": 0.001214948649405672,
"loss": 1.3491,
"step": 2790
},
{
"epoch": 0.29535864978902954,
"grad_norm": 0.3434010446071625,
"learning_rate": 0.0012129760939902407,
"loss": 1.3473,
"step": 2800
},
{
"epoch": 0.29641350210970463,
"grad_norm": 0.37486496567726135,
"learning_rate": 0.0012109983507016114,
"loss": 1.3492,
"step": 2810
},
{
"epoch": 0.2974683544303797,
"grad_norm": 0.3642338812351227,
"learning_rate": 0.0012090154417013636,
"loss": 1.3532,
"step": 2820
},
{
"epoch": 0.29852320675105487,
"grad_norm": 0.31444185972213745,
"learning_rate": 0.0012070273892089605,
"loss": 1.3255,
"step": 2830
},
{
"epoch": 0.29957805907172996,
"grad_norm": 0.3436901271343231,
"learning_rate": 0.0012050342155015012,
"loss": 1.3333,
"step": 2840
},
{
"epoch": 0.30063291139240506,
"grad_norm": 0.3226953148841858,
"learning_rate": 0.0012030359429134707,
"loss": 1.3377,
"step": 2850
},
{
"epoch": 0.30168776371308015,
"grad_norm": 0.32987070083618164,
"learning_rate": 0.0012010325938364883,
"loss": 1.3423,
"step": 2860
},
{
"epoch": 0.3027426160337553,
"grad_norm": 0.3697974979877472,
"learning_rate": 0.0011990241907190592,
"loss": 1.3434,
"step": 2870
},
{
"epoch": 0.3037974683544304,
"grad_norm": 0.35540589690208435,
"learning_rate": 0.001197010756066321,
"loss": 1.3316,
"step": 2880
},
{
"epoch": 0.3048523206751055,
"grad_norm": 0.3480130434036255,
"learning_rate": 0.0011949923124397917,
"loss": 1.3328,
"step": 2890
},
{
"epoch": 0.3059071729957806,
"grad_norm": 0.3540296256542206,
"learning_rate": 0.001192968882457118,
"loss": 1.3406,
"step": 2900
},
{
"epoch": 0.3069620253164557,
"grad_norm": 0.351339727640152,
"learning_rate": 0.001190940488791821,
"loss": 1.3406,
"step": 2910
},
{
"epoch": 0.3080168776371308,
"grad_norm": 0.393935889005661,
"learning_rate": 0.0011889071541730419,
"loss": 1.3353,
"step": 2920
},
{
"epoch": 0.3090717299578059,
"grad_norm": 0.34540510177612305,
"learning_rate": 0.001186868901385288,
"loss": 1.3329,
"step": 2930
},
{
"epoch": 0.310126582278481,
"grad_norm": 0.36295583844184875,
"learning_rate": 0.001184825753268177,
"loss": 1.3359,
"step": 2940
},
{
"epoch": 0.3111814345991561,
"grad_norm": 0.33286401629447937,
"learning_rate": 0.0011827777327161814,
"loss": 1.3453,
"step": 2950
},
{
"epoch": 0.31223628691983124,
"grad_norm": 0.36974620819091797,
"learning_rate": 0.0011807248626783714,
"loss": 1.3248,
"step": 2960
},
{
"epoch": 0.31329113924050633,
"grad_norm": 0.3501949608325958,
"learning_rate": 0.0011786671661581584,
"loss": 1.3218,
"step": 2970
},
{
"epoch": 0.3143459915611814,
"grad_norm": 0.3529147207736969,
"learning_rate": 0.001176604666213036,
"loss": 1.3278,
"step": 2980
},
{
"epoch": 0.3154008438818565,
"grad_norm": 0.33370137214660645,
"learning_rate": 0.0011745373859543236,
"loss": 1.3411,
"step": 2990
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.3584390878677368,
"learning_rate": 0.0011724653485469063,
"loss": 1.3221,
"step": 3000
},
{
"epoch": 0.31751054852320676,
"grad_norm": 0.3395592272281647,
"learning_rate": 0.0011703885772089743,
"loss": 1.3285,
"step": 3010
},
{
"epoch": 0.31856540084388185,
"grad_norm": 0.33367520570755005,
"learning_rate": 0.0011683070952117646,
"loss": 1.3291,
"step": 3020
},
{
"epoch": 0.31962025316455694,
"grad_norm": 0.4275783896446228,
"learning_rate": 0.0011662209258792998,
"loss": 1.3179,
"step": 3030
},
{
"epoch": 0.3206751054852321,
"grad_norm": 0.385898619890213,
"learning_rate": 0.0011641300925881257,
"loss": 1.3256,
"step": 3040
},
{
"epoch": 0.3217299578059072,
"grad_norm": 0.3597271144390106,
"learning_rate": 0.0011620346187670501,
"loss": 1.3219,
"step": 3050
},
{
"epoch": 0.3227848101265823,
"grad_norm": 0.34854650497436523,
"learning_rate": 0.0011599345278968806,
"loss": 1.3354,
"step": 3060
},
{
"epoch": 0.32383966244725737,
"grad_norm": 0.34655341506004333,
"learning_rate": 0.0011578298435101604,
"loss": 1.3218,
"step": 3070
},
{
"epoch": 0.32489451476793246,
"grad_norm": 0.34645092487335205,
"learning_rate": 0.0011557205891909062,
"loss": 1.3141,
"step": 3080
},
{
"epoch": 0.3259493670886076,
"grad_norm": 0.338767945766449,
"learning_rate": 0.0011536067885743423,
"loss": 1.3299,
"step": 3090
},
{
"epoch": 0.3270042194092827,
"grad_norm": 0.34491071105003357,
"learning_rate": 0.001151488465346637,
"loss": 1.3169,
"step": 3100
},
{
"epoch": 0.3280590717299578,
"grad_norm": 0.3524955213069916,
"learning_rate": 0.0011493656432446362,
"loss": 1.3186,
"step": 3110
},
{
"epoch": 0.3291139240506329,
"grad_norm": 0.37061384320259094,
"learning_rate": 0.0011472383460555983,
"loss": 1.3234,
"step": 3120
},
{
"epoch": 0.33016877637130804,
"grad_norm": 0.36625683307647705,
"learning_rate": 0.001145106597616927,
"loss": 1.3338,
"step": 3130
},
{
"epoch": 0.33122362869198313,
"grad_norm": 0.384627103805542,
"learning_rate": 0.001142970421815904,
"loss": 1.3098,
"step": 3140
},
{
"epoch": 0.3322784810126582,
"grad_norm": 0.3396291136741638,
"learning_rate": 0.0011408298425894226,
"loss": 1.3219,
"step": 3150
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.3925064206123352,
"learning_rate": 0.0011386848839237186,
"loss": 1.3212,
"step": 3160
},
{
"epoch": 0.33438818565400846,
"grad_norm": 0.3743514120578766,
"learning_rate": 0.0011365355698541005,
"loss": 1.3204,
"step": 3170
},
{
"epoch": 0.33544303797468356,
"grad_norm": 0.37474945187568665,
"learning_rate": 0.0011343819244646824,
"loss": 1.3147,
"step": 3180
},
{
"epoch": 0.33649789029535865,
"grad_norm": 0.41234973073005676,
"learning_rate": 0.001132223971888112,
"loss": 1.3149,
"step": 3190
},
{
"epoch": 0.33755274261603374,
"grad_norm": 0.3496325612068176,
"learning_rate": 0.0011300617363053024,
"loss": 1.31,
"step": 3200
},
{
"epoch": 0.33860759493670883,
"grad_norm": 0.3794001042842865,
"learning_rate": 0.0011278952419451586,
"loss": 1.3232,
"step": 3210
},
{
"epoch": 0.339662447257384,
"grad_norm": 0.38375622034072876,
"learning_rate": 0.0011257245130843077,
"loss": 1.3148,
"step": 3220
},
{
"epoch": 0.3407172995780591,
"grad_norm": 0.3591216802597046,
"learning_rate": 0.0011235495740468265,
"loss": 1.3092,
"step": 3230
},
{
"epoch": 0.34177215189873417,
"grad_norm": 0.3375595211982727,
"learning_rate": 0.0011213704492039694,
"loss": 1.29,
"step": 3240
},
{
"epoch": 0.34282700421940926,
"grad_norm": 0.37955981492996216,
"learning_rate": 0.001119187162973894,
"loss": 1.3161,
"step": 3250
},
{
"epoch": 0.3438818565400844,
"grad_norm": 0.35981568694114685,
"learning_rate": 0.001116999739821388,
"loss": 1.3048,
"step": 3260
},
{
"epoch": 0.3449367088607595,
"grad_norm": 0.35022369027137756,
"learning_rate": 0.0011148082042575968,
"loss": 1.3239,
"step": 3270
},
{
"epoch": 0.3459915611814346,
"grad_norm": 0.3633371889591217,
"learning_rate": 0.0011126125808397461,
"loss": 1.322,
"step": 3280
},
{
"epoch": 0.3470464135021097,
"grad_norm": 0.4322781264781952,
"learning_rate": 0.0011104128941708683,
"loss": 1.3041,
"step": 3290
},
{
"epoch": 0.34810126582278483,
"grad_norm": 0.34436267614364624,
"learning_rate": 0.001108209168899527,
"loss": 1.3112,
"step": 3300
},
{
"epoch": 0.3491561181434599,
"grad_norm": 0.3594895601272583,
"learning_rate": 0.0011060014297195396,
"loss": 1.3094,
"step": 3310
},
{
"epoch": 0.350210970464135,
"grad_norm": 0.35558730363845825,
"learning_rate": 0.0011037897013697015,
"loss": 1.3153,
"step": 3320
},
{
"epoch": 0.3512658227848101,
"grad_norm": 0.3509814441204071,
"learning_rate": 0.0011015740086335092,
"loss": 1.303,
"step": 3330
},
{
"epoch": 0.35232067510548526,
"grad_norm": 0.39170390367507935,
"learning_rate": 0.0010993543763388814,
"loss": 1.3075,
"step": 3340
},
{
"epoch": 0.35337552742616035,
"grad_norm": 0.38959237933158875,
"learning_rate": 0.0010971308293578814,
"loss": 1.3028,
"step": 3350
},
{
"epoch": 0.35443037974683544,
"grad_norm": 0.4082445204257965,
"learning_rate": 0.0010949033926064397,
"loss": 1.3028,
"step": 3360
},
{
"epoch": 0.35548523206751054,
"grad_norm": 0.45328179001808167,
"learning_rate": 0.0010926720910440725,
"loss": 1.3113,
"step": 3370
},
{
"epoch": 0.35654008438818563,
"grad_norm": 0.40795767307281494,
"learning_rate": 0.001090436949673603,
"loss": 1.3048,
"step": 3380
},
{
"epoch": 0.3575949367088608,
"grad_norm": 0.34642016887664795,
"learning_rate": 0.0010881979935408815,
"loss": 1.2998,
"step": 3390
},
{
"epoch": 0.35864978902953587,
"grad_norm": 0.4289262592792511,
"learning_rate": 0.0010859552477345052,
"loss": 1.3076,
"step": 3400
},
{
"epoch": 0.35970464135021096,
"grad_norm": 0.3674197494983673,
"learning_rate": 0.001083708737385536,
"loss": 1.3056,
"step": 3410
},
{
"epoch": 0.36075949367088606,
"grad_norm": 0.3692392110824585,
"learning_rate": 0.0010814584876672187,
"loss": 1.2879,
"step": 3420
},
{
"epoch": 0.3618143459915612,
"grad_norm": 0.3641003966331482,
"learning_rate": 0.0010792045237947008,
"loss": 1.2997,
"step": 3430
},
{
"epoch": 0.3628691983122363,
"grad_norm": 0.3449475169181824,
"learning_rate": 0.0010769468710247478,
"loss": 1.3001,
"step": 3440
},
{
"epoch": 0.3639240506329114,
"grad_norm": 0.3534505367279053,
"learning_rate": 0.0010746855546554612,
"loss": 1.2934,
"step": 3450
},
{
"epoch": 0.3649789029535865,
"grad_norm": 0.383791983127594,
"learning_rate": 0.0010724206000259954,
"loss": 1.2894,
"step": 3460
},
{
"epoch": 0.36603375527426163,
"grad_norm": 0.3717612028121948,
"learning_rate": 0.0010701520325162727,
"loss": 1.3072,
"step": 3470
},
{
"epoch": 0.3670886075949367,
"grad_norm": 0.3450316786766052,
"learning_rate": 0.0010678798775467001,
"loss": 1.3092,
"step": 3480
},
{
"epoch": 0.3681434599156118,
"grad_norm": 0.3202390670776367,
"learning_rate": 0.0010656041605778832,
"loss": 1.3014,
"step": 3490
},
{
"epoch": 0.3691983122362869,
"grad_norm": 0.3394745886325836,
"learning_rate": 0.001063324907110342,
"loss": 1.2814,
"step": 3500
},
{
"epoch": 0.370253164556962,
"grad_norm": 0.3754570782184601,
"learning_rate": 0.0010610421426842241,
"loss": 1.3015,
"step": 3510
},
{
"epoch": 0.37130801687763715,
"grad_norm": 0.367051899433136,
"learning_rate": 0.00105875589287902,
"loss": 1.2982,
"step": 3520
},
{
"epoch": 0.37236286919831224,
"grad_norm": 0.39652103185653687,
"learning_rate": 0.0010564661833132752,
"loss": 1.3068,
"step": 3530
},
{
"epoch": 0.37341772151898733,
"grad_norm": 0.35056713223457336,
"learning_rate": 0.001054173039644303,
"loss": 1.2984,
"step": 3540
},
{
"epoch": 0.3744725738396624,
"grad_norm": 0.37702634930610657,
"learning_rate": 0.0010518764875678981,
"loss": 1.2922,
"step": 3550
},
{
"epoch": 0.3755274261603376,
"grad_norm": 0.35276398062705994,
"learning_rate": 0.001049576552818048,
"loss": 1.2941,
"step": 3560
},
{
"epoch": 0.37658227848101267,
"grad_norm": 0.3581322729587555,
"learning_rate": 0.0010472732611666448,
"loss": 1.2877,
"step": 3570
},
{
"epoch": 0.37763713080168776,
"grad_norm": 0.4018959701061249,
"learning_rate": 0.0010449666384231954,
"loss": 1.2892,
"step": 3580
},
{
"epoch": 0.37869198312236285,
"grad_norm": 0.382752925157547,
"learning_rate": 0.0010426567104345346,
"loss": 1.3005,
"step": 3590
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.37618669867515564,
"learning_rate": 0.0010403435030845332,
"loss": 1.2861,
"step": 3600
},
{
"epoch": 0.3808016877637131,
"grad_norm": 0.35512155294418335,
"learning_rate": 0.0010380270422938093,
"loss": 1.284,
"step": 3610
},
{
"epoch": 0.3818565400843882,
"grad_norm": 0.47898605465888977,
"learning_rate": 0.0010357073540194362,
"loss": 1.2887,
"step": 3620
},
{
"epoch": 0.3829113924050633,
"grad_norm": 0.3588807284832001,
"learning_rate": 0.001033384464254655,
"loss": 1.2857,
"step": 3630
},
{
"epoch": 0.38396624472573837,
"grad_norm": 0.3950936198234558,
"learning_rate": 0.001031058399028579,
"loss": 1.2873,
"step": 3640
},
{
"epoch": 0.3850210970464135,
"grad_norm": 0.3477298319339752,
"learning_rate": 0.001028729184405905,
"loss": 1.2954,
"step": 3650
},
{
"epoch": 0.3860759493670886,
"grad_norm": 0.36207327246665955,
"learning_rate": 0.0010263968464866201,
"loss": 1.2892,
"step": 3660
},
{
"epoch": 0.3871308016877637,
"grad_norm": 0.44805607199668884,
"learning_rate": 0.0010240614114057098,
"loss": 1.2815,
"step": 3670
},
{
"epoch": 0.3881856540084388,
"grad_norm": 0.43384575843811035,
"learning_rate": 0.001021722905332864,
"loss": 1.2851,
"step": 3680
},
{
"epoch": 0.38924050632911394,
"grad_norm": 0.35028716921806335,
"learning_rate": 0.0010193813544721855,
"loss": 1.2895,
"step": 3690
},
{
"epoch": 0.39029535864978904,
"grad_norm": 0.3653584122657776,
"learning_rate": 0.001017036785061895,
"loss": 1.2941,
"step": 3700
},
{
"epoch": 0.39135021097046413,
"grad_norm": 0.34270143508911133,
"learning_rate": 0.0010146892233740376,
"loss": 1.2841,
"step": 3710
},
{
"epoch": 0.3924050632911392,
"grad_norm": 0.41713616251945496,
"learning_rate": 0.0010123386957141883,
"loss": 1.2662,
"step": 3720
},
{
"epoch": 0.39345991561181437,
"grad_norm": 0.3545433282852173,
"learning_rate": 0.0010099852284211573,
"loss": 1.2843,
"step": 3730
},
{
"epoch": 0.39451476793248946,
"grad_norm": 0.3524881601333618,
"learning_rate": 0.0010076288478666944,
"loss": 1.2817,
"step": 3740
},
{
"epoch": 0.39556962025316456,
"grad_norm": 0.3451279103755951,
"learning_rate": 0.0010052695804551946,
"loss": 1.2877,
"step": 3750
},
{
"epoch": 0.39662447257383965,
"grad_norm": 0.36001715064048767,
"learning_rate": 0.0010029074526234014,
"loss": 1.275,
"step": 3760
},
{
"epoch": 0.39767932489451474,
"grad_norm": 0.3992428183555603,
"learning_rate": 0.0010005424908401104,
"loss": 1.2909,
"step": 3770
},
{
"epoch": 0.3987341772151899,
"grad_norm": 0.3687851130962372,
"learning_rate": 0.0009981747216058728,
"loss": 1.2761,
"step": 3780
},
{
"epoch": 0.399789029535865,
"grad_norm": 0.39275357127189636,
"learning_rate": 0.0009958041714526998,
"loss": 1.2806,
"step": 3790
},
{
"epoch": 0.4008438818565401,
"grad_norm": 0.3487364947795868,
"learning_rate": 0.0009934308669437627,
"loss": 1.2866,
"step": 3800
},
{
"epoch": 0.40189873417721517,
"grad_norm": 0.3634994924068451,
"learning_rate": 0.0009910548346730972,
"loss": 1.2745,
"step": 3810
},
{
"epoch": 0.4029535864978903,
"grad_norm": 0.35668596625328064,
"learning_rate": 0.0009886761012653062,
"loss": 1.2684,
"step": 3820
},
{
"epoch": 0.4040084388185654,
"grad_norm": 0.3821624517440796,
"learning_rate": 0.000986294693375258,
"loss": 1.2716,
"step": 3830
},
{
"epoch": 0.4050632911392405,
"grad_norm": 0.3544701039791107,
"learning_rate": 0.000983910637687791,
"loss": 1.2849,
"step": 3840
},
{
"epoch": 0.4061181434599156,
"grad_norm": 0.3593977987766266,
"learning_rate": 0.0009815239609174138,
"loss": 1.268,
"step": 3850
},
{
"epoch": 0.40717299578059074,
"grad_norm": 0.3400956988334656,
"learning_rate": 0.0009791346898080043,
"loss": 1.28,
"step": 3860
},
{
"epoch": 0.40822784810126583,
"grad_norm": 0.3652053475379944,
"learning_rate": 0.0009767428511325122,
"loss": 1.2733,
"step": 3870
},
{
"epoch": 0.4092827004219409,
"grad_norm": 0.37829872965812683,
"learning_rate": 0.0009743484716926576,
"loss": 1.2622,
"step": 3880
},
{
"epoch": 0.410337552742616,
"grad_norm": 0.36608776450157166,
"learning_rate": 0.0009719515783186319,
"loss": 1.2642,
"step": 3890
},
{
"epoch": 0.41139240506329117,
"grad_norm": 0.3877875506877899,
"learning_rate": 0.0009695521978687951,
"loss": 1.2656,
"step": 3900
},
{
"epoch": 0.41244725738396626,
"grad_norm": 0.37717291712760925,
"learning_rate": 0.0009671503572293767,
"loss": 1.2705,
"step": 3910
},
{
"epoch": 0.41350210970464135,
"grad_norm": 0.3739968538284302,
"learning_rate": 0.0009647460833141742,
"loss": 1.2633,
"step": 3920
},
{
"epoch": 0.41455696202531644,
"grad_norm": 0.3548559546470642,
"learning_rate": 0.0009623394030642507,
"loss": 1.2697,
"step": 3930
},
{
"epoch": 0.41561181434599154,
"grad_norm": 0.3345405161380768,
"learning_rate": 0.0009599303434476334,
"loss": 1.2639,
"step": 3940
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.3435943126678467,
"learning_rate": 0.0009575189314590118,
"loss": 1.2698,
"step": 3950
},
{
"epoch": 0.4177215189873418,
"grad_norm": 0.3582841157913208,
"learning_rate": 0.0009551051941194346,
"loss": 1.2699,
"step": 3960
},
{
"epoch": 0.41877637130801687,
"grad_norm": 0.39052221179008484,
"learning_rate": 0.0009526891584760071,
"loss": 1.2617,
"step": 3970
},
{
"epoch": 0.41983122362869196,
"grad_norm": 0.34528857469558716,
"learning_rate": 0.0009502708516015889,
"loss": 1.2704,
"step": 3980
},
{
"epoch": 0.4208860759493671,
"grad_norm": 0.35940083861351013,
"learning_rate": 0.0009478503005944888,
"loss": 1.2612,
"step": 3990
},
{
"epoch": 0.4219409282700422,
"grad_norm": 0.4698091447353363,
"learning_rate": 0.0009454275325781632,
"loss": 1.2752,
"step": 4000
},
{
"epoch": 0.4229957805907173,
"grad_norm": 0.35269781947135925,
"learning_rate": 0.0009430025747009104,
"loss": 1.2627,
"step": 4010
},
{
"epoch": 0.4240506329113924,
"grad_norm": 0.35480797290802,
"learning_rate": 0.0009405754541355677,
"loss": 1.2644,
"step": 4020
},
{
"epoch": 0.42510548523206754,
"grad_norm": 0.35654541850090027,
"learning_rate": 0.0009381461980792061,
"loss": 1.2581,
"step": 4030
},
{
"epoch": 0.42616033755274263,
"grad_norm": 0.4350382387638092,
"learning_rate": 0.0009357148337528256,
"loss": 1.265,
"step": 4040
},
{
"epoch": 0.4272151898734177,
"grad_norm": 0.43560877442359924,
"learning_rate": 0.0009332813884010511,
"loss": 1.2698,
"step": 4050
},
{
"epoch": 0.4282700421940928,
"grad_norm": 0.3371978998184204,
"learning_rate": 0.0009308458892918259,
"loss": 1.271,
"step": 4060
},
{
"epoch": 0.4293248945147679,
"grad_norm": 0.3576991856098175,
"learning_rate": 0.0009284083637161064,
"loss": 1.2678,
"step": 4070
},
{
"epoch": 0.43037974683544306,
"grad_norm": 0.39786258339881897,
"learning_rate": 0.0009259688389875574,
"loss": 1.2712,
"step": 4080
},
{
"epoch": 0.43143459915611815,
"grad_norm": 0.40100061893463135,
"learning_rate": 0.0009235273424422442,
"loss": 1.264,
"step": 4090
},
{
"epoch": 0.43248945147679324,
"grad_norm": 0.3555314242839813,
"learning_rate": 0.0009210839014383282,
"loss": 1.2598,
"step": 4100
},
{
"epoch": 0.43354430379746833,
"grad_norm": 0.38855957984924316,
"learning_rate": 0.0009186385433557584,
"loss": 1.2577,
"step": 4110
},
{
"epoch": 0.4345991561181435,
"grad_norm": 0.42936259508132935,
"learning_rate": 0.0009161912955959668,
"loss": 1.2618,
"step": 4120
},
{
"epoch": 0.4356540084388186,
"grad_norm": 0.3469286561012268,
"learning_rate": 0.000913742185581559,
"loss": 1.2507,
"step": 4130
},
{
"epoch": 0.43670886075949367,
"grad_norm": 0.338794082403183,
"learning_rate": 0.0009112912407560086,
"loss": 1.2559,
"step": 4140
},
{
"epoch": 0.43776371308016876,
"grad_norm": 0.38378769159317017,
"learning_rate": 0.0009088384885833495,
"loss": 1.2606,
"step": 4150
},
{
"epoch": 0.4388185654008439,
"grad_norm": 0.37784048914909363,
"learning_rate": 0.000906383956547867,
"loss": 1.2539,
"step": 4160
},
{
"epoch": 0.439873417721519,
"grad_norm": 0.3505966067314148,
"learning_rate": 0.0009039276721537915,
"loss": 1.2546,
"step": 4170
},
{
"epoch": 0.4409282700421941,
"grad_norm": 0.37530627846717834,
"learning_rate": 0.0009014696629249886,
"loss": 1.253,
"step": 4180
},
{
"epoch": 0.4419831223628692,
"grad_norm": 0.37926673889160156,
"learning_rate": 0.0008990099564046522,
"loss": 1.258,
"step": 4190
},
{
"epoch": 0.4430379746835443,
"grad_norm": 0.3719365894794464,
"learning_rate": 0.0008965485801549946,
"loss": 1.2585,
"step": 4200
},
{
"epoch": 0.4440928270042194,
"grad_norm": 0.34479984641075134,
"learning_rate": 0.000894085561756939,
"loss": 1.2487,
"step": 4210
},
{
"epoch": 0.4451476793248945,
"grad_norm": 0.3615456521511078,
"learning_rate": 0.0008916209288098088,
"loss": 1.257,
"step": 4220
},
{
"epoch": 0.4462025316455696,
"grad_norm": 0.3427673280239105,
"learning_rate": 0.0008891547089310198,
"loss": 1.2533,
"step": 4230
},
{
"epoch": 0.4472573839662447,
"grad_norm": 0.34341564774513245,
"learning_rate": 0.0008866869297557699,
"loss": 1.2475,
"step": 4240
},
{
"epoch": 0.44831223628691985,
"grad_norm": 0.34663861989974976,
"learning_rate": 0.0008842176189367299,
"loss": 1.254,
"step": 4250
},
{
"epoch": 0.44936708860759494,
"grad_norm": 0.35745692253112793,
"learning_rate": 0.0008817468041437329,
"loss": 1.2599,
"step": 4260
},
{
"epoch": 0.45042194092827004,
"grad_norm": 0.35569432377815247,
"learning_rate": 0.0008792745130634654,
"loss": 1.2498,
"step": 4270
},
{
"epoch": 0.45147679324894513,
"grad_norm": 0.34108027815818787,
"learning_rate": 0.0008768007733991561,
"loss": 1.2439,
"step": 4280
},
{
"epoch": 0.4525316455696203,
"grad_norm": 0.3217419385910034,
"learning_rate": 0.0008743256128702658,
"loss": 1.255,
"step": 4290
},
{
"epoch": 0.45358649789029537,
"grad_norm": 0.39865434169769287,
"learning_rate": 0.0008718490592121768,
"loss": 1.2542,
"step": 4300
},
{
"epoch": 0.45464135021097046,
"grad_norm": 0.3799431025981903,
"learning_rate": 0.0008693711401758822,
"loss": 1.2478,
"step": 4310
},
{
"epoch": 0.45569620253164556,
"grad_norm": 0.3823153078556061,
"learning_rate": 0.0008668918835276747,
"loss": 1.2481,
"step": 4320
},
{
"epoch": 0.45675105485232065,
"grad_norm": 0.3550722599029541,
"learning_rate": 0.0008644113170488355,
"loss": 1.2545,
"step": 4330
},
{
"epoch": 0.4578059071729958,
"grad_norm": 0.3509629964828491,
"learning_rate": 0.0008619294685353235,
"loss": 1.2467,
"step": 4340
},
{
"epoch": 0.4588607594936709,
"grad_norm": 0.36587169766426086,
"learning_rate": 0.0008594463657974627,
"loss": 1.2447,
"step": 4350
},
{
"epoch": 0.459915611814346,
"grad_norm": 0.3689899146556854,
"learning_rate": 0.0008569620366596322,
"loss": 1.2618,
"step": 4360
},
{
"epoch": 0.4609704641350211,
"grad_norm": 0.3530426621437073,
"learning_rate": 0.000854476508959953,
"loss": 1.2376,
"step": 4370
},
{
"epoch": 0.4620253164556962,
"grad_norm": 0.35614463686943054,
"learning_rate": 0.0008519898105499762,
"loss": 1.2429,
"step": 4380
},
{
"epoch": 0.4630801687763713,
"grad_norm": 0.351806104183197,
"learning_rate": 0.0008495019692943721,
"loss": 1.2407,
"step": 4390
},
{
"epoch": 0.4641350210970464,
"grad_norm": 0.37533193826675415,
"learning_rate": 0.0008470130130706166,
"loss": 1.2446,
"step": 4400
},
{
"epoch": 0.4651898734177215,
"grad_norm": 0.3407420814037323,
"learning_rate": 0.0008445229697686795,
"loss": 1.259,
"step": 4410
},
{
"epoch": 0.46624472573839665,
"grad_norm": 0.3381921648979187,
"learning_rate": 0.0008420318672907119,
"loss": 1.2444,
"step": 4420
},
{
"epoch": 0.46729957805907174,
"grad_norm": 0.33693447709083557,
"learning_rate": 0.0008395397335507334,
"loss": 1.2468,
"step": 4430
},
{
"epoch": 0.46835443037974683,
"grad_norm": 0.39592495560646057,
"learning_rate": 0.0008370465964743196,
"loss": 1.2399,
"step": 4440
},
{
"epoch": 0.4694092827004219,
"grad_norm": 0.3851742446422577,
"learning_rate": 0.0008345524839982886,
"loss": 1.2503,
"step": 4450
},
{
"epoch": 0.4704641350210971,
"grad_norm": 0.3582525849342346,
"learning_rate": 0.0008320574240703886,
"loss": 1.2354,
"step": 4460
},
{
"epoch": 0.47151898734177217,
"grad_norm": 0.3568393290042877,
"learning_rate": 0.0008295614446489842,
"loss": 1.2437,
"step": 4470
},
{
"epoch": 0.47257383966244726,
"grad_norm": 0.3440687656402588,
"learning_rate": 0.0008270645737027441,
"loss": 1.2495,
"step": 4480
},
{
"epoch": 0.47362869198312235,
"grad_norm": 0.3949049711227417,
"learning_rate": 0.0008245668392103259,
"loss": 1.2386,
"step": 4490
},
{
"epoch": 0.47468354430379744,
"grad_norm": 0.33692407608032227,
"learning_rate": 0.0008220682691600645,
"loss": 1.2374,
"step": 4500
},
{
"epoch": 0.4757383966244726,
"grad_norm": 0.3463355600833893,
"learning_rate": 0.0008195688915496571,
"loss": 1.2433,
"step": 4510
},
{
"epoch": 0.4767932489451477,
"grad_norm": 0.34808549284935,
"learning_rate": 0.0008170687343858506,
"loss": 1.2243,
"step": 4520
},
{
"epoch": 0.4778481012658228,
"grad_norm": 0.49410006403923035,
"learning_rate": 0.0008145678256841265,
"loss": 1.2453,
"step": 4530
},
{
"epoch": 0.47890295358649787,
"grad_norm": 0.3992741107940674,
"learning_rate": 0.0008120661934683879,
"loss": 1.2427,
"step": 4540
},
{
"epoch": 0.479957805907173,
"grad_norm": 0.34049564599990845,
"learning_rate": 0.0008095638657706456,
"loss": 1.2424,
"step": 4550
},
{
"epoch": 0.4810126582278481,
"grad_norm": 0.33464476466178894,
"learning_rate": 0.000807060870630703,
"loss": 1.2317,
"step": 4560
},
{
"epoch": 0.4820675105485232,
"grad_norm": 0.356582909822464,
"learning_rate": 0.000804557236095843,
"loss": 1.2142,
"step": 4570
},
{
"epoch": 0.4831223628691983,
"grad_norm": 0.39746856689453125,
"learning_rate": 0.0008020529902205129,
"loss": 1.2384,
"step": 4580
},
{
"epoch": 0.48417721518987344,
"grad_norm": 0.36983558535575867,
"learning_rate": 0.0007995481610660108,
"loss": 1.2247,
"step": 4590
},
{
"epoch": 0.48523206751054854,
"grad_norm": 0.3777245581150055,
"learning_rate": 0.0007970427767001702,
"loss": 1.241,
"step": 4600
},
{
"epoch": 0.48628691983122363,
"grad_norm": 0.35135287046432495,
"learning_rate": 0.0007945368651970464,
"loss": 1.2422,
"step": 4610
},
{
"epoch": 0.4873417721518987,
"grad_norm": 0.3400181829929352,
"learning_rate": 0.0007920304546366013,
"loss": 1.2234,
"step": 4620
},
{
"epoch": 0.4883966244725738,
"grad_norm": 0.41734570264816284,
"learning_rate": 0.000789523573104389,
"loss": 1.2398,
"step": 4630
},
{
"epoch": 0.48945147679324896,
"grad_norm": 0.3581525385379791,
"learning_rate": 0.0007870162486912414,
"loss": 1.2394,
"step": 4640
},
{
"epoch": 0.49050632911392406,
"grad_norm": 0.35023796558380127,
"learning_rate": 0.0007845085094929527,
"loss": 1.2366,
"step": 4650
},
{
"epoch": 0.49156118143459915,
"grad_norm": 0.39396238327026367,
"learning_rate": 0.0007820003836099649,
"loss": 1.232,
"step": 4660
},
{
"epoch": 0.49261603375527424,
"grad_norm": 0.38713324069976807,
"learning_rate": 0.0007794918991470537,
"loss": 1.2324,
"step": 4670
},
{
"epoch": 0.4936708860759494,
"grad_norm": 0.3926469385623932,
"learning_rate": 0.0007769830842130119,
"loss": 1.2275,
"step": 4680
},
{
"epoch": 0.4947257383966245,
"grad_norm": 0.32987460494041443,
"learning_rate": 0.0007744739669203361,
"loss": 1.2215,
"step": 4690
},
{
"epoch": 0.4957805907172996,
"grad_norm": 0.37924712896347046,
"learning_rate": 0.0007719645753849108,
"loss": 1.2297,
"step": 4700
},
{
"epoch": 0.49683544303797467,
"grad_norm": 0.3345460593700409,
"learning_rate": 0.0007694549377256932,
"loss": 1.2321,
"step": 4710
},
{
"epoch": 0.4978902953586498,
"grad_norm": 0.4350821077823639,
"learning_rate": 0.0007669450820643987,
"loss": 1.2302,
"step": 4720
},
{
"epoch": 0.4989451476793249,
"grad_norm": 0.36744949221611023,
"learning_rate": 0.0007644350365251855,
"loss": 1.2244,
"step": 4730
},
{
"epoch": 0.5,
"grad_norm": 0.35269591212272644,
"learning_rate": 0.0007619248292343399,
"loss": 1.2237,
"step": 4740
},
{
"epoch": 0.5010548523206751,
"grad_norm": 0.3423689603805542,
"learning_rate": 0.0007594144883199599,
"loss": 1.2441,
"step": 4750
},
{
"epoch": 0.5021097046413502,
"grad_norm": 0.33250823616981506,
"learning_rate": 0.0007569040419116413,
"loss": 1.2274,
"step": 4760
},
{
"epoch": 0.5031645569620253,
"grad_norm": 0.4062730073928833,
"learning_rate": 0.000754393518140162,
"loss": 1.2284,
"step": 4770
},
{
"epoch": 0.5042194092827004,
"grad_norm": 0.33738499879837036,
"learning_rate": 0.0007518829451371665,
"loss": 1.2336,
"step": 4780
},
{
"epoch": 0.5052742616033755,
"grad_norm": 0.3440134525299072,
"learning_rate": 0.0007493723510348516,
"loss": 1.22,
"step": 4790
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.35827070474624634,
"learning_rate": 0.0007468617639656496,
"loss": 1.2188,
"step": 4800
},
{
"epoch": 0.5073839662447257,
"grad_norm": 0.3508043885231018,
"learning_rate": 0.0007443512120619144,
"loss": 1.2208,
"step": 4810
},
{
"epoch": 0.5084388185654009,
"grad_norm": 0.3430309593677521,
"learning_rate": 0.0007418407234556067,
"loss": 1.2432,
"step": 4820
},
{
"epoch": 0.509493670886076,
"grad_norm": 0.43787845969200134,
"learning_rate": 0.0007393303262779767,
"loss": 1.2253,
"step": 4830
},
{
"epoch": 0.510548523206751,
"grad_norm": 0.39787721633911133,
"learning_rate": 0.0007368200486592507,
"loss": 1.2201,
"step": 4840
},
{
"epoch": 0.5116033755274262,
"grad_norm": 0.40354400873184204,
"learning_rate": 0.0007343099187283149,
"loss": 1.2322,
"step": 4850
},
{
"epoch": 0.5126582278481012,
"grad_norm": 0.4470100402832031,
"learning_rate": 0.0007317999646124011,
"loss": 1.2214,
"step": 4860
},
{
"epoch": 0.5137130801687764,
"grad_norm": 0.3791539967060089,
"learning_rate": 0.0007292902144367704,
"loss": 1.2157,
"step": 4870
},
{
"epoch": 0.5147679324894515,
"grad_norm": 0.3646821081638336,
"learning_rate": 0.0007267806963243995,
"loss": 1.2323,
"step": 4880
},
{
"epoch": 0.5158227848101266,
"grad_norm": 0.34422731399536133,
"learning_rate": 0.0007242714383956639,
"loss": 1.2224,
"step": 4890
},
{
"epoch": 0.5168776371308017,
"grad_norm": 0.3907710313796997,
"learning_rate": 0.000721762468768024,
"loss": 1.2291,
"step": 4900
},
{
"epoch": 0.5179324894514767,
"grad_norm": 0.37104958295822144,
"learning_rate": 0.0007192538155557094,
"loss": 1.2241,
"step": 4910
},
{
"epoch": 0.5189873417721519,
"grad_norm": 0.4097077250480652,
"learning_rate": 0.0007167455068694046,
"loss": 1.2126,
"step": 4920
},
{
"epoch": 0.520042194092827,
"grad_norm": 0.3646121919155121,
"learning_rate": 0.000714237570815933,
"loss": 1.2246,
"step": 4930
},
{
"epoch": 0.5210970464135021,
"grad_norm": 0.39202240109443665,
"learning_rate": 0.0007117300354979423,
"loss": 1.2217,
"step": 4940
},
{
"epoch": 0.5221518987341772,
"grad_norm": 0.35184553265571594,
"learning_rate": 0.000709222929013591,
"loss": 1.2289,
"step": 4950
},
{
"epoch": 0.5232067510548524,
"grad_norm": 0.39552363753318787,
"learning_rate": 0.0007067162794562309,
"loss": 1.212,
"step": 4960
},
{
"epoch": 0.5242616033755274,
"grad_norm": 0.3939037024974823,
"learning_rate": 0.0007042101149140943,
"loss": 1.219,
"step": 4970
},
{
"epoch": 0.5253164556962026,
"grad_norm": 0.3670158088207245,
"learning_rate": 0.0007017044634699787,
"loss": 1.2112,
"step": 4980
},
{
"epoch": 0.5263713080168776,
"grad_norm": 0.3672795295715332,
"learning_rate": 0.0006991993532009319,
"loss": 1.2155,
"step": 4990
},
{
"epoch": 0.5274261603375527,
"grad_norm": 0.37238597869873047,
"learning_rate": 0.0006966948121779378,
"loss": 1.2237,
"step": 5000
},
{
"epoch": 0.5284810126582279,
"grad_norm": 0.34321847558021545,
"learning_rate": 0.000694190868465601,
"loss": 1.2167,
"step": 5010
},
{
"epoch": 0.5295358649789029,
"grad_norm": 0.35932210087776184,
"learning_rate": 0.0006916875501218343,
"loss": 1.2118,
"step": 5020
},
{
"epoch": 0.5305907172995781,
"grad_norm": 0.33948570489883423,
"learning_rate": 0.0006891848851975416,
"loss": 1.197,
"step": 5030
},
{
"epoch": 0.5316455696202531,
"grad_norm": 0.3529501259326935,
"learning_rate": 0.0006866829017363054,
"loss": 1.2252,
"step": 5040
},
{
"epoch": 0.5327004219409283,
"grad_norm": 0.3476143479347229,
"learning_rate": 0.0006841816277740722,
"loss": 1.2125,
"step": 5050
},
{
"epoch": 0.5337552742616034,
"grad_norm": 0.38103732466697693,
"learning_rate": 0.0006816810913388379,
"loss": 1.2257,
"step": 5060
},
{
"epoch": 0.5348101265822784,
"grad_norm": 0.3743418753147125,
"learning_rate": 0.0006791813204503342,
"loss": 1.2119,
"step": 5070
},
{
"epoch": 0.5358649789029536,
"grad_norm": 0.3913949131965637,
"learning_rate": 0.0006766823431197147,
"loss": 1.2239,
"step": 5080
},
{
"epoch": 0.5369198312236287,
"grad_norm": 0.36954477429389954,
"learning_rate": 0.0006741841873492406,
"loss": 1.215,
"step": 5090
},
{
"epoch": 0.5379746835443038,
"grad_norm": 0.3766242563724518,
"learning_rate": 0.0006716868811319671,
"loss": 1.2004,
"step": 5100
},
{
"epoch": 0.5390295358649789,
"grad_norm": 0.36319437623023987,
"learning_rate": 0.0006691904524514297,
"loss": 1.2125,
"step": 5110
},
{
"epoch": 0.540084388185654,
"grad_norm": 0.35376471281051636,
"learning_rate": 0.0006666949292813306,
"loss": 1.2082,
"step": 5120
},
{
"epoch": 0.5411392405063291,
"grad_norm": 0.3592052757740021,
"learning_rate": 0.0006642003395852258,
"loss": 1.2081,
"step": 5130
},
{
"epoch": 0.5421940928270043,
"grad_norm": 0.3607555031776428,
"learning_rate": 0.0006617067113162103,
"loss": 1.2217,
"step": 5140
},
{
"epoch": 0.5432489451476793,
"grad_norm": 0.34854963421821594,
"learning_rate": 0.0006592140724166073,
"loss": 1.2167,
"step": 5150
},
{
"epoch": 0.5443037974683544,
"grad_norm": 0.3794786036014557,
"learning_rate": 0.0006567224508176523,
"loss": 1.2116,
"step": 5160
},
{
"epoch": 0.5453586497890295,
"grad_norm": 0.352797269821167,
"learning_rate": 0.0006542318744391821,
"loss": 1.2106,
"step": 5170
},
{
"epoch": 0.5464135021097046,
"grad_norm": 0.3626922369003296,
"learning_rate": 0.0006517423711893209,
"loss": 1.2181,
"step": 5180
},
{
"epoch": 0.5474683544303798,
"grad_norm": 0.3714168667793274,
"learning_rate": 0.0006492539689641685,
"loss": 1.2022,
"step": 5190
},
{
"epoch": 0.5485232067510548,
"grad_norm": 0.39416635036468506,
"learning_rate": 0.0006467666956474865,
"loss": 1.2142,
"step": 5200
},
{
"epoch": 0.54957805907173,
"grad_norm": 0.36706307530403137,
"learning_rate": 0.0006442805791103873,
"loss": 1.2016,
"step": 5210
},
{
"epoch": 0.5506329113924051,
"grad_norm": 0.3452375829219818,
"learning_rate": 0.0006417956472110205,
"loss": 1.2108,
"step": 5220
},
{
"epoch": 0.5516877637130801,
"grad_norm": 0.4015010893344879,
"learning_rate": 0.0006393119277942614,
"loss": 1.2049,
"step": 5230
},
{
"epoch": 0.5527426160337553,
"grad_norm": 0.42108821868896484,
"learning_rate": 0.0006368294486913987,
"loss": 1.209,
"step": 5240
},
{
"epoch": 0.5537974683544303,
"grad_norm": 0.3588517904281616,
"learning_rate": 0.0006343482377198232,
"loss": 1.2027,
"step": 5250
},
{
"epoch": 0.5548523206751055,
"grad_norm": 0.3523201048374176,
"learning_rate": 0.0006318683226827151,
"loss": 1.2083,
"step": 5260
},
{
"epoch": 0.5559071729957806,
"grad_norm": 0.38106194138526917,
"learning_rate": 0.0006293897313687331,
"loss": 1.2161,
"step": 5270
},
{
"epoch": 0.5569620253164557,
"grad_norm": 0.38883480429649353,
"learning_rate": 0.0006269124915517037,
"loss": 1.2054,
"step": 5280
},
{
"epoch": 0.5580168776371308,
"grad_norm": 0.3482867181301117,
"learning_rate": 0.0006244366309903084,
"loss": 1.2067,
"step": 5290
},
{
"epoch": 0.5590717299578059,
"grad_norm": 0.3491329848766327,
"learning_rate": 0.0006219621774277737,
"loss": 1.2112,
"step": 5300
},
{
"epoch": 0.560126582278481,
"grad_norm": 0.35099756717681885,
"learning_rate": 0.00061948915859156,
"loss": 1.1956,
"step": 5310
},
{
"epoch": 0.5611814345991561,
"grad_norm": 0.3603163957595825,
"learning_rate": 0.0006170176021930509,
"loss": 1.2005,
"step": 5320
},
{
"epoch": 0.5622362869198312,
"grad_norm": 0.3842976689338684,
"learning_rate": 0.0006145475359272424,
"loss": 1.207,
"step": 5330
},
{
"epoch": 0.5632911392405063,
"grad_norm": 0.35818201303482056,
"learning_rate": 0.0006120789874724336,
"loss": 1.1974,
"step": 5340
},
{
"epoch": 0.5643459915611815,
"grad_norm": 0.34876447916030884,
"learning_rate": 0.0006096119844899151,
"loss": 1.1943,
"step": 5350
},
{
"epoch": 0.5654008438818565,
"grad_norm": 0.3554922938346863,
"learning_rate": 0.0006071465546236601,
"loss": 1.2042,
"step": 5360
},
{
"epoch": 0.5664556962025317,
"grad_norm": 0.3837421238422394,
"learning_rate": 0.0006046827255000135,
"loss": 1.2043,
"step": 5370
},
{
"epoch": 0.5675105485232067,
"grad_norm": 0.4092616140842438,
"learning_rate": 0.0006022205247273845,
"loss": 1.1952,
"step": 5380
},
{
"epoch": 0.5685654008438819,
"grad_norm": 0.37640655040740967,
"learning_rate": 0.0005997599798959343,
"loss": 1.1963,
"step": 5390
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.35962364077568054,
"learning_rate": 0.0005973011185772694,
"loss": 1.1974,
"step": 5400
},
{
"epoch": 0.570675105485232,
"grad_norm": 0.37594228982925415,
"learning_rate": 0.0005948439683241318,
"loss": 1.2018,
"step": 5410
},
{
"epoch": 0.5717299578059072,
"grad_norm": 0.36223894357681274,
"learning_rate": 0.0005923885566700896,
"loss": 1.1931,
"step": 5420
},
{
"epoch": 0.5727848101265823,
"grad_norm": 0.3633806109428406,
"learning_rate": 0.0005899349111292293,
"loss": 1.1905,
"step": 5430
},
{
"epoch": 0.5738396624472574,
"grad_norm": 0.38978320360183716,
"learning_rate": 0.0005874830591958474,
"loss": 1.1868,
"step": 5440
},
{
"epoch": 0.5748945147679325,
"grad_norm": 0.35502946376800537,
"learning_rate": 0.000585033028344142,
"loss": 1.1953,
"step": 5450
},
{
"epoch": 0.5759493670886076,
"grad_norm": 0.3547123670578003,
"learning_rate": 0.0005825848460279048,
"loss": 1.1962,
"step": 5460
},
{
"epoch": 0.5770042194092827,
"grad_norm": 0.3606116473674774,
"learning_rate": 0.0005801385396802146,
"loss": 1.1962,
"step": 5470
},
{
"epoch": 0.5780590717299579,
"grad_norm": 0.3635554313659668,
"learning_rate": 0.0005776941367131282,
"loss": 1.1984,
"step": 5480
},
{
"epoch": 0.5791139240506329,
"grad_norm": 0.4081866145133972,
"learning_rate": 0.0005752516645173745,
"loss": 1.2017,
"step": 5490
},
{
"epoch": 0.580168776371308,
"grad_norm": 0.3537876009941101,
"learning_rate": 0.0005728111504620472,
"loss": 1.1939,
"step": 5500
},
{
"epoch": 0.5812236286919831,
"grad_norm": 0.40251171588897705,
"learning_rate": 0.0005703726218942976,
"loss": 1.1996,
"step": 5510
},
{
"epoch": 0.5822784810126582,
"grad_norm": 0.4112827777862549,
"learning_rate": 0.0005679361061390295,
"loss": 1.192,
"step": 5520
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.39286091923713684,
"learning_rate": 0.0005655016304985908,
"loss": 1.1874,
"step": 5530
},
{
"epoch": 0.5843881856540084,
"grad_norm": 0.3851916193962097,
"learning_rate": 0.0005630692222524709,
"loss": 1.1848,
"step": 5540
},
{
"epoch": 0.5854430379746836,
"grad_norm": 0.3481987416744232,
"learning_rate": 0.0005606389086569911,
"loss": 1.1994,
"step": 5550
},
{
"epoch": 0.5864978902953587,
"grad_norm": 0.36745893955230713,
"learning_rate": 0.0005582107169450023,
"loss": 1.2026,
"step": 5560
},
{
"epoch": 0.5875527426160337,
"grad_norm": 0.3689424991607666,
"learning_rate": 0.0005557846743255783,
"loss": 1.1887,
"step": 5570
},
{
"epoch": 0.5886075949367089,
"grad_norm": 0.3701568841934204,
"learning_rate": 0.0005533608079837109,
"loss": 1.1889,
"step": 5580
},
{
"epoch": 0.5896624472573839,
"grad_norm": 0.35756173729896545,
"learning_rate": 0.0005509391450800061,
"loss": 1.1906,
"step": 5590
},
{
"epoch": 0.5907172995780591,
"grad_norm": 0.38948529958724976,
"learning_rate": 0.0005485197127503795,
"loss": 1.1877,
"step": 5600
},
{
"epoch": 0.5917721518987342,
"grad_norm": 0.401199072599411,
"learning_rate": 0.0005461025381057516,
"loss": 1.1901,
"step": 5610
},
{
"epoch": 0.5928270042194093,
"grad_norm": 0.3774757385253906,
"learning_rate": 0.0005436876482317444,
"loss": 1.1974,
"step": 5620
},
{
"epoch": 0.5938818565400844,
"grad_norm": 0.3717878460884094,
"learning_rate": 0.0005412750701883782,
"loss": 1.1923,
"step": 5630
},
{
"epoch": 0.5949367088607594,
"grad_norm": 0.4294511675834656,
"learning_rate": 0.0005388648310097682,
"loss": 1.2008,
"step": 5640
},
{
"epoch": 0.5959915611814346,
"grad_norm": 0.36430466175079346,
"learning_rate": 0.000536456957703821,
"loss": 1.1945,
"step": 5650
},
{
"epoch": 0.5970464135021097,
"grad_norm": 0.3893060088157654,
"learning_rate": 0.0005340514772519324,
"loss": 1.1896,
"step": 5660
},
{
"epoch": 0.5981012658227848,
"grad_norm": 0.3386535942554474,
"learning_rate": 0.0005316484166086863,
"loss": 1.1962,
"step": 5670
},
{
"epoch": 0.5991561181434599,
"grad_norm": 0.372712105512619,
"learning_rate": 0.00052924780270155,
"loss": 1.1963,
"step": 5680
},
{
"epoch": 0.6002109704641351,
"grad_norm": 0.36947306990623474,
"learning_rate": 0.0005268496624305747,
"loss": 1.1764,
"step": 5690
},
{
"epoch": 0.6012658227848101,
"grad_norm": 0.35860103368759155,
"learning_rate": 0.0005244540226680931,
"loss": 1.182,
"step": 5700
},
{
"epoch": 0.6023206751054853,
"grad_norm": 0.3623606562614441,
"learning_rate": 0.0005220609102584185,
"loss": 1.1873,
"step": 5710
},
{
"epoch": 0.6033755274261603,
"grad_norm": 0.3721083998680115,
"learning_rate": 0.0005196703520175437,
"loss": 1.1913,
"step": 5720
},
{
"epoch": 0.6044303797468354,
"grad_norm": 0.40760260820388794,
"learning_rate": 0.0005172823747328415,
"loss": 1.1886,
"step": 5730
},
{
"epoch": 0.6054852320675106,
"grad_norm": 0.35366445779800415,
"learning_rate": 0.0005148970051627632,
"loss": 1.185,
"step": 5740
},
{
"epoch": 0.6065400843881856,
"grad_norm": 0.3632625937461853,
"learning_rate": 0.0005125142700365394,
"loss": 1.159,
"step": 5750
},
{
"epoch": 0.6075949367088608,
"grad_norm": 0.40271735191345215,
"learning_rate": 0.000510134196053881,
"loss": 1.1697,
"step": 5760
},
{
"epoch": 0.6086497890295358,
"grad_norm": 0.37240713834762573,
"learning_rate": 0.0005077568098846789,
"loss": 1.1709,
"step": 5770
},
{
"epoch": 0.609704641350211,
"grad_norm": 0.40513280034065247,
"learning_rate": 0.000505382138168706,
"loss": 1.1806,
"step": 5780
},
{
"epoch": 0.6107594936708861,
"grad_norm": 0.3707410991191864,
"learning_rate": 0.0005030102075153181,
"loss": 1.1779,
"step": 5790
},
{
"epoch": 0.6118143459915611,
"grad_norm": 0.39285629987716675,
"learning_rate": 0.0005006410445031569,
"loss": 1.1814,
"step": 5800
},
{
"epoch": 0.6128691983122363,
"grad_norm": 0.36199751496315,
"learning_rate": 0.0004982746756798507,
"loss": 1.184,
"step": 5810
},
{
"epoch": 0.6139240506329114,
"grad_norm": 0.36564263701438904,
"learning_rate": 0.0004959111275617174,
"loss": 1.1968,
"step": 5820
},
{
"epoch": 0.6149789029535865,
"grad_norm": 0.3686883747577667,
"learning_rate": 0.0004935504266334677,
"loss": 1.1806,
"step": 5830
},
{
"epoch": 0.6160337552742616,
"grad_norm": 0.3595767617225647,
"learning_rate": 0.0004911925993479085,
"loss": 1.1911,
"step": 5840
},
{
"epoch": 0.6170886075949367,
"grad_norm": 0.380021870136261,
"learning_rate": 0.0004888376721256456,
"loss": 1.1966,
"step": 5850
},
{
"epoch": 0.6181434599156118,
"grad_norm": 0.3993105888366699,
"learning_rate": 0.00048648567135478805,
"loss": 1.1838,
"step": 5860
},
{
"epoch": 0.619198312236287,
"grad_norm": 0.3877832889556885,
"learning_rate": 0.0004841366233906538,
"loss": 1.1741,
"step": 5870
},
{
"epoch": 0.620253164556962,
"grad_norm": 0.3675520122051239,
"learning_rate": 0.0004817905545554717,
"loss": 1.1869,
"step": 5880
},
{
"epoch": 0.6213080168776371,
"grad_norm": 0.365359365940094,
"learning_rate": 0.00047944749113808884,
"loss": 1.1756,
"step": 5890
},
{
"epoch": 0.6223628691983122,
"grad_norm": 0.34352290630340576,
"learning_rate": 0.00047710745939367474,
"loss": 1.1786,
"step": 5900
},
{
"epoch": 0.6234177215189873,
"grad_norm": 0.3574180006980896,
"learning_rate": 0.0004747704855434278,
"loss": 1.1714,
"step": 5910
},
{
"epoch": 0.6244725738396625,
"grad_norm": 0.3392818868160248,
"learning_rate": 0.0004724365957742809,
"loss": 1.1683,
"step": 5920
},
{
"epoch": 0.6255274261603375,
"grad_norm": 0.33924534916877747,
"learning_rate": 0.00047010581623860883,
"loss": 1.1744,
"step": 5930
},
{
"epoch": 0.6265822784810127,
"grad_norm": 0.36496856808662415,
"learning_rate": 0.0004677781730539342,
"loss": 1.1808,
"step": 5940
},
{
"epoch": 0.6276371308016878,
"grad_norm": 0.35622742772102356,
"learning_rate": 0.0004654536923026356,
"loss": 1.172,
"step": 5950
},
{
"epoch": 0.6286919831223629,
"grad_norm": 0.35895973443984985,
"learning_rate": 0.00046313240003165466,
"loss": 1.1776,
"step": 5960
},
{
"epoch": 0.629746835443038,
"grad_norm": 0.3521154224872589,
"learning_rate": 0.0004608143222522048,
"loss": 1.1763,
"step": 5970
},
{
"epoch": 0.630801687763713,
"grad_norm": 0.34627509117126465,
"learning_rate": 0.0004584994849394795,
"loss": 1.1611,
"step": 5980
},
{
"epoch": 0.6318565400843882,
"grad_norm": 0.38166508078575134,
"learning_rate": 0.0004561879140323607,
"loss": 1.1793,
"step": 5990
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.37901800870895386,
"learning_rate": 0.0004538796354331298,
"loss": 1.1864,
"step": 6000
},
{
"epoch": 0.6339662447257384,
"grad_norm": 0.4082631766796112,
"learning_rate": 0.0004515746750071754,
"loss": 1.1753,
"step": 6010
},
{
"epoch": 0.6350210970464135,
"grad_norm": 0.3545146882534027,
"learning_rate": 0.0004492730585827046,
"loss": 1.1744,
"step": 6020
},
{
"epoch": 0.6360759493670886,
"grad_norm": 0.3988998830318451,
"learning_rate": 0.0004469748119504529,
"loss": 1.1594,
"step": 6030
},
{
"epoch": 0.6371308016877637,
"grad_norm": 0.38194578886032104,
"learning_rate": 0.0004446799608633964,
"loss": 1.1786,
"step": 6040
},
{
"epoch": 0.6381856540084389,
"grad_norm": 0.37773770093917847,
"learning_rate": 0.00044238853103646154,
"loss": 1.1644,
"step": 6050
},
{
"epoch": 0.6392405063291139,
"grad_norm": 0.39295461773872375,
"learning_rate": 0.00044010054814623925,
"loss": 1.18,
"step": 6060
},
{
"epoch": 0.640295358649789,
"grad_norm": 0.3779560327529907,
"learning_rate": 0.0004378160378306944,
"loss": 1.171,
"step": 6070
},
{
"epoch": 0.6413502109704642,
"grad_norm": 0.36832717061042786,
"learning_rate": 0.00043553502568888095,
"loss": 1.1697,
"step": 6080
},
{
"epoch": 0.6424050632911392,
"grad_norm": 0.3654510974884033,
"learning_rate": 0.0004332575372806534,
"loss": 1.1664,
"step": 6090
},
{
"epoch": 0.6434599156118144,
"grad_norm": 0.4271853268146515,
"learning_rate": 0.00043098359812638145,
"loss": 1.1625,
"step": 6100
},
{
"epoch": 0.6445147679324894,
"grad_norm": 0.3969768285751343,
"learning_rate": 0.00042871323370666383,
"loss": 1.1747,
"step": 6110
},
{
"epoch": 0.6455696202531646,
"grad_norm": 0.3841383755207062,
"learning_rate": 0.0004264464694620421,
"loss": 1.1684,
"step": 6120
},
{
"epoch": 0.6466244725738397,
"grad_norm": 0.3560936450958252,
"learning_rate": 0.000424183330792717,
"loss": 1.1723,
"step": 6130
},
{
"epoch": 0.6476793248945147,
"grad_norm": 0.48002129793167114,
"learning_rate": 0.0004219238430582621,
"loss": 1.1676,
"step": 6140
},
{
"epoch": 0.6487341772151899,
"grad_norm": 0.3844591975212097,
"learning_rate": 0.0004196680315773408,
"loss": 1.1788,
"step": 6150
},
{
"epoch": 0.6497890295358649,
"grad_norm": 0.3658396899700165,
"learning_rate": 0.00041741592162742214,
"loss": 1.1715,
"step": 6160
},
{
"epoch": 0.6508438818565401,
"grad_norm": 0.34545275568962097,
"learning_rate": 0.0004151675384444978,
"loss": 1.1503,
"step": 6170
},
{
"epoch": 0.6518987341772152,
"grad_norm": 0.3446422219276428,
"learning_rate": 0.00041292290722279914,
"loss": 1.1725,
"step": 6180
},
{
"epoch": 0.6529535864978903,
"grad_norm": 0.3521484434604645,
"learning_rate": 0.00041068205311451517,
"loss": 1.174,
"step": 6190
},
{
"epoch": 0.6540084388185654,
"grad_norm": 0.3657844662666321,
"learning_rate": 0.00040844500122951026,
"loss": 1.1691,
"step": 6200
},
{
"epoch": 0.6550632911392406,
"grad_norm": 0.38760536909103394,
"learning_rate": 0.00040621177663504313,
"loss": 1.1634,
"step": 6210
},
{
"epoch": 0.6561181434599156,
"grad_norm": 0.4080621898174286,
"learning_rate": 0.00040398240435548583,
"loss": 1.1728,
"step": 6220
},
{
"epoch": 0.6571729957805907,
"grad_norm": 0.37887656688690186,
"learning_rate": 0.00040175690937204324,
"loss": 1.1616,
"step": 6230
},
{
"epoch": 0.6582278481012658,
"grad_norm": 0.3961949050426483,
"learning_rate": 0.00039953531662247343,
"loss": 1.1635,
"step": 6240
},
{
"epoch": 0.6592827004219409,
"grad_norm": 0.36524301767349243,
"learning_rate": 0.0003973176510008075,
"loss": 1.1672,
"step": 6250
},
{
"epoch": 0.6603375527426161,
"grad_norm": 0.393848717212677,
"learning_rate": 0.00039510393735707233,
"loss": 1.1622,
"step": 6260
},
{
"epoch": 0.6613924050632911,
"grad_norm": 0.36277323961257935,
"learning_rate": 0.00039289420049700986,
"loss": 1.1707,
"step": 6270
},
{
"epoch": 0.6624472573839663,
"grad_norm": 0.3623575270175934,
"learning_rate": 0.0003906884651818006,
"loss": 1.1691,
"step": 6280
},
{
"epoch": 0.6635021097046413,
"grad_norm": 0.4982074797153473,
"learning_rate": 0.00038848675612778577,
"loss": 1.1671,
"step": 6290
},
{
"epoch": 0.6645569620253164,
"grad_norm": 0.35465696454048157,
"learning_rate": 0.00038628909800619046,
"loss": 1.1594,
"step": 6300
},
{
"epoch": 0.6656118143459916,
"grad_norm": 0.37563908100128174,
"learning_rate": 0.0003840955154428467,
"loss": 1.1604,
"step": 6310
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.3676312565803528,
"learning_rate": 0.00038190603301791864,
"loss": 1.1569,
"step": 6320
},
{
"epoch": 0.6677215189873418,
"grad_norm": 0.42015132308006287,
"learning_rate": 0.0003797206752656258,
"loss": 1.1656,
"step": 6330
},
{
"epoch": 0.6687763713080169,
"grad_norm": 0.36727169156074524,
"learning_rate": 0.0003775394666739688,
"loss": 1.1577,
"step": 6340
},
{
"epoch": 0.669831223628692,
"grad_norm": 0.37604963779449463,
"learning_rate": 0.00037536243168445507,
"loss": 1.1656,
"step": 6350
},
{
"epoch": 0.6708860759493671,
"grad_norm": 0.3591715097427368,
"learning_rate": 0.0003731895946918246,
"loss": 1.1637,
"step": 6360
},
{
"epoch": 0.6719409282700421,
"grad_norm": 0.4027197062969208,
"learning_rate": 0.0003710209800437769,
"loss": 1.1454,
"step": 6370
},
{
"epoch": 0.6729957805907173,
"grad_norm": 0.34489601850509644,
"learning_rate": 0.00036885661204069767,
"loss": 1.1495,
"step": 6380
},
{
"epoch": 0.6740506329113924,
"grad_norm": 0.3564709722995758,
"learning_rate": 0.0003666965149353878,
"loss": 1.1643,
"step": 6390
},
{
"epoch": 0.6751054852320675,
"grad_norm": 0.4152984917163849,
"learning_rate": 0.0003645407129327898,
"loss": 1.1621,
"step": 6400
},
{
"epoch": 0.6761603375527426,
"grad_norm": 0.407699316740036,
"learning_rate": 0.00036238923018971783,
"loss": 1.1588,
"step": 6410
},
{
"epoch": 0.6772151898734177,
"grad_norm": 0.3636753261089325,
"learning_rate": 0.0003602420908145865,
"loss": 1.1499,
"step": 6420
},
{
"epoch": 0.6782700421940928,
"grad_norm": 0.36881223320961,
"learning_rate": 0.00035809931886714093,
"loss": 1.1594,
"step": 6430
},
{
"epoch": 0.679324894514768,
"grad_norm": 0.36326852440834045,
"learning_rate": 0.00035596093835818683,
"loss": 1.1439,
"step": 6440
},
{
"epoch": 0.680379746835443,
"grad_norm": 0.35837841033935547,
"learning_rate": 0.00035382697324932245,
"loss": 1.1491,
"step": 6450
},
{
"epoch": 0.6814345991561181,
"grad_norm": 0.43497446179389954,
"learning_rate": 0.00035169744745266866,
"loss": 1.1627,
"step": 6460
},
{
"epoch": 0.6824894514767933,
"grad_norm": 0.379119336605072,
"learning_rate": 0.0003495723848306017,
"loss": 1.1577,
"step": 6470
},
{
"epoch": 0.6835443037974683,
"grad_norm": 0.3709147274494171,
"learning_rate": 0.0003474518091954859,
"loss": 1.1617,
"step": 6480
},
{
"epoch": 0.6845991561181435,
"grad_norm": 0.36736902594566345,
"learning_rate": 0.0003453357443094068,
"loss": 1.1582,
"step": 6490
},
{
"epoch": 0.6856540084388185,
"grad_norm": 0.391516774892807,
"learning_rate": 0.00034322421388390456,
"loss": 1.171,
"step": 6500
},
{
"epoch": 0.6867088607594937,
"grad_norm": 0.3488520681858063,
"learning_rate": 0.0003411172415797087,
"loss": 1.1706,
"step": 6510
},
{
"epoch": 0.6877637130801688,
"grad_norm": 0.36531731486320496,
"learning_rate": 0.0003390148510064727,
"loss": 1.1581,
"step": 6520
},
{
"epoch": 0.6888185654008439,
"grad_norm": 0.37665581703186035,
"learning_rate": 0.0003369170657225094,
"loss": 1.1484,
"step": 6530
},
{
"epoch": 0.689873417721519,
"grad_norm": 0.37849798798561096,
"learning_rate": 0.0003348239092345275,
"loss": 1.1588,
"step": 6540
},
{
"epoch": 0.6909282700421941,
"grad_norm": 0.38719886541366577,
"learning_rate": 0.0003327354049973672,
"loss": 1.1545,
"step": 6550
},
{
"epoch": 0.6919831223628692,
"grad_norm": 0.41207027435302734,
"learning_rate": 0.00033065157641373847,
"loss": 1.1541,
"step": 6560
},
{
"epoch": 0.6930379746835443,
"grad_norm": 0.42637428641319275,
"learning_rate": 0.0003285724468339576,
"loss": 1.1508,
"step": 6570
},
{
"epoch": 0.6940928270042194,
"grad_norm": 0.3876138925552368,
"learning_rate": 0.00032649803955568755,
"loss": 1.1502,
"step": 6580
},
{
"epoch": 0.6951476793248945,
"grad_norm": 0.3820565938949585,
"learning_rate": 0.00032442837782367434,
"loss": 1.1524,
"step": 6590
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.3652574419975281,
"learning_rate": 0.0003223634848294883,
"loss": 1.1583,
"step": 6600
},
{
"epoch": 0.6972573839662447,
"grad_norm": 0.4173029363155365,
"learning_rate": 0.00032030338371126374,
"loss": 1.1439,
"step": 6610
},
{
"epoch": 0.6983122362869199,
"grad_norm": 0.362628310918808,
"learning_rate": 0.0003182480975534395,
"loss": 1.1546,
"step": 6620
},
{
"epoch": 0.6993670886075949,
"grad_norm": 0.4287622272968292,
"learning_rate": 0.00031619764938650057,
"loss": 1.1467,
"step": 6630
},
{
"epoch": 0.70042194092827,
"grad_norm": 0.3689073324203491,
"learning_rate": 0.0003141520621867197,
"loss": 1.1522,
"step": 6640
},
{
"epoch": 0.7014767932489452,
"grad_norm": 0.38702285289764404,
"learning_rate": 0.00031211135887590074,
"loss": 1.1454,
"step": 6650
},
{
"epoch": 0.7025316455696202,
"grad_norm": 0.3564895689487457,
"learning_rate": 0.0003100755623211205,
"loss": 1.1472,
"step": 6660
},
{
"epoch": 0.7035864978902954,
"grad_norm": 0.36778292059898376,
"learning_rate": 0.0003080446953344735,
"loss": 1.1505,
"step": 6670
},
{
"epoch": 0.7046413502109705,
"grad_norm": 0.35331690311431885,
"learning_rate": 0.00030601878067281575,
"loss": 1.1511,
"step": 6680
},
{
"epoch": 0.7056962025316456,
"grad_norm": 0.36919105052948,
"learning_rate": 0.00030399784103751044,
"loss": 1.144,
"step": 6690
},
{
"epoch": 0.7067510548523207,
"grad_norm": 0.35663625597953796,
"learning_rate": 0.000301981899074173,
"loss": 1.1471,
"step": 6700
},
{
"epoch": 0.7078059071729957,
"grad_norm": 0.3756558299064636,
"learning_rate": 0.0002999709773724171,
"loss": 1.1485,
"step": 6710
},
{
"epoch": 0.7088607594936709,
"grad_norm": 0.3560740053653717,
"learning_rate": 0.00029796509846560294,
"loss": 1.1377,
"step": 6720
},
{
"epoch": 0.709915611814346,
"grad_norm": 0.3501749336719513,
"learning_rate": 0.0002959642848305828,
"loss": 1.1435,
"step": 6730
},
{
"epoch": 0.7109704641350211,
"grad_norm": 0.36983656883239746,
"learning_rate": 0.00029396855888745045,
"loss": 1.15,
"step": 6740
},
{
"epoch": 0.7120253164556962,
"grad_norm": 0.4085451662540436,
"learning_rate": 0.0002919779429992895,
"loss": 1.1543,
"step": 6750
},
{
"epoch": 0.7130801687763713,
"grad_norm": 0.37220215797424316,
"learning_rate": 0.0002899924594719231,
"loss": 1.1488,
"step": 6760
},
{
"epoch": 0.7141350210970464,
"grad_norm": 0.3557329475879669,
"learning_rate": 0.00028801213055366335,
"loss": 1.1492,
"step": 6770
},
{
"epoch": 0.7151898734177216,
"grad_norm": 0.37931346893310547,
"learning_rate": 0.00028603697843506315,
"loss": 1.1465,
"step": 6780
},
{
"epoch": 0.7162447257383966,
"grad_norm": 0.37470945715904236,
"learning_rate": 0.0002840670252486662,
"loss": 1.1506,
"step": 6790
},
{
"epoch": 0.7172995780590717,
"grad_norm": 0.37293675541877747,
"learning_rate": 0.00028210229306876,
"loss": 1.1384,
"step": 6800
},
{
"epoch": 0.7183544303797469,
"grad_norm": 0.3550741374492645,
"learning_rate": 0.0002801428039111279,
"loss": 1.1451,
"step": 6810
},
{
"epoch": 0.7194092827004219,
"grad_norm": 0.37655240297317505,
"learning_rate": 0.00027818857973280274,
"loss": 1.1438,
"step": 6820
},
{
"epoch": 0.7204641350210971,
"grad_norm": 0.3859023451805115,
"learning_rate": 0.0002762396424318206,
"loss": 1.1437,
"step": 6830
},
{
"epoch": 0.7215189873417721,
"grad_norm": 0.36085501313209534,
"learning_rate": 0.00027429601384697526,
"loss": 1.1343,
"step": 6840
},
{
"epoch": 0.7225738396624473,
"grad_norm": 0.37223494052886963,
"learning_rate": 0.00027235771575757466,
"loss": 1.1437,
"step": 6850
},
{
"epoch": 0.7236286919831224,
"grad_norm": 0.38057178258895874,
"learning_rate": 0.0002704247698831951,
"loss": 1.1365,
"step": 6860
},
{
"epoch": 0.7246835443037974,
"grad_norm": 0.36014029383659363,
"learning_rate": 0.0002684971978834389,
"loss": 1.135,
"step": 6870
},
{
"epoch": 0.7257383966244726,
"grad_norm": 0.34835484623908997,
"learning_rate": 0.0002665750213576914,
"loss": 1.1451,
"step": 6880
},
{
"epoch": 0.7267932489451476,
"grad_norm": 0.36837393045425415,
"learning_rate": 0.0002646582618448794,
"loss": 1.1347,
"step": 6890
},
{
"epoch": 0.7278481012658228,
"grad_norm": 0.37352582812309265,
"learning_rate": 0.00026274694082322896,
"loss": 1.1332,
"step": 6900
},
{
"epoch": 0.7289029535864979,
"grad_norm": 0.42627763748168945,
"learning_rate": 0.0002608410797100255,
"loss": 1.1583,
"step": 6910
},
{
"epoch": 0.729957805907173,
"grad_norm": 0.41890236735343933,
"learning_rate": 0.0002589406998613733,
"loss": 1.1393,
"step": 6920
},
{
"epoch": 0.7310126582278481,
"grad_norm": 0.3742920756340027,
"learning_rate": 0.0002570458225719567,
"loss": 1.1431,
"step": 6930
},
{
"epoch": 0.7320675105485233,
"grad_norm": 0.38046887516975403,
"learning_rate": 0.00025515646907480074,
"loss": 1.1469,
"step": 6940
},
{
"epoch": 0.7331223628691983,
"grad_norm": 0.3562321364879608,
"learning_rate": 0.00025327266054103395,
"loss": 1.1443,
"step": 6950
},
{
"epoch": 0.7341772151898734,
"grad_norm": 0.35831841826438904,
"learning_rate": 0.0002513944180796509,
"loss": 1.1379,
"step": 6960
},
{
"epoch": 0.7352320675105485,
"grad_norm": 0.3924483358860016,
"learning_rate": 0.0002495217627372752,
"loss": 1.1327,
"step": 6970
},
{
"epoch": 0.7362869198312236,
"grad_norm": 0.36096253991127014,
"learning_rate": 0.0002476547154979248,
"loss": 1.1359,
"step": 6980
},
{
"epoch": 0.7373417721518988,
"grad_norm": 0.37537628412246704,
"learning_rate": 0.00024579329728277534,
"loss": 1.1359,
"step": 6990
},
{
"epoch": 0.7383966244725738,
"grad_norm": 0.37565308809280396,
"learning_rate": 0.00024393752894992708,
"loss": 1.1476,
"step": 7000
},
{
"epoch": 0.739451476793249,
"grad_norm": 0.35901403427124023,
"learning_rate": 0.00024208743129417004,
"loss": 1.1208,
"step": 7010
},
{
"epoch": 0.740506329113924,
"grad_norm": 0.39581504464149475,
"learning_rate": 0.00024024302504675206,
"loss": 1.1346,
"step": 7020
},
{
"epoch": 0.7415611814345991,
"grad_norm": 0.3442055881023407,
"learning_rate": 0.0002384043308751454,
"loss": 1.1373,
"step": 7030
},
{
"epoch": 0.7426160337552743,
"grad_norm": 0.3714121878147125,
"learning_rate": 0.00023657136938281653,
"loss": 1.1447,
"step": 7040
},
{
"epoch": 0.7436708860759493,
"grad_norm": 0.38861745595932007,
"learning_rate": 0.00023474416110899377,
"loss": 1.1352,
"step": 7050
},
{
"epoch": 0.7447257383966245,
"grad_norm": 0.36330005526542664,
"learning_rate": 0.00023292272652843807,
"loss": 1.1367,
"step": 7060
},
{
"epoch": 0.7457805907172996,
"grad_norm": 0.3554536998271942,
"learning_rate": 0.00023110708605121317,
"loss": 1.143,
"step": 7070
},
{
"epoch": 0.7468354430379747,
"grad_norm": 0.37097129225730896,
"learning_rate": 0.00022929726002245728,
"loss": 1.1393,
"step": 7080
},
{
"epoch": 0.7478902953586498,
"grad_norm": 0.38714399933815,
"learning_rate": 0.00022749326872215472,
"loss": 1.1365,
"step": 7090
},
{
"epoch": 0.7489451476793249,
"grad_norm": 0.3776571452617645,
"learning_rate": 0.0002256951323649087,
"loss": 1.1327,
"step": 7100
},
{
"epoch": 0.75,
"grad_norm": 0.3625503182411194,
"learning_rate": 0.00022390287109971547,
"loss": 1.145,
"step": 7110
},
{
"epoch": 0.7510548523206751,
"grad_norm": 0.41759389638900757,
"learning_rate": 0.00022211650500973746,
"loss": 1.1381,
"step": 7120
},
{
"epoch": 0.7521097046413502,
"grad_norm": 0.3729211390018463,
"learning_rate": 0.0002203360541120789,
"loss": 1.1454,
"step": 7130
},
{
"epoch": 0.7531645569620253,
"grad_norm": 0.3729153573513031,
"learning_rate": 0.00021856153835756164,
"loss": 1.1291,
"step": 7140
},
{
"epoch": 0.7542194092827004,
"grad_norm": 0.39670905470848083,
"learning_rate": 0.00021679297763050104,
"loss": 1.1275,
"step": 7150
},
{
"epoch": 0.7552742616033755,
"grad_norm": 0.3594678044319153,
"learning_rate": 0.0002150303917484834,
"loss": 1.1357,
"step": 7160
},
{
"epoch": 0.7563291139240507,
"grad_norm": 0.40017199516296387,
"learning_rate": 0.0002132738004621446,
"loss": 1.1439,
"step": 7170
},
{
"epoch": 0.7573839662447257,
"grad_norm": 0.3732672333717346,
"learning_rate": 0.00021152322345494763,
"loss": 1.1231,
"step": 7180
},
{
"epoch": 0.7584388185654009,
"grad_norm": 0.39866897463798523,
"learning_rate": 0.00020977868034296253,
"loss": 1.1264,
"step": 7190
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.35106202960014343,
"learning_rate": 0.00020804019067464667,
"loss": 1.1298,
"step": 7200
},
{
"epoch": 0.760548523206751,
"grad_norm": 0.4421609342098236,
"learning_rate": 0.00020630777393062575,
"loss": 1.134,
"step": 7210
},
{
"epoch": 0.7616033755274262,
"grad_norm": 0.40509048104286194,
"learning_rate": 0.00020458144952347523,
"loss": 1.1451,
"step": 7220
},
{
"epoch": 0.7626582278481012,
"grad_norm": 0.3772280514240265,
"learning_rate": 0.00020286123679750314,
"loss": 1.1437,
"step": 7230
},
{
"epoch": 0.7637130801687764,
"grad_norm": 0.3716863691806793,
"learning_rate": 0.00020114715502853292,
"loss": 1.1409,
"step": 7240
},
{
"epoch": 0.7647679324894515,
"grad_norm": 0.41956421732902527,
"learning_rate": 0.0001994392234236878,
"loss": 1.1332,
"step": 7250
},
{
"epoch": 0.7658227848101266,
"grad_norm": 0.3653375804424286,
"learning_rate": 0.0001977374611211754,
"loss": 1.1337,
"step": 7260
},
{
"epoch": 0.7668776371308017,
"grad_norm": 0.3775090277194977,
"learning_rate": 0.00019604188719007313,
"loss": 1.1317,
"step": 7270
},
{
"epoch": 0.7679324894514767,
"grad_norm": 0.3718429505825043,
"learning_rate": 0.00019435252063011504,
"loss": 1.133,
"step": 7280
},
{
"epoch": 0.7689873417721519,
"grad_norm": 0.37331250309944153,
"learning_rate": 0.0001926693803714779,
"loss": 1.1447,
"step": 7290
},
{
"epoch": 0.770042194092827,
"grad_norm": 0.3802056312561035,
"learning_rate": 0.00019099248527457068,
"loss": 1.1459,
"step": 7300
},
{
"epoch": 0.7710970464135021,
"grad_norm": 0.36767441034317017,
"learning_rate": 0.0001893218541298216,
"loss": 1.1268,
"step": 7310
},
{
"epoch": 0.7721518987341772,
"grad_norm": 0.34967729449272156,
"learning_rate": 0.00018765750565746827,
"loss": 1.1214,
"step": 7320
},
{
"epoch": 0.7732067510548524,
"grad_norm": 0.3683644235134125,
"learning_rate": 0.00018599945850734812,
"loss": 1.1291,
"step": 7330
},
{
"epoch": 0.7742616033755274,
"grad_norm": 0.3712475001811981,
"learning_rate": 0.00018434773125868895,
"loss": 1.1222,
"step": 7340
},
{
"epoch": 0.7753164556962026,
"grad_norm": 0.3514195680618286,
"learning_rate": 0.00018270234241990108,
"loss": 1.1235,
"step": 7350
},
{
"epoch": 0.7763713080168776,
"grad_norm": 0.37643975019454956,
"learning_rate": 0.0001810633104283698,
"loss": 1.1306,
"step": 7360
},
{
"epoch": 0.7774261603375527,
"grad_norm": 0.38709020614624023,
"learning_rate": 0.0001794306536502492,
"loss": 1.1333,
"step": 7370
},
{
"epoch": 0.7784810126582279,
"grad_norm": 0.3852018117904663,
"learning_rate": 0.0001778043903802555,
"loss": 1.1191,
"step": 7380
},
{
"epoch": 0.7795358649789029,
"grad_norm": 0.36321789026260376,
"learning_rate": 0.0001761845388414627,
"loss": 1.128,
"step": 7390
},
{
"epoch": 0.7805907172995781,
"grad_norm": 0.3746873140335083,
"learning_rate": 0.00017457111718509831,
"loss": 1.1372,
"step": 7400
},
{
"epoch": 0.7816455696202531,
"grad_norm": 0.3483794033527374,
"learning_rate": 0.00017296414349033976,
"loss": 1.1182,
"step": 7410
},
{
"epoch": 0.7827004219409283,
"grad_norm": 0.35910797119140625,
"learning_rate": 0.00017136363576411172,
"loss": 1.1232,
"step": 7420
},
{
"epoch": 0.7837552742616034,
"grad_norm": 0.3550094664096832,
"learning_rate": 0.00016976961194088526,
"loss": 1.1108,
"step": 7430
},
{
"epoch": 0.7848101265822784,
"grad_norm": 0.3682705760002136,
"learning_rate": 0.00016818208988247533,
"loss": 1.1147,
"step": 7440
},
{
"epoch": 0.7858649789029536,
"grad_norm": 0.35992518067359924,
"learning_rate": 0.0001666010873778419,
"loss": 1.1181,
"step": 7450
},
{
"epoch": 0.7869198312236287,
"grad_norm": 0.3707635998725891,
"learning_rate": 0.00016502662214289,
"loss": 1.1287,
"step": 7460
},
{
"epoch": 0.7879746835443038,
"grad_norm": 0.3830517530441284,
"learning_rate": 0.00016345871182027124,
"loss": 1.1204,
"step": 7470
},
{
"epoch": 0.7890295358649789,
"grad_norm": 0.36266350746154785,
"learning_rate": 0.00016189737397918653,
"loss": 1.1258,
"step": 7480
},
{
"epoch": 0.790084388185654,
"grad_norm": 0.363208144903183,
"learning_rate": 0.0001603426261151884,
"loss": 1.1323,
"step": 7490
},
{
"epoch": 0.7911392405063291,
"grad_norm": 0.3821084201335907,
"learning_rate": 0.00015879448564998648,
"loss": 1.1273,
"step": 7500
},
{
"epoch": 0.7921940928270043,
"grad_norm": 0.3608359396457672,
"learning_rate": 0.0001572529699312501,
"loss": 1.133,
"step": 7510
},
{
"epoch": 0.7932489451476793,
"grad_norm": 0.3614010810852051,
"learning_rate": 0.0001557180962324158,
"loss": 1.1172,
"step": 7520
},
{
"epoch": 0.7943037974683544,
"grad_norm": 0.3900236487388611,
"learning_rate": 0.00015418988175249282,
"loss": 1.1198,
"step": 7530
},
{
"epoch": 0.7953586497890295,
"grad_norm": 0.40939193964004517,
"learning_rate": 0.00015266834361587063,
"loss": 1.1163,
"step": 7540
},
{
"epoch": 0.7964135021097046,
"grad_norm": 0.3722957670688629,
"learning_rate": 0.00015115349887212678,
"loss": 1.1139,
"step": 7550
},
{
"epoch": 0.7974683544303798,
"grad_norm": 0.3719434440135956,
"learning_rate": 0.00014964536449583657,
"loss": 1.119,
"step": 7560
},
{
"epoch": 0.7985232067510548,
"grad_norm": 0.37369686365127563,
"learning_rate": 0.00014814395738638195,
"loss": 1.1259,
"step": 7570
},
{
"epoch": 0.79957805907173,
"grad_norm": 0.3760931193828583,
"learning_rate": 0.00014664929436776278,
"loss": 1.1248,
"step": 7580
},
{
"epoch": 0.8006329113924051,
"grad_norm": 0.3629474937915802,
"learning_rate": 0.00014516139218840788,
"loss": 1.1123,
"step": 7590
},
{
"epoch": 0.8016877637130801,
"grad_norm": 0.3516145348548889,
"learning_rate": 0.00014368026752098782,
"loss": 1.1198,
"step": 7600
},
{
"epoch": 0.8027426160337553,
"grad_norm": 0.36271005868911743,
"learning_rate": 0.00014220593696222768,
"loss": 1.132,
"step": 7610
},
{
"epoch": 0.8037974683544303,
"grad_norm": 0.34938865900039673,
"learning_rate": 0.00014073841703272092,
"loss": 1.1069,
"step": 7620
},
{
"epoch": 0.8048523206751055,
"grad_norm": 0.3703489601612091,
"learning_rate": 0.00013927772417674558,
"loss": 1.1308,
"step": 7630
},
{
"epoch": 0.8059071729957806,
"grad_norm": 0.3835078477859497,
"learning_rate": 0.00013782387476207788,
"loss": 1.1225,
"step": 7640
},
{
"epoch": 0.8069620253164557,
"grad_norm": 0.3790573477745056,
"learning_rate": 0.00013637688507981064,
"loss": 1.1379,
"step": 7650
},
{
"epoch": 0.8080168776371308,
"grad_norm": 0.3718129098415375,
"learning_rate": 0.0001349367713441697,
"loss": 1.1152,
"step": 7660
},
{
"epoch": 0.8090717299578059,
"grad_norm": 0.36849159002304077,
"learning_rate": 0.0001335035496923326,
"loss": 1.122,
"step": 7670
},
{
"epoch": 0.810126582278481,
"grad_norm": 0.3801332116127014,
"learning_rate": 0.0001320772361842478,
"loss": 1.1326,
"step": 7680
},
{
"epoch": 0.8111814345991561,
"grad_norm": 0.3597279191017151,
"learning_rate": 0.00013065784680245442,
"loss": 1.1092,
"step": 7690
},
{
"epoch": 0.8122362869198312,
"grad_norm": 0.3624008893966675,
"learning_rate": 0.00012924539745190402,
"loss": 1.1181,
"step": 7700
},
{
"epoch": 0.8132911392405063,
"grad_norm": 0.38765400648117065,
"learning_rate": 0.0001278399039597809,
"loss": 1.1156,
"step": 7710
},
{
"epoch": 0.8143459915611815,
"grad_norm": 0.3619614243507385,
"learning_rate": 0.0001264413820753261,
"loss": 1.1205,
"step": 7720
},
{
"epoch": 0.8154008438818565,
"grad_norm": 0.36417073011398315,
"learning_rate": 0.00012504984746966003,
"loss": 1.126,
"step": 7730
},
{
"epoch": 0.8164556962025317,
"grad_norm": 0.35633671283721924,
"learning_rate": 0.00012366531573560754,
"loss": 1.1255,
"step": 7740
},
{
"epoch": 0.8175105485232067,
"grad_norm": 0.3816451132297516,
"learning_rate": 0.00012228780238752264,
"loss": 1.1127,
"step": 7750
},
{
"epoch": 0.8185654008438819,
"grad_norm": 0.3993898630142212,
"learning_rate": 0.00012091732286111514,
"loss": 1.1123,
"step": 7760
},
{
"epoch": 0.819620253164557,
"grad_norm": 0.36348336935043335,
"learning_rate": 0.00011955389251327737,
"loss": 1.1179,
"step": 7770
},
{
"epoch": 0.820675105485232,
"grad_norm": 0.3756120502948761,
"learning_rate": 0.00011819752662191197,
"loss": 1.1125,
"step": 7780
},
{
"epoch": 0.8217299578059072,
"grad_norm": 0.3802875876426697,
"learning_rate": 0.00011684824038576115,
"loss": 1.1294,
"step": 7790
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.3661904036998749,
"learning_rate": 0.00011550604892423593,
"loss": 1.1176,
"step": 7800
},
{
"epoch": 0.8238396624472574,
"grad_norm": 0.3550266623497009,
"learning_rate": 0.0001141709672772471,
"loss": 1.1166,
"step": 7810
},
{
"epoch": 0.8248945147679325,
"grad_norm": 0.36318251490592957,
"learning_rate": 0.00011284301040503625,
"loss": 1.1232,
"step": 7820
},
{
"epoch": 0.8259493670886076,
"grad_norm": 0.35563451051712036,
"learning_rate": 0.0001115221931880088,
"loss": 1.116,
"step": 7830
},
{
"epoch": 0.8270042194092827,
"grad_norm": 0.3556043803691864,
"learning_rate": 0.00011020853042656648,
"loss": 1.0999,
"step": 7840
},
{
"epoch": 0.8280590717299579,
"grad_norm": 0.36785390973091125,
"learning_rate": 0.000108902036840942,
"loss": 1.1088,
"step": 7850
},
{
"epoch": 0.8291139240506329,
"grad_norm": 0.34999290108680725,
"learning_rate": 0.00010760272707103389,
"loss": 1.0979,
"step": 7860
},
{
"epoch": 0.830168776371308,
"grad_norm": 0.3790420889854431,
"learning_rate": 0.00010631061567624259,
"loss": 1.1117,
"step": 7870
},
{
"epoch": 0.8312236286919831,
"grad_norm": 0.3649984896183014,
"learning_rate": 0.00010502571713530706,
"loss": 1.1302,
"step": 7880
},
{
"epoch": 0.8322784810126582,
"grad_norm": 0.3923533260822296,
"learning_rate": 0.00010374804584614308,
"loss": 1.1276,
"step": 7890
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.38636764883995056,
"learning_rate": 0.00010247761612568129,
"loss": 1.1056,
"step": 7900
},
{
"epoch": 0.8343881856540084,
"grad_norm": 0.38178226351737976,
"learning_rate": 0.0001012144422097069,
"loss": 1.1166,
"step": 7910
},
{
"epoch": 0.8354430379746836,
"grad_norm": 0.38185352087020874,
"learning_rate": 9.995853825270052e-05,
"loss": 1.1147,
"step": 7920
},
{
"epoch": 0.8364978902953587,
"grad_norm": 0.3619385063648224,
"learning_rate": 9.870991832767919e-05,
"loss": 1.1225,
"step": 7930
},
{
"epoch": 0.8375527426160337,
"grad_norm": 0.3500303328037262,
"learning_rate": 9.746859642603884e-05,
"loss": 1.1175,
"step": 7940
},
{
"epoch": 0.8386075949367089,
"grad_norm": 0.3563097417354584,
"learning_rate": 9.623458645739755e-05,
"loss": 1.1219,
"step": 7950
},
{
"epoch": 0.8396624472573839,
"grad_norm": 0.35280612111091614,
"learning_rate": 9.50079022494395e-05,
"loss": 1.1071,
"step": 7960
},
{
"epoch": 0.8407172995780591,
"grad_norm": 0.3701493442058563,
"learning_rate": 9.378855754776028e-05,
"loss": 1.1076,
"step": 7970
},
{
"epoch": 0.8417721518987342,
"grad_norm": 0.36128586530685425,
"learning_rate": 9.257656601571266e-05,
"loss": 1.1075,
"step": 7980
},
{
"epoch": 0.8428270042194093,
"grad_norm": 0.3913003206253052,
"learning_rate": 9.137194123425349e-05,
"loss": 1.1176,
"step": 7990
},
{
"epoch": 0.8438818565400844,
"grad_norm": 0.3562246859073639,
"learning_rate": 9.017469670179168e-05,
"loss": 1.1125,
"step": 8000
},
{
"epoch": 0.8449367088607594,
"grad_norm": 0.38052770495414734,
"learning_rate": 8.898484583403668e-05,
"loss": 1.1189,
"step": 8010
},
{
"epoch": 0.8459915611814346,
"grad_norm": 0.36378705501556396,
"learning_rate": 8.780240196384873e-05,
"loss": 1.1116,
"step": 8020
},
{
"epoch": 0.8470464135021097,
"grad_norm": 0.3562518358230591,
"learning_rate": 8.662737834108861e-05,
"loss": 1.1104,
"step": 8030
},
{
"epoch": 0.8481012658227848,
"grad_norm": 0.37795576453208923,
"learning_rate": 8.545978813246987e-05,
"loss": 1.1219,
"step": 8040
},
{
"epoch": 0.8491561181434599,
"grad_norm": 0.3587080240249634,
"learning_rate": 8.429964442141072e-05,
"loss": 1.1039,
"step": 8050
},
{
"epoch": 0.8502109704641351,
"grad_norm": 0.35426101088523865,
"learning_rate": 8.314696020788806e-05,
"loss": 1.1127,
"step": 8060
},
{
"epoch": 0.8512658227848101,
"grad_norm": 0.3638628423213959,
"learning_rate": 8.200174840829136e-05,
"loss": 1.1169,
"step": 8070
},
{
"epoch": 0.8523206751054853,
"grad_norm": 0.3730154037475586,
"learning_rate": 8.08640218552778e-05,
"loss": 1.119,
"step": 8080
},
{
"epoch": 0.8533755274261603,
"grad_norm": 0.3520881235599518,
"learning_rate": 7.973379329762925e-05,
"loss": 1.1072,
"step": 8090
},
{
"epoch": 0.8544303797468354,
"grad_norm": 0.3606489300727844,
"learning_rate": 7.861107540010845e-05,
"loss": 1.1025,
"step": 8100
},
{
"epoch": 0.8554852320675106,
"grad_norm": 0.40547868609428406,
"learning_rate": 7.749588074331762e-05,
"loss": 1.118,
"step": 8110
},
{
"epoch": 0.8565400843881856,
"grad_norm": 0.36077240109443665,
"learning_rate": 7.63882218235575e-05,
"loss": 1.1092,
"step": 8120
},
{
"epoch": 0.8575949367088608,
"grad_norm": 0.3992495834827423,
"learning_rate": 7.528811105268699e-05,
"loss": 1.1092,
"step": 8130
},
{
"epoch": 0.8586497890295358,
"grad_norm": 0.3654575049877167,
"learning_rate": 7.41955607579845e-05,
"loss": 1.108,
"step": 8140
},
{
"epoch": 0.859704641350211,
"grad_norm": 0.3554239869117737,
"learning_rate": 7.311058318200969e-05,
"loss": 1.1055,
"step": 8150
},
{
"epoch": 0.8607594936708861,
"grad_norm": 0.37257564067840576,
"learning_rate": 7.203319048246599e-05,
"loss": 1.1156,
"step": 8160
},
{
"epoch": 0.8618143459915611,
"grad_norm": 0.36394527554512024,
"learning_rate": 7.096339473206471e-05,
"loss": 1.1073,
"step": 8170
},
{
"epoch": 0.8628691983122363,
"grad_norm": 0.37236538529396057,
"learning_rate": 6.990120791838953e-05,
"loss": 1.1135,
"step": 8180
},
{
"epoch": 0.8639240506329114,
"grad_norm": 0.38898542523384094,
"learning_rate": 6.884664194376233e-05,
"loss": 1.106,
"step": 8190
},
{
"epoch": 0.8649789029535865,
"grad_norm": 0.36607620120048523,
"learning_rate": 6.779970862510989e-05,
"loss": 1.1171,
"step": 8200
},
{
"epoch": 0.8660337552742616,
"grad_norm": 0.36071106791496277,
"learning_rate": 6.676041969383107e-05,
"loss": 1.1126,
"step": 8210
},
{
"epoch": 0.8670886075949367,
"grad_norm": 0.35975003242492676,
"learning_rate": 6.572878679566605e-05,
"loss": 1.1197,
"step": 8220
},
{
"epoch": 0.8681434599156118,
"grad_norm": 0.3584916889667511,
"learning_rate": 6.470482149056509e-05,
"loss": 1.1115,
"step": 8230
},
{
"epoch": 0.869198312236287,
"grad_norm": 0.3623376488685608,
"learning_rate": 6.368853525255942e-05,
"loss": 1.1084,
"step": 8240
},
{
"epoch": 0.870253164556962,
"grad_norm": 0.36445266008377075,
"learning_rate": 6.267993946963249e-05,
"loss": 1.1233,
"step": 8250
},
{
"epoch": 0.8713080168776371,
"grad_norm": 0.37702205777168274,
"learning_rate": 6.167904544359265e-05,
"loss": 1.1153,
"step": 8260
},
{
"epoch": 0.8723628691983122,
"grad_norm": 0.36750108003616333,
"learning_rate": 6.068586438994617e-05,
"loss": 1.1131,
"step": 8270
},
{
"epoch": 0.8734177215189873,
"grad_norm": 0.3449283242225647,
"learning_rate": 5.970040743777161e-05,
"loss": 1.0963,
"step": 8280
},
{
"epoch": 0.8744725738396625,
"grad_norm": 0.36028075218200684,
"learning_rate": 5.8722685629595454e-05,
"loss": 1.0979,
"step": 8290
},
{
"epoch": 0.8755274261603375,
"grad_norm": 0.35771775245666504,
"learning_rate": 5.7752709921267855e-05,
"loss": 1.1218,
"step": 8300
},
{
"epoch": 0.8765822784810127,
"grad_norm": 0.3550027012825012,
"learning_rate": 5.6790491181840294e-05,
"loss": 1.1001,
"step": 8310
},
{
"epoch": 0.8776371308016878,
"grad_norm": 0.36124446988105774,
"learning_rate": 5.583604019344354e-05,
"loss": 1.1182,
"step": 8320
},
{
"epoch": 0.8786919831223629,
"grad_norm": 0.3725493252277374,
"learning_rate": 5.4889367651167007e-05,
"loss": 1.1114,
"step": 8330
},
{
"epoch": 0.879746835443038,
"grad_norm": 0.34998440742492676,
"learning_rate": 5.3950484162938714e-05,
"loss": 1.101,
"step": 8340
},
{
"epoch": 0.880801687763713,
"grad_norm": 0.36161261796951294,
"learning_rate": 5.3019400249406686e-05,
"loss": 1.1022,
"step": 8350
},
{
"epoch": 0.8818565400843882,
"grad_norm": 0.36241891980171204,
"learning_rate": 5.209612634382077e-05,
"loss": 1.1013,
"step": 8360
},
{
"epoch": 0.8829113924050633,
"grad_norm": 0.35523760318756104,
"learning_rate": 5.118067279191599e-05,
"loss": 1.1102,
"step": 8370
},
{
"epoch": 0.8839662447257384,
"grad_norm": 0.3642718195915222,
"learning_rate": 5.0273049851796205e-05,
"loss": 1.1132,
"step": 8380
},
{
"epoch": 0.8850210970464135,
"grad_norm": 0.36435121297836304,
"learning_rate": 4.9373267693819805e-05,
"loss": 1.105,
"step": 8390
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.3773235082626343,
"learning_rate": 4.848133640048513e-05,
"loss": 1.1051,
"step": 8400
},
{
"epoch": 0.8871308016877637,
"grad_norm": 0.3574071228504181,
"learning_rate": 4.75972659663178e-05,
"loss": 1.1247,
"step": 8410
},
{
"epoch": 0.8881856540084389,
"grad_norm": 0.3590472638607025,
"learning_rate": 4.672106629775882e-05,
"loss": 1.1096,
"step": 8420
},
{
"epoch": 0.8892405063291139,
"grad_norm": 0.3647381365299225,
"learning_rate": 4.585274721305333e-05,
"loss": 1.1088,
"step": 8430
},
{
"epoch": 0.890295358649789,
"grad_norm": 0.3700782060623169,
"learning_rate": 4.4992318442140575e-05,
"loss": 1.1025,
"step": 8440
},
{
"epoch": 0.8913502109704642,
"grad_norm": 0.3659191429615021,
"learning_rate": 4.413978962654508e-05,
"loss": 1.1094,
"step": 8450
},
{
"epoch": 0.8924050632911392,
"grad_norm": 0.3675118684768677,
"learning_rate": 4.3295170319268554e-05,
"loss": 1.1051,
"step": 8460
},
{
"epoch": 0.8934599156118144,
"grad_norm": 0.3624016344547272,
"learning_rate": 4.245846998468261e-05,
"loss": 1.1058,
"step": 8470
},
{
"epoch": 0.8945147679324894,
"grad_norm": 0.3543236553668976,
"learning_rate": 4.16296979984232e-05,
"loss": 1.1013,
"step": 8480
},
{
"epoch": 0.8955696202531646,
"grad_norm": 0.34951353073120117,
"learning_rate": 4.080886364728506e-05,
"loss": 1.1069,
"step": 8490
},
{
"epoch": 0.8966244725738397,
"grad_norm": 0.38264498114585876,
"learning_rate": 3.999597612911793e-05,
"loss": 1.0967,
"step": 8500
},
{
"epoch": 0.8976793248945147,
"grad_norm": 0.37308940291404724,
"learning_rate": 3.9191044552723345e-05,
"loss": 1.1087,
"step": 8510
},
{
"epoch": 0.8987341772151899,
"grad_norm": 0.38366225361824036,
"learning_rate": 3.839407793775268e-05,
"loss": 1.1049,
"step": 8520
},
{
"epoch": 0.8997890295358649,
"grad_norm": 0.36603066325187683,
"learning_rate": 3.760508521460584e-05,
"loss": 1.1171,
"step": 8530
},
{
"epoch": 0.9008438818565401,
"grad_norm": 0.35585013031959534,
"learning_rate": 3.682407522433173e-05,
"loss": 1.1129,
"step": 8540
},
{
"epoch": 0.9018987341772152,
"grad_norm": 0.38468706607818604,
"learning_rate": 3.605105671852854e-05,
"loss": 1.105,
"step": 8550
},
{
"epoch": 0.9029535864978903,
"grad_norm": 0.3588978350162506,
"learning_rate": 3.528603835924626e-05,
"loss": 1.0985,
"step": 8560
},
{
"epoch": 0.9040084388185654,
"grad_norm": 0.39159154891967773,
"learning_rate": 3.4529028718888935e-05,
"loss": 1.1147,
"step": 8570
},
{
"epoch": 0.9050632911392406,
"grad_norm": 0.3527233898639679,
"learning_rate": 3.378003628011938e-05,
"loss": 1.1063,
"step": 8580
},
{
"epoch": 0.9061181434599156,
"grad_norm": 0.3757486343383789,
"learning_rate": 3.303906943576346e-05,
"loss": 1.1108,
"step": 8590
},
{
"epoch": 0.9071729957805907,
"grad_norm": 0.3887219727039337,
"learning_rate": 3.230613648871661e-05,
"loss": 1.1016,
"step": 8600
},
{
"epoch": 0.9082278481012658,
"grad_norm": 0.3607241213321686,
"learning_rate": 3.158124565185022e-05,
"loss": 1.107,
"step": 8610
},
{
"epoch": 0.9092827004219409,
"grad_norm": 0.3600178360939026,
"learning_rate": 3.086440504792026e-05,
"loss": 1.1031,
"step": 8620
},
{
"epoch": 0.9103375527426161,
"grad_norm": 0.38059672713279724,
"learning_rate": 3.015562270947553e-05,
"loss": 1.1108,
"step": 8630
},
{
"epoch": 0.9113924050632911,
"grad_norm": 0.3696359097957611,
"learning_rate": 2.945490657876837e-05,
"loss": 1.0924,
"step": 8640
},
{
"epoch": 0.9124472573839663,
"grad_norm": 0.36051735281944275,
"learning_rate": 2.8762264507665113e-05,
"loss": 1.098,
"step": 8650
},
{
"epoch": 0.9135021097046413,
"grad_norm": 0.37566593289375305,
"learning_rate": 2.807770425755829e-05,
"loss": 1.0973,
"step": 8660
},
{
"epoch": 0.9145569620253164,
"grad_norm": 0.35185009241104126,
"learning_rate": 2.7401233499279866e-05,
"loss": 1.1077,
"step": 8670
},
{
"epoch": 0.9156118143459916,
"grad_norm": 0.36478352546691895,
"learning_rate": 2.6732859813014987e-05,
"loss": 1.1157,
"step": 8680
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.3797094225883484,
"learning_rate": 2.607259068821721e-05,
"loss": 1.1092,
"step": 8690
},
{
"epoch": 0.9177215189873418,
"grad_norm": 0.3498828113079071,
"learning_rate": 2.5420433523524493e-05,
"loss": 1.1054,
"step": 8700
},
{
"epoch": 0.9187763713080169,
"grad_norm": 0.36742252111434937,
"learning_rate": 2.4776395626676162e-05,
"loss": 1.1059,
"step": 8710
},
{
"epoch": 0.919831223628692,
"grad_norm": 0.3539731204509735,
"learning_rate": 2.414048421443141e-05,
"loss": 1.0994,
"step": 8720
},
{
"epoch": 0.9208860759493671,
"grad_norm": 0.36758720874786377,
"learning_rate": 2.3512706412488012e-05,
"loss": 1.1128,
"step": 8730
},
{
"epoch": 0.9219409282700421,
"grad_norm": 0.3628343343734741,
"learning_rate": 2.2893069255402993e-05,
"loss": 1.1013,
"step": 8740
},
{
"epoch": 0.9229957805907173,
"grad_norm": 0.3503541052341461,
"learning_rate": 2.2281579686513176e-05,
"loss": 1.0978,
"step": 8750
},
{
"epoch": 0.9240506329113924,
"grad_norm": 0.3604075014591217,
"learning_rate": 2.1678244557857663e-05,
"loss": 1.088,
"step": 8760
},
{
"epoch": 0.9251054852320675,
"grad_norm": 0.34871819615364075,
"learning_rate": 2.1083070630101232e-05,
"loss": 1.0962,
"step": 8770
},
{
"epoch": 0.9261603375527426,
"grad_norm": 0.35838931798934937,
"learning_rate": 2.0496064572458395e-05,
"loss": 1.1052,
"step": 8780
},
{
"epoch": 0.9272151898734177,
"grad_norm": 0.3570609986782074,
"learning_rate": 1.991723296261863e-05,
"loss": 1.0995,
"step": 8790
},
{
"epoch": 0.9282700421940928,
"grad_norm": 0.35785019397735596,
"learning_rate": 1.9346582286672686e-05,
"loss": 1.0957,
"step": 8800
},
{
"epoch": 0.929324894514768,
"grad_norm": 0.3643856644630432,
"learning_rate": 1.878411893904014e-05,
"loss": 1.1149,
"step": 8810
},
{
"epoch": 0.930379746835443,
"grad_norm": 0.3485681712627411,
"learning_rate": 1.822984922239737e-05,
"loss": 1.1,
"step": 8820
},
{
"epoch": 0.9314345991561181,
"grad_norm": 0.3704765737056732,
"learning_rate": 1.7683779347607286e-05,
"loss": 1.1168,
"step": 8830
},
{
"epoch": 0.9324894514767933,
"grad_norm": 0.35022515058517456,
"learning_rate": 1.714591543364938e-05,
"loss": 1.1045,
"step": 8840
},
{
"epoch": 0.9335443037974683,
"grad_norm": 0.37648147344589233,
"learning_rate": 1.6616263507551437e-05,
"loss": 1.1046,
"step": 8850
},
{
"epoch": 0.9345991561181435,
"grad_norm": 0.3764314651489258,
"learning_rate": 1.609482950432195e-05,
"loss": 1.1033,
"step": 8860
},
{
"epoch": 0.9356540084388185,
"grad_norm": 0.36434733867645264,
"learning_rate": 1.5581619266883563e-05,
"loss": 1.1028,
"step": 8870
},
{
"epoch": 0.9367088607594937,
"grad_norm": 0.37274083495140076,
"learning_rate": 1.5076638546007548e-05,
"loss": 1.1092,
"step": 8880
},
{
"epoch": 0.9377637130801688,
"grad_norm": 0.361558198928833,
"learning_rate": 1.457989300024945e-05,
"loss": 1.1047,
"step": 8890
},
{
"epoch": 0.9388185654008439,
"grad_norm": 0.3575560748577118,
"learning_rate": 1.4091388195885625e-05,
"loss": 1.0988,
"step": 8900
},
{
"epoch": 0.939873417721519,
"grad_norm": 0.36193081736564636,
"learning_rate": 1.3611129606851041e-05,
"loss": 1.098,
"step": 8910
},
{
"epoch": 0.9409282700421941,
"grad_norm": 0.3471304476261139,
"learning_rate": 1.313912261467759e-05,
"loss": 1.1043,
"step": 8920
},
{
"epoch": 0.9419831223628692,
"grad_norm": 0.35767582058906555,
"learning_rate": 1.267537250843412e-05,
"loss": 1.1029,
"step": 8930
},
{
"epoch": 0.9430379746835443,
"grad_norm": 0.3535594046115875,
"learning_rate": 1.2219884484667071e-05,
"loss": 1.1013,
"step": 8940
},
{
"epoch": 0.9440928270042194,
"grad_norm": 0.35520821809768677,
"learning_rate": 1.1772663647341947e-05,
"loss": 1.1031,
"step": 8950
},
{
"epoch": 0.9451476793248945,
"grad_norm": 0.35638466477394104,
"learning_rate": 1.1333715007786932e-05,
"loss": 1.1059,
"step": 8960
},
{
"epoch": 0.9462025316455697,
"grad_norm": 0.36158987879753113,
"learning_rate": 1.0903043484635694e-05,
"loss": 1.1043,
"step": 8970
},
{
"epoch": 0.9472573839662447,
"grad_norm": 0.37424609065055847,
"learning_rate": 1.0480653903772924e-05,
"loss": 1.0999,
"step": 8980
},
{
"epoch": 0.9483122362869199,
"grad_norm": 0.35378211736679077,
"learning_rate": 1.0066550998280132e-05,
"loss": 1.1059,
"step": 8990
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.3605569303035736,
"learning_rate": 9.660739408382608e-06,
"loss": 1.1022,
"step": 9000
},
{
"epoch": 0.95042194092827,
"grad_norm": 0.3631739020347595,
"learning_rate": 9.26322368139737e-06,
"loss": 1.0971,
"step": 9010
},
{
"epoch": 0.9514767932489452,
"grad_norm": 0.3738535940647125,
"learning_rate": 8.874008271682222e-06,
"loss": 1.0982,
"step": 9020
},
{
"epoch": 0.9525316455696202,
"grad_norm": 0.3611426055431366,
"learning_rate": 8.493097540585775e-06,
"loss": 1.117,
"step": 9030
},
{
"epoch": 0.9535864978902954,
"grad_norm": 0.36006152629852295,
"learning_rate": 8.120495756399005e-06,
"loss": 1.0943,
"step": 9040
},
{
"epoch": 0.9546413502109705,
"grad_norm": 0.3576846718788147,
"learning_rate": 7.756207094306605e-06,
"loss": 1.1044,
"step": 9050
},
{
"epoch": 0.9556962025316456,
"grad_norm": 0.36890554428100586,
"learning_rate": 7.400235636340957e-06,
"loss": 1.1062,
"step": 9060
},
{
"epoch": 0.9567510548523207,
"grad_norm": 0.3569638729095459,
"learning_rate": 7.0525853713362395e-06,
"loss": 1.1074,
"step": 9070
},
{
"epoch": 0.9578059071729957,
"grad_norm": 0.35618263483047485,
"learning_rate": 6.71326019488322e-06,
"loss": 1.1086,
"step": 9080
},
{
"epoch": 0.9588607594936709,
"grad_norm": 0.37716180086135864,
"learning_rate": 6.3822639092862846e-06,
"loss": 1.1082,
"step": 9090
},
{
"epoch": 0.959915611814346,
"grad_norm": 0.3503626883029938,
"learning_rate": 6.059600223520478e-06,
"loss": 1.0881,
"step": 9100
},
{
"epoch": 0.9609704641350211,
"grad_norm": 0.36392760276794434,
"learning_rate": 5.745272753189784e-06,
"loss": 1.0974,
"step": 9110
},
{
"epoch": 0.9620253164556962,
"grad_norm": 0.35463055968284607,
"learning_rate": 5.439285020487156e-06,
"loss": 1.1097,
"step": 9120
},
{
"epoch": 0.9630801687763713,
"grad_norm": 0.35742634534835815,
"learning_rate": 5.141640454154467e-06,
"loss": 1.0939,
"step": 9130
},
{
"epoch": 0.9641350210970464,
"grad_norm": 0.36812102794647217,
"learning_rate": 4.852342389444458e-06,
"loss": 1.1132,
"step": 9140
},
{
"epoch": 0.9651898734177216,
"grad_norm": 0.37311768531799316,
"learning_rate": 4.571394068083185e-06,
"loss": 1.1018,
"step": 9150
},
{
"epoch": 0.9662447257383966,
"grad_norm": 0.3573930263519287,
"learning_rate": 4.298798638233709e-06,
"loss": 1.1089,
"step": 9160
},
{
"epoch": 0.9672995780590717,
"grad_norm": 0.3686988949775696,
"learning_rate": 4.034559154461049e-06,
"loss": 1.1077,
"step": 9170
},
{
"epoch": 0.9683544303797469,
"grad_norm": 0.34888580441474915,
"learning_rate": 3.7786785776976198e-06,
"loss": 1.0993,
"step": 9180
},
{
"epoch": 0.9694092827004219,
"grad_norm": 0.371014803647995,
"learning_rate": 3.5311597752100964e-06,
"loss": 1.0974,
"step": 9190
},
{
"epoch": 0.9704641350210971,
"grad_norm": 0.36439818143844604,
"learning_rate": 3.2920055205676867e-06,
"loss": 1.1026,
"step": 9200
},
{
"epoch": 0.9715189873417721,
"grad_norm": 0.34828996658325195,
"learning_rate": 3.06121849361049e-06,
"loss": 1.1013,
"step": 9210
},
{
"epoch": 0.9725738396624473,
"grad_norm": 0.3600952625274658,
"learning_rate": 2.838801280419856e-06,
"loss": 1.1016,
"step": 9220
},
{
"epoch": 0.9736286919831224,
"grad_norm": 0.35362669825553894,
"learning_rate": 2.624756373289322e-06,
"loss": 1.0932,
"step": 9230
},
{
"epoch": 0.9746835443037974,
"grad_norm": 0.35408636927604675,
"learning_rate": 2.419086170696472e-06,
"loss": 1.0874,
"step": 9240
},
{
"epoch": 0.9757383966244726,
"grad_norm": 0.35020846128463745,
"learning_rate": 2.2217929772764545e-06,
"loss": 1.0993,
"step": 9250
},
{
"epoch": 0.9767932489451476,
"grad_norm": 0.3536199629306793,
"learning_rate": 2.0328790037957568e-06,
"loss": 1.0937,
"step": 9260
},
{
"epoch": 0.9778481012658228,
"grad_norm": 0.35778024792671204,
"learning_rate": 1.8523463671278052e-06,
"loss": 1.1041,
"step": 9270
},
{
"epoch": 0.9789029535864979,
"grad_norm": 0.34945109486579895,
"learning_rate": 1.6801970902288188e-06,
"loss": 1.0929,
"step": 9280
},
{
"epoch": 0.979957805907173,
"grad_norm": 0.36028343439102173,
"learning_rate": 1.5164331021155774e-06,
"loss": 1.1024,
"step": 9290
},
{
"epoch": 0.9810126582278481,
"grad_norm": 0.3585197329521179,
"learning_rate": 1.3610562378435221e-06,
"loss": 1.0997,
"step": 9300
},
{
"epoch": 0.9820675105485233,
"grad_norm": 0.3567984998226166,
"learning_rate": 1.2140682384862712e-06,
"loss": 1.1006,
"step": 9310
},
{
"epoch": 0.9831223628691983,
"grad_norm": 0.3587944209575653,
"learning_rate": 1.0754707511161365e-06,
"loss": 1.0956,
"step": 9320
},
{
"epoch": 0.9841772151898734,
"grad_norm": 0.360120564699173,
"learning_rate": 9.452653287856383e-07,
"loss": 1.1082,
"step": 9330
},
{
"epoch": 0.9852320675105485,
"grad_norm": 0.36455395817756653,
"learning_rate": 8.234534305101015e-07,
"loss": 1.105,
"step": 9340
},
{
"epoch": 0.9862869198312236,
"grad_norm": 0.3427172303199768,
"learning_rate": 7.100364212513367e-07,
"loss": 1.116,
"step": 9350
},
{
"epoch": 0.9873417721518988,
"grad_norm": 0.3511948585510254,
"learning_rate": 6.050155719023176e-07,
"loss": 1.1072,
"step": 9360
},
{
"epoch": 0.9883966244725738,
"grad_norm": 0.3564155399799347,
"learning_rate": 5.08392059272944e-07,
"loss": 1.1053,
"step": 9370
},
{
"epoch": 0.989451476793249,
"grad_norm": 0.3686405420303345,
"learning_rate": 4.2016696607680147e-07,
"loss": 1.0926,
"step": 9380
},
{
"epoch": 0.990506329113924,
"grad_norm": 0.3639363646507263,
"learning_rate": 3.4034128091917085e-07,
"loss": 1.0972,
"step": 9390
},
{
"epoch": 0.9915611814345991,
"grad_norm": 0.35043707489967346,
"learning_rate": 2.689158982859541e-07,
"loss": 1.1099,
"step": 9400
},
{
"epoch": 0.9926160337552743,
"grad_norm": 0.35883840918540955,
"learning_rate": 2.05891618533266e-07,
"loss": 1.0861,
"step": 9410
},
{
"epoch": 0.9936708860759493,
"grad_norm": 0.3705896735191345,
"learning_rate": 1.5126914787894074e-07,
"loss": 1.1049,
"step": 9420
},
{
"epoch": 0.9947257383966245,
"grad_norm": 0.34956392645835876,
"learning_rate": 1.0504909839462173e-07,
"loss": 1.1043,
"step": 9430
},
{
"epoch": 0.9957805907172996,
"grad_norm": 0.3487434983253479,
"learning_rate": 6.723198799826746e-08,
"loss": 1.1014,
"step": 9440
},
{
"epoch": 0.9968354430379747,
"grad_norm": 0.34953734278678894,
"learning_rate": 3.781824044932214e-08,
"loss": 1.1097,
"step": 9450
},
{
"epoch": 0.9978902953586498,
"grad_norm": 0.3557943105697632,
"learning_rate": 1.6808185342970238e-08,
"loss": 1.0945,
"step": 9460
},
{
"epoch": 0.9989451476793249,
"grad_norm": 0.3526889979839325,
"learning_rate": 4.202058107305451e-09,
"loss": 1.1073,
"step": 9470
},
{
"epoch": 1.0,
"grad_norm": 1.046021819114685,
"learning_rate": 0.0,
"loss": 1.0947,
"step": 9480
}
],
"logging_steps": 10,
"max_steps": 9480,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.036240179760947e+16,
"train_batch_size": 1024,
"trial_name": null,
"trial_params": null
}