phi_mini_selection / trainer_state.json
lavallone's picture
Upload 11 files
af2e683 verified
{
"best_metric": 0.15827356278896332,
"best_model_checkpoint": "finetuned_models/selection/phi_mini/checkpoint-8828",
"epoch": 4.999660287623145,
"eval_steps": 500,
"global_step": 11035,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004529498358056845,
"grad_norm": 2.3706889152526855,
"learning_rate": 3.7735849056603773e-06,
"loss": 3.1411,
"step": 10
},
{
"epoch": 0.00905899671611369,
"grad_norm": 1.1344068050384521,
"learning_rate": 7.547169811320755e-06,
"loss": 2.7341,
"step": 20
},
{
"epoch": 0.013588495074170535,
"grad_norm": 0.5109199285507202,
"learning_rate": 1.1320754716981132e-05,
"loss": 2.3962,
"step": 30
},
{
"epoch": 0.01811799343222738,
"grad_norm": 0.2340932935476303,
"learning_rate": 1.509433962264151e-05,
"loss": 2.1062,
"step": 40
},
{
"epoch": 0.022647491790284226,
"grad_norm": 0.16189691424369812,
"learning_rate": 1.8867924528301888e-05,
"loss": 1.9221,
"step": 50
},
{
"epoch": 0.02717699014834107,
"grad_norm": 0.14064399898052216,
"learning_rate": 2.2641509433962265e-05,
"loss": 1.7971,
"step": 60
},
{
"epoch": 0.031706488506397915,
"grad_norm": 0.1196412444114685,
"learning_rate": 2.641509433962264e-05,
"loss": 1.7255,
"step": 70
},
{
"epoch": 0.03623598686445476,
"grad_norm": 0.15146440267562866,
"learning_rate": 3.018867924528302e-05,
"loss": 1.6767,
"step": 80
},
{
"epoch": 0.040765485222511604,
"grad_norm": 0.13450802862644196,
"learning_rate": 3.39622641509434e-05,
"loss": 1.5943,
"step": 90
},
{
"epoch": 0.04529498358056845,
"grad_norm": 0.15073299407958984,
"learning_rate": 3.7735849056603776e-05,
"loss": 1.5428,
"step": 100
},
{
"epoch": 0.0498244819386253,
"grad_norm": 0.13764727115631104,
"learning_rate": 4.150943396226415e-05,
"loss": 1.4956,
"step": 110
},
{
"epoch": 0.05435398029668214,
"grad_norm": 0.23157894611358643,
"learning_rate": 4.528301886792453e-05,
"loss": 1.4492,
"step": 120
},
{
"epoch": 0.05888347865473899,
"grad_norm": 0.1756928712129593,
"learning_rate": 4.9056603773584906e-05,
"loss": 1.4258,
"step": 130
},
{
"epoch": 0.06341297701279583,
"grad_norm": 0.19877882301807404,
"learning_rate": 5.283018867924528e-05,
"loss": 1.3863,
"step": 140
},
{
"epoch": 0.06794247537085268,
"grad_norm": 0.19395482540130615,
"learning_rate": 5.660377358490566e-05,
"loss": 1.3469,
"step": 150
},
{
"epoch": 0.07247197372890953,
"grad_norm": 0.2622753083705902,
"learning_rate": 6.037735849056604e-05,
"loss": 1.3177,
"step": 160
},
{
"epoch": 0.07700147208696637,
"grad_norm": 0.47893616557121277,
"learning_rate": 6.415094339622641e-05,
"loss": 1.2414,
"step": 170
},
{
"epoch": 0.08153097044502321,
"grad_norm": 0.2570054233074188,
"learning_rate": 6.79245283018868e-05,
"loss": 1.2046,
"step": 180
},
{
"epoch": 0.08606046880308006,
"grad_norm": 0.31944283843040466,
"learning_rate": 7.169811320754717e-05,
"loss": 1.2254,
"step": 190
},
{
"epoch": 0.0905899671611369,
"grad_norm": 0.35244274139404297,
"learning_rate": 7.547169811320755e-05,
"loss": 1.1671,
"step": 200
},
{
"epoch": 0.09511946551919374,
"grad_norm": 0.23283237218856812,
"learning_rate": 7.924528301886794e-05,
"loss": 1.2043,
"step": 210
},
{
"epoch": 0.0996489638772506,
"grad_norm": 0.38952431082725525,
"learning_rate": 8.30188679245283e-05,
"loss": 1.202,
"step": 220
},
{
"epoch": 0.10417846223530744,
"grad_norm": 0.28450387716293335,
"learning_rate": 8.679245283018869e-05,
"loss": 1.1323,
"step": 230
},
{
"epoch": 0.10870796059336428,
"grad_norm": 0.30833789706230164,
"learning_rate": 9.056603773584906e-05,
"loss": 1.1101,
"step": 240
},
{
"epoch": 0.11323745895142114,
"grad_norm": 0.31221917271614075,
"learning_rate": 9.433962264150944e-05,
"loss": 1.0949,
"step": 250
},
{
"epoch": 0.11776695730947798,
"grad_norm": 0.3738393187522888,
"learning_rate": 9.811320754716981e-05,
"loss": 1.1372,
"step": 260
},
{
"epoch": 0.12229645566753482,
"grad_norm": 0.2999807596206665,
"learning_rate": 0.0001018867924528302,
"loss": 1.1028,
"step": 270
},
{
"epoch": 0.12682595402559166,
"grad_norm": 0.4104474186897278,
"learning_rate": 0.00010566037735849057,
"loss": 1.0796,
"step": 280
},
{
"epoch": 0.1313554523836485,
"grad_norm": 0.2639298141002655,
"learning_rate": 0.00010943396226415095,
"loss": 1.0626,
"step": 290
},
{
"epoch": 0.13588495074170537,
"grad_norm": 0.2657984495162964,
"learning_rate": 0.00011320754716981132,
"loss": 1.0418,
"step": 300
},
{
"epoch": 0.1404144490997622,
"grad_norm": 0.2493669092655182,
"learning_rate": 0.0001169811320754717,
"loss": 1.0157,
"step": 310
},
{
"epoch": 0.14494394745781905,
"grad_norm": 0.21642285585403442,
"learning_rate": 0.00012075471698113207,
"loss": 0.9852,
"step": 320
},
{
"epoch": 0.1494734458158759,
"grad_norm": 0.2093484252691269,
"learning_rate": 0.00012452830188679244,
"loss": 0.9938,
"step": 330
},
{
"epoch": 0.15400294417393273,
"grad_norm": 0.2212437391281128,
"learning_rate": 0.00012830188679245283,
"loss": 1.0289,
"step": 340
},
{
"epoch": 0.1585324425319896,
"grad_norm": 0.22111104428768158,
"learning_rate": 0.0001320754716981132,
"loss": 0.9656,
"step": 350
},
{
"epoch": 0.16306194089004641,
"grad_norm": 0.31839072704315186,
"learning_rate": 0.0001358490566037736,
"loss": 0.9723,
"step": 360
},
{
"epoch": 0.16759143924810327,
"grad_norm": 0.26599910855293274,
"learning_rate": 0.00013962264150943395,
"loss": 0.9503,
"step": 370
},
{
"epoch": 0.17212093760616012,
"grad_norm": 0.273809552192688,
"learning_rate": 0.00014339622641509434,
"loss": 0.9786,
"step": 380
},
{
"epoch": 0.17665043596421695,
"grad_norm": 0.1905912607908249,
"learning_rate": 0.00014716981132075472,
"loss": 0.9271,
"step": 390
},
{
"epoch": 0.1811799343222738,
"grad_norm": 0.21957655251026154,
"learning_rate": 0.0001509433962264151,
"loss": 0.911,
"step": 400
},
{
"epoch": 0.18570943268033066,
"grad_norm": 0.21992002427577972,
"learning_rate": 0.0001547169811320755,
"loss": 0.9434,
"step": 410
},
{
"epoch": 0.1902389310383875,
"grad_norm": 0.2033444494009018,
"learning_rate": 0.00015849056603773587,
"loss": 0.9189,
"step": 420
},
{
"epoch": 0.19476842939644434,
"grad_norm": 0.2479432225227356,
"learning_rate": 0.00016226415094339625,
"loss": 0.9137,
"step": 430
},
{
"epoch": 0.1992979277545012,
"grad_norm": 0.26578351855278015,
"learning_rate": 0.0001660377358490566,
"loss": 0.9172,
"step": 440
},
{
"epoch": 0.20382742611255802,
"grad_norm": 0.17441338300704956,
"learning_rate": 0.000169811320754717,
"loss": 0.8783,
"step": 450
},
{
"epoch": 0.20835692447061488,
"grad_norm": 0.18898604810237885,
"learning_rate": 0.00017358490566037738,
"loss": 0.874,
"step": 460
},
{
"epoch": 0.21288642282867173,
"grad_norm": 0.18335719406604767,
"learning_rate": 0.00017735849056603776,
"loss": 0.8604,
"step": 470
},
{
"epoch": 0.21741592118672856,
"grad_norm": 0.20873741805553436,
"learning_rate": 0.00018113207547169812,
"loss": 0.8368,
"step": 480
},
{
"epoch": 0.22194541954478542,
"grad_norm": 0.2140520066022873,
"learning_rate": 0.0001849056603773585,
"loss": 0.8729,
"step": 490
},
{
"epoch": 0.22647491790284227,
"grad_norm": 0.20203453302383423,
"learning_rate": 0.00018867924528301889,
"loss": 0.836,
"step": 500
},
{
"epoch": 0.2310044162608991,
"grad_norm": 0.185277059674263,
"learning_rate": 0.00019245283018867927,
"loss": 0.8224,
"step": 510
},
{
"epoch": 0.23553391461895595,
"grad_norm": 0.207021564245224,
"learning_rate": 0.00019622641509433963,
"loss": 0.8482,
"step": 520
},
{
"epoch": 0.2400634129770128,
"grad_norm": 0.19016426801681519,
"learning_rate": 0.0002,
"loss": 0.8296,
"step": 530
},
{
"epoch": 0.24459291133506963,
"grad_norm": 0.20634956657886505,
"learning_rate": 0.00019999983174896345,
"loss": 0.8294,
"step": 540
},
{
"epoch": 0.2491224096931265,
"grad_norm": 0.16894035041332245,
"learning_rate": 0.00019999932699641984,
"loss": 0.7966,
"step": 550
},
{
"epoch": 0.2536519080511833,
"grad_norm": 0.21543951332569122,
"learning_rate": 0.00019999848574406778,
"loss": 0.819,
"step": 560
},
{
"epoch": 0.2581814064092402,
"grad_norm": 0.18474166095256805,
"learning_rate": 0.000199997307994738,
"loss": 0.8073,
"step": 570
},
{
"epoch": 0.262710904767297,
"grad_norm": 0.1627601534128189,
"learning_rate": 0.0001999957937523937,
"loss": 0.798,
"step": 580
},
{
"epoch": 0.26724040312535385,
"grad_norm": 0.16344527900218964,
"learning_rate": 0.0001999939430221304,
"loss": 0.7846,
"step": 590
},
{
"epoch": 0.27176990148341074,
"grad_norm": 0.1784357726573944,
"learning_rate": 0.00019999175581017573,
"loss": 0.7892,
"step": 600
},
{
"epoch": 0.27629939984146756,
"grad_norm": 0.1735469251871109,
"learning_rate": 0.00019998923212388977,
"loss": 0.7624,
"step": 610
},
{
"epoch": 0.2808288981995244,
"grad_norm": 0.20232649147510529,
"learning_rate": 0.00019998637197176478,
"loss": 0.7754,
"step": 620
},
{
"epoch": 0.2853583965575813,
"grad_norm": 0.21980105340480804,
"learning_rate": 0.00019998317536342524,
"loss": 0.7896,
"step": 630
},
{
"epoch": 0.2898878949156381,
"grad_norm": 0.15072612464427948,
"learning_rate": 0.00019997964230962774,
"loss": 0.7451,
"step": 640
},
{
"epoch": 0.2944173932736949,
"grad_norm": 0.17559681832790375,
"learning_rate": 0.00019997577282226115,
"loss": 0.719,
"step": 650
},
{
"epoch": 0.2989468916317518,
"grad_norm": 0.17159104347229004,
"learning_rate": 0.00019997156691434632,
"loss": 0.7356,
"step": 660
},
{
"epoch": 0.30347638998980864,
"grad_norm": 0.20724473893642426,
"learning_rate": 0.00019996702460003623,
"loss": 0.7257,
"step": 670
},
{
"epoch": 0.30800588834786546,
"grad_norm": 0.15702813863754272,
"learning_rate": 0.00019996214589461592,
"loss": 0.7104,
"step": 680
},
{
"epoch": 0.31253538670592235,
"grad_norm": 0.185310959815979,
"learning_rate": 0.00019995693081450227,
"loss": 0.7192,
"step": 690
},
{
"epoch": 0.3170648850639792,
"grad_norm": 0.17659538984298706,
"learning_rate": 0.00019995137937724413,
"loss": 0.7084,
"step": 700
},
{
"epoch": 0.321594383422036,
"grad_norm": 0.16541071236133575,
"learning_rate": 0.00019994549160152225,
"loss": 0.7179,
"step": 710
},
{
"epoch": 0.32612388178009283,
"grad_norm": 0.16881656646728516,
"learning_rate": 0.00019993926750714905,
"loss": 0.7039,
"step": 720
},
{
"epoch": 0.3306533801381497,
"grad_norm": 0.18213717639446259,
"learning_rate": 0.0001999327071150688,
"loss": 0.712,
"step": 730
},
{
"epoch": 0.33518287849620654,
"grad_norm": 0.16946811974048615,
"learning_rate": 0.00019992581044735736,
"loss": 0.7041,
"step": 740
},
{
"epoch": 0.33971237685426336,
"grad_norm": 0.20027601718902588,
"learning_rate": 0.00019991857752722208,
"loss": 0.6937,
"step": 750
},
{
"epoch": 0.34424187521232025,
"grad_norm": 0.17900145053863525,
"learning_rate": 0.000199911008379002,
"loss": 0.689,
"step": 760
},
{
"epoch": 0.3487713735703771,
"grad_norm": 0.1626042276620865,
"learning_rate": 0.00019990310302816738,
"loss": 0.6923,
"step": 770
},
{
"epoch": 0.3533008719284339,
"grad_norm": 0.1776456981897354,
"learning_rate": 0.00019989486150131987,
"loss": 0.6725,
"step": 780
},
{
"epoch": 0.3578303702864908,
"grad_norm": 0.16232900321483612,
"learning_rate": 0.00019988628382619242,
"loss": 0.6621,
"step": 790
},
{
"epoch": 0.3623598686445476,
"grad_norm": 0.16653478145599365,
"learning_rate": 0.00019987737003164912,
"loss": 0.6825,
"step": 800
},
{
"epoch": 0.36688936700260444,
"grad_norm": 0.16946111619472504,
"learning_rate": 0.00019986812014768503,
"loss": 0.6634,
"step": 810
},
{
"epoch": 0.3714188653606613,
"grad_norm": 0.16169489920139313,
"learning_rate": 0.00019985853420542617,
"loss": 0.6592,
"step": 820
},
{
"epoch": 0.37594836371871815,
"grad_norm": 0.1830553561449051,
"learning_rate": 0.0001998486122371295,
"loss": 0.6661,
"step": 830
},
{
"epoch": 0.380477862076775,
"grad_norm": 0.18185435235500336,
"learning_rate": 0.00019983835427618262,
"loss": 0.6331,
"step": 840
},
{
"epoch": 0.38500736043483186,
"grad_norm": 0.17038173973560333,
"learning_rate": 0.0001998277603571038,
"loss": 0.6274,
"step": 850
},
{
"epoch": 0.3895368587928887,
"grad_norm": 0.15142400562763214,
"learning_rate": 0.00019981683051554174,
"loss": 0.6282,
"step": 860
},
{
"epoch": 0.3940663571509455,
"grad_norm": 0.18170781433582306,
"learning_rate": 0.00019980556478827564,
"loss": 0.605,
"step": 870
},
{
"epoch": 0.3985958555090024,
"grad_norm": 0.1576147973537445,
"learning_rate": 0.0001997939632132149,
"loss": 0.6393,
"step": 880
},
{
"epoch": 0.4031253538670592,
"grad_norm": 0.17267905175685883,
"learning_rate": 0.00019978202582939902,
"loss": 0.6274,
"step": 890
},
{
"epoch": 0.40765485222511605,
"grad_norm": 0.19358091056346893,
"learning_rate": 0.00019976975267699758,
"loss": 0.5976,
"step": 900
},
{
"epoch": 0.41218435058317293,
"grad_norm": 0.20368127524852753,
"learning_rate": 0.00019975714379730998,
"loss": 0.637,
"step": 910
},
{
"epoch": 0.41671384894122976,
"grad_norm": 0.17673739790916443,
"learning_rate": 0.00019974419923276537,
"loss": 0.6014,
"step": 920
},
{
"epoch": 0.4212433472992866,
"grad_norm": 0.1759296953678131,
"learning_rate": 0.0001997309190269225,
"loss": 0.5822,
"step": 930
},
{
"epoch": 0.42577284565734347,
"grad_norm": 0.15785963833332062,
"learning_rate": 0.00019971730322446949,
"loss": 0.5856,
"step": 940
},
{
"epoch": 0.4303023440154003,
"grad_norm": 0.16193810105323792,
"learning_rate": 0.00019970335187122383,
"loss": 0.5854,
"step": 950
},
{
"epoch": 0.4348318423734571,
"grad_norm": 0.1555752456188202,
"learning_rate": 0.0001996890650141321,
"loss": 0.5852,
"step": 960
},
{
"epoch": 0.439361340731514,
"grad_norm": 0.17118428647518158,
"learning_rate": 0.00019967444270126988,
"loss": 0.5816,
"step": 970
},
{
"epoch": 0.44389083908957083,
"grad_norm": 0.15966954827308655,
"learning_rate": 0.00019965948498184153,
"loss": 0.5641,
"step": 980
},
{
"epoch": 0.44842033744762766,
"grad_norm": 0.20606863498687744,
"learning_rate": 0.0001996441919061801,
"loss": 0.588,
"step": 990
},
{
"epoch": 0.45294983580568454,
"grad_norm": 0.17158259451389313,
"learning_rate": 0.0001996285635257471,
"loss": 0.5437,
"step": 1000
},
{
"epoch": 0.45747933416374137,
"grad_norm": 0.1764381229877472,
"learning_rate": 0.0001996125998931324,
"loss": 0.5546,
"step": 1010
},
{
"epoch": 0.4620088325217982,
"grad_norm": 0.17307806015014648,
"learning_rate": 0.0001995963010620539,
"loss": 0.5442,
"step": 1020
},
{
"epoch": 0.4665383308798551,
"grad_norm": 0.17395785450935364,
"learning_rate": 0.00019957966708735754,
"loss": 0.5198,
"step": 1030
},
{
"epoch": 0.4710678292379119,
"grad_norm": 0.17280320823192596,
"learning_rate": 0.00019956269802501696,
"loss": 0.5235,
"step": 1040
},
{
"epoch": 0.47559732759596873,
"grad_norm": 0.1894276738166809,
"learning_rate": 0.00019954539393213344,
"loss": 0.539,
"step": 1050
},
{
"epoch": 0.4801268259540256,
"grad_norm": 0.19094131886959076,
"learning_rate": 0.0001995277548669356,
"loss": 0.5445,
"step": 1060
},
{
"epoch": 0.48465632431208244,
"grad_norm": 0.182444766163826,
"learning_rate": 0.00019950978088877923,
"loss": 0.526,
"step": 1070
},
{
"epoch": 0.48918582267013927,
"grad_norm": 0.2150132805109024,
"learning_rate": 0.00019949147205814715,
"loss": 0.5334,
"step": 1080
},
{
"epoch": 0.49371532102819615,
"grad_norm": 0.17609047889709473,
"learning_rate": 0.000199472828436649,
"loss": 0.5239,
"step": 1090
},
{
"epoch": 0.498244819386253,
"grad_norm": 0.18994882702827454,
"learning_rate": 0.0001994538500870209,
"loss": 0.5163,
"step": 1100
},
{
"epoch": 0.5027743177443098,
"grad_norm": 0.1678932011127472,
"learning_rate": 0.00019943453707312544,
"loss": 0.5379,
"step": 1110
},
{
"epoch": 0.5073038161023666,
"grad_norm": 0.18330644071102142,
"learning_rate": 0.00019941488945995125,
"loss": 0.5037,
"step": 1120
},
{
"epoch": 0.5118333144604235,
"grad_norm": 0.1946277767419815,
"learning_rate": 0.00019939490731361298,
"loss": 0.5169,
"step": 1130
},
{
"epoch": 0.5163628128184804,
"grad_norm": 0.1769060641527176,
"learning_rate": 0.00019937459070135097,
"loss": 0.5016,
"step": 1140
},
{
"epoch": 0.5208923111765372,
"grad_norm": 0.1812835931777954,
"learning_rate": 0.00019935393969153106,
"loss": 0.4974,
"step": 1150
},
{
"epoch": 0.525421809534594,
"grad_norm": 0.17336933314800262,
"learning_rate": 0.00019933295435364432,
"loss": 0.4936,
"step": 1160
},
{
"epoch": 0.5299513078926509,
"grad_norm": 0.19504410028457642,
"learning_rate": 0.00019931163475830682,
"loss": 0.4892,
"step": 1170
},
{
"epoch": 0.5344808062507077,
"grad_norm": 0.17446300387382507,
"learning_rate": 0.00019928998097725945,
"loss": 0.4851,
"step": 1180
},
{
"epoch": 0.5390103046087645,
"grad_norm": 0.2062528431415558,
"learning_rate": 0.00019926799308336767,
"loss": 0.4796,
"step": 1190
},
{
"epoch": 0.5435398029668215,
"grad_norm": 0.17791499197483063,
"learning_rate": 0.00019924567115062116,
"loss": 0.4704,
"step": 1200
},
{
"epoch": 0.5480693013248783,
"grad_norm": 0.20112474262714386,
"learning_rate": 0.00019922301525413368,
"loss": 0.4848,
"step": 1210
},
{
"epoch": 0.5525987996829351,
"grad_norm": 0.1905170977115631,
"learning_rate": 0.00019920002547014283,
"loss": 0.4848,
"step": 1220
},
{
"epoch": 0.557128298040992,
"grad_norm": 0.2167678326368332,
"learning_rate": 0.00019917670187600967,
"loss": 0.475,
"step": 1230
},
{
"epoch": 0.5616577963990488,
"grad_norm": 0.1879906803369522,
"learning_rate": 0.00019915304455021859,
"loss": 0.4661,
"step": 1240
},
{
"epoch": 0.5661872947571056,
"grad_norm": 0.17811033129692078,
"learning_rate": 0.00019912905357237701,
"loss": 0.4758,
"step": 1250
},
{
"epoch": 0.5707167931151625,
"grad_norm": 0.18101903796195984,
"learning_rate": 0.00019910472902321503,
"loss": 0.4668,
"step": 1260
},
{
"epoch": 0.5752462914732194,
"grad_norm": 0.1657211035490036,
"learning_rate": 0.0001990800709845853,
"loss": 0.4645,
"step": 1270
},
{
"epoch": 0.5797757898312762,
"grad_norm": 0.32196566462516785,
"learning_rate": 0.00019905507953946257,
"loss": 0.4442,
"step": 1280
},
{
"epoch": 0.584305288189333,
"grad_norm": 0.2010417878627777,
"learning_rate": 0.00019902975477194363,
"loss": 0.4633,
"step": 1290
},
{
"epoch": 0.5888347865473899,
"grad_norm": 0.18759405612945557,
"learning_rate": 0.00019900409676724682,
"loss": 0.4642,
"step": 1300
},
{
"epoch": 0.5933642849054467,
"grad_norm": 0.19315552711486816,
"learning_rate": 0.00019897810561171189,
"loss": 0.4308,
"step": 1310
},
{
"epoch": 0.5978937832635036,
"grad_norm": 0.194192036986351,
"learning_rate": 0.00019895178139279956,
"loss": 0.4424,
"step": 1320
},
{
"epoch": 0.6024232816215604,
"grad_norm": 0.17403574287891388,
"learning_rate": 0.00019892512419909138,
"loss": 0.4491,
"step": 1330
},
{
"epoch": 0.6069527799796173,
"grad_norm": 0.20866619050502777,
"learning_rate": 0.00019889813412028942,
"loss": 0.4546,
"step": 1340
},
{
"epoch": 0.6114822783376741,
"grad_norm": 0.1847338080406189,
"learning_rate": 0.00019887081124721583,
"loss": 0.4354,
"step": 1350
},
{
"epoch": 0.6160117766957309,
"grad_norm": 0.20528827607631683,
"learning_rate": 0.00019884315567181263,
"loss": 0.432,
"step": 1360
},
{
"epoch": 0.6205412750537878,
"grad_norm": 0.19688895344734192,
"learning_rate": 0.00019881516748714137,
"loss": 0.4256,
"step": 1370
},
{
"epoch": 0.6250707734118447,
"grad_norm": 0.1834789514541626,
"learning_rate": 0.00019878684678738295,
"loss": 0.4142,
"step": 1380
},
{
"epoch": 0.6296002717699015,
"grad_norm": 0.1904083490371704,
"learning_rate": 0.00019875819366783705,
"loss": 0.4072,
"step": 1390
},
{
"epoch": 0.6341297701279583,
"grad_norm": 0.24558007717132568,
"learning_rate": 0.00019872920822492206,
"loss": 0.4168,
"step": 1400
},
{
"epoch": 0.6386592684860152,
"grad_norm": 0.19825737178325653,
"learning_rate": 0.0001986998905561745,
"loss": 0.4102,
"step": 1410
},
{
"epoch": 0.643188766844072,
"grad_norm": 0.2427905946969986,
"learning_rate": 0.00019867024076024908,
"loss": 0.4266,
"step": 1420
},
{
"epoch": 0.6477182652021288,
"grad_norm": 0.20517700910568237,
"learning_rate": 0.00019864025893691784,
"loss": 0.4155,
"step": 1430
},
{
"epoch": 0.6522477635601857,
"grad_norm": 0.19519874453544617,
"learning_rate": 0.00019860994518707036,
"loss": 0.4093,
"step": 1440
},
{
"epoch": 0.6567772619182426,
"grad_norm": 0.17730577290058136,
"learning_rate": 0.0001985792996127129,
"loss": 0.3932,
"step": 1450
},
{
"epoch": 0.6613067602762994,
"grad_norm": 0.1811046451330185,
"learning_rate": 0.00019854832231696855,
"loss": 0.3953,
"step": 1460
},
{
"epoch": 0.6658362586343562,
"grad_norm": 0.18473340570926666,
"learning_rate": 0.00019851701340407654,
"loss": 0.3846,
"step": 1470
},
{
"epoch": 0.6703657569924131,
"grad_norm": 0.1876707524061203,
"learning_rate": 0.000198485372979392,
"loss": 0.3947,
"step": 1480
},
{
"epoch": 0.6748952553504699,
"grad_norm": 0.21453642845153809,
"learning_rate": 0.00019845340114938562,
"loss": 0.3893,
"step": 1490
},
{
"epoch": 0.6794247537085267,
"grad_norm": 0.19314515590667725,
"learning_rate": 0.00019842109802164327,
"loss": 0.3857,
"step": 1500
},
{
"epoch": 0.6839542520665837,
"grad_norm": 0.18713776767253876,
"learning_rate": 0.0001983884637048656,
"loss": 0.3945,
"step": 1510
},
{
"epoch": 0.6884837504246405,
"grad_norm": 0.18545708060264587,
"learning_rate": 0.00019835549830886785,
"loss": 0.3829,
"step": 1520
},
{
"epoch": 0.6930132487826973,
"grad_norm": 0.163354754447937,
"learning_rate": 0.00019832220194457919,
"loss": 0.3681,
"step": 1530
},
{
"epoch": 0.6975427471407541,
"grad_norm": 0.19729359447956085,
"learning_rate": 0.0001982885747240426,
"loss": 0.376,
"step": 1540
},
{
"epoch": 0.702072245498811,
"grad_norm": 0.19601188600063324,
"learning_rate": 0.00019825461676041436,
"loss": 0.3738,
"step": 1550
},
{
"epoch": 0.7066017438568678,
"grad_norm": 0.184451162815094,
"learning_rate": 0.00019822032816796376,
"loss": 0.3689,
"step": 1560
},
{
"epoch": 0.7111312422149247,
"grad_norm": 0.16905899345874786,
"learning_rate": 0.0001981857090620726,
"loss": 0.3667,
"step": 1570
},
{
"epoch": 0.7156607405729816,
"grad_norm": 0.17829935252666473,
"learning_rate": 0.0001981507595592349,
"loss": 0.3718,
"step": 1580
},
{
"epoch": 0.7201902389310384,
"grad_norm": 0.17314116656780243,
"learning_rate": 0.0001981154797770564,
"loss": 0.3711,
"step": 1590
},
{
"epoch": 0.7247197372890952,
"grad_norm": 0.17752452194690704,
"learning_rate": 0.0001980798698342544,
"loss": 0.3711,
"step": 1600
},
{
"epoch": 0.729249235647152,
"grad_norm": 0.16267523169517517,
"learning_rate": 0.00019804392985065702,
"loss": 0.3461,
"step": 1610
},
{
"epoch": 0.7337787340052089,
"grad_norm": 0.1715889424085617,
"learning_rate": 0.00019800765994720308,
"loss": 0.3542,
"step": 1620
},
{
"epoch": 0.7383082323632658,
"grad_norm": 0.2011169195175171,
"learning_rate": 0.00019797106024594153,
"loss": 0.3602,
"step": 1630
},
{
"epoch": 0.7428377307213226,
"grad_norm": 0.16859227418899536,
"learning_rate": 0.00019793413087003115,
"loss": 0.3509,
"step": 1640
},
{
"epoch": 0.7473672290793795,
"grad_norm": 0.18904465436935425,
"learning_rate": 0.0001978968719437401,
"loss": 0.3619,
"step": 1650
},
{
"epoch": 0.7518967274374363,
"grad_norm": 0.1918095499277115,
"learning_rate": 0.00019785928359244533,
"loss": 0.3529,
"step": 1660
},
{
"epoch": 0.7564262257954931,
"grad_norm": 0.16930030286312103,
"learning_rate": 0.0001978213659426325,
"loss": 0.3505,
"step": 1670
},
{
"epoch": 0.76095572415355,
"grad_norm": 0.19345726072788239,
"learning_rate": 0.00019778311912189528,
"loss": 0.3548,
"step": 1680
},
{
"epoch": 0.7654852225116069,
"grad_norm": 0.1755731701850891,
"learning_rate": 0.000197744543258935,
"loss": 0.3549,
"step": 1690
},
{
"epoch": 0.7700147208696637,
"grad_norm": 0.17827914655208588,
"learning_rate": 0.00019770563848356024,
"loss": 0.3622,
"step": 1700
},
{
"epoch": 0.7745442192277205,
"grad_norm": 0.1955813765525818,
"learning_rate": 0.0001976664049266864,
"loss": 0.3412,
"step": 1710
},
{
"epoch": 0.7790737175857774,
"grad_norm": 0.18960636854171753,
"learning_rate": 0.00019762684272033515,
"loss": 0.3438,
"step": 1720
},
{
"epoch": 0.7836032159438342,
"grad_norm": 0.20935559272766113,
"learning_rate": 0.00019758695199763418,
"loss": 0.3497,
"step": 1730
},
{
"epoch": 0.788132714301891,
"grad_norm": 0.18760916590690613,
"learning_rate": 0.00019754673289281663,
"loss": 0.3299,
"step": 1740
},
{
"epoch": 0.792662212659948,
"grad_norm": 0.2013741135597229,
"learning_rate": 0.0001975061855412206,
"loss": 0.3395,
"step": 1750
},
{
"epoch": 0.7971917110180048,
"grad_norm": 0.18885807693004608,
"learning_rate": 0.0001974653100792887,
"loss": 0.3321,
"step": 1760
},
{
"epoch": 0.8017212093760616,
"grad_norm": 0.18193817138671875,
"learning_rate": 0.00019742410664456777,
"loss": 0.3387,
"step": 1770
},
{
"epoch": 0.8062507077341184,
"grad_norm": 0.16840125620365143,
"learning_rate": 0.00019738257537570822,
"loss": 0.3302,
"step": 1780
},
{
"epoch": 0.8107802060921753,
"grad_norm": 0.1618867665529251,
"learning_rate": 0.00019734071641246365,
"loss": 0.3212,
"step": 1790
},
{
"epoch": 0.8153097044502321,
"grad_norm": 0.20026183128356934,
"learning_rate": 0.00019729852989569028,
"loss": 0.3274,
"step": 1800
},
{
"epoch": 0.819839202808289,
"grad_norm": 0.18741321563720703,
"learning_rate": 0.00019725601596734668,
"loss": 0.3267,
"step": 1810
},
{
"epoch": 0.8243687011663459,
"grad_norm": 0.17450092732906342,
"learning_rate": 0.000197213174770493,
"loss": 0.3193,
"step": 1820
},
{
"epoch": 0.8288981995244027,
"grad_norm": 0.1721801608800888,
"learning_rate": 0.00019717000644929087,
"loss": 0.3127,
"step": 1830
},
{
"epoch": 0.8334276978824595,
"grad_norm": 0.18926140666007996,
"learning_rate": 0.00019712651114900257,
"loss": 0.3214,
"step": 1840
},
{
"epoch": 0.8379571962405163,
"grad_norm": 0.17309771478176117,
"learning_rate": 0.0001970826890159906,
"loss": 0.318,
"step": 1850
},
{
"epoch": 0.8424866945985732,
"grad_norm": 0.18818823993206024,
"learning_rate": 0.00019703854019771742,
"loss": 0.3154,
"step": 1860
},
{
"epoch": 0.84701619295663,
"grad_norm": 0.18680931627750397,
"learning_rate": 0.00019699406484274468,
"loss": 0.3104,
"step": 1870
},
{
"epoch": 0.8515456913146869,
"grad_norm": 0.16489103436470032,
"learning_rate": 0.0001969492631007329,
"loss": 0.3232,
"step": 1880
},
{
"epoch": 0.8560751896727438,
"grad_norm": 0.17721644043922424,
"learning_rate": 0.0001969041351224409,
"loss": 0.3034,
"step": 1890
},
{
"epoch": 0.8606046880308006,
"grad_norm": 0.19497451186180115,
"learning_rate": 0.00019685868105972517,
"loss": 0.3092,
"step": 1900
},
{
"epoch": 0.8651341863888574,
"grad_norm": 0.20427413284778595,
"learning_rate": 0.00019681290106553969,
"loss": 0.3158,
"step": 1910
},
{
"epoch": 0.8696636847469142,
"grad_norm": 0.18642422556877136,
"learning_rate": 0.00019676679529393498,
"loss": 0.3058,
"step": 1920
},
{
"epoch": 0.8741931831049711,
"grad_norm": 0.16172035038471222,
"learning_rate": 0.00019672036390005798,
"loss": 0.3069,
"step": 1930
},
{
"epoch": 0.878722681463028,
"grad_norm": 0.15888796746730804,
"learning_rate": 0.00019667360704015127,
"loss": 0.3075,
"step": 1940
},
{
"epoch": 0.8832521798210848,
"grad_norm": 0.16608227789402008,
"learning_rate": 0.0001966265248715527,
"loss": 0.295,
"step": 1950
},
{
"epoch": 0.8877816781791417,
"grad_norm": 0.18529315292835236,
"learning_rate": 0.00019657911755269466,
"loss": 0.3087,
"step": 1960
},
{
"epoch": 0.8923111765371985,
"grad_norm": 0.1623723804950714,
"learning_rate": 0.0001965313852431038,
"loss": 0.318,
"step": 1970
},
{
"epoch": 0.8968406748952553,
"grad_norm": 0.18999403715133667,
"learning_rate": 0.0001964833281034004,
"loss": 0.3013,
"step": 1980
},
{
"epoch": 0.9013701732533121,
"grad_norm": 0.1742704212665558,
"learning_rate": 0.0001964349462952976,
"loss": 0.2906,
"step": 1990
},
{
"epoch": 0.9058996716113691,
"grad_norm": 0.15007524192333221,
"learning_rate": 0.00019638623998160127,
"loss": 0.2909,
"step": 2000
},
{
"epoch": 0.9104291699694259,
"grad_norm": 0.18087700009346008,
"learning_rate": 0.00019633720932620916,
"loss": 0.2852,
"step": 2010
},
{
"epoch": 0.9149586683274827,
"grad_norm": 0.172203928232193,
"learning_rate": 0.0001962878544941104,
"loss": 0.2894,
"step": 2020
},
{
"epoch": 0.9194881666855396,
"grad_norm": 0.1811007559299469,
"learning_rate": 0.00019623817565138512,
"loss": 0.2905,
"step": 2030
},
{
"epoch": 0.9240176650435964,
"grad_norm": 0.17736268043518066,
"learning_rate": 0.00019618817296520355,
"loss": 0.2855,
"step": 2040
},
{
"epoch": 0.9285471634016532,
"grad_norm": 0.1875537484884262,
"learning_rate": 0.00019613784660382582,
"loss": 0.3006,
"step": 2050
},
{
"epoch": 0.9330766617597102,
"grad_norm": 0.16459111869335175,
"learning_rate": 0.00019608719673660117,
"loss": 0.2928,
"step": 2060
},
{
"epoch": 0.937606160117767,
"grad_norm": 0.19852280616760254,
"learning_rate": 0.00019603622353396745,
"loss": 0.2877,
"step": 2070
},
{
"epoch": 0.9421356584758238,
"grad_norm": 0.1441079080104828,
"learning_rate": 0.00019598492716745055,
"loss": 0.2722,
"step": 2080
},
{
"epoch": 0.9466651568338806,
"grad_norm": 0.17091263830661774,
"learning_rate": 0.00019593330780966377,
"loss": 0.2845,
"step": 2090
},
{
"epoch": 0.9511946551919375,
"grad_norm": 0.17907531559467316,
"learning_rate": 0.00019588136563430735,
"loss": 0.2881,
"step": 2100
},
{
"epoch": 0.9557241535499943,
"grad_norm": 0.18411681056022644,
"learning_rate": 0.00019582910081616782,
"loss": 0.2906,
"step": 2110
},
{
"epoch": 0.9602536519080512,
"grad_norm": 0.19341252744197845,
"learning_rate": 0.00019577651353111733,
"loss": 0.2926,
"step": 2120
},
{
"epoch": 0.9647831502661081,
"grad_norm": 0.17022013664245605,
"learning_rate": 0.00019572360395611317,
"loss": 0.2728,
"step": 2130
},
{
"epoch": 0.9693126486241649,
"grad_norm": 0.17077523469924927,
"learning_rate": 0.00019567037226919721,
"loss": 0.2754,
"step": 2140
},
{
"epoch": 0.9738421469822217,
"grad_norm": 0.16188162565231323,
"learning_rate": 0.00019561681864949514,
"loss": 0.2761,
"step": 2150
},
{
"epoch": 0.9783716453402785,
"grad_norm": 0.16258101165294647,
"learning_rate": 0.00019556294327721603,
"loss": 0.2724,
"step": 2160
},
{
"epoch": 0.9829011436983354,
"grad_norm": 0.1751459836959839,
"learning_rate": 0.00019550874633365162,
"loss": 0.2844,
"step": 2170
},
{
"epoch": 0.9874306420563923,
"grad_norm": 0.14674732089042664,
"learning_rate": 0.0001954542280011757,
"loss": 0.2818,
"step": 2180
},
{
"epoch": 0.9919601404144491,
"grad_norm": 0.1843065619468689,
"learning_rate": 0.00019539938846324363,
"loss": 0.2736,
"step": 2190
},
{
"epoch": 0.996489638772506,
"grad_norm": 0.18449115753173828,
"learning_rate": 0.00019534422790439164,
"loss": 0.2828,
"step": 2200
},
{
"epoch": 0.9996602876231457,
"eval_loss": 0.26604515314102173,
"eval_runtime": 617.1505,
"eval_samples_per_second": 12.752,
"eval_steps_per_second": 1.594,
"step": 2207
},
{
"epoch": 1.001358849507417,
"grad_norm": 0.15234586596488953,
"learning_rate": 0.00019528874651023606,
"loss": 0.2608,
"step": 2210
},
{
"epoch": 1.0058883478654739,
"grad_norm": 0.15887659788131714,
"learning_rate": 0.00019523294446747297,
"loss": 0.2417,
"step": 2220
},
{
"epoch": 1.0104178462235307,
"grad_norm": 0.16629189252853394,
"learning_rate": 0.00019517682196387744,
"loss": 0.2306,
"step": 2230
},
{
"epoch": 1.0149473445815875,
"grad_norm": 0.17960551381111145,
"learning_rate": 0.00019512037918830282,
"loss": 0.2279,
"step": 2240
},
{
"epoch": 1.0194768429396444,
"grad_norm": 0.1671302169561386,
"learning_rate": 0.0001950636163306802,
"loss": 0.2181,
"step": 2250
},
{
"epoch": 1.0240063412977012,
"grad_norm": 0.16400860249996185,
"learning_rate": 0.0001950065335820178,
"loss": 0.2333,
"step": 2260
},
{
"epoch": 1.0285358396557582,
"grad_norm": 0.15259268879890442,
"learning_rate": 0.00019494913113440022,
"loss": 0.2307,
"step": 2270
},
{
"epoch": 1.033065338013815,
"grad_norm": 0.1612786501646042,
"learning_rate": 0.00019489140918098796,
"loss": 0.2349,
"step": 2280
},
{
"epoch": 1.0375948363718719,
"grad_norm": 0.15766066312789917,
"learning_rate": 0.00019483336791601655,
"loss": 0.23,
"step": 2290
},
{
"epoch": 1.0421243347299287,
"grad_norm": 0.16044190526008606,
"learning_rate": 0.00019477500753479603,
"loss": 0.2234,
"step": 2300
},
{
"epoch": 1.0466538330879855,
"grad_norm": 0.18357709050178528,
"learning_rate": 0.00019471632823371028,
"loss": 0.2208,
"step": 2310
},
{
"epoch": 1.0511833314460424,
"grad_norm": 0.1702904850244522,
"learning_rate": 0.00019465733021021645,
"loss": 0.2248,
"step": 2320
},
{
"epoch": 1.0557128298040992,
"grad_norm": 0.15621191263198853,
"learning_rate": 0.00019459801366284403,
"loss": 0.2286,
"step": 2330
},
{
"epoch": 1.060242328162156,
"grad_norm": 0.1782391220331192,
"learning_rate": 0.00019453837879119444,
"loss": 0.2304,
"step": 2340
},
{
"epoch": 1.0647718265202128,
"grad_norm": 0.16530479490756989,
"learning_rate": 0.00019447842579594027,
"loss": 0.2306,
"step": 2350
},
{
"epoch": 1.0693013248782697,
"grad_norm": 0.16082873940467834,
"learning_rate": 0.00019441815487882463,
"loss": 0.2252,
"step": 2360
},
{
"epoch": 1.0738308232363265,
"grad_norm": 0.15404802560806274,
"learning_rate": 0.00019435756624266035,
"loss": 0.216,
"step": 2370
},
{
"epoch": 1.0783603215943833,
"grad_norm": 0.14842167496681213,
"learning_rate": 0.00019429666009132944,
"loss": 0.2218,
"step": 2380
},
{
"epoch": 1.0828898199524404,
"grad_norm": 0.16312135756015778,
"learning_rate": 0.00019423543662978245,
"loss": 0.212,
"step": 2390
},
{
"epoch": 1.0874193183104972,
"grad_norm": 0.17386338114738464,
"learning_rate": 0.00019417389606403752,
"loss": 0.2251,
"step": 2400
},
{
"epoch": 1.091948816668554,
"grad_norm": 0.17737415432929993,
"learning_rate": 0.00019411203860117995,
"loss": 0.2304,
"step": 2410
},
{
"epoch": 1.0964783150266109,
"grad_norm": 0.16693584620952606,
"learning_rate": 0.00019404986444936136,
"loss": 0.2175,
"step": 2420
},
{
"epoch": 1.1010078133846677,
"grad_norm": 0.1775166392326355,
"learning_rate": 0.00019398737381779913,
"loss": 0.2209,
"step": 2430
},
{
"epoch": 1.1055373117427245,
"grad_norm": 0.1629152148962021,
"learning_rate": 0.00019392456691677546,
"loss": 0.2113,
"step": 2440
},
{
"epoch": 1.1100668101007813,
"grad_norm": 0.1428159475326538,
"learning_rate": 0.0001938614439576369,
"loss": 0.2141,
"step": 2450
},
{
"epoch": 1.1145963084588382,
"grad_norm": 0.1580020934343338,
"learning_rate": 0.0001937980051527935,
"loss": 0.2193,
"step": 2460
},
{
"epoch": 1.119125806816895,
"grad_norm": 0.13861976563930511,
"learning_rate": 0.0001937342507157182,
"loss": 0.2091,
"step": 2470
},
{
"epoch": 1.1236553051749518,
"grad_norm": 0.16170430183410645,
"learning_rate": 0.00019367018086094594,
"loss": 0.2175,
"step": 2480
},
{
"epoch": 1.1281848035330087,
"grad_norm": 0.15579678118228912,
"learning_rate": 0.00019360579580407315,
"loss": 0.2091,
"step": 2490
},
{
"epoch": 1.1327143018910655,
"grad_norm": 0.15239351987838745,
"learning_rate": 0.00019354109576175685,
"loss": 0.2189,
"step": 2500
},
{
"epoch": 1.1372438002491223,
"grad_norm": 0.16122813522815704,
"learning_rate": 0.00019347608095171407,
"loss": 0.2159,
"step": 2510
},
{
"epoch": 1.1417732986071791,
"grad_norm": 0.14791563153266907,
"learning_rate": 0.00019341075159272096,
"loss": 0.2093,
"step": 2520
},
{
"epoch": 1.1463027969652362,
"grad_norm": 0.138755664229393,
"learning_rate": 0.0001933451079046122,
"loss": 0.2231,
"step": 2530
},
{
"epoch": 1.150832295323293,
"grad_norm": 0.15061049163341522,
"learning_rate": 0.0001932791501082801,
"loss": 0.2067,
"step": 2540
},
{
"epoch": 1.1553617936813498,
"grad_norm": 0.17541393637657166,
"learning_rate": 0.00019321287842567408,
"loss": 0.2197,
"step": 2550
},
{
"epoch": 1.1598912920394067,
"grad_norm": 0.17274054884910583,
"learning_rate": 0.00019314629307979968,
"loss": 0.2179,
"step": 2560
},
{
"epoch": 1.1644207903974635,
"grad_norm": 0.16083642840385437,
"learning_rate": 0.000193079394294718,
"loss": 0.2139,
"step": 2570
},
{
"epoch": 1.1689502887555203,
"grad_norm": 0.16815818846225739,
"learning_rate": 0.00019301218229554482,
"loss": 0.2158,
"step": 2580
},
{
"epoch": 1.1734797871135771,
"grad_norm": 0.15939727425575256,
"learning_rate": 0.0001929446573084499,
"loss": 0.2139,
"step": 2590
},
{
"epoch": 1.178009285471634,
"grad_norm": 0.14855942130088806,
"learning_rate": 0.00019287681956065624,
"loss": 0.2156,
"step": 2600
},
{
"epoch": 1.1825387838296908,
"grad_norm": 0.16065727174282074,
"learning_rate": 0.00019280866928043927,
"loss": 0.2131,
"step": 2610
},
{
"epoch": 1.1870682821877476,
"grad_norm": 0.15156914293766022,
"learning_rate": 0.00019274020669712608,
"loss": 0.2133,
"step": 2620
},
{
"epoch": 1.1915977805458047,
"grad_norm": 0.15163294970989227,
"learning_rate": 0.00019267143204109469,
"loss": 0.2172,
"step": 2630
},
{
"epoch": 1.1961272789038615,
"grad_norm": 0.14060626924037933,
"learning_rate": 0.00019260234554377325,
"loss": 0.2048,
"step": 2640
},
{
"epoch": 1.2006567772619183,
"grad_norm": 0.16215626895427704,
"learning_rate": 0.00019253294743763925,
"loss": 0.2077,
"step": 2650
},
{
"epoch": 1.2051862756199752,
"grad_norm": 0.13906173408031464,
"learning_rate": 0.00019246323795621875,
"loss": 0.2125,
"step": 2660
},
{
"epoch": 1.209715773978032,
"grad_norm": 0.15761959552764893,
"learning_rate": 0.0001923932173340856,
"loss": 0.2104,
"step": 2670
},
{
"epoch": 1.2142452723360888,
"grad_norm": 0.16140113770961761,
"learning_rate": 0.00019232288580686068,
"loss": 0.2131,
"step": 2680
},
{
"epoch": 1.2187747706941456,
"grad_norm": 0.13611847162246704,
"learning_rate": 0.000192252243611211,
"loss": 0.2042,
"step": 2690
},
{
"epoch": 1.2233042690522025,
"grad_norm": 0.14395853877067566,
"learning_rate": 0.00019218129098484902,
"loss": 0.2144,
"step": 2700
},
{
"epoch": 1.2278337674102593,
"grad_norm": 0.14826107025146484,
"learning_rate": 0.0001921100281665318,
"loss": 0.2119,
"step": 2710
},
{
"epoch": 1.2323632657683161,
"grad_norm": 0.1515769064426422,
"learning_rate": 0.0001920384553960602,
"loss": 0.2051,
"step": 2720
},
{
"epoch": 1.236892764126373,
"grad_norm": 0.15898488461971283,
"learning_rate": 0.00019196657291427807,
"loss": 0.2127,
"step": 2730
},
{
"epoch": 1.2414222624844298,
"grad_norm": 0.13833607733249664,
"learning_rate": 0.00019189438096307146,
"loss": 0.2097,
"step": 2740
},
{
"epoch": 1.2459517608424866,
"grad_norm": 0.14516334235668182,
"learning_rate": 0.0001918218797853678,
"loss": 0.1958,
"step": 2750
},
{
"epoch": 1.2504812592005434,
"grad_norm": 0.13684655725955963,
"learning_rate": 0.00019174906962513504,
"loss": 0.2196,
"step": 2760
},
{
"epoch": 1.2550107575586003,
"grad_norm": 0.16645090281963348,
"learning_rate": 0.00019167595072738084,
"loss": 0.2095,
"step": 2770
},
{
"epoch": 1.2595402559166573,
"grad_norm": 0.1568327695131302,
"learning_rate": 0.00019160252333815187,
"loss": 0.2046,
"step": 2780
},
{
"epoch": 1.2640697542747141,
"grad_norm": 0.15349489450454712,
"learning_rate": 0.00019152878770453279,
"loss": 0.2124,
"step": 2790
},
{
"epoch": 1.268599252632771,
"grad_norm": 0.16242361068725586,
"learning_rate": 0.00019145474407464554,
"loss": 0.2059,
"step": 2800
},
{
"epoch": 1.2731287509908278,
"grad_norm": 0.15133287012577057,
"learning_rate": 0.00019138039269764846,
"loss": 0.2068,
"step": 2810
},
{
"epoch": 1.2776582493488846,
"grad_norm": 0.1698140799999237,
"learning_rate": 0.00019130573382373549,
"loss": 0.2165,
"step": 2820
},
{
"epoch": 1.2821877477069414,
"grad_norm": 0.16591964662075043,
"learning_rate": 0.00019123076770413526,
"loss": 0.2052,
"step": 2830
},
{
"epoch": 1.2867172460649983,
"grad_norm": 0.14136140048503876,
"learning_rate": 0.00019115549459111034,
"loss": 0.1972,
"step": 2840
},
{
"epoch": 1.291246744423055,
"grad_norm": 0.15886986255645752,
"learning_rate": 0.0001910799147379563,
"loss": 0.2178,
"step": 2850
},
{
"epoch": 1.295776242781112,
"grad_norm": 0.143589586019516,
"learning_rate": 0.00019100402839900097,
"loss": 0.2139,
"step": 2860
},
{
"epoch": 1.300305741139169,
"grad_norm": 0.16037988662719727,
"learning_rate": 0.0001909278358296034,
"loss": 0.2073,
"step": 2870
},
{
"epoch": 1.3048352394972258,
"grad_norm": 0.1397211104631424,
"learning_rate": 0.00019085133728615313,
"loss": 0.2045,
"step": 2880
},
{
"epoch": 1.3093647378552826,
"grad_norm": 0.1394536942243576,
"learning_rate": 0.00019077453302606944,
"loss": 0.194,
"step": 2890
},
{
"epoch": 1.3138942362133395,
"grad_norm": 0.1598595380783081,
"learning_rate": 0.00019069742330780014,
"loss": 0.205,
"step": 2900
},
{
"epoch": 1.3184237345713963,
"grad_norm": 0.16302059590816498,
"learning_rate": 0.00019062000839082115,
"loss": 0.2044,
"step": 2910
},
{
"epoch": 1.322953232929453,
"grad_norm": 0.15237270295619965,
"learning_rate": 0.0001905422885356352,
"loss": 0.2061,
"step": 2920
},
{
"epoch": 1.32748273128751,
"grad_norm": 0.16175110638141632,
"learning_rate": 0.00019046426400377123,
"loss": 0.2127,
"step": 2930
},
{
"epoch": 1.3320122296455668,
"grad_norm": 0.17352445423603058,
"learning_rate": 0.00019038593505778343,
"loss": 0.2121,
"step": 2940
},
{
"epoch": 1.3365417280036236,
"grad_norm": 0.15539845824241638,
"learning_rate": 0.0001903073019612503,
"loss": 0.1996,
"step": 2950
},
{
"epoch": 1.3410712263616804,
"grad_norm": 0.1654234081506729,
"learning_rate": 0.00019022836497877382,
"loss": 0.1982,
"step": 2960
},
{
"epoch": 1.3456007247197372,
"grad_norm": 0.15698087215423584,
"learning_rate": 0.00019014912437597862,
"loss": 0.2006,
"step": 2970
},
{
"epoch": 1.350130223077794,
"grad_norm": 0.15171001851558685,
"learning_rate": 0.00019006958041951094,
"loss": 0.2066,
"step": 2980
},
{
"epoch": 1.354659721435851,
"grad_norm": 0.15153132379055023,
"learning_rate": 0.00018998973337703784,
"loss": 0.1969,
"step": 2990
},
{
"epoch": 1.3591892197939077,
"grad_norm": 0.14000695943832397,
"learning_rate": 0.00018990958351724634,
"loss": 0.2081,
"step": 3000
},
{
"epoch": 1.3637187181519645,
"grad_norm": 0.14371009171009064,
"learning_rate": 0.00018982913110984225,
"loss": 0.1964,
"step": 3010
},
{
"epoch": 1.3682482165100216,
"grad_norm": 0.1594901829957962,
"learning_rate": 0.0001897483764255497,
"loss": 0.2004,
"step": 3020
},
{
"epoch": 1.3727777148680784,
"grad_norm": 0.15266938507556915,
"learning_rate": 0.00018966731973610985,
"loss": 0.2081,
"step": 3030
},
{
"epoch": 1.3773072132261353,
"grad_norm": 0.17764367163181305,
"learning_rate": 0.0001895859613142801,
"loss": 0.2028,
"step": 3040
},
{
"epoch": 1.381836711584192,
"grad_norm": 0.15086011588573456,
"learning_rate": 0.0001895043014338333,
"loss": 0.1984,
"step": 3050
},
{
"epoch": 1.386366209942249,
"grad_norm": 0.1648501455783844,
"learning_rate": 0.00018942234036955659,
"loss": 0.2019,
"step": 3060
},
{
"epoch": 1.3908957083003057,
"grad_norm": 0.1467510610818863,
"learning_rate": 0.00018934007839725063,
"loss": 0.1972,
"step": 3070
},
{
"epoch": 1.3954252066583626,
"grad_norm": 0.17046092450618744,
"learning_rate": 0.0001892575157937287,
"loss": 0.2053,
"step": 3080
},
{
"epoch": 1.3999547050164194,
"grad_norm": 0.14983297884464264,
"learning_rate": 0.0001891746528368157,
"loss": 0.1986,
"step": 3090
},
{
"epoch": 1.4044842033744762,
"grad_norm": 0.16196715831756592,
"learning_rate": 0.00018909148980534712,
"loss": 0.1982,
"step": 3100
},
{
"epoch": 1.409013701732533,
"grad_norm": 0.15527282655239105,
"learning_rate": 0.00018900802697916836,
"loss": 0.2028,
"step": 3110
},
{
"epoch": 1.41354320009059,
"grad_norm": 0.1645379364490509,
"learning_rate": 0.0001889242646391335,
"loss": 0.1939,
"step": 3120
},
{
"epoch": 1.418072698448647,
"grad_norm": 0.1684643030166626,
"learning_rate": 0.0001888402030671046,
"loss": 0.1931,
"step": 3130
},
{
"epoch": 1.4226021968067037,
"grad_norm": 0.15814268589019775,
"learning_rate": 0.00018875584254595055,
"loss": 0.1951,
"step": 3140
},
{
"epoch": 1.4271316951647606,
"grad_norm": 0.1520155966281891,
"learning_rate": 0.00018867118335954625,
"loss": 0.1886,
"step": 3150
},
{
"epoch": 1.4316611935228174,
"grad_norm": 0.16438494622707367,
"learning_rate": 0.0001885862257927717,
"loss": 0.2015,
"step": 3160
},
{
"epoch": 1.4361906918808742,
"grad_norm": 0.15568524599075317,
"learning_rate": 0.00018850097013151077,
"loss": 0.1898,
"step": 3170
},
{
"epoch": 1.440720190238931,
"grad_norm": 0.15463340282440186,
"learning_rate": 0.00018841541666265058,
"loss": 0.1988,
"step": 3180
},
{
"epoch": 1.4452496885969879,
"grad_norm": 0.14274995028972626,
"learning_rate": 0.00018832956567408032,
"loss": 0.1884,
"step": 3190
},
{
"epoch": 1.4497791869550447,
"grad_norm": 0.17546044290065765,
"learning_rate": 0.00018824341745469033,
"loss": 0.1959,
"step": 3200
},
{
"epoch": 1.4543086853131015,
"grad_norm": 0.14111734926700592,
"learning_rate": 0.0001881569722943712,
"loss": 0.1929,
"step": 3210
},
{
"epoch": 1.4588381836711584,
"grad_norm": 0.1645372211933136,
"learning_rate": 0.00018807023048401263,
"loss": 0.1913,
"step": 3220
},
{
"epoch": 1.4633676820292152,
"grad_norm": 0.16762864589691162,
"learning_rate": 0.00018798319231550265,
"loss": 0.1876,
"step": 3230
},
{
"epoch": 1.467897180387272,
"grad_norm": 0.14765408635139465,
"learning_rate": 0.00018789585808172649,
"loss": 0.1935,
"step": 3240
},
{
"epoch": 1.4724266787453288,
"grad_norm": 0.16272325813770294,
"learning_rate": 0.0001878082280765656,
"loss": 0.199,
"step": 3250
},
{
"epoch": 1.4769561771033857,
"grad_norm": 0.14496152102947235,
"learning_rate": 0.0001877203025948969,
"loss": 0.1987,
"step": 3260
},
{
"epoch": 1.4814856754614427,
"grad_norm": 0.1556200087070465,
"learning_rate": 0.00018763208193259132,
"loss": 0.1938,
"step": 3270
},
{
"epoch": 1.4860151738194995,
"grad_norm": 0.14785943925380707,
"learning_rate": 0.00018754356638651332,
"loss": 0.1905,
"step": 3280
},
{
"epoch": 1.4905446721775564,
"grad_norm": 0.14636161923408508,
"learning_rate": 0.00018745475625451947,
"loss": 0.1928,
"step": 3290
},
{
"epoch": 1.4950741705356132,
"grad_norm": 0.16059593856334686,
"learning_rate": 0.00018736565183545773,
"loss": 0.1967,
"step": 3300
},
{
"epoch": 1.49960366889367,
"grad_norm": 0.15864983201026917,
"learning_rate": 0.00018727625342916633,
"loss": 0.1984,
"step": 3310
},
{
"epoch": 1.5041331672517269,
"grad_norm": 0.14578469097614288,
"learning_rate": 0.00018718656133647277,
"loss": 0.1848,
"step": 3320
},
{
"epoch": 1.5086626656097837,
"grad_norm": 0.16975462436676025,
"learning_rate": 0.00018709657585919275,
"loss": 0.1914,
"step": 3330
},
{
"epoch": 1.5131921639678405,
"grad_norm": 0.14356206357479095,
"learning_rate": 0.00018700629730012934,
"loss": 0.1978,
"step": 3340
},
{
"epoch": 1.5177216623258976,
"grad_norm": 0.14980971813201904,
"learning_rate": 0.00018691572596307173,
"loss": 0.1993,
"step": 3350
},
{
"epoch": 1.5222511606839544,
"grad_norm": 0.1422482430934906,
"learning_rate": 0.00018682486215279435,
"loss": 0.187,
"step": 3360
},
{
"epoch": 1.5267806590420112,
"grad_norm": 0.1586323380470276,
"learning_rate": 0.00018673370617505576,
"loss": 0.1843,
"step": 3370
},
{
"epoch": 1.531310157400068,
"grad_norm": 0.1464434564113617,
"learning_rate": 0.00018664225833659777,
"loss": 0.1973,
"step": 3380
},
{
"epoch": 1.5358396557581249,
"grad_norm": 0.16265639662742615,
"learning_rate": 0.00018655051894514424,
"loss": 0.1873,
"step": 3390
},
{
"epoch": 1.5403691541161817,
"grad_norm": 0.13967713713645935,
"learning_rate": 0.00018645848830940013,
"loss": 0.1834,
"step": 3400
},
{
"epoch": 1.5448986524742385,
"grad_norm": 0.12256325781345367,
"learning_rate": 0.0001863661667390504,
"loss": 0.1849,
"step": 3410
},
{
"epoch": 1.5494281508322953,
"grad_norm": 0.14708378911018372,
"learning_rate": 0.00018627355454475908,
"loss": 0.1921,
"step": 3420
},
{
"epoch": 1.5539576491903522,
"grad_norm": 0.14427697658538818,
"learning_rate": 0.00018618065203816812,
"loss": 0.1863,
"step": 3430
},
{
"epoch": 1.558487147548409,
"grad_norm": 0.1333187371492386,
"learning_rate": 0.0001860874595318964,
"loss": 0.1927,
"step": 3440
},
{
"epoch": 1.5630166459064658,
"grad_norm": 0.15604457259178162,
"learning_rate": 0.00018599397733953858,
"loss": 0.1841,
"step": 3450
},
{
"epoch": 1.5675461442645227,
"grad_norm": 0.147917240858078,
"learning_rate": 0.00018590020577566424,
"loss": 0.1886,
"step": 3460
},
{
"epoch": 1.5720756426225795,
"grad_norm": 0.14821654558181763,
"learning_rate": 0.0001858061451558166,
"loss": 0.1833,
"step": 3470
},
{
"epoch": 1.5766051409806363,
"grad_norm": 0.12086760997772217,
"learning_rate": 0.00018571179579651159,
"loss": 0.1918,
"step": 3480
},
{
"epoch": 1.5811346393386931,
"grad_norm": 0.16424959897994995,
"learning_rate": 0.0001856171580152368,
"loss": 0.1792,
"step": 3490
},
{
"epoch": 1.58566413769675,
"grad_norm": 0.14219975471496582,
"learning_rate": 0.00018552223213045028,
"loss": 0.1946,
"step": 3500
},
{
"epoch": 1.5901936360548068,
"grad_norm": 0.1768968552350998,
"learning_rate": 0.00018542701846157962,
"loss": 0.1843,
"step": 3510
},
{
"epoch": 1.5947231344128636,
"grad_norm": 0.12454737722873688,
"learning_rate": 0.0001853315173290208,
"loss": 0.1836,
"step": 3520
},
{
"epoch": 1.5992526327709207,
"grad_norm": 0.14064136147499084,
"learning_rate": 0.00018523572905413709,
"loss": 0.1841,
"step": 3530
},
{
"epoch": 1.6037821311289775,
"grad_norm": 0.15816141664981842,
"learning_rate": 0.00018513965395925802,
"loss": 0.1882,
"step": 3540
},
{
"epoch": 1.6083116294870343,
"grad_norm": 0.15514902770519257,
"learning_rate": 0.00018504329236767832,
"loss": 0.1881,
"step": 3550
},
{
"epoch": 1.6128411278450911,
"grad_norm": 0.15803417563438416,
"learning_rate": 0.00018494664460365668,
"loss": 0.1859,
"step": 3560
},
{
"epoch": 1.617370626203148,
"grad_norm": 0.12781353294849396,
"learning_rate": 0.00018484971099241485,
"loss": 0.1832,
"step": 3570
},
{
"epoch": 1.6219001245612048,
"grad_norm": 0.16309882700443268,
"learning_rate": 0.0001847524918601365,
"loss": 0.1962,
"step": 3580
},
{
"epoch": 1.6264296229192619,
"grad_norm": 0.12590362131595612,
"learning_rate": 0.00018465498753396595,
"loss": 0.1928,
"step": 3590
},
{
"epoch": 1.6309591212773187,
"grad_norm": 0.1451760232448578,
"learning_rate": 0.00018455719834200728,
"loss": 0.1837,
"step": 3600
},
{
"epoch": 1.6354886196353755,
"grad_norm": 0.14908108115196228,
"learning_rate": 0.0001844591246133232,
"loss": 0.1866,
"step": 3610
},
{
"epoch": 1.6400181179934323,
"grad_norm": 0.13437342643737793,
"learning_rate": 0.00018436076667793382,
"loss": 0.1886,
"step": 3620
},
{
"epoch": 1.6445476163514892,
"grad_norm": 0.13465970754623413,
"learning_rate": 0.00018426212486681562,
"loss": 0.183,
"step": 3630
},
{
"epoch": 1.649077114709546,
"grad_norm": 0.13650234043598175,
"learning_rate": 0.00018416319951190032,
"loss": 0.177,
"step": 3640
},
{
"epoch": 1.6536066130676028,
"grad_norm": 0.1663140207529068,
"learning_rate": 0.00018406399094607386,
"loss": 0.187,
"step": 3650
},
{
"epoch": 1.6581361114256596,
"grad_norm": 0.16565509140491486,
"learning_rate": 0.00018396449950317504,
"loss": 0.1837,
"step": 3660
},
{
"epoch": 1.6626656097837165,
"grad_norm": 0.18802668154239655,
"learning_rate": 0.0001838647255179947,
"loss": 0.1814,
"step": 3670
},
{
"epoch": 1.6671951081417733,
"grad_norm": 0.17005442082881927,
"learning_rate": 0.0001837646693262743,
"loss": 0.1871,
"step": 3680
},
{
"epoch": 1.6717246064998301,
"grad_norm": 0.14796973764896393,
"learning_rate": 0.00018366433126470506,
"loss": 0.1781,
"step": 3690
},
{
"epoch": 1.676254104857887,
"grad_norm": 0.1405303180217743,
"learning_rate": 0.0001835637116709266,
"loss": 0.1792,
"step": 3700
},
{
"epoch": 1.6807836032159438,
"grad_norm": 0.1343483328819275,
"learning_rate": 0.00018346281088352592,
"loss": 0.1807,
"step": 3710
},
{
"epoch": 1.6853131015740006,
"grad_norm": 0.14635176956653595,
"learning_rate": 0.00018336162924203632,
"loss": 0.176,
"step": 3720
},
{
"epoch": 1.6898425999320574,
"grad_norm": 0.13452979922294617,
"learning_rate": 0.0001832601670869361,
"loss": 0.1822,
"step": 3730
},
{
"epoch": 1.6943720982901143,
"grad_norm": 0.14736182987689972,
"learning_rate": 0.00018315842475964748,
"loss": 0.1828,
"step": 3740
},
{
"epoch": 1.698901596648171,
"grad_norm": 0.13288873434066772,
"learning_rate": 0.00018305640260253553,
"loss": 0.1749,
"step": 3750
},
{
"epoch": 1.703431095006228,
"grad_norm": 0.146206796169281,
"learning_rate": 0.00018295410095890696,
"loss": 0.191,
"step": 3760
},
{
"epoch": 1.7079605933642847,
"grad_norm": 0.13878855109214783,
"learning_rate": 0.00018285152017300885,
"loss": 0.1827,
"step": 3770
},
{
"epoch": 1.7124900917223418,
"grad_norm": 0.14912264049053192,
"learning_rate": 0.00018275895908433733,
"loss": 0.173,
"step": 3780
},
{
"epoch": 1.7170195900803986,
"grad_norm": 0.14632469415664673,
"learning_rate": 0.0001826558488798913,
"loss": 0.1776,
"step": 3790
},
{
"epoch": 1.7215490884384554,
"grad_norm": 0.14830105006694794,
"learning_rate": 0.0001825524605368002,
"loss": 0.1762,
"step": 3800
},
{
"epoch": 1.7260785867965123,
"grad_norm": 0.15307176113128662,
"learning_rate": 0.00018244879440296793,
"loss": 0.1753,
"step": 3810
},
{
"epoch": 1.730608085154569,
"grad_norm": 0.15168583393096924,
"learning_rate": 0.0001823448508272332,
"loss": 0.1774,
"step": 3820
},
{
"epoch": 1.735137583512626,
"grad_norm": 0.14207693934440613,
"learning_rate": 0.0001822406301593683,
"loss": 0.1765,
"step": 3830
},
{
"epoch": 1.739667081870683,
"grad_norm": 0.15022936463356018,
"learning_rate": 0.0001821361327500779,
"loss": 0.1852,
"step": 3840
},
{
"epoch": 1.7441965802287398,
"grad_norm": 0.14267757534980774,
"learning_rate": 0.00018203135895099797,
"loss": 0.1788,
"step": 3850
},
{
"epoch": 1.7487260785867966,
"grad_norm": 0.13068848848342896,
"learning_rate": 0.00018192630911469454,
"loss": 0.1834,
"step": 3860
},
{
"epoch": 1.7532555769448535,
"grad_norm": 0.13527341187000275,
"learning_rate": 0.00018182098359466244,
"loss": 0.1878,
"step": 3870
},
{
"epoch": 1.7577850753029103,
"grad_norm": 0.14090019464492798,
"learning_rate": 0.00018171538274532428,
"loss": 0.1825,
"step": 3880
},
{
"epoch": 1.762314573660967,
"grad_norm": 0.16419830918312073,
"learning_rate": 0.00018160950692202907,
"loss": 0.1735,
"step": 3890
},
{
"epoch": 1.766844072019024,
"grad_norm": 0.13737310469150543,
"learning_rate": 0.00018150335648105118,
"loss": 0.1798,
"step": 3900
},
{
"epoch": 1.7713735703770808,
"grad_norm": 0.13491977751255035,
"learning_rate": 0.00018139693177958902,
"loss": 0.1814,
"step": 3910
},
{
"epoch": 1.7759030687351376,
"grad_norm": 0.13069839775562286,
"learning_rate": 0.0001812902331757639,
"loss": 0.1795,
"step": 3920
},
{
"epoch": 1.7804325670931944,
"grad_norm": 0.14693836867809296,
"learning_rate": 0.0001811832610286189,
"loss": 0.1798,
"step": 3930
},
{
"epoch": 1.7849620654512512,
"grad_norm": 0.15298062562942505,
"learning_rate": 0.00018107601569811746,
"loss": 0.1717,
"step": 3940
},
{
"epoch": 1.789491563809308,
"grad_norm": 0.1533603072166443,
"learning_rate": 0.0001809684975451423,
"loss": 0.1825,
"step": 3950
},
{
"epoch": 1.794021062167365,
"grad_norm": 0.15522614121437073,
"learning_rate": 0.00018086070693149435,
"loss": 0.1843,
"step": 3960
},
{
"epoch": 1.7985505605254217,
"grad_norm": 0.12531672418117523,
"learning_rate": 0.00018075264421989117,
"loss": 0.1839,
"step": 3970
},
{
"epoch": 1.8030800588834786,
"grad_norm": 0.1647823303937912,
"learning_rate": 0.00018064430977396607,
"loss": 0.1842,
"step": 3980
},
{
"epoch": 1.8076095572415354,
"grad_norm": 0.14417417347431183,
"learning_rate": 0.00018053570395826666,
"loss": 0.17,
"step": 3990
},
{
"epoch": 1.8121390555995922,
"grad_norm": 0.14394541084766388,
"learning_rate": 0.00018042682713825377,
"loss": 0.181,
"step": 4000
},
{
"epoch": 1.816668553957649,
"grad_norm": 0.13082464039325714,
"learning_rate": 0.0001803176796803002,
"loss": 0.1759,
"step": 4010
},
{
"epoch": 1.821198052315706,
"grad_norm": 0.13551370799541473,
"learning_rate": 0.00018020826195168938,
"loss": 0.1737,
"step": 4020
},
{
"epoch": 1.825727550673763,
"grad_norm": 0.16460978984832764,
"learning_rate": 0.00018009857432061424,
"loss": 0.1788,
"step": 4030
},
{
"epoch": 1.8302570490318197,
"grad_norm": 0.1246340349316597,
"learning_rate": 0.00017998861715617595,
"loss": 0.1648,
"step": 4040
},
{
"epoch": 1.8347865473898766,
"grad_norm": 0.14473074674606323,
"learning_rate": 0.00017987839082838264,
"loss": 0.1683,
"step": 4050
},
{
"epoch": 1.8393160457479334,
"grad_norm": 0.13617068529129028,
"learning_rate": 0.00017976789570814812,
"loss": 0.1731,
"step": 4060
},
{
"epoch": 1.8438455441059902,
"grad_norm": 0.1399005949497223,
"learning_rate": 0.00017965713216729084,
"loss": 0.1726,
"step": 4070
},
{
"epoch": 1.8483750424640473,
"grad_norm": 0.15167087316513062,
"learning_rate": 0.00017954610057853242,
"loss": 0.1769,
"step": 4080
},
{
"epoch": 1.852904540822104,
"grad_norm": 0.1486155092716217,
"learning_rate": 0.00017943480131549637,
"loss": 0.1735,
"step": 4090
},
{
"epoch": 1.857434039180161,
"grad_norm": 0.12672476470470428,
"learning_rate": 0.00017932323475270713,
"loss": 0.1692,
"step": 4100
},
{
"epoch": 1.8619635375382178,
"grad_norm": 0.12943005561828613,
"learning_rate": 0.0001792114012655884,
"loss": 0.1736,
"step": 4110
},
{
"epoch": 1.8664930358962746,
"grad_norm": 0.1305234730243683,
"learning_rate": 0.00017909930123046226,
"loss": 0.1693,
"step": 4120
},
{
"epoch": 1.8710225342543314,
"grad_norm": 0.15078797936439514,
"learning_rate": 0.00017898693502454757,
"loss": 0.1714,
"step": 4130
},
{
"epoch": 1.8755520326123882,
"grad_norm": 0.13605743646621704,
"learning_rate": 0.00017887430302595902,
"loss": 0.1742,
"step": 4140
},
{
"epoch": 1.880081530970445,
"grad_norm": 0.15072084963321686,
"learning_rate": 0.0001787614056137056,
"loss": 0.1761,
"step": 4150
},
{
"epoch": 1.8846110293285019,
"grad_norm": 0.12788626551628113,
"learning_rate": 0.0001786482431676894,
"loss": 0.1698,
"step": 4160
},
{
"epoch": 1.8891405276865587,
"grad_norm": 0.11104808747768402,
"learning_rate": 0.00017853481606870447,
"loss": 0.1673,
"step": 4170
},
{
"epoch": 1.8936700260446155,
"grad_norm": 0.15082287788391113,
"learning_rate": 0.00017842112469843526,
"loss": 0.1771,
"step": 4180
},
{
"epoch": 1.8981995244026724,
"grad_norm": 0.13250093162059784,
"learning_rate": 0.00017830716943945566,
"loss": 0.1693,
"step": 4190
},
{
"epoch": 1.9027290227607292,
"grad_norm": 0.1345834881067276,
"learning_rate": 0.00017819295067522746,
"loss": 0.1657,
"step": 4200
},
{
"epoch": 1.907258521118786,
"grad_norm": 0.12472589313983917,
"learning_rate": 0.00017807846879009916,
"loss": 0.1673,
"step": 4210
},
{
"epoch": 1.9117880194768428,
"grad_norm": 0.14480777084827423,
"learning_rate": 0.00017796372416930466,
"loss": 0.1617,
"step": 4220
},
{
"epoch": 1.9163175178348997,
"grad_norm": 0.12188120186328888,
"learning_rate": 0.00017784871719896207,
"loss": 0.1697,
"step": 4230
},
{
"epoch": 1.9208470161929565,
"grad_norm": 0.13561968505382538,
"learning_rate": 0.0001777334482660721,
"loss": 0.1675,
"step": 4240
},
{
"epoch": 1.9253765145510133,
"grad_norm": 0.1565788984298706,
"learning_rate": 0.0001776179177585172,
"loss": 0.1695,
"step": 4250
},
{
"epoch": 1.9299060129090702,
"grad_norm": 0.13274118304252625,
"learning_rate": 0.00017750212606505988,
"loss": 0.173,
"step": 4260
},
{
"epoch": 1.9344355112671272,
"grad_norm": 0.13509687781333923,
"learning_rate": 0.0001773860735753416,
"loss": 0.1711,
"step": 4270
},
{
"epoch": 1.938965009625184,
"grad_norm": 0.14307665824890137,
"learning_rate": 0.0001772697606798814,
"loss": 0.1752,
"step": 4280
},
{
"epoch": 1.9434945079832409,
"grad_norm": 0.14142999053001404,
"learning_rate": 0.0001771531877700746,
"loss": 0.1746,
"step": 4290
},
{
"epoch": 1.9480240063412977,
"grad_norm": 0.13015881180763245,
"learning_rate": 0.0001770363552381914,
"loss": 0.1624,
"step": 4300
},
{
"epoch": 1.9525535046993545,
"grad_norm": 0.15056206285953522,
"learning_rate": 0.00017691926347737573,
"loss": 0.1683,
"step": 4310
},
{
"epoch": 1.9570830030574113,
"grad_norm": 0.1449085772037506,
"learning_rate": 0.00017680191288164382,
"loss": 0.1652,
"step": 4320
},
{
"epoch": 1.9616125014154684,
"grad_norm": 0.13363459706306458,
"learning_rate": 0.00017668430384588278,
"loss": 0.1755,
"step": 4330
},
{
"epoch": 1.9661419997735252,
"grad_norm": 0.11182225495576859,
"learning_rate": 0.00017656643676584955,
"loss": 0.1649,
"step": 4340
},
{
"epoch": 1.970671498131582,
"grad_norm": 0.1344953030347824,
"learning_rate": 0.00017644831203816926,
"loss": 0.1699,
"step": 4350
},
{
"epoch": 1.9752009964896389,
"grad_norm": 0.14654122292995453,
"learning_rate": 0.000176329930060334,
"loss": 0.1646,
"step": 4360
},
{
"epoch": 1.9797304948476957,
"grad_norm": 0.12001664191484451,
"learning_rate": 0.00017621129123070167,
"loss": 0.1732,
"step": 4370
},
{
"epoch": 1.9842599932057525,
"grad_norm": 0.12289103865623474,
"learning_rate": 0.00017609239594849435,
"loss": 0.1665,
"step": 4380
},
{
"epoch": 1.9887894915638094,
"grad_norm": 0.15383568406105042,
"learning_rate": 0.00017597324461379716,
"loss": 0.1668,
"step": 4390
},
{
"epoch": 1.9933189899218662,
"grad_norm": 0.11333877593278885,
"learning_rate": 0.0001758538376275568,
"loss": 0.1699,
"step": 4400
},
{
"epoch": 1.997848488279923,
"grad_norm": 0.13718217611312866,
"learning_rate": 0.00017573417539158017,
"loss": 0.1674,
"step": 4410
},
{
"epoch": 1.9996602876231457,
"eval_loss": 0.17693181335926056,
"eval_runtime": 617.1958,
"eval_samples_per_second": 12.751,
"eval_steps_per_second": 1.594,
"step": 4414
},
{
"epoch": 2.002717699014834,
"grad_norm": 0.12558519840240479,
"learning_rate": 0.0001756142583085333,
"loss": 0.1601,
"step": 4420
},
{
"epoch": 2.007247197372891,
"grad_norm": 0.171942800283432,
"learning_rate": 0.00017549408678193962,
"loss": 0.1325,
"step": 4430
},
{
"epoch": 2.0117766957309478,
"grad_norm": 0.12557823956012726,
"learning_rate": 0.0001753736612161788,
"loss": 0.1337,
"step": 4440
},
{
"epoch": 2.0163061940890046,
"grad_norm": 0.1112385243177414,
"learning_rate": 0.00017525298201648534,
"loss": 0.1353,
"step": 4450
},
{
"epoch": 2.0208356924470614,
"grad_norm": 0.10396666824817657,
"learning_rate": 0.00017513204958894728,
"loss": 0.1344,
"step": 4460
},
{
"epoch": 2.0253651908051182,
"grad_norm": 0.11958423256874084,
"learning_rate": 0.0001750108643405047,
"loss": 0.1325,
"step": 4470
},
{
"epoch": 2.029894689163175,
"grad_norm": 0.13883349299430847,
"learning_rate": 0.00017488942667894856,
"loss": 0.1308,
"step": 4480
},
{
"epoch": 2.034424187521232,
"grad_norm": 0.12778469920158386,
"learning_rate": 0.00017476773701291905,
"loss": 0.1285,
"step": 4490
},
{
"epoch": 2.0389536858792887,
"grad_norm": 0.12921588122844696,
"learning_rate": 0.00017464579575190444,
"loss": 0.1286,
"step": 4500
},
{
"epoch": 2.0434831842373455,
"grad_norm": 0.14378762245178223,
"learning_rate": 0.00017452360330623957,
"loss": 0.1389,
"step": 4510
},
{
"epoch": 2.0480126825954024,
"grad_norm": 0.13812440633773804,
"learning_rate": 0.00017440116008710457,
"loss": 0.1342,
"step": 4520
},
{
"epoch": 2.052542180953459,
"grad_norm": 0.15414589643478394,
"learning_rate": 0.00017427846650652342,
"loss": 0.1381,
"step": 4530
},
{
"epoch": 2.0570716793115165,
"grad_norm": 0.11771693825721741,
"learning_rate": 0.00017415552297736256,
"loss": 0.1344,
"step": 4540
},
{
"epoch": 2.0616011776695733,
"grad_norm": 0.13729040324687958,
"learning_rate": 0.00017403232991332953,
"loss": 0.1323,
"step": 4550
},
{
"epoch": 2.06613067602763,
"grad_norm": 0.11777821183204651,
"learning_rate": 0.00017390888772897148,
"loss": 0.1354,
"step": 4560
},
{
"epoch": 2.070660174385687,
"grad_norm": 0.11759165674448013,
"learning_rate": 0.00017378519683967399,
"loss": 0.1359,
"step": 4570
},
{
"epoch": 2.0751896727437438,
"grad_norm": 0.14665256440639496,
"learning_rate": 0.00017366125766165943,
"loss": 0.1295,
"step": 4580
},
{
"epoch": 2.0797191711018006,
"grad_norm": 0.12388816475868225,
"learning_rate": 0.00017353707061198574,
"loss": 0.1366,
"step": 4590
},
{
"epoch": 2.0842486694598574,
"grad_norm": 0.12518715858459473,
"learning_rate": 0.00017341263610854487,
"loss": 0.1372,
"step": 4600
},
{
"epoch": 2.0887781678179143,
"grad_norm": 0.1429567039012909,
"learning_rate": 0.00017328795457006153,
"loss": 0.1326,
"step": 4610
},
{
"epoch": 2.093307666175971,
"grad_norm": 0.11989770084619522,
"learning_rate": 0.00017316302641609167,
"loss": 0.134,
"step": 4620
},
{
"epoch": 2.097837164534028,
"grad_norm": 0.11995401233434677,
"learning_rate": 0.00017303785206702115,
"loss": 0.136,
"step": 4630
},
{
"epoch": 2.1023666628920847,
"grad_norm": 0.11321832239627838,
"learning_rate": 0.0001729124319440642,
"loss": 0.1371,
"step": 4640
},
{
"epoch": 2.1068961612501416,
"grad_norm": 0.11317916214466095,
"learning_rate": 0.00017278676646926219,
"loss": 0.1303,
"step": 4650
},
{
"epoch": 2.1114256596081984,
"grad_norm": 0.11971450597047806,
"learning_rate": 0.00017266085606548197,
"loss": 0.1363,
"step": 4660
},
{
"epoch": 2.115955157966255,
"grad_norm": 0.12779143452644348,
"learning_rate": 0.00017253470115641473,
"loss": 0.1395,
"step": 4670
},
{
"epoch": 2.120484656324312,
"grad_norm": 0.12094374746084213,
"learning_rate": 0.00017240830216657432,
"loss": 0.1337,
"step": 4680
},
{
"epoch": 2.125014154682369,
"grad_norm": 0.11902227252721786,
"learning_rate": 0.00017228165952129601,
"loss": 0.1342,
"step": 4690
},
{
"epoch": 2.1295436530404257,
"grad_norm": 0.12663759291172028,
"learning_rate": 0.00017215477364673486,
"loss": 0.1356,
"step": 4700
},
{
"epoch": 2.1340731513984825,
"grad_norm": 0.12311159074306488,
"learning_rate": 0.0001720276449698645,
"loss": 0.1364,
"step": 4710
},
{
"epoch": 2.1386026497565394,
"grad_norm": 0.134132981300354,
"learning_rate": 0.00017190027391847555,
"loss": 0.1352,
"step": 4720
},
{
"epoch": 2.143132148114596,
"grad_norm": 0.1177242249250412,
"learning_rate": 0.00017177266092117428,
"loss": 0.132,
"step": 4730
},
{
"epoch": 2.147661646472653,
"grad_norm": 0.11641071736812592,
"learning_rate": 0.00017164480640738101,
"loss": 0.1359,
"step": 4740
},
{
"epoch": 2.15219114483071,
"grad_norm": 0.1303935945034027,
"learning_rate": 0.00017151671080732888,
"loss": 0.1354,
"step": 4750
},
{
"epoch": 2.1567206431887667,
"grad_norm": 0.13929632306098938,
"learning_rate": 0.0001713883745520622,
"loss": 0.1303,
"step": 4760
},
{
"epoch": 2.1612501415468235,
"grad_norm": 0.13775485754013062,
"learning_rate": 0.00017125979807343519,
"loss": 0.1379,
"step": 4770
},
{
"epoch": 2.1657796399048808,
"grad_norm": 0.10667065531015396,
"learning_rate": 0.00017113098180411026,
"loss": 0.1323,
"step": 4780
},
{
"epoch": 2.1703091382629376,
"grad_norm": 0.12592215836048126,
"learning_rate": 0.00017100192617755693,
"loss": 0.1326,
"step": 4790
},
{
"epoch": 2.1748386366209944,
"grad_norm": 0.12523461878299713,
"learning_rate": 0.00017087263162805,
"loss": 0.1361,
"step": 4800
},
{
"epoch": 2.1793681349790512,
"grad_norm": 0.13614587485790253,
"learning_rate": 0.00017074309859066837,
"loss": 0.136,
"step": 4810
},
{
"epoch": 2.183897633337108,
"grad_norm": 0.13419945538043976,
"learning_rate": 0.00017061332750129332,
"loss": 0.1299,
"step": 4820
},
{
"epoch": 2.188427131695165,
"grad_norm": 0.10393204540014267,
"learning_rate": 0.00017048331879660733,
"loss": 0.1334,
"step": 4830
},
{
"epoch": 2.1929566300532217,
"grad_norm": 0.12654437124729156,
"learning_rate": 0.00017035307291409234,
"loss": 0.138,
"step": 4840
},
{
"epoch": 2.1974861284112785,
"grad_norm": 0.12029164284467697,
"learning_rate": 0.00017022259029202843,
"loss": 0.1329,
"step": 4850
},
{
"epoch": 2.2020156267693354,
"grad_norm": 0.1427529752254486,
"learning_rate": 0.00017009187136949238,
"loss": 0.1314,
"step": 4860
},
{
"epoch": 2.206545125127392,
"grad_norm": 0.10956190526485443,
"learning_rate": 0.00016996091658635603,
"loss": 0.1324,
"step": 4870
},
{
"epoch": 2.211074623485449,
"grad_norm": 0.12758436799049377,
"learning_rate": 0.00016982972638328496,
"loss": 0.1326,
"step": 4880
},
{
"epoch": 2.215604121843506,
"grad_norm": 0.10729292035102844,
"learning_rate": 0.00016969830120173692,
"loss": 0.1317,
"step": 4890
},
{
"epoch": 2.2201336202015627,
"grad_norm": 0.14230488240718842,
"learning_rate": 0.0001695666414839604,
"loss": 0.1387,
"step": 4900
},
{
"epoch": 2.2246631185596195,
"grad_norm": 0.13682898879051208,
"learning_rate": 0.00016943474767299298,
"loss": 0.1341,
"step": 4910
},
{
"epoch": 2.2291926169176763,
"grad_norm": 0.14022116363048553,
"learning_rate": 0.0001693026202126602,
"loss": 0.1345,
"step": 4920
},
{
"epoch": 2.233722115275733,
"grad_norm": 0.12787717580795288,
"learning_rate": 0.00016917025954757365,
"loss": 0.138,
"step": 4930
},
{
"epoch": 2.23825161363379,
"grad_norm": 0.12592186033725739,
"learning_rate": 0.00016903766612312967,
"loss": 0.135,
"step": 4940
},
{
"epoch": 2.242781111991847,
"grad_norm": 0.12485472112894058,
"learning_rate": 0.00016890484038550792,
"loss": 0.1305,
"step": 4950
},
{
"epoch": 2.2473106103499036,
"grad_norm": 0.12487582862377167,
"learning_rate": 0.0001687717827816698,
"loss": 0.1352,
"step": 4960
},
{
"epoch": 2.2518401087079605,
"grad_norm": 0.1367800235748291,
"learning_rate": 0.0001686384937593568,
"loss": 0.1377,
"step": 4970
},
{
"epoch": 2.2563696070660173,
"grad_norm": 0.12008614093065262,
"learning_rate": 0.00016850497376708935,
"loss": 0.1399,
"step": 4980
},
{
"epoch": 2.260899105424074,
"grad_norm": 0.1453281044960022,
"learning_rate": 0.00016837122325416494,
"loss": 0.134,
"step": 4990
},
{
"epoch": 2.265428603782131,
"grad_norm": 0.1182338148355484,
"learning_rate": 0.00016823724267065683,
"loss": 0.1386,
"step": 5000
},
{
"epoch": 2.269958102140188,
"grad_norm": 0.1372307538986206,
"learning_rate": 0.00016810303246741245,
"loss": 0.1336,
"step": 5010
},
{
"epoch": 2.2744876004982446,
"grad_norm": 0.1213153526186943,
"learning_rate": 0.00016796859309605195,
"loss": 0.1345,
"step": 5020
},
{
"epoch": 2.2790170988563014,
"grad_norm": 0.12057512998580933,
"learning_rate": 0.00016783392500896652,
"loss": 0.1324,
"step": 5030
},
{
"epoch": 2.2835465972143583,
"grad_norm": 0.13681593537330627,
"learning_rate": 0.00016769902865931718,
"loss": 0.1377,
"step": 5040
},
{
"epoch": 2.2880760955724155,
"grad_norm": 0.12073809653520584,
"learning_rate": 0.00016756390450103285,
"loss": 0.1358,
"step": 5050
},
{
"epoch": 2.2926055939304724,
"grad_norm": 0.1260959357023239,
"learning_rate": 0.00016742855298880916,
"loss": 0.1327,
"step": 5060
},
{
"epoch": 2.297135092288529,
"grad_norm": 0.12705475091934204,
"learning_rate": 0.0001672929745781068,
"loss": 0.1326,
"step": 5070
},
{
"epoch": 2.301664590646586,
"grad_norm": 0.12451212108135223,
"learning_rate": 0.00016715716972514984,
"loss": 0.1357,
"step": 5080
},
{
"epoch": 2.306194089004643,
"grad_norm": 0.10446886718273163,
"learning_rate": 0.00016702113888692448,
"loss": 0.1346,
"step": 5090
},
{
"epoch": 2.3107235873626997,
"grad_norm": 0.1240820363163948,
"learning_rate": 0.0001668848825211773,
"loss": 0.1376,
"step": 5100
},
{
"epoch": 2.3152530857207565,
"grad_norm": 0.11466921865940094,
"learning_rate": 0.00016674840108641382,
"loss": 0.1347,
"step": 5110
},
{
"epoch": 2.3197825840788133,
"grad_norm": 0.12086183577775955,
"learning_rate": 0.00016661169504189686,
"loss": 0.1392,
"step": 5120
},
{
"epoch": 2.32431208243687,
"grad_norm": 0.12020442634820938,
"learning_rate": 0.0001664747648476451,
"loss": 0.1326,
"step": 5130
},
{
"epoch": 2.328841580794927,
"grad_norm": 0.1300458312034607,
"learning_rate": 0.0001663376109644315,
"loss": 0.1382,
"step": 5140
},
{
"epoch": 2.333371079152984,
"grad_norm": 0.11588041484355927,
"learning_rate": 0.00016620023385378172,
"loss": 0.1348,
"step": 5150
},
{
"epoch": 2.3379005775110406,
"grad_norm": 0.11398044973611832,
"learning_rate": 0.0001660626339779726,
"loss": 0.1335,
"step": 5160
},
{
"epoch": 2.3424300758690975,
"grad_norm": 0.10993365198373795,
"learning_rate": 0.0001659248118000305,
"loss": 0.1314,
"step": 5170
},
{
"epoch": 2.3469595742271543,
"grad_norm": 0.11220837384462357,
"learning_rate": 0.00016578676778373,
"loss": 0.1376,
"step": 5180
},
{
"epoch": 2.351489072585211,
"grad_norm": 0.12188950926065445,
"learning_rate": 0.000165648502393592,
"loss": 0.1371,
"step": 5190
},
{
"epoch": 2.356018570943268,
"grad_norm": 0.11867307126522064,
"learning_rate": 0.00016551001609488246,
"loss": 0.1335,
"step": 5200
},
{
"epoch": 2.3605480693013248,
"grad_norm": 0.14046625792980194,
"learning_rate": 0.00016537130935361064,
"loss": 0.1392,
"step": 5210
},
{
"epoch": 2.3650775676593816,
"grad_norm": 0.11454641073942184,
"learning_rate": 0.00016523238263652757,
"loss": 0.139,
"step": 5220
},
{
"epoch": 2.3696070660174384,
"grad_norm": 0.1256382018327713,
"learning_rate": 0.00016509323641112456,
"loss": 0.1366,
"step": 5230
},
{
"epoch": 2.3741365643754953,
"grad_norm": 0.11187759041786194,
"learning_rate": 0.00016495387114563153,
"loss": 0.1338,
"step": 5240
},
{
"epoch": 2.378666062733552,
"grad_norm": 0.14559686183929443,
"learning_rate": 0.0001648142873090155,
"loss": 0.136,
"step": 5250
},
{
"epoch": 2.3831955610916093,
"grad_norm": 0.12695267796516418,
"learning_rate": 0.00016467448537097894,
"loss": 0.1365,
"step": 5260
},
{
"epoch": 2.387725059449666,
"grad_norm": 0.1341744363307953,
"learning_rate": 0.0001645344658019583,
"loss": 0.1354,
"step": 5270
},
{
"epoch": 2.392254557807723,
"grad_norm": 0.12615807354450226,
"learning_rate": 0.0001643942290731223,
"loss": 0.1317,
"step": 5280
},
{
"epoch": 2.39678405616578,
"grad_norm": 0.1132565289735794,
"learning_rate": 0.00016425377565637054,
"loss": 0.1322,
"step": 5290
},
{
"epoch": 2.4013135545238367,
"grad_norm": 0.11671450734138489,
"learning_rate": 0.00016411310602433156,
"loss": 0.1296,
"step": 5300
},
{
"epoch": 2.4058430528818935,
"grad_norm": 0.1351209580898285,
"learning_rate": 0.00016397222065036164,
"loss": 0.1304,
"step": 5310
},
{
"epoch": 2.4103725512399503,
"grad_norm": 0.1276492178440094,
"learning_rate": 0.000163831120008543,
"loss": 0.1361,
"step": 5320
},
{
"epoch": 2.414902049598007,
"grad_norm": 0.13524995744228363,
"learning_rate": 0.00016368980457368216,
"loss": 0.133,
"step": 5330
},
{
"epoch": 2.419431547956064,
"grad_norm": 0.1324642449617386,
"learning_rate": 0.00016354827482130855,
"loss": 0.1373,
"step": 5340
},
{
"epoch": 2.423961046314121,
"grad_norm": 0.13200613856315613,
"learning_rate": 0.0001634065312276727,
"loss": 0.1367,
"step": 5350
},
{
"epoch": 2.4284905446721776,
"grad_norm": 0.12052213400602341,
"learning_rate": 0.00016326457426974475,
"loss": 0.1335,
"step": 5360
},
{
"epoch": 2.4330200430302344,
"grad_norm": 0.1289413571357727,
"learning_rate": 0.00016312240442521278,
"loss": 0.1358,
"step": 5370
},
{
"epoch": 2.4375495413882913,
"grad_norm": 0.11921897530555725,
"learning_rate": 0.00016298002217248131,
"loss": 0.1322,
"step": 5380
},
{
"epoch": 2.442079039746348,
"grad_norm": 0.14872752130031586,
"learning_rate": 0.00016283742799066953,
"loss": 0.1385,
"step": 5390
},
{
"epoch": 2.446608538104405,
"grad_norm": 0.11772260814905167,
"learning_rate": 0.00016269462235960985,
"loss": 0.1336,
"step": 5400
},
{
"epoch": 2.4511380364624618,
"grad_norm": 0.13925409317016602,
"learning_rate": 0.00016255160575984616,
"loss": 0.137,
"step": 5410
},
{
"epoch": 2.4556675348205186,
"grad_norm": 0.1357075273990631,
"learning_rate": 0.00016240837867263227,
"loss": 0.1349,
"step": 5420
},
{
"epoch": 2.4601970331785754,
"grad_norm": 0.1274648904800415,
"learning_rate": 0.00016226494157993036,
"loss": 0.1307,
"step": 5430
},
{
"epoch": 2.4647265315366322,
"grad_norm": 0.1424674391746521,
"learning_rate": 0.00016212129496440914,
"loss": 0.1359,
"step": 5440
},
{
"epoch": 2.469256029894689,
"grad_norm": 0.1157744899392128,
"learning_rate": 0.00016197743930944247,
"loss": 0.1371,
"step": 5450
},
{
"epoch": 2.473785528252746,
"grad_norm": 0.1353282928466797,
"learning_rate": 0.00016183337509910762,
"loss": 0.1399,
"step": 5460
},
{
"epoch": 2.4783150266108027,
"grad_norm": 0.11779867857694626,
"learning_rate": 0.00016168910281818367,
"loss": 0.1348,
"step": 5470
},
{
"epoch": 2.4828445249688595,
"grad_norm": 0.11190491169691086,
"learning_rate": 0.00016154462295214984,
"loss": 0.1341,
"step": 5480
},
{
"epoch": 2.4873740233269164,
"grad_norm": 0.1286158561706543,
"learning_rate": 0.0001613999359871838,
"loss": 0.1323,
"step": 5490
},
{
"epoch": 2.491903521684973,
"grad_norm": 0.12542322278022766,
"learning_rate": 0.0001612550424101603,
"loss": 0.1365,
"step": 5500
},
{
"epoch": 2.49643302004303,
"grad_norm": 0.12170036882162094,
"learning_rate": 0.00016110994270864912,
"loss": 0.1344,
"step": 5510
},
{
"epoch": 2.500962518401087,
"grad_norm": 0.13724590837955475,
"learning_rate": 0.00016096463737091382,
"loss": 0.1325,
"step": 5520
},
{
"epoch": 2.5054920167591437,
"grad_norm": 0.11381508409976959,
"learning_rate": 0.00016081912688590988,
"loss": 0.1339,
"step": 5530
},
{
"epoch": 2.5100215151172005,
"grad_norm": 0.12289192527532578,
"learning_rate": 0.00016067341174328306,
"loss": 0.1302,
"step": 5540
},
{
"epoch": 2.514551013475258,
"grad_norm": 0.12465256452560425,
"learning_rate": 0.00016052749243336786,
"loss": 0.1354,
"step": 5550
},
{
"epoch": 2.5190805118333146,
"grad_norm": 0.12437895685434341,
"learning_rate": 0.0001603813694471858,
"loss": 0.1321,
"step": 5560
},
{
"epoch": 2.5236100101913714,
"grad_norm": 0.12177952378988266,
"learning_rate": 0.00016023504327644376,
"loss": 0.1387,
"step": 5570
},
{
"epoch": 2.5281395085494283,
"grad_norm": 0.12667645514011383,
"learning_rate": 0.00016008851441353232,
"loss": 0.1383,
"step": 5580
},
{
"epoch": 2.532669006907485,
"grad_norm": 0.13816499710083008,
"learning_rate": 0.00015994178335152412,
"loss": 0.1419,
"step": 5590
},
{
"epoch": 2.537198505265542,
"grad_norm": 0.13884486258029938,
"learning_rate": 0.00015979485058417226,
"loss": 0.1345,
"step": 5600
},
{
"epoch": 2.5417280036235987,
"grad_norm": 0.13231264054775238,
"learning_rate": 0.0001596477166059085,
"loss": 0.1386,
"step": 5610
},
{
"epoch": 2.5462575019816556,
"grad_norm": 0.10923223942518234,
"learning_rate": 0.00015950038191184178,
"loss": 0.1382,
"step": 5620
},
{
"epoch": 2.5507870003397124,
"grad_norm": 0.1239657923579216,
"learning_rate": 0.00015935284699775638,
"loss": 0.1345,
"step": 5630
},
{
"epoch": 2.5553164986977692,
"grad_norm": 0.11910531669855118,
"learning_rate": 0.00015920511236011038,
"loss": 0.1321,
"step": 5640
},
{
"epoch": 2.559845997055826,
"grad_norm": 0.1176079511642456,
"learning_rate": 0.00015905717849603384,
"loss": 0.1379,
"step": 5650
},
{
"epoch": 2.564375495413883,
"grad_norm": 0.10820971429347992,
"learning_rate": 0.0001589090459033273,
"loss": 0.1353,
"step": 5660
},
{
"epoch": 2.5689049937719397,
"grad_norm": 0.11455655097961426,
"learning_rate": 0.00015876071508046002,
"loss": 0.1375,
"step": 5670
},
{
"epoch": 2.5734344921299965,
"grad_norm": 0.13477309048175812,
"learning_rate": 0.00015861218652656826,
"loss": 0.1345,
"step": 5680
},
{
"epoch": 2.5779639904880534,
"grad_norm": 0.1447640061378479,
"learning_rate": 0.00015846346074145374,
"loss": 0.1398,
"step": 5690
},
{
"epoch": 2.58249348884611,
"grad_norm": 0.11953482776880264,
"learning_rate": 0.00015831453822558178,
"loss": 0.1323,
"step": 5700
},
{
"epoch": 2.587022987204167,
"grad_norm": 0.11846103519201279,
"learning_rate": 0.00015816541948007967,
"loss": 0.1359,
"step": 5710
},
{
"epoch": 2.591552485562224,
"grad_norm": 0.1382216066122055,
"learning_rate": 0.00015801610500673524,
"loss": 0.1406,
"step": 5720
},
{
"epoch": 2.5960819839202807,
"grad_norm": 0.12505120038986206,
"learning_rate": 0.0001578665953079946,
"loss": 0.1315,
"step": 5730
},
{
"epoch": 2.600611482278338,
"grad_norm": 0.13036322593688965,
"learning_rate": 0.00015771689088696112,
"loss": 0.1322,
"step": 5740
},
{
"epoch": 2.6051409806363948,
"grad_norm": 0.10827736556529999,
"learning_rate": 0.00015756699224739323,
"loss": 0.1346,
"step": 5750
},
{
"epoch": 2.6096704789944516,
"grad_norm": 0.12595966458320618,
"learning_rate": 0.00015741689989370294,
"loss": 0.1318,
"step": 5760
},
{
"epoch": 2.6141999773525084,
"grad_norm": 0.12824150919914246,
"learning_rate": 0.0001572666143309542,
"loss": 0.1287,
"step": 5770
},
{
"epoch": 2.6187294757105652,
"grad_norm": 0.12415400892496109,
"learning_rate": 0.00015711613606486096,
"loss": 0.1329,
"step": 5780
},
{
"epoch": 2.623258974068622,
"grad_norm": 0.1439315378665924,
"learning_rate": 0.0001569654656017858,
"loss": 0.1307,
"step": 5790
},
{
"epoch": 2.627788472426679,
"grad_norm": 0.11085296422243118,
"learning_rate": 0.00015681460344873786,
"loss": 0.1343,
"step": 5800
},
{
"epoch": 2.6323179707847357,
"grad_norm": 0.12394888699054718,
"learning_rate": 0.00015666355011337147,
"loss": 0.132,
"step": 5810
},
{
"epoch": 2.6368474691427926,
"grad_norm": 0.1326746642589569,
"learning_rate": 0.0001565123061039842,
"loss": 0.1354,
"step": 5820
},
{
"epoch": 2.6413769675008494,
"grad_norm": 0.11657778173685074,
"learning_rate": 0.00015636087192951527,
"loss": 0.1354,
"step": 5830
},
{
"epoch": 2.645906465858906,
"grad_norm": 0.12350430339574814,
"learning_rate": 0.0001562092480995439,
"loss": 0.137,
"step": 5840
},
{
"epoch": 2.650435964216963,
"grad_norm": 0.1291380524635315,
"learning_rate": 0.0001560574351242873,
"loss": 0.1332,
"step": 5850
},
{
"epoch": 2.65496546257502,
"grad_norm": 0.13578584790229797,
"learning_rate": 0.00015590543351459937,
"loss": 0.1338,
"step": 5860
},
{
"epoch": 2.6594949609330767,
"grad_norm": 0.11825544387102127,
"learning_rate": 0.00015575324378196866,
"loss": 0.1304,
"step": 5870
},
{
"epoch": 2.6640244592911335,
"grad_norm": 0.11767857521772385,
"learning_rate": 0.00015560086643851676,
"loss": 0.1346,
"step": 5880
},
{
"epoch": 2.6685539576491903,
"grad_norm": 0.12600229680538177,
"learning_rate": 0.00015544830199699662,
"loss": 0.1335,
"step": 5890
},
{
"epoch": 2.673083456007247,
"grad_norm": 0.11990875750780106,
"learning_rate": 0.00015529555097079065,
"loss": 0.1341,
"step": 5900
},
{
"epoch": 2.677612954365304,
"grad_norm": 0.10967559367418289,
"learning_rate": 0.00015514261387390935,
"loss": 0.1305,
"step": 5910
},
{
"epoch": 2.682142452723361,
"grad_norm": 0.1208115667104721,
"learning_rate": 0.00015498949122098914,
"loss": 0.1329,
"step": 5920
},
{
"epoch": 2.6866719510814177,
"grad_norm": 0.12302912771701813,
"learning_rate": 0.00015483618352729093,
"loss": 0.141,
"step": 5930
},
{
"epoch": 2.6912014494394745,
"grad_norm": 0.14282426238059998,
"learning_rate": 0.00015468269130869834,
"loss": 0.1312,
"step": 5940
},
{
"epoch": 2.6957309477975313,
"grad_norm": 0.1203923374414444,
"learning_rate": 0.0001545290150817158,
"loss": 0.1327,
"step": 5950
},
{
"epoch": 2.700260446155588,
"grad_norm": 0.141504168510437,
"learning_rate": 0.00015437515536346704,
"loss": 0.1307,
"step": 5960
},
{
"epoch": 2.704789944513645,
"grad_norm": 0.12170039117336273,
"learning_rate": 0.00015422111267169322,
"loss": 0.139,
"step": 5970
},
{
"epoch": 2.709319442871702,
"grad_norm": 0.13064149022102356,
"learning_rate": 0.0001540668875247511,
"loss": 0.1358,
"step": 5980
},
{
"epoch": 2.7138489412297586,
"grad_norm": 0.11947247385978699,
"learning_rate": 0.00015391248044161162,
"loss": 0.1301,
"step": 5990
},
{
"epoch": 2.7183784395878154,
"grad_norm": 0.10719356685876846,
"learning_rate": 0.00015375789194185772,
"loss": 0.1296,
"step": 6000
},
{
"epoch": 2.7229079379458723,
"grad_norm": 0.11288373172283173,
"learning_rate": 0.00015360312254568295,
"loss": 0.1336,
"step": 6010
},
{
"epoch": 2.727437436303929,
"grad_norm": 0.12122143059968948,
"learning_rate": 0.00015344817277388955,
"loss": 0.1293,
"step": 6020
},
{
"epoch": 2.731966934661986,
"grad_norm": 0.11723847687244415,
"learning_rate": 0.0001532930431478867,
"loss": 0.133,
"step": 6030
},
{
"epoch": 2.736496433020043,
"grad_norm": 0.11670687049627304,
"learning_rate": 0.00015313773418968878,
"loss": 0.127,
"step": 6040
},
{
"epoch": 2.7410259313781,
"grad_norm": 0.13267673552036285,
"learning_rate": 0.00015298224642191368,
"loss": 0.1287,
"step": 6050
},
{
"epoch": 2.745555429736157,
"grad_norm": 0.12557269632816315,
"learning_rate": 0.00015282658036778094,
"loss": 0.1371,
"step": 6060
},
{
"epoch": 2.7500849280942137,
"grad_norm": 0.12416243553161621,
"learning_rate": 0.0001526707365511101,
"loss": 0.1339,
"step": 6070
},
{
"epoch": 2.7546144264522705,
"grad_norm": 0.13237670063972473,
"learning_rate": 0.00015251471549631882,
"loss": 0.1307,
"step": 6080
},
{
"epoch": 2.7591439248103273,
"grad_norm": 0.10942938178777695,
"learning_rate": 0.00015235851772842115,
"loss": 0.1325,
"step": 6090
},
{
"epoch": 2.763673423168384,
"grad_norm": 0.12319351732730865,
"learning_rate": 0.00015220214377302586,
"loss": 0.1346,
"step": 6100
},
{
"epoch": 2.768202921526441,
"grad_norm": 0.11745291203260422,
"learning_rate": 0.00015204559415633452,
"loss": 0.1358,
"step": 6110
},
{
"epoch": 2.772732419884498,
"grad_norm": 0.12627694010734558,
"learning_rate": 0.00015188886940513987,
"loss": 0.1314,
"step": 6120
},
{
"epoch": 2.7772619182425546,
"grad_norm": 0.12790648639202118,
"learning_rate": 0.0001517319700468239,
"loss": 0.1314,
"step": 6130
},
{
"epoch": 2.7817914166006115,
"grad_norm": 0.12807555496692657,
"learning_rate": 0.00015157489660935625,
"loss": 0.1368,
"step": 6140
},
{
"epoch": 2.7863209149586683,
"grad_norm": 0.114469513297081,
"learning_rate": 0.00015141764962129227,
"loss": 0.1364,
"step": 6150
},
{
"epoch": 2.790850413316725,
"grad_norm": 0.12749959528446198,
"learning_rate": 0.00015126022961177134,
"loss": 0.133,
"step": 6160
},
{
"epoch": 2.795379911674782,
"grad_norm": 0.12623634934425354,
"learning_rate": 0.00015110263711051505,
"loss": 0.1341,
"step": 6170
},
{
"epoch": 2.7999094100328388,
"grad_norm": 0.10407795011997223,
"learning_rate": 0.00015094487264782544,
"loss": 0.1373,
"step": 6180
},
{
"epoch": 2.8044389083908956,
"grad_norm": 0.11660348623991013,
"learning_rate": 0.0001507869367545832,
"loss": 0.1336,
"step": 6190
},
{
"epoch": 2.8089684067489524,
"grad_norm": 0.13876129686832428,
"learning_rate": 0.00015062882996224586,
"loss": 0.1282,
"step": 6200
},
{
"epoch": 2.8134979051070093,
"grad_norm": 0.12573808431625366,
"learning_rate": 0.0001504705528028461,
"loss": 0.1345,
"step": 6210
},
{
"epoch": 2.818027403465066,
"grad_norm": 0.12007986009120941,
"learning_rate": 0.0001503121058089898,
"loss": 0.1342,
"step": 6220
},
{
"epoch": 2.8225569018231234,
"grad_norm": 0.10775137692689896,
"learning_rate": 0.00015015348951385443,
"loss": 0.1352,
"step": 6230
},
{
"epoch": 2.82708640018118,
"grad_norm": 0.10959987342357635,
"learning_rate": 0.00014999470445118705,
"loss": 0.1299,
"step": 6240
},
{
"epoch": 2.831615898539237,
"grad_norm": 0.11662711948156357,
"learning_rate": 0.00014983575115530272,
"loss": 0.136,
"step": 6250
},
{
"epoch": 2.836145396897294,
"grad_norm": 0.11882171779870987,
"learning_rate": 0.00014967663016108258,
"loss": 0.1336,
"step": 6260
},
{
"epoch": 2.8406748952553507,
"grad_norm": 0.12361105531454086,
"learning_rate": 0.00014951734200397204,
"loss": 0.1363,
"step": 6270
},
{
"epoch": 2.8452043936134075,
"grad_norm": 0.11306975781917572,
"learning_rate": 0.0001493578872199791,
"loss": 0.1315,
"step": 6280
},
{
"epoch": 2.8497338919714643,
"grad_norm": 0.10558556020259857,
"learning_rate": 0.0001491982663456724,
"loss": 0.1293,
"step": 6290
},
{
"epoch": 2.854263390329521,
"grad_norm": 0.11685465276241302,
"learning_rate": 0.00014903847991817946,
"loss": 0.1309,
"step": 6300
},
{
"epoch": 2.858792888687578,
"grad_norm": 0.10772823542356491,
"learning_rate": 0.00014887852847518497,
"loss": 0.1306,
"step": 6310
},
{
"epoch": 2.863322387045635,
"grad_norm": 0.13630211353302002,
"learning_rate": 0.0001487184125549288,
"loss": 0.1301,
"step": 6320
},
{
"epoch": 2.8678518854036916,
"grad_norm": 0.11658801138401031,
"learning_rate": 0.0001485581326962044,
"loss": 0.1301,
"step": 6330
},
{
"epoch": 2.8723813837617485,
"grad_norm": 0.14447173476219177,
"learning_rate": 0.00014839768943835676,
"loss": 0.1364,
"step": 6340
},
{
"epoch": 2.8769108821198053,
"grad_norm": 0.10343156009912491,
"learning_rate": 0.00014823708332128077,
"loss": 0.1305,
"step": 6350
},
{
"epoch": 2.881440380477862,
"grad_norm": 0.14246292412281036,
"learning_rate": 0.00014807631488541938,
"loss": 0.1322,
"step": 6360
},
{
"epoch": 2.885969878835919,
"grad_norm": 0.13046808540821075,
"learning_rate": 0.00014791538467176174,
"loss": 0.1327,
"step": 6370
},
{
"epoch": 2.8904993771939758,
"grad_norm": 0.1174997016787529,
"learning_rate": 0.00014775429322184128,
"loss": 0.1319,
"step": 6380
},
{
"epoch": 2.8950288755520326,
"grad_norm": 0.11900872737169266,
"learning_rate": 0.0001475930410777341,
"loss": 0.1346,
"step": 6390
},
{
"epoch": 2.8995583739100894,
"grad_norm": 0.10685596615076065,
"learning_rate": 0.000147431628782057,
"loss": 0.1309,
"step": 6400
},
{
"epoch": 2.9040878722681462,
"grad_norm": 0.1201610341668129,
"learning_rate": 0.00014727005687796573,
"loss": 0.1334,
"step": 6410
},
{
"epoch": 2.908617370626203,
"grad_norm": 0.1042858362197876,
"learning_rate": 0.00014710832590915306,
"loss": 0.1305,
"step": 6420
},
{
"epoch": 2.91314686898426,
"grad_norm": 0.11404233425855637,
"learning_rate": 0.00014694643641984708,
"loss": 0.1264,
"step": 6430
},
{
"epoch": 2.9176763673423167,
"grad_norm": 0.09692881256341934,
"learning_rate": 0.0001467843889548093,
"loss": 0.1356,
"step": 6440
},
{
"epoch": 2.9222058657003736,
"grad_norm": 0.11369141191244125,
"learning_rate": 0.0001466221840593327,
"loss": 0.1281,
"step": 6450
},
{
"epoch": 2.9267353640584304,
"grad_norm": 0.12543022632598877,
"learning_rate": 0.0001464598222792402,
"loss": 0.1344,
"step": 6460
},
{
"epoch": 2.931264862416487,
"grad_norm": 0.09960107505321503,
"learning_rate": 0.00014629730416088256,
"loss": 0.1347,
"step": 6470
},
{
"epoch": 2.935794360774544,
"grad_norm": 0.11416647583246231,
"learning_rate": 0.00014613463025113662,
"loss": 0.128,
"step": 6480
},
{
"epoch": 2.940323859132601,
"grad_norm": 0.13363508880138397,
"learning_rate": 0.0001459718010974034,
"loss": 0.1362,
"step": 6490
},
{
"epoch": 2.9448533574906577,
"grad_norm": 0.12580367922782898,
"learning_rate": 0.00014580881724760638,
"loss": 0.1331,
"step": 6500
},
{
"epoch": 2.9493828558487145,
"grad_norm": 0.1310282200574875,
"learning_rate": 0.00014564567925018967,
"loss": 0.137,
"step": 6510
},
{
"epoch": 2.9539123542067713,
"grad_norm": 0.12097878754138947,
"learning_rate": 0.000145482387654116,
"loss": 0.1327,
"step": 6520
},
{
"epoch": 2.9584418525648286,
"grad_norm": 0.11536047607660294,
"learning_rate": 0.0001453189430088649,
"loss": 0.1383,
"step": 6530
},
{
"epoch": 2.9629713509228854,
"grad_norm": 0.11799097061157227,
"learning_rate": 0.00014515534586443104,
"loss": 0.1365,
"step": 6540
},
{
"epoch": 2.9675008492809423,
"grad_norm": 0.10550688207149506,
"learning_rate": 0.00014499159677132219,
"loss": 0.1304,
"step": 6550
},
{
"epoch": 2.972030347638999,
"grad_norm": 0.13376198709011078,
"learning_rate": 0.00014482769628055748,
"loss": 0.1317,
"step": 6560
},
{
"epoch": 2.976559845997056,
"grad_norm": 0.1147933304309845,
"learning_rate": 0.0001446636449436654,
"loss": 0.1317,
"step": 6570
},
{
"epoch": 2.9810893443551127,
"grad_norm": 0.12273435294628143,
"learning_rate": 0.00014449944331268216,
"loss": 0.1302,
"step": 6580
},
{
"epoch": 2.9856188427131696,
"grad_norm": 0.12308023869991302,
"learning_rate": 0.00014433509194014963,
"loss": 0.1284,
"step": 6590
},
{
"epoch": 2.9901483410712264,
"grad_norm": 0.11716390401124954,
"learning_rate": 0.00014417059137911356,
"loss": 0.1286,
"step": 6600
},
{
"epoch": 2.9946778394292832,
"grad_norm": 0.1330905556678772,
"learning_rate": 0.00014400594218312178,
"loss": 0.1321,
"step": 6610
},
{
"epoch": 2.99920733778734,
"grad_norm": 0.12336422502994537,
"learning_rate": 0.00014384114490622221,
"loss": 0.1327,
"step": 6620
},
{
"epoch": 2.9996602876231457,
"eval_loss": 0.16021211445331573,
"eval_runtime": 617.3452,
"eval_samples_per_second": 12.748,
"eval_steps_per_second": 1.594,
"step": 6621
},
{
"epoch": 3.004076548522251,
"grad_norm": 0.1117822602391243,
"learning_rate": 0.00014367620010296114,
"loss": 0.1199,
"step": 6630
},
{
"epoch": 3.008606046880308,
"grad_norm": 0.10662990808486938,
"learning_rate": 0.00014351110832838123,
"loss": 0.1082,
"step": 6640
},
{
"epoch": 3.013135545238365,
"grad_norm": 0.09254604578018188,
"learning_rate": 0.00014334587013801976,
"loss": 0.1106,
"step": 6650
},
{
"epoch": 3.0176650435964216,
"grad_norm": 0.10764751583337784,
"learning_rate": 0.00014318048608790663,
"loss": 0.1087,
"step": 6660
},
{
"epoch": 3.0221945419544785,
"grad_norm": 0.10320322960615158,
"learning_rate": 0.00014301495673456262,
"loss": 0.1072,
"step": 6670
},
{
"epoch": 3.0267240403125353,
"grad_norm": 0.09786458313465118,
"learning_rate": 0.00014284928263499742,
"loss": 0.1052,
"step": 6680
},
{
"epoch": 3.031253538670592,
"grad_norm": 0.0940663069486618,
"learning_rate": 0.00014268346434670782,
"loss": 0.1141,
"step": 6690
},
{
"epoch": 3.035783037028649,
"grad_norm": 0.12340737879276276,
"learning_rate": 0.0001425175024276758,
"loss": 0.1099,
"step": 6700
},
{
"epoch": 3.0403125353867058,
"grad_norm": 0.10877358913421631,
"learning_rate": 0.00014235139743636662,
"loss": 0.1066,
"step": 6710
},
{
"epoch": 3.0448420337447626,
"grad_norm": 0.09268616884946823,
"learning_rate": 0.00014218514993172705,
"loss": 0.105,
"step": 6720
},
{
"epoch": 3.0493715321028194,
"grad_norm": 0.09083138406276703,
"learning_rate": 0.00014201876047318342,
"loss": 0.1103,
"step": 6730
},
{
"epoch": 3.0539010304608762,
"grad_norm": 0.10291367769241333,
"learning_rate": 0.00014185222962063965,
"loss": 0.1072,
"step": 6740
},
{
"epoch": 3.0584305288189335,
"grad_norm": 0.10415250808000565,
"learning_rate": 0.00014168555793447554,
"loss": 0.1114,
"step": 6750
},
{
"epoch": 3.0629600271769903,
"grad_norm": 0.10135282576084137,
"learning_rate": 0.00014151874597554477,
"loss": 0.1086,
"step": 6760
},
{
"epoch": 3.067489525535047,
"grad_norm": 0.10510314255952835,
"learning_rate": 0.00014135179430517305,
"loss": 0.1117,
"step": 6770
},
{
"epoch": 3.072019023893104,
"grad_norm": 0.11414755135774612,
"learning_rate": 0.0001411847034851562,
"loss": 0.1102,
"step": 6780
},
{
"epoch": 3.076548522251161,
"grad_norm": 0.0981656014919281,
"learning_rate": 0.0001410174740777583,
"loss": 0.1112,
"step": 6790
},
{
"epoch": 3.0810780206092176,
"grad_norm": 0.09286178648471832,
"learning_rate": 0.00014085010664570974,
"loss": 0.1085,
"step": 6800
},
{
"epoch": 3.0856075189672745,
"grad_norm": 0.10993903875350952,
"learning_rate": 0.00014068260175220546,
"loss": 0.1121,
"step": 6810
},
{
"epoch": 3.0901370173253313,
"grad_norm": 0.10415517538785934,
"learning_rate": 0.00014051495996090285,
"loss": 0.109,
"step": 6820
},
{
"epoch": 3.094666515683388,
"grad_norm": 0.09917622059583664,
"learning_rate": 0.00014034718183592,
"loss": 0.1085,
"step": 6830
},
{
"epoch": 3.099196014041445,
"grad_norm": 0.09848062694072723,
"learning_rate": 0.00014017926794183383,
"loss": 0.1047,
"step": 6840
},
{
"epoch": 3.103725512399502,
"grad_norm": 0.12383636087179184,
"learning_rate": 0.00014001121884367804,
"loss": 0.1105,
"step": 6850
},
{
"epoch": 3.1082550107575586,
"grad_norm": 0.10345660895109177,
"learning_rate": 0.00013984303510694134,
"loss": 0.1108,
"step": 6860
},
{
"epoch": 3.1127845091156154,
"grad_norm": 0.08951733261346817,
"learning_rate": 0.0001396747172975655,
"loss": 0.1117,
"step": 6870
},
{
"epoch": 3.1173140074736723,
"grad_norm": 0.09321026504039764,
"learning_rate": 0.00013950626598194346,
"loss": 0.1095,
"step": 6880
},
{
"epoch": 3.121843505831729,
"grad_norm": 0.09075412154197693,
"learning_rate": 0.0001393376817269173,
"loss": 0.1111,
"step": 6890
},
{
"epoch": 3.126373004189786,
"grad_norm": 0.08038198202848434,
"learning_rate": 0.0001391689650997766,
"loss": 0.1085,
"step": 6900
},
{
"epoch": 3.1309025025478427,
"grad_norm": 0.09946314990520477,
"learning_rate": 0.00013900011666825632,
"loss": 0.1079,
"step": 6910
},
{
"epoch": 3.1354320009058996,
"grad_norm": 0.083831787109375,
"learning_rate": 0.00013883113700053493,
"loss": 0.108,
"step": 6920
},
{
"epoch": 3.1399614992639564,
"grad_norm": 0.09110364317893982,
"learning_rate": 0.00013866202666523245,
"loss": 0.1074,
"step": 6930
},
{
"epoch": 3.1444909976220132,
"grad_norm": 0.09342263638973236,
"learning_rate": 0.00013849278623140874,
"loss": 0.1102,
"step": 6940
},
{
"epoch": 3.14902049598007,
"grad_norm": 0.10097695142030716,
"learning_rate": 0.00013832341626856135,
"loss": 0.1091,
"step": 6950
},
{
"epoch": 3.153549994338127,
"grad_norm": 0.10724612325429916,
"learning_rate": 0.0001381539173466237,
"loss": 0.1095,
"step": 6960
},
{
"epoch": 3.1580794926961837,
"grad_norm": 0.113038569688797,
"learning_rate": 0.0001379842900359632,
"loss": 0.1101,
"step": 6970
},
{
"epoch": 3.1626089910542405,
"grad_norm": 0.10871588438749313,
"learning_rate": 0.00013781453490737918,
"loss": 0.1074,
"step": 6980
},
{
"epoch": 3.167138489412298,
"grad_norm": 0.09797286987304688,
"learning_rate": 0.0001376446525321013,
"loss": 0.1107,
"step": 6990
},
{
"epoch": 3.1716679877703546,
"grad_norm": 0.10018666833639145,
"learning_rate": 0.0001374746434817872,
"loss": 0.1112,
"step": 7000
},
{
"epoch": 3.1761974861284115,
"grad_norm": 0.09767764061689377,
"learning_rate": 0.00013730450832852086,
"loss": 0.1117,
"step": 7010
},
{
"epoch": 3.1807269844864683,
"grad_norm": 0.10807600617408752,
"learning_rate": 0.00013713424764481066,
"loss": 0.1069,
"step": 7020
},
{
"epoch": 3.185256482844525,
"grad_norm": 0.11085067689418793,
"learning_rate": 0.00013696386200358723,
"loss": 0.1098,
"step": 7030
},
{
"epoch": 3.189785981202582,
"grad_norm": 0.11777514964342117,
"learning_rate": 0.0001367933519782018,
"loss": 0.1095,
"step": 7040
},
{
"epoch": 3.1943154795606388,
"grad_norm": 0.08946658670902252,
"learning_rate": 0.00013662271814242422,
"loss": 0.1091,
"step": 7050
},
{
"epoch": 3.1988449779186956,
"grad_norm": 0.10264267772436142,
"learning_rate": 0.0001364519610704408,
"loss": 0.1116,
"step": 7060
},
{
"epoch": 3.2033744762767524,
"grad_norm": 0.0933040976524353,
"learning_rate": 0.00013628108133685273,
"loss": 0.1091,
"step": 7070
},
{
"epoch": 3.2079039746348093,
"grad_norm": 0.10949963331222534,
"learning_rate": 0.00013611007951667376,
"loss": 0.1122,
"step": 7080
},
{
"epoch": 3.212433472992866,
"grad_norm": 0.10518185049295425,
"learning_rate": 0.0001359389561853286,
"loss": 0.1112,
"step": 7090
},
{
"epoch": 3.216962971350923,
"grad_norm": 0.10346280038356781,
"learning_rate": 0.00013576771191865078,
"loss": 0.109,
"step": 7100
},
{
"epoch": 3.2214924697089797,
"grad_norm": 0.09324981272220612,
"learning_rate": 0.00013559634729288088,
"loss": 0.1092,
"step": 7110
},
{
"epoch": 3.2260219680670366,
"grad_norm": 0.10806597769260406,
"learning_rate": 0.00013542486288466428,
"loss": 0.1103,
"step": 7120
},
{
"epoch": 3.2305514664250934,
"grad_norm": 0.10441877692937851,
"learning_rate": 0.00013525325927104973,
"loss": 0.1095,
"step": 7130
},
{
"epoch": 3.23508096478315,
"grad_norm": 0.08796998858451843,
"learning_rate": 0.00013508153702948683,
"loss": 0.1104,
"step": 7140
},
{
"epoch": 3.239610463141207,
"grad_norm": 0.12072450667619705,
"learning_rate": 0.00013490969673782453,
"loss": 0.1095,
"step": 7150
},
{
"epoch": 3.244139961499264,
"grad_norm": 0.10589967668056488,
"learning_rate": 0.00013473773897430903,
"loss": 0.107,
"step": 7160
},
{
"epoch": 3.2486694598573207,
"grad_norm": 0.10880044102668762,
"learning_rate": 0.00013456566431758164,
"loss": 0.1101,
"step": 7170
},
{
"epoch": 3.2531989582153775,
"grad_norm": 0.10041461884975433,
"learning_rate": 0.00013439347334667722,
"loss": 0.1103,
"step": 7180
},
{
"epoch": 3.2577284565734344,
"grad_norm": 0.11079218983650208,
"learning_rate": 0.000134221166641022,
"loss": 0.1112,
"step": 7190
},
{
"epoch": 3.262257954931491,
"grad_norm": 0.10900229215621948,
"learning_rate": 0.00013404874478043153,
"loss": 0.1117,
"step": 7200
},
{
"epoch": 3.266787453289548,
"grad_norm": 0.10362094640731812,
"learning_rate": 0.000133876208345109,
"loss": 0.1114,
"step": 7210
},
{
"epoch": 3.271316951647605,
"grad_norm": 0.10555779188871384,
"learning_rate": 0.00013370355791564306,
"loss": 0.1123,
"step": 7220
},
{
"epoch": 3.2758464500056617,
"grad_norm": 0.09255950897932053,
"learning_rate": 0.00013353079407300603,
"loss": 0.1131,
"step": 7230
},
{
"epoch": 3.2803759483637185,
"grad_norm": 0.09914428740739822,
"learning_rate": 0.00013335791739855176,
"loss": 0.1113,
"step": 7240
},
{
"epoch": 3.2849054467217758,
"grad_norm": 0.10521331429481506,
"learning_rate": 0.0001331849284740139,
"loss": 0.11,
"step": 7250
},
{
"epoch": 3.2894349450798326,
"grad_norm": 0.09139056503772736,
"learning_rate": 0.00013301182788150374,
"loss": 0.1109,
"step": 7260
},
{
"epoch": 3.2939644434378894,
"grad_norm": 0.09516976028680801,
"learning_rate": 0.00013283861620350836,
"loss": 0.1096,
"step": 7270
},
{
"epoch": 3.2984939417959462,
"grad_norm": 0.09153826534748077,
"learning_rate": 0.00013266529402288866,
"loss": 0.1093,
"step": 7280
},
{
"epoch": 3.303023440154003,
"grad_norm": 0.11171313375234604,
"learning_rate": 0.00013249186192287735,
"loss": 0.113,
"step": 7290
},
{
"epoch": 3.30755293851206,
"grad_norm": 0.1110367551445961,
"learning_rate": 0.00013231832048707712,
"loss": 0.1146,
"step": 7300
},
{
"epoch": 3.3120824368701167,
"grad_norm": 0.10271560400724411,
"learning_rate": 0.00013214467029945835,
"loss": 0.1096,
"step": 7310
},
{
"epoch": 3.3166119352281735,
"grad_norm": 0.10005812346935272,
"learning_rate": 0.00013197091194435767,
"loss": 0.1089,
"step": 7320
},
{
"epoch": 3.3211414335862304,
"grad_norm": 0.09489379823207855,
"learning_rate": 0.00013179704600647547,
"loss": 0.1119,
"step": 7330
},
{
"epoch": 3.325670931944287,
"grad_norm": 0.10342545807361603,
"learning_rate": 0.00013162307307087423,
"loss": 0.1128,
"step": 7340
},
{
"epoch": 3.330200430302344,
"grad_norm": 0.10697804391384125,
"learning_rate": 0.0001314489937229765,
"loss": 0.1126,
"step": 7350
},
{
"epoch": 3.334729928660401,
"grad_norm": 0.11575332283973694,
"learning_rate": 0.00013127480854856295,
"loss": 0.1133,
"step": 7360
},
{
"epoch": 3.3392594270184577,
"grad_norm": 0.10017456859350204,
"learning_rate": 0.00013110051813377025,
"loss": 0.1091,
"step": 7370
},
{
"epoch": 3.3437889253765145,
"grad_norm": 0.11635085195302963,
"learning_rate": 0.00013092612306508922,
"loss": 0.1139,
"step": 7380
},
{
"epoch": 3.3483184237345713,
"grad_norm": 0.09450142085552216,
"learning_rate": 0.00013075162392936295,
"loss": 0.1119,
"step": 7390
},
{
"epoch": 3.352847922092628,
"grad_norm": 0.09203408658504486,
"learning_rate": 0.0001305770213137846,
"loss": 0.1088,
"step": 7400
},
{
"epoch": 3.357377420450685,
"grad_norm": 0.09736169874668121,
"learning_rate": 0.00013040231580589565,
"loss": 0.1099,
"step": 7410
},
{
"epoch": 3.361906918808742,
"grad_norm": 0.09759002178907394,
"learning_rate": 0.0001302275079935837,
"loss": 0.1149,
"step": 7420
},
{
"epoch": 3.3664364171667986,
"grad_norm": 0.09410129487514496,
"learning_rate": 0.00013005259846508068,
"loss": 0.1132,
"step": 7430
},
{
"epoch": 3.3709659155248555,
"grad_norm": 0.09184587746858597,
"learning_rate": 0.0001298775878089608,
"loss": 0.1099,
"step": 7440
},
{
"epoch": 3.3754954138829123,
"grad_norm": 0.10475565493106842,
"learning_rate": 0.00012970247661413855,
"loss": 0.1109,
"step": 7450
},
{
"epoch": 3.380024912240969,
"grad_norm": 0.10369405895471573,
"learning_rate": 0.00012952726546986668,
"loss": 0.1144,
"step": 7460
},
{
"epoch": 3.3845544105990264,
"grad_norm": 0.1000487357378006,
"learning_rate": 0.00012935195496573435,
"loss": 0.1093,
"step": 7470
},
{
"epoch": 3.3890839089570832,
"grad_norm": 0.1104254201054573,
"learning_rate": 0.00012917654569166503,
"loss": 0.1093,
"step": 7480
},
{
"epoch": 3.39361340731514,
"grad_norm": 0.10195254534482956,
"learning_rate": 0.0001290010382379146,
"loss": 0.1104,
"step": 7490
},
{
"epoch": 3.398142905673197,
"grad_norm": 0.10613837838172913,
"learning_rate": 0.00012882543319506925,
"loss": 0.115,
"step": 7500
},
{
"epoch": 3.4026724040312537,
"grad_norm": 0.10054861009120941,
"learning_rate": 0.0001286497311540436,
"loss": 0.1093,
"step": 7510
},
{
"epoch": 3.4072019023893105,
"grad_norm": 0.1072639673948288,
"learning_rate": 0.0001284739327060787,
"loss": 0.114,
"step": 7520
},
{
"epoch": 3.4117314007473674,
"grad_norm": 0.09658465534448624,
"learning_rate": 0.00012829803844273987,
"loss": 0.1088,
"step": 7530
},
{
"epoch": 3.416260899105424,
"grad_norm": 0.09596540778875351,
"learning_rate": 0.00012812204895591505,
"loss": 0.1124,
"step": 7540
},
{
"epoch": 3.420790397463481,
"grad_norm": 0.08748818188905716,
"learning_rate": 0.00012794596483781248,
"loss": 0.1125,
"step": 7550
},
{
"epoch": 3.425319895821538,
"grad_norm": 0.09352606534957886,
"learning_rate": 0.00012776978668095884,
"loss": 0.1134,
"step": 7560
},
{
"epoch": 3.4298493941795947,
"grad_norm": 0.11329905688762665,
"learning_rate": 0.0001275935150781973,
"loss": 0.1138,
"step": 7570
},
{
"epoch": 3.4343788925376515,
"grad_norm": 0.09285202622413635,
"learning_rate": 0.00012741715062268547,
"loss": 0.1096,
"step": 7580
},
{
"epoch": 3.4389083908957083,
"grad_norm": 0.10598818957805634,
"learning_rate": 0.00012724069390789342,
"loss": 0.113,
"step": 7590
},
{
"epoch": 3.443437889253765,
"grad_norm": 0.11264318972826004,
"learning_rate": 0.0001270641455276016,
"loss": 0.1135,
"step": 7600
},
{
"epoch": 3.447967387611822,
"grad_norm": 0.09473126381635666,
"learning_rate": 0.00012688750607589897,
"loss": 0.1106,
"step": 7610
},
{
"epoch": 3.452496885969879,
"grad_norm": 0.09131330251693726,
"learning_rate": 0.000126710776147181,
"loss": 0.1149,
"step": 7620
},
{
"epoch": 3.4570263843279356,
"grad_norm": 0.10694695264101028,
"learning_rate": 0.0001265339563361475,
"loss": 0.1126,
"step": 7630
},
{
"epoch": 3.4615558826859925,
"grad_norm": 0.1015838012099266,
"learning_rate": 0.00012635704723780087,
"loss": 0.1135,
"step": 7640
},
{
"epoch": 3.4660853810440493,
"grad_norm": 0.10224758833646774,
"learning_rate": 0.00012618004944744385,
"loss": 0.1155,
"step": 7650
},
{
"epoch": 3.470614879402106,
"grad_norm": 0.11169352382421494,
"learning_rate": 0.00012600296356067768,
"loss": 0.1092,
"step": 7660
},
{
"epoch": 3.475144377760163,
"grad_norm": 0.10369731485843658,
"learning_rate": 0.00012582579017340003,
"loss": 0.1107,
"step": 7670
},
{
"epoch": 3.4796738761182198,
"grad_norm": 0.09245746582746506,
"learning_rate": 0.00012564852988180305,
"loss": 0.1093,
"step": 7680
},
{
"epoch": 3.4842033744762766,
"grad_norm": 0.09676039218902588,
"learning_rate": 0.0001254711832823713,
"loss": 0.1117,
"step": 7690
},
{
"epoch": 3.4887328728343334,
"grad_norm": 0.10541850328445435,
"learning_rate": 0.0001252937509718797,
"loss": 0.1119,
"step": 7700
},
{
"epoch": 3.4932623711923902,
"grad_norm": 0.08481086790561676,
"learning_rate": 0.0001251162335473917,
"loss": 0.1103,
"step": 7710
},
{
"epoch": 3.497791869550447,
"grad_norm": 0.09966452419757843,
"learning_rate": 0.00012493863160625713,
"loss": 0.1147,
"step": 7720
},
{
"epoch": 3.502321367908504,
"grad_norm": 0.09558738023042679,
"learning_rate": 0.00012476094574611016,
"loss": 0.1123,
"step": 7730
},
{
"epoch": 3.5068508662665607,
"grad_norm": 0.10436621308326721,
"learning_rate": 0.00012458317656486746,
"loss": 0.1129,
"step": 7740
},
{
"epoch": 3.5113803646246176,
"grad_norm": 0.10191968828439713,
"learning_rate": 0.00012440532466072597,
"loss": 0.1099,
"step": 7750
},
{
"epoch": 3.515909862982675,
"grad_norm": 0.10766720771789551,
"learning_rate": 0.000124227390632161,
"loss": 0.1121,
"step": 7760
},
{
"epoch": 3.5204393613407317,
"grad_norm": 0.08841870725154877,
"learning_rate": 0.0001240493750779243,
"loss": 0.1103,
"step": 7770
},
{
"epoch": 3.5249688596987885,
"grad_norm": 0.1090930923819542,
"learning_rate": 0.00012387127859704187,
"loss": 0.1164,
"step": 7780
},
{
"epoch": 3.5294983580568453,
"grad_norm": 0.10451924055814743,
"learning_rate": 0.00012369310178881205,
"loss": 0.1112,
"step": 7790
},
{
"epoch": 3.534027856414902,
"grad_norm": 0.09721478819847107,
"learning_rate": 0.0001235148452528035,
"loss": 0.1135,
"step": 7800
},
{
"epoch": 3.538557354772959,
"grad_norm": 0.0975523293018341,
"learning_rate": 0.00012333650958885322,
"loss": 0.1105,
"step": 7810
},
{
"epoch": 3.543086853131016,
"grad_norm": 0.08713623881340027,
"learning_rate": 0.00012315809539706436,
"loss": 0.1103,
"step": 7820
},
{
"epoch": 3.5476163514890726,
"grad_norm": 0.09232752025127411,
"learning_rate": 0.00012297960327780437,
"loss": 0.1128,
"step": 7830
},
{
"epoch": 3.5521458498471294,
"grad_norm": 0.09094680100679398,
"learning_rate": 0.00012280103383170295,
"loss": 0.1104,
"step": 7840
},
{
"epoch": 3.5566753482051863,
"grad_norm": 0.09738276153802872,
"learning_rate": 0.00012262238765964995,
"loss": 0.1059,
"step": 7850
},
{
"epoch": 3.561204846563243,
"grad_norm": 0.0989813581109047,
"learning_rate": 0.0001224436653627935,
"loss": 0.112,
"step": 7860
},
{
"epoch": 3.5657343449213,
"grad_norm": 0.09522037208080292,
"learning_rate": 0.0001222648675425378,
"loss": 0.1081,
"step": 7870
},
{
"epoch": 3.5702638432793568,
"grad_norm": 0.10340669006109238,
"learning_rate": 0.00012208599480054125,
"loss": 0.1117,
"step": 7880
},
{
"epoch": 3.5747933416374136,
"grad_norm": 0.11090776324272156,
"learning_rate": 0.0001219070477387143,
"loss": 0.1097,
"step": 7890
},
{
"epoch": 3.5793228399954704,
"grad_norm": 0.08626790344715118,
"learning_rate": 0.00012172802695921754,
"loss": 0.1128,
"step": 7900
},
{
"epoch": 3.5838523383535272,
"grad_norm": 0.09012069553136826,
"learning_rate": 0.00012154893306445961,
"loss": 0.1137,
"step": 7910
},
{
"epoch": 3.588381836711584,
"grad_norm": 0.07982558012008667,
"learning_rate": 0.00012136976665709516,
"loss": 0.1117,
"step": 7920
},
{
"epoch": 3.592911335069641,
"grad_norm": 0.09850164502859116,
"learning_rate": 0.00012119052834002289,
"loss": 0.1088,
"step": 7930
},
{
"epoch": 3.597440833427698,
"grad_norm": 0.09800245612859726,
"learning_rate": 0.00012101121871638343,
"loss": 0.1153,
"step": 7940
},
{
"epoch": 3.601970331785755,
"grad_norm": 0.09477314352989197,
"learning_rate": 0.0001208318383895574,
"loss": 0.1104,
"step": 7950
},
{
"epoch": 3.606499830143812,
"grad_norm": 0.10447141528129578,
"learning_rate": 0.00012065238796316331,
"loss": 0.1115,
"step": 7960
},
{
"epoch": 3.6110293285018686,
"grad_norm": 0.10505667328834534,
"learning_rate": 0.00012047286804105557,
"loss": 0.1096,
"step": 7970
},
{
"epoch": 3.6155588268599255,
"grad_norm": 0.0925762876868248,
"learning_rate": 0.00012029327922732242,
"loss": 0.1146,
"step": 7980
},
{
"epoch": 3.6200883252179823,
"grad_norm": 0.12217893451452255,
"learning_rate": 0.00012011362212628397,
"loss": 0.1105,
"step": 7990
},
{
"epoch": 3.624617823576039,
"grad_norm": 0.09887892752885818,
"learning_rate": 0.00011993389734249006,
"loss": 0.1098,
"step": 8000
},
{
"epoch": 3.629147321934096,
"grad_norm": 0.10694731771945953,
"learning_rate": 0.00011975410548071832,
"loss": 0.1129,
"step": 8010
},
{
"epoch": 3.6336768202921528,
"grad_norm": 0.08971285820007324,
"learning_rate": 0.00011957424714597212,
"loss": 0.1084,
"step": 8020
},
{
"epoch": 3.6382063186502096,
"grad_norm": 0.08375135064125061,
"learning_rate": 0.00011939432294347848,
"loss": 0.1098,
"step": 8030
},
{
"epoch": 3.6427358170082664,
"grad_norm": 0.09610874205827713,
"learning_rate": 0.00011921433347868602,
"loss": 0.1109,
"step": 8040
},
{
"epoch": 3.6472653153663233,
"grad_norm": 0.09743242710828781,
"learning_rate": 0.00011903427935726308,
"loss": 0.1176,
"step": 8050
},
{
"epoch": 3.65179481372438,
"grad_norm": 0.09157928824424744,
"learning_rate": 0.00011885416118509549,
"loss": 0.1116,
"step": 8060
},
{
"epoch": 3.656324312082437,
"grad_norm": 0.10359596461057663,
"learning_rate": 0.00011867397956828463,
"loss": 0.1117,
"step": 8070
},
{
"epoch": 3.6608538104404937,
"grad_norm": 0.08667086809873581,
"learning_rate": 0.00011849373511314537,
"loss": 0.1126,
"step": 8080
},
{
"epoch": 3.6653833087985506,
"grad_norm": 0.0973113626241684,
"learning_rate": 0.00011831342842620405,
"loss": 0.1099,
"step": 8090
},
{
"epoch": 3.6699128071566074,
"grad_norm": 0.09472218155860901,
"learning_rate": 0.00011813306011419642,
"loss": 0.1117,
"step": 8100
},
{
"epoch": 3.674442305514664,
"grad_norm": 0.10071218013763428,
"learning_rate": 0.00011795263078406558,
"loss": 0.1096,
"step": 8110
},
{
"epoch": 3.678971803872721,
"grad_norm": 0.08343309164047241,
"learning_rate": 0.00011777214104295995,
"loss": 0.1118,
"step": 8120
},
{
"epoch": 3.683501302230778,
"grad_norm": 0.0963587686419487,
"learning_rate": 0.00011759159149823127,
"loss": 0.1099,
"step": 8130
},
{
"epoch": 3.6880308005888347,
"grad_norm": 0.09920413792133331,
"learning_rate": 0.00011741098275743247,
"loss": 0.1132,
"step": 8140
},
{
"epoch": 3.6925602989468915,
"grad_norm": 0.12149636447429657,
"learning_rate": 0.00011723031542831578,
"loss": 0.1146,
"step": 8150
},
{
"epoch": 3.6970897973049484,
"grad_norm": 0.09953594207763672,
"learning_rate": 0.00011704959011883043,
"loss": 0.1078,
"step": 8160
},
{
"epoch": 3.701619295663005,
"grad_norm": 0.11264549940824509,
"learning_rate": 0.0001168688074371209,
"loss": 0.1098,
"step": 8170
},
{
"epoch": 3.706148794021062,
"grad_norm": 0.10793278366327286,
"learning_rate": 0.00011668796799152457,
"loss": 0.1123,
"step": 8180
},
{
"epoch": 3.710678292379119,
"grad_norm": 0.10062643885612488,
"learning_rate": 0.00011650707239057,
"loss": 0.1136,
"step": 8190
},
{
"epoch": 3.7152077907371757,
"grad_norm": 0.09304151684045792,
"learning_rate": 0.00011632612124297461,
"loss": 0.1126,
"step": 8200
},
{
"epoch": 3.7197372890952325,
"grad_norm": 0.10045602172613144,
"learning_rate": 0.00011614511515764277,
"loss": 0.1092,
"step": 8210
},
{
"epoch": 3.7242667874532893,
"grad_norm": 0.09587648510932922,
"learning_rate": 0.00011596405474366372,
"loss": 0.1115,
"step": 8220
},
{
"epoch": 3.728796285811346,
"grad_norm": 0.10631423443555832,
"learning_rate": 0.00011578294061030947,
"loss": 0.111,
"step": 8230
},
{
"epoch": 3.733325784169403,
"grad_norm": 0.09861784428358078,
"learning_rate": 0.00011560177336703291,
"loss": 0.11,
"step": 8240
},
{
"epoch": 3.7378552825274602,
"grad_norm": 0.0921064168214798,
"learning_rate": 0.00011542055362346549,
"loss": 0.1109,
"step": 8250
},
{
"epoch": 3.742384780885517,
"grad_norm": 0.10424584895372391,
"learning_rate": 0.00011523928198941543,
"loss": 0.11,
"step": 8260
},
{
"epoch": 3.746914279243574,
"grad_norm": 0.10199391096830368,
"learning_rate": 0.00011505795907486551,
"loss": 0.112,
"step": 8270
},
{
"epoch": 3.7514437776016307,
"grad_norm": 0.09731689840555191,
"learning_rate": 0.00011487658548997115,
"loss": 0.1125,
"step": 8280
},
{
"epoch": 3.7559732759596876,
"grad_norm": 0.07730797678232193,
"learning_rate": 0.00011469516184505821,
"loss": 0.1096,
"step": 8290
},
{
"epoch": 3.7605027743177444,
"grad_norm": 0.09512131661176682,
"learning_rate": 0.00011451368875062101,
"loss": 0.1115,
"step": 8300
},
{
"epoch": 3.765032272675801,
"grad_norm": 0.08450417220592499,
"learning_rate": 0.00011433216681732027,
"loss": 0.1135,
"step": 8310
},
{
"epoch": 3.769561771033858,
"grad_norm": 0.08709891885519028,
"learning_rate": 0.00011415059665598105,
"loss": 0.111,
"step": 8320
},
{
"epoch": 3.774091269391915,
"grad_norm": 0.12575045228004456,
"learning_rate": 0.00011396897887759071,
"loss": 0.1145,
"step": 8330
},
{
"epoch": 3.7786207677499717,
"grad_norm": 0.09050168097019196,
"learning_rate": 0.00011378731409329684,
"loss": 0.1108,
"step": 8340
},
{
"epoch": 3.7831502661080285,
"grad_norm": 0.0824236199259758,
"learning_rate": 0.00011360560291440526,
"loss": 0.1137,
"step": 8350
},
{
"epoch": 3.7876797644660853,
"grad_norm": 0.10261125862598419,
"learning_rate": 0.00011342384595237776,
"loss": 0.1089,
"step": 8360
},
{
"epoch": 3.792209262824142,
"grad_norm": 0.08885115385055542,
"learning_rate": 0.00011324204381883033,
"loss": 0.1109,
"step": 8370
},
{
"epoch": 3.796738761182199,
"grad_norm": 0.10409918427467346,
"learning_rate": 0.00011306019712553094,
"loss": 0.1142,
"step": 8380
},
{
"epoch": 3.801268259540256,
"grad_norm": 0.0991046279668808,
"learning_rate": 0.00011287830648439746,
"loss": 0.115,
"step": 8390
},
{
"epoch": 3.8057977578983126,
"grad_norm": 0.10309819132089615,
"learning_rate": 0.00011269637250749565,
"loss": 0.1112,
"step": 8400
},
{
"epoch": 3.8103272562563695,
"grad_norm": 0.09360276162624359,
"learning_rate": 0.00011251439580703716,
"loss": 0.1115,
"step": 8410
},
{
"epoch": 3.8148567546144263,
"grad_norm": 0.09267252683639526,
"learning_rate": 0.0001123323769953773,
"loss": 0.1106,
"step": 8420
},
{
"epoch": 3.819386252972483,
"grad_norm": 0.11334355920553207,
"learning_rate": 0.00011215031668501322,
"loss": 0.1086,
"step": 8430
},
{
"epoch": 3.8239157513305404,
"grad_norm": 0.09532047063112259,
"learning_rate": 0.00011196821548858156,
"loss": 0.1091,
"step": 8440
},
{
"epoch": 3.8284452496885972,
"grad_norm": 0.08060566335916519,
"learning_rate": 0.00011178607401885668,
"loss": 0.1102,
"step": 8450
},
{
"epoch": 3.832974748046654,
"grad_norm": 0.09655016660690308,
"learning_rate": 0.0001116038928887484,
"loss": 0.1124,
"step": 8460
},
{
"epoch": 3.837504246404711,
"grad_norm": 0.10175477713346481,
"learning_rate": 0.00011142167271129996,
"loss": 0.1108,
"step": 8470
},
{
"epoch": 3.8420337447627677,
"grad_norm": 0.08714988827705383,
"learning_rate": 0.00011123941409968606,
"loss": 0.111,
"step": 8480
},
{
"epoch": 3.8465632431208245,
"grad_norm": 0.08987358957529068,
"learning_rate": 0.00011105711766721067,
"loss": 0.1096,
"step": 8490
},
{
"epoch": 3.8510927414788814,
"grad_norm": 0.10814320296049118,
"learning_rate": 0.00011087478402730514,
"loss": 0.1151,
"step": 8500
},
{
"epoch": 3.855622239836938,
"grad_norm": 0.09886670112609863,
"learning_rate": 0.00011069241379352588,
"loss": 0.1078,
"step": 8510
},
{
"epoch": 3.860151738194995,
"grad_norm": 0.09303957968950272,
"learning_rate": 0.00011051000757955257,
"loss": 0.113,
"step": 8520
},
{
"epoch": 3.864681236553052,
"grad_norm": 0.10088100284337997,
"learning_rate": 0.00011032756599918584,
"loss": 0.1112,
"step": 8530
},
{
"epoch": 3.8692107349111087,
"grad_norm": 0.11249160021543503,
"learning_rate": 0.0001101450896663454,
"loss": 0.1124,
"step": 8540
},
{
"epoch": 3.8737402332691655,
"grad_norm": 0.0930514931678772,
"learning_rate": 0.00010996257919506794,
"loss": 0.1115,
"step": 8550
},
{
"epoch": 3.8782697316272223,
"grad_norm": 0.09656676650047302,
"learning_rate": 0.00010978003519950493,
"loss": 0.1098,
"step": 8560
},
{
"epoch": 3.882799229985279,
"grad_norm": 0.091661736369133,
"learning_rate": 0.00010959745829392069,
"loss": 0.1135,
"step": 8570
},
{
"epoch": 3.887328728343336,
"grad_norm": 0.09262984991073608,
"learning_rate": 0.00010941484909269036,
"loss": 0.1115,
"step": 8580
},
{
"epoch": 3.891858226701393,
"grad_norm": 0.11751729995012283,
"learning_rate": 0.00010923220821029762,
"loss": 0.1132,
"step": 8590
},
{
"epoch": 3.8963877250594496,
"grad_norm": 0.10761595517396927,
"learning_rate": 0.00010904953626133287,
"loss": 0.1126,
"step": 8600
},
{
"epoch": 3.9009172234175065,
"grad_norm": 0.08337333053350449,
"learning_rate": 0.00010886683386049099,
"loss": 0.111,
"step": 8610
},
{
"epoch": 3.9054467217755633,
"grad_norm": 0.10421154648065567,
"learning_rate": 0.00010868410162256935,
"loss": 0.1108,
"step": 8620
},
{
"epoch": 3.90997622013362,
"grad_norm": 0.10565438121557236,
"learning_rate": 0.0001085013401624657,
"loss": 0.112,
"step": 8630
},
{
"epoch": 3.914505718491677,
"grad_norm": 0.08946827799081802,
"learning_rate": 0.00010831855009517613,
"loss": 0.1101,
"step": 8640
},
{
"epoch": 3.9190352168497338,
"grad_norm": 0.08507835865020752,
"learning_rate": 0.00010813573203579306,
"loss": 0.11,
"step": 8650
},
{
"epoch": 3.9235647152077906,
"grad_norm": 0.07897284626960754,
"learning_rate": 0.00010795288659950303,
"loss": 0.1111,
"step": 8660
},
{
"epoch": 3.9280942135658474,
"grad_norm": 0.09554194658994675,
"learning_rate": 0.00010777001440158472,
"loss": 0.1126,
"step": 8670
},
{
"epoch": 3.9326237119239043,
"grad_norm": 0.11981197446584702,
"learning_rate": 0.00010758711605740683,
"loss": 0.1105,
"step": 8680
},
{
"epoch": 3.937153210281961,
"grad_norm": 0.11121747642755508,
"learning_rate": 0.00010740419218242615,
"loss": 0.112,
"step": 8690
},
{
"epoch": 3.941682708640018,
"grad_norm": 0.10044469684362411,
"learning_rate": 0.00010722124339218524,
"loss": 0.1097,
"step": 8700
},
{
"epoch": 3.9462122069980747,
"grad_norm": 0.07444220036268234,
"learning_rate": 0.00010703827030231065,
"loss": 0.1096,
"step": 8710
},
{
"epoch": 3.9507417053561316,
"grad_norm": 0.08997642993927002,
"learning_rate": 0.00010685527352851054,
"loss": 0.1098,
"step": 8720
},
{
"epoch": 3.9552712037141884,
"grad_norm": 0.09852538257837296,
"learning_rate": 0.0001066722536865729,
"loss": 0.1112,
"step": 8730
},
{
"epoch": 3.9598007020722457,
"grad_norm": 0.0946199893951416,
"learning_rate": 0.00010648921139236328,
"loss": 0.113,
"step": 8740
},
{
"epoch": 3.9643302004303025,
"grad_norm": 0.10738665610551834,
"learning_rate": 0.0001063061472618228,
"loss": 0.1105,
"step": 8750
},
{
"epoch": 3.9688596987883593,
"grad_norm": 0.09911846369504929,
"learning_rate": 0.00010612306191096602,
"loss": 0.1092,
"step": 8760
},
{
"epoch": 3.973389197146416,
"grad_norm": 0.09100183844566345,
"learning_rate": 0.00010593995595587898,
"loss": 0.1075,
"step": 8770
},
{
"epoch": 3.977918695504473,
"grad_norm": 0.08540119975805283,
"learning_rate": 0.00010575683001271701,
"loss": 0.11,
"step": 8780
},
{
"epoch": 3.98244819386253,
"grad_norm": 0.1455107182264328,
"learning_rate": 0.00010557368469770268,
"loss": 0.1072,
"step": 8790
},
{
"epoch": 3.9869776922205866,
"grad_norm": 0.09040206670761108,
"learning_rate": 0.0001053905206271238,
"loss": 0.112,
"step": 8800
},
{
"epoch": 3.9915071905786434,
"grad_norm": 0.08172180503606796,
"learning_rate": 0.00010520733841733125,
"loss": 0.1128,
"step": 8810
},
{
"epoch": 3.9960366889367003,
"grad_norm": 0.09760237485170364,
"learning_rate": 0.000105024138684737,
"loss": 0.1119,
"step": 8820
},
{
"epoch": 3.9996602876231457,
"eval_loss": 0.15827356278896332,
"eval_runtime": 617.6968,
"eval_samples_per_second": 12.741,
"eval_steps_per_second": 1.593,
"step": 8828
},
{
"epoch": 4.000905899671611,
"grad_norm": 0.0798049345612526,
"learning_rate": 0.00010484092204581189,
"loss": 0.1153,
"step": 8830
},
{
"epoch": 4.005435398029668,
"grad_norm": 0.07974246889352798,
"learning_rate": 0.00010465768911708373,
"loss": 0.0957,
"step": 8840
},
{
"epoch": 4.009964896387725,
"grad_norm": 0.08676203340291977,
"learning_rate": 0.00010447444051513513,
"loss": 0.0962,
"step": 8850
},
{
"epoch": 4.014494394745782,
"grad_norm": 0.07175087183713913,
"learning_rate": 0.00010429117685660146,
"loss": 0.0961,
"step": 8860
},
{
"epoch": 4.019023893103839,
"grad_norm": 0.06814973056316376,
"learning_rate": 0.00010410789875816866,
"loss": 0.0963,
"step": 8870
},
{
"epoch": 4.0235533914618955,
"grad_norm": 0.09090814739465714,
"learning_rate": 0.00010392460683657142,
"loss": 0.0994,
"step": 8880
},
{
"epoch": 4.028082889819952,
"grad_norm": 0.08229593187570572,
"learning_rate": 0.0001037413017085908,
"loss": 0.0967,
"step": 8890
},
{
"epoch": 4.032612388178009,
"grad_norm": 0.07398311048746109,
"learning_rate": 0.00010355798399105235,
"loss": 0.096,
"step": 8900
},
{
"epoch": 4.037141886536066,
"grad_norm": 0.06932748854160309,
"learning_rate": 0.00010337465430082403,
"loss": 0.0969,
"step": 8910
},
{
"epoch": 4.041671384894123,
"grad_norm": 0.09156011044979095,
"learning_rate": 0.000103191313254814,
"loss": 0.098,
"step": 8920
},
{
"epoch": 4.04620088325218,
"grad_norm": 0.07946418970823288,
"learning_rate": 0.00010300796146996874,
"loss": 0.0962,
"step": 8930
},
{
"epoch": 4.0507303816102365,
"grad_norm": 0.08557803928852081,
"learning_rate": 0.00010282459956327073,
"loss": 0.0948,
"step": 8940
},
{
"epoch": 4.055259879968293,
"grad_norm": 0.0721755251288414,
"learning_rate": 0.00010264122815173665,
"loss": 0.0981,
"step": 8950
},
{
"epoch": 4.05978937832635,
"grad_norm": 0.069907546043396,
"learning_rate": 0.0001024578478524151,
"loss": 0.0973,
"step": 8960
},
{
"epoch": 4.064318876684407,
"grad_norm": 0.07597635686397552,
"learning_rate": 0.00010227445928238455,
"loss": 0.0985,
"step": 8970
},
{
"epoch": 4.068848375042464,
"grad_norm": 0.08416584879159927,
"learning_rate": 0.00010209106305875139,
"loss": 0.0954,
"step": 8980
},
{
"epoch": 4.073377873400521,
"grad_norm": 0.08617585897445679,
"learning_rate": 0.00010190765979864764,
"loss": 0.0977,
"step": 8990
},
{
"epoch": 4.077907371758577,
"grad_norm": 0.07779661566019058,
"learning_rate": 0.00010172425011922915,
"loss": 0.0968,
"step": 9000
},
{
"epoch": 4.082436870116634,
"grad_norm": 0.08647850900888443,
"learning_rate": 0.00010154083463767323,
"loss": 0.0964,
"step": 9010
},
{
"epoch": 4.086966368474691,
"grad_norm": 0.08829203248023987,
"learning_rate": 0.00010135741397117684,
"loss": 0.0992,
"step": 9020
},
{
"epoch": 4.091495866832748,
"grad_norm": 0.08579693734645844,
"learning_rate": 0.00010117398873695429,
"loss": 0.0987,
"step": 9030
},
{
"epoch": 4.096025365190805,
"grad_norm": 0.06886789947748184,
"learning_rate": 0.00010099055955223531,
"loss": 0.0983,
"step": 9040
},
{
"epoch": 4.100554863548862,
"grad_norm": 0.0997413694858551,
"learning_rate": 0.0001008071270342629,
"loss": 0.0956,
"step": 9050
},
{
"epoch": 4.105084361906918,
"grad_norm": 0.07166160643100739,
"learning_rate": 0.00010062369180029125,
"loss": 0.0968,
"step": 9060
},
{
"epoch": 4.109613860264975,
"grad_norm": 0.07676910609006882,
"learning_rate": 0.00010044025446758381,
"loss": 0.097,
"step": 9070
},
{
"epoch": 4.114143358623033,
"grad_norm": 0.08378776907920837,
"learning_rate": 0.00010025681565341091,
"loss": 0.0964,
"step": 9080
},
{
"epoch": 4.11867285698109,
"grad_norm": 0.0725962296128273,
"learning_rate": 0.00010007337597504804,
"loss": 0.0982,
"step": 9090
},
{
"epoch": 4.123202355339147,
"grad_norm": 0.0860457792878151,
"learning_rate": 9.988993604977352e-05,
"loss": 0.0974,
"step": 9100
},
{
"epoch": 4.127731853697203,
"grad_norm": 0.08629846572875977,
"learning_rate": 9.970649649486644e-05,
"loss": 0.0981,
"step": 9110
},
{
"epoch": 4.13226135205526,
"grad_norm": 0.08496873825788498,
"learning_rate": 9.952305792760475e-05,
"loss": 0.0991,
"step": 9120
},
{
"epoch": 4.136790850413317,
"grad_norm": 0.07953400164842606,
"learning_rate": 9.933962096526302e-05,
"loss": 0.0953,
"step": 9130
},
{
"epoch": 4.141320348771374,
"grad_norm": 0.08169267326593399,
"learning_rate": 9.915618622511044e-05,
"loss": 0.0985,
"step": 9140
},
{
"epoch": 4.145849847129431,
"grad_norm": 0.09323912113904953,
"learning_rate": 9.897275432440872e-05,
"loss": 0.0955,
"step": 9150
},
{
"epoch": 4.1503793454874875,
"grad_norm": 0.07836610078811646,
"learning_rate": 9.878932588040997e-05,
"loss": 0.0983,
"step": 9160
},
{
"epoch": 4.154908843845544,
"grad_norm": 0.06795407086610794,
"learning_rate": 9.860590151035473e-05,
"loss": 0.097,
"step": 9170
},
{
"epoch": 4.159438342203601,
"grad_norm": 0.082821324467659,
"learning_rate": 9.84224818314698e-05,
"loss": 0.0972,
"step": 9180
},
{
"epoch": 4.163967840561658,
"grad_norm": 0.06650907546281815,
"learning_rate": 9.823906746096622e-05,
"loss": 0.0973,
"step": 9190
},
{
"epoch": 4.168497338919715,
"grad_norm": 0.07272431999444962,
"learning_rate": 9.805565901603714e-05,
"loss": 0.0974,
"step": 9200
},
{
"epoch": 4.173026837277772,
"grad_norm": 0.07406030595302582,
"learning_rate": 9.78722571138558e-05,
"loss": 0.0968,
"step": 9210
},
{
"epoch": 4.1775563356358285,
"grad_norm": 0.06534506380558014,
"learning_rate": 9.768886237157337e-05,
"loss": 0.0977,
"step": 9220
},
{
"epoch": 4.182085833993885,
"grad_norm": 0.08346185088157654,
"learning_rate": 9.750547540631697e-05,
"loss": 0.0966,
"step": 9230
},
{
"epoch": 4.186615332351942,
"grad_norm": 0.0646069347858429,
"learning_rate": 9.732209683518753e-05,
"loss": 0.0957,
"step": 9240
},
{
"epoch": 4.191144830709999,
"grad_norm": 0.07642305642366409,
"learning_rate": 9.713872727525778e-05,
"loss": 0.0948,
"step": 9250
},
{
"epoch": 4.195674329068056,
"grad_norm": 0.07574049383401871,
"learning_rate": 9.695536734357005e-05,
"loss": 0.0977,
"step": 9260
},
{
"epoch": 4.200203827426113,
"grad_norm": 0.08899475634098053,
"learning_rate": 9.677201765713435e-05,
"loss": 0.0979,
"step": 9270
},
{
"epoch": 4.2047333257841695,
"grad_norm": 0.07823716104030609,
"learning_rate": 9.658867883292615e-05,
"loss": 0.0986,
"step": 9280
},
{
"epoch": 4.209262824142226,
"grad_norm": 0.07970847934484482,
"learning_rate": 9.640535148788443e-05,
"loss": 0.0965,
"step": 9290
},
{
"epoch": 4.213792322500283,
"grad_norm": 0.07121343910694122,
"learning_rate": 9.622203623890944e-05,
"loss": 0.098,
"step": 9300
},
{
"epoch": 4.21832182085834,
"grad_norm": 0.08438264578580856,
"learning_rate": 9.603873370286083e-05,
"loss": 0.0975,
"step": 9310
},
{
"epoch": 4.222851319216397,
"grad_norm": 0.07344311475753784,
"learning_rate": 9.585544449655543e-05,
"loss": 0.0995,
"step": 9320
},
{
"epoch": 4.227380817574454,
"grad_norm": 0.08449902385473251,
"learning_rate": 9.567216923676526e-05,
"loss": 0.1,
"step": 9330
},
{
"epoch": 4.23191031593251,
"grad_norm": 0.08021081984043121,
"learning_rate": 9.548890854021529e-05,
"loss": 0.0966,
"step": 9340
},
{
"epoch": 4.236439814290567,
"grad_norm": 0.08234046399593353,
"learning_rate": 9.530566302358162e-05,
"loss": 0.0948,
"step": 9350
},
{
"epoch": 4.240969312648624,
"grad_norm": 0.09645576030015945,
"learning_rate": 9.512243330348917e-05,
"loss": 0.0952,
"step": 9360
},
{
"epoch": 4.245498811006681,
"grad_norm": 0.07178854942321777,
"learning_rate": 9.493921999650981e-05,
"loss": 0.0928,
"step": 9370
},
{
"epoch": 4.250028309364738,
"grad_norm": 0.08183001726865768,
"learning_rate": 9.475602371916006e-05,
"loss": 0.0969,
"step": 9380
},
{
"epoch": 4.254557807722795,
"grad_norm": 0.07914981991052628,
"learning_rate": 9.457284508789922e-05,
"loss": 0.0967,
"step": 9390
},
{
"epoch": 4.259087306080851,
"grad_norm": 0.07766249775886536,
"learning_rate": 9.438968471912718e-05,
"loss": 0.0973,
"step": 9400
},
{
"epoch": 4.263616804438908,
"grad_norm": 0.06642225384712219,
"learning_rate": 9.420654322918234e-05,
"loss": 0.0972,
"step": 9410
},
{
"epoch": 4.268146302796965,
"grad_norm": 0.10396700352430344,
"learning_rate": 9.402342123433968e-05,
"loss": 0.0992,
"step": 9420
},
{
"epoch": 4.272675801155022,
"grad_norm": 0.0772017240524292,
"learning_rate": 9.384031935080849e-05,
"loss": 0.0955,
"step": 9430
},
{
"epoch": 4.277205299513079,
"grad_norm": 0.08579739928245544,
"learning_rate": 9.365723819473034e-05,
"loss": 0.0999,
"step": 9440
},
{
"epoch": 4.2817347978711355,
"grad_norm": 0.07170093059539795,
"learning_rate": 9.347417838217719e-05,
"loss": 0.0978,
"step": 9450
},
{
"epoch": 4.286264296229192,
"grad_norm": 0.09926804155111313,
"learning_rate": 9.329114052914905e-05,
"loss": 0.0975,
"step": 9460
},
{
"epoch": 4.290793794587249,
"grad_norm": 0.0870131105184555,
"learning_rate": 9.310812525157211e-05,
"loss": 0.0976,
"step": 9470
},
{
"epoch": 4.295323292945306,
"grad_norm": 0.09447421133518219,
"learning_rate": 9.29251331652966e-05,
"loss": 0.0978,
"step": 9480
},
{
"epoch": 4.299852791303363,
"grad_norm": 0.06886494159698486,
"learning_rate": 9.274216488609465e-05,
"loss": 0.0956,
"step": 9490
},
{
"epoch": 4.30438228966142,
"grad_norm": 0.06958340108394623,
"learning_rate": 9.255922102965835e-05,
"loss": 0.0978,
"step": 9500
},
{
"epoch": 4.3089117880194765,
"grad_norm": 0.09395691007375717,
"learning_rate": 9.237630221159751e-05,
"loss": 0.0999,
"step": 9510
},
{
"epoch": 4.313441286377533,
"grad_norm": 0.08615806698799133,
"learning_rate": 9.219340904743781e-05,
"loss": 0.0971,
"step": 9520
},
{
"epoch": 4.31797078473559,
"grad_norm": 0.09322655200958252,
"learning_rate": 9.201054215261849e-05,
"loss": 0.1008,
"step": 9530
},
{
"epoch": 4.322500283093647,
"grad_norm": 0.08992312103509903,
"learning_rate": 9.182770214249046e-05,
"loss": 0.0992,
"step": 9540
},
{
"epoch": 4.327029781451705,
"grad_norm": 0.08701404929161072,
"learning_rate": 9.164488963231415e-05,
"loss": 0.0969,
"step": 9550
},
{
"epoch": 4.3315592798097615,
"grad_norm": 0.07870589941740036,
"learning_rate": 9.146210523725744e-05,
"loss": 0.0989,
"step": 9560
},
{
"epoch": 4.336088778167818,
"grad_norm": 0.061097387224435806,
"learning_rate": 9.127934957239367e-05,
"loss": 0.0986,
"step": 9570
},
{
"epoch": 4.340618276525875,
"grad_norm": 0.08281367272138596,
"learning_rate": 9.109662325269932e-05,
"loss": 0.0988,
"step": 9580
},
{
"epoch": 4.345147774883932,
"grad_norm": 0.09463726729154587,
"learning_rate": 9.091392689305233e-05,
"loss": 0.0977,
"step": 9590
},
{
"epoch": 4.349677273241989,
"grad_norm": 0.07657352089881897,
"learning_rate": 9.073126110822969e-05,
"loss": 0.0995,
"step": 9600
},
{
"epoch": 4.354206771600046,
"grad_norm": 0.08821120113134384,
"learning_rate": 9.054862651290559e-05,
"loss": 0.0972,
"step": 9610
},
{
"epoch": 4.3587362699581025,
"grad_norm": 0.09997398406267166,
"learning_rate": 9.036602372164922e-05,
"loss": 0.0987,
"step": 9620
},
{
"epoch": 4.363265768316159,
"grad_norm": 0.08112788945436478,
"learning_rate": 9.018345334892275e-05,
"loss": 0.0974,
"step": 9630
},
{
"epoch": 4.367795266674216,
"grad_norm": 0.07112699747085571,
"learning_rate": 9.000091600907928e-05,
"loss": 0.0977,
"step": 9640
},
{
"epoch": 4.372324765032273,
"grad_norm": 0.09066987037658691,
"learning_rate": 8.981841231636073e-05,
"loss": 0.0989,
"step": 9650
},
{
"epoch": 4.37685426339033,
"grad_norm": 0.08122070878744125,
"learning_rate": 8.96359428848958e-05,
"loss": 0.0997,
"step": 9660
},
{
"epoch": 4.381383761748387,
"grad_norm": 0.08035853505134583,
"learning_rate": 8.945350832869795e-05,
"loss": 0.0979,
"step": 9670
},
{
"epoch": 4.3859132601064434,
"grad_norm": 0.07366472482681274,
"learning_rate": 8.927110926166324e-05,
"loss": 0.0969,
"step": 9680
},
{
"epoch": 4.3904427584645,
"grad_norm": 0.0794186070561409,
"learning_rate": 8.908874629756827e-05,
"loss": 0.0983,
"step": 9690
},
{
"epoch": 4.394972256822557,
"grad_norm": 0.06437776982784271,
"learning_rate": 8.890642005006822e-05,
"loss": 0.0984,
"step": 9700
},
{
"epoch": 4.399501755180614,
"grad_norm": 0.07162316143512726,
"learning_rate": 8.872413113269468e-05,
"loss": 0.0975,
"step": 9710
},
{
"epoch": 4.404031253538671,
"grad_norm": 0.07623278349637985,
"learning_rate": 8.854188015885368e-05,
"loss": 0.0998,
"step": 9720
},
{
"epoch": 4.408560751896728,
"grad_norm": 0.07586734741926193,
"learning_rate": 8.835966774182349e-05,
"loss": 0.0973,
"step": 9730
},
{
"epoch": 4.413090250254784,
"grad_norm": 0.0751037672162056,
"learning_rate": 8.817749449475266e-05,
"loss": 0.099,
"step": 9740
},
{
"epoch": 4.417619748612841,
"grad_norm": 0.07702226936817169,
"learning_rate": 8.799536103065794e-05,
"loss": 0.098,
"step": 9750
},
{
"epoch": 4.422149246970898,
"grad_norm": 0.07942003011703491,
"learning_rate": 8.781326796242222e-05,
"loss": 0.0982,
"step": 9760
},
{
"epoch": 4.426678745328955,
"grad_norm": 0.07305794209241867,
"learning_rate": 8.763121590279249e-05,
"loss": 0.0964,
"step": 9770
},
{
"epoch": 4.431208243687012,
"grad_norm": 0.07927001267671585,
"learning_rate": 8.744920546437764e-05,
"loss": 0.0985,
"step": 9780
},
{
"epoch": 4.4357377420450685,
"grad_norm": 0.08005883544683456,
"learning_rate": 8.726723725964662e-05,
"loss": 0.0996,
"step": 9790
},
{
"epoch": 4.440267240403125,
"grad_norm": 0.07482803612947464,
"learning_rate": 8.708531190092619e-05,
"loss": 0.1007,
"step": 9800
},
{
"epoch": 4.444796738761182,
"grad_norm": 0.08192785084247589,
"learning_rate": 8.690343000039895e-05,
"loss": 0.1008,
"step": 9810
},
{
"epoch": 4.449326237119239,
"grad_norm": 0.07693403214216232,
"learning_rate": 8.67215921701013e-05,
"loss": 0.0982,
"step": 9820
},
{
"epoch": 4.453855735477296,
"grad_norm": 0.0875929445028305,
"learning_rate": 8.653979902192125e-05,
"loss": 0.1003,
"step": 9830
},
{
"epoch": 4.458385233835353,
"grad_norm": 0.07676168531179428,
"learning_rate": 8.635805116759656e-05,
"loss": 0.0964,
"step": 9840
},
{
"epoch": 4.4629147321934095,
"grad_norm": 0.0706658735871315,
"learning_rate": 8.617634921871252e-05,
"loss": 0.0996,
"step": 9850
},
{
"epoch": 4.467444230551466,
"grad_norm": 0.08421318978071213,
"learning_rate": 8.599469378669997e-05,
"loss": 0.1004,
"step": 9860
},
{
"epoch": 4.471973728909523,
"grad_norm": 0.06626369804143906,
"learning_rate": 8.581308548283313e-05,
"loss": 0.0961,
"step": 9870
},
{
"epoch": 4.47650322726758,
"grad_norm": 0.10955769568681717,
"learning_rate": 8.563152491822777e-05,
"loss": 0.0989,
"step": 9880
},
{
"epoch": 4.481032725625637,
"grad_norm": 0.07062443345785141,
"learning_rate": 8.545001270383896e-05,
"loss": 0.0996,
"step": 9890
},
{
"epoch": 4.485562223983694,
"grad_norm": 0.09103110432624817,
"learning_rate": 8.526854945045903e-05,
"loss": 0.0969,
"step": 9900
},
{
"epoch": 4.4900917223417505,
"grad_norm": 0.08335482329130173,
"learning_rate": 8.508713576871564e-05,
"loss": 0.0988,
"step": 9910
},
{
"epoch": 4.494621220699807,
"grad_norm": 0.08251272886991501,
"learning_rate": 8.490577226906952e-05,
"loss": 0.1002,
"step": 9920
},
{
"epoch": 4.499150719057864,
"grad_norm": 0.0790376290678978,
"learning_rate": 8.472445956181266e-05,
"loss": 0.0959,
"step": 9930
},
{
"epoch": 4.503680217415921,
"grad_norm": 0.07596680521965027,
"learning_rate": 8.454319825706607e-05,
"loss": 0.0957,
"step": 9940
},
{
"epoch": 4.508209715773978,
"grad_norm": 0.07809595763683319,
"learning_rate": 8.436198896477777e-05,
"loss": 0.0966,
"step": 9950
},
{
"epoch": 4.512739214132035,
"grad_norm": 0.0959998071193695,
"learning_rate": 8.418083229472081e-05,
"loss": 0.0983,
"step": 9960
},
{
"epoch": 4.517268712490091,
"grad_norm": 0.0705457404255867,
"learning_rate": 8.399972885649115e-05,
"loss": 0.0985,
"step": 9970
},
{
"epoch": 4.521798210848148,
"grad_norm": 0.07132048159837723,
"learning_rate": 8.381867925950558e-05,
"loss": 0.0966,
"step": 9980
},
{
"epoch": 4.526327709206205,
"grad_norm": 0.08615089952945709,
"learning_rate": 8.363768411299978e-05,
"loss": 0.097,
"step": 9990
},
{
"epoch": 4.530857207564262,
"grad_norm": 0.07540059089660645,
"learning_rate": 8.345674402602617e-05,
"loss": 0.1016,
"step": 10000
},
{
"epoch": 4.535386705922319,
"grad_norm": 0.0691477432847023,
"learning_rate": 8.32758596074519e-05,
"loss": 0.1008,
"step": 10010
},
{
"epoch": 4.539916204280376,
"grad_norm": 0.07377701252698898,
"learning_rate": 8.309503146595674e-05,
"loss": 0.0995,
"step": 10020
},
{
"epoch": 4.544445702638432,
"grad_norm": 0.06582989543676376,
"learning_rate": 8.291426021003117e-05,
"loss": 0.0974,
"step": 10030
},
{
"epoch": 4.548975200996489,
"grad_norm": 0.07520575076341629,
"learning_rate": 8.273354644797421e-05,
"loss": 0.0995,
"step": 10040
},
{
"epoch": 4.553504699354546,
"grad_norm": 0.0851583182811737,
"learning_rate": 8.255289078789141e-05,
"loss": 0.097,
"step": 10050
},
{
"epoch": 4.558034197712603,
"grad_norm": 0.08124125748872757,
"learning_rate": 8.237229383769283e-05,
"loss": 0.1001,
"step": 10060
},
{
"epoch": 4.56256369607066,
"grad_norm": 0.08267924189567566,
"learning_rate": 8.219175620509092e-05,
"loss": 0.0969,
"step": 10070
},
{
"epoch": 4.5670931944287165,
"grad_norm": 0.07254312187433243,
"learning_rate": 8.201127849759861e-05,
"loss": 0.0993,
"step": 10080
},
{
"epoch": 4.571622692786774,
"grad_norm": 0.08983401954174042,
"learning_rate": 8.183086132252706e-05,
"loss": 0.1003,
"step": 10090
},
{
"epoch": 4.576152191144831,
"grad_norm": 0.06914500892162323,
"learning_rate": 8.165050528698385e-05,
"loss": 0.1002,
"step": 10100
},
{
"epoch": 4.580681689502888,
"grad_norm": 0.06419195234775543,
"learning_rate": 8.147021099787075e-05,
"loss": 0.099,
"step": 10110
},
{
"epoch": 4.585211187860945,
"grad_norm": 0.0637657642364502,
"learning_rate": 8.12899790618818e-05,
"loss": 0.0986,
"step": 10120
},
{
"epoch": 4.5897406862190016,
"grad_norm": 0.06946605443954468,
"learning_rate": 8.11098100855012e-05,
"loss": 0.1003,
"step": 10130
},
{
"epoch": 4.594270184577058,
"grad_norm": 0.06739254295825958,
"learning_rate": 8.092970467500129e-05,
"loss": 0.1002,
"step": 10140
},
{
"epoch": 4.598799682935115,
"grad_norm": 0.058849554508924484,
"learning_rate": 8.074966343644056e-05,
"loss": 0.0991,
"step": 10150
},
{
"epoch": 4.603329181293172,
"grad_norm": 0.07838159799575806,
"learning_rate": 8.056968697566141e-05,
"loss": 0.0986,
"step": 10160
},
{
"epoch": 4.607858679651229,
"grad_norm": 0.06857123970985413,
"learning_rate": 8.038977589828841e-05,
"loss": 0.0995,
"step": 10170
},
{
"epoch": 4.612388178009286,
"grad_norm": 0.06318482011556625,
"learning_rate": 8.020993080972607e-05,
"loss": 0.0993,
"step": 10180
},
{
"epoch": 4.6169176763673425,
"grad_norm": 0.06283606588840485,
"learning_rate": 8.003015231515683e-05,
"loss": 0.0986,
"step": 10190
},
{
"epoch": 4.621447174725399,
"grad_norm": 0.07274708896875381,
"learning_rate": 7.985044101953905e-05,
"loss": 0.0967,
"step": 10200
},
{
"epoch": 4.625976673083456,
"grad_norm": 0.0730716809630394,
"learning_rate": 7.967079752760498e-05,
"loss": 0.0998,
"step": 10210
},
{
"epoch": 4.630506171441513,
"grad_norm": 0.08666019141674042,
"learning_rate": 7.949122244385869e-05,
"loss": 0.0997,
"step": 10220
},
{
"epoch": 4.63503566979957,
"grad_norm": 0.07280432432889938,
"learning_rate": 7.931171637257407e-05,
"loss": 0.098,
"step": 10230
},
{
"epoch": 4.639565168157627,
"grad_norm": 0.07623490691184998,
"learning_rate": 7.913227991779275e-05,
"loss": 0.0972,
"step": 10240
},
{
"epoch": 4.6440946665156835,
"grad_norm": 0.08786217123270035,
"learning_rate": 7.895291368332213e-05,
"loss": 0.0984,
"step": 10250
},
{
"epoch": 4.64862416487374,
"grad_norm": 0.06460744142532349,
"learning_rate": 7.877361827273333e-05,
"loss": 0.1003,
"step": 10260
},
{
"epoch": 4.653153663231797,
"grad_norm": 0.0875258669257164,
"learning_rate": 7.859439428935907e-05,
"loss": 0.0973,
"step": 10270
},
{
"epoch": 4.657683161589854,
"grad_norm": 0.0640462338924408,
"learning_rate": 7.841524233629182e-05,
"loss": 0.097,
"step": 10280
},
{
"epoch": 4.662212659947911,
"grad_norm": 0.08805970847606659,
"learning_rate": 7.823616301638158e-05,
"loss": 0.0977,
"step": 10290
},
{
"epoch": 4.666742158305968,
"grad_norm": 0.08403537422418594,
"learning_rate": 7.805715693223403e-05,
"loss": 0.0974,
"step": 10300
},
{
"epoch": 4.671271656664024,
"grad_norm": 0.08450974524021149,
"learning_rate": 7.787822468620831e-05,
"loss": 0.0996,
"step": 10310
},
{
"epoch": 4.675801155022081,
"grad_norm": 0.06727894395589828,
"learning_rate": 7.76993668804151e-05,
"loss": 0.0968,
"step": 10320
},
{
"epoch": 4.680330653380138,
"grad_norm": 0.07860536128282547,
"learning_rate": 7.752058411671469e-05,
"loss": 0.098,
"step": 10330
},
{
"epoch": 4.684860151738195,
"grad_norm": 0.0783989354968071,
"learning_rate": 7.734187699671475e-05,
"loss": 0.1001,
"step": 10340
},
{
"epoch": 4.689389650096252,
"grad_norm": 0.09318368136882782,
"learning_rate": 7.716324612176848e-05,
"loss": 0.102,
"step": 10350
},
{
"epoch": 4.693919148454309,
"grad_norm": 0.06499195098876953,
"learning_rate": 7.698469209297243e-05,
"loss": 0.0972,
"step": 10360
},
{
"epoch": 4.698448646812365,
"grad_norm": 0.08642645180225372,
"learning_rate": 7.680621551116464e-05,
"loss": 0.0976,
"step": 10370
},
{
"epoch": 4.702978145170422,
"grad_norm": 0.08057048916816711,
"learning_rate": 7.662781697692251e-05,
"loss": 0.1001,
"step": 10380
},
{
"epoch": 4.707507643528479,
"grad_norm": 0.07037744671106339,
"learning_rate": 7.644949709056081e-05,
"loss": 0.0954,
"step": 10390
},
{
"epoch": 4.712037141886536,
"grad_norm": 0.07643935829401016,
"learning_rate": 7.627125645212962e-05,
"loss": 0.0988,
"step": 10400
},
{
"epoch": 4.716566640244593,
"grad_norm": 0.06035691127181053,
"learning_rate": 7.609309566141242e-05,
"loss": 0.0951,
"step": 10410
},
{
"epoch": 4.7210961386026495,
"grad_norm": 0.06654711812734604,
"learning_rate": 7.591501531792394e-05,
"loss": 0.0978,
"step": 10420
},
{
"epoch": 4.725625636960706,
"grad_norm": 0.0829191505908966,
"learning_rate": 7.573701602090826e-05,
"loss": 0.0974,
"step": 10430
},
{
"epoch": 4.730155135318763,
"grad_norm": 0.06532509624958038,
"learning_rate": 7.555909836933668e-05,
"loss": 0.1,
"step": 10440
},
{
"epoch": 4.73468463367682,
"grad_norm": 0.07426194101572037,
"learning_rate": 7.538126296190578e-05,
"loss": 0.0978,
"step": 10450
},
{
"epoch": 4.739214132034877,
"grad_norm": 0.07493621110916138,
"learning_rate": 7.520351039703539e-05,
"loss": 0.0982,
"step": 10460
},
{
"epoch": 4.743743630392934,
"grad_norm": 0.07495691627264023,
"learning_rate": 7.50258412728666e-05,
"loss": 0.0988,
"step": 10470
},
{
"epoch": 4.7482731287509905,
"grad_norm": 0.08136378973722458,
"learning_rate": 7.484825618725968e-05,
"loss": 0.097,
"step": 10480
},
{
"epoch": 4.752802627109047,
"grad_norm": 0.06776054948568344,
"learning_rate": 7.467075573779215e-05,
"loss": 0.099,
"step": 10490
},
{
"epoch": 4.757332125467104,
"grad_norm": 0.06532083451747894,
"learning_rate": 7.449334052175665e-05,
"loss": 0.1008,
"step": 10500
},
{
"epoch": 4.761861623825161,
"grad_norm": 0.08907100558280945,
"learning_rate": 7.431601113615909e-05,
"loss": 0.0995,
"step": 10510
},
{
"epoch": 4.766391122183219,
"grad_norm": 0.07240644842386246,
"learning_rate": 7.413876817771655e-05,
"loss": 0.0998,
"step": 10520
},
{
"epoch": 4.7709206205412755,
"grad_norm": 0.07485652714967728,
"learning_rate": 7.396161224285521e-05,
"loss": 0.0964,
"step": 10530
},
{
"epoch": 4.775450118899332,
"grad_norm": 0.07228762656450272,
"learning_rate": 7.378454392770851e-05,
"loss": 0.0999,
"step": 10540
},
{
"epoch": 4.779979617257389,
"grad_norm": 0.08463383466005325,
"learning_rate": 7.360756382811498e-05,
"loss": 0.0962,
"step": 10550
},
{
"epoch": 4.784509115615446,
"grad_norm": 0.08021671324968338,
"learning_rate": 7.343067253961633e-05,
"loss": 0.0982,
"step": 10560
},
{
"epoch": 4.789038613973503,
"grad_norm": 0.0640299916267395,
"learning_rate": 7.325387065745542e-05,
"loss": 0.0987,
"step": 10570
},
{
"epoch": 4.79356811233156,
"grad_norm": 0.08146077394485474,
"learning_rate": 7.307715877657428e-05,
"loss": 0.1004,
"step": 10580
},
{
"epoch": 4.7980976106896165,
"grad_norm": 0.0729324147105217,
"learning_rate": 7.290053749161197e-05,
"loss": 0.098,
"step": 10590
},
{
"epoch": 4.802627109047673,
"grad_norm": 0.08027558028697968,
"learning_rate": 7.272400739690281e-05,
"loss": 0.1003,
"step": 10600
},
{
"epoch": 4.80715660740573,
"grad_norm": 0.07233118265867233,
"learning_rate": 7.254756908647424e-05,
"loss": 0.0969,
"step": 10610
},
{
"epoch": 4.811686105763787,
"grad_norm": 0.08703139424324036,
"learning_rate": 7.237122315404483e-05,
"loss": 0.0978,
"step": 10620
},
{
"epoch": 4.816215604121844,
"grad_norm": 0.09773527085781097,
"learning_rate": 7.219497019302231e-05,
"loss": 0.1006,
"step": 10630
},
{
"epoch": 4.820745102479901,
"grad_norm": 0.07498451322317123,
"learning_rate": 7.201881079650153e-05,
"loss": 0.0953,
"step": 10640
},
{
"epoch": 4.8252746008379575,
"grad_norm": 0.08071410655975342,
"learning_rate": 7.184274555726251e-05,
"loss": 0.0997,
"step": 10650
},
{
"epoch": 4.829804099196014,
"grad_norm": 0.09239617735147476,
"learning_rate": 7.166677506776847e-05,
"loss": 0.0966,
"step": 10660
},
{
"epoch": 4.834333597554071,
"grad_norm": 0.06160885840654373,
"learning_rate": 7.149089992016369e-05,
"loss": 0.0996,
"step": 10670
},
{
"epoch": 4.838863095912128,
"grad_norm": 0.06242508441209793,
"learning_rate": 7.131512070627174e-05,
"loss": 0.0971,
"step": 10680
},
{
"epoch": 4.843392594270185,
"grad_norm": 0.07087717205286026,
"learning_rate": 7.113943801759328e-05,
"loss": 0.0981,
"step": 10690
},
{
"epoch": 4.847922092628242,
"grad_norm": 0.09145446121692657,
"learning_rate": 7.096385244530421e-05,
"loss": 0.1018,
"step": 10700
},
{
"epoch": 4.852451590986298,
"grad_norm": 0.06915028393268585,
"learning_rate": 7.078836458025367e-05,
"loss": 0.0975,
"step": 10710
},
{
"epoch": 4.856981089344355,
"grad_norm": 0.0731835886836052,
"learning_rate": 7.06129750129619e-05,
"loss": 0.0983,
"step": 10720
},
{
"epoch": 4.861510587702412,
"grad_norm": 0.07754811644554138,
"learning_rate": 7.043768433361848e-05,
"loss": 0.0987,
"step": 10730
},
{
"epoch": 4.866040086060469,
"grad_norm": 0.07234437018632889,
"learning_rate": 7.026249313208013e-05,
"loss": 0.0999,
"step": 10740
},
{
"epoch": 4.870569584418526,
"grad_norm": 0.06629019230604172,
"learning_rate": 7.008740199786891e-05,
"loss": 0.0982,
"step": 10750
},
{
"epoch": 4.8750990827765825,
"grad_norm": 0.07004278153181076,
"learning_rate": 6.991241152017009e-05,
"loss": 0.0984,
"step": 10760
},
{
"epoch": 4.879628581134639,
"grad_norm": 0.07674950361251831,
"learning_rate": 6.973752228783028e-05,
"loss": 0.0967,
"step": 10770
},
{
"epoch": 4.884158079492696,
"grad_norm": 0.08505762368440628,
"learning_rate": 6.956273488935537e-05,
"loss": 0.1013,
"step": 10780
},
{
"epoch": 4.888687577850753,
"grad_norm": 0.07949452847242355,
"learning_rate": 6.938804991290856e-05,
"loss": 0.0985,
"step": 10790
},
{
"epoch": 4.89321707620881,
"grad_norm": 0.08295728266239166,
"learning_rate": 6.921346794630843e-05,
"loss": 0.0989,
"step": 10800
},
{
"epoch": 4.897746574566867,
"grad_norm": 0.06370176374912262,
"learning_rate": 6.903898957702694e-05,
"loss": 0.0973,
"step": 10810
},
{
"epoch": 4.9022760729249235,
"grad_norm": 0.07928381115198135,
"learning_rate": 6.886461539218739e-05,
"loss": 0.0997,
"step": 10820
},
{
"epoch": 4.90680557128298,
"grad_norm": 0.07781045138835907,
"learning_rate": 6.870776818850459e-05,
"loss": 0.1002,
"step": 10830
},
{
"epoch": 4.911335069641037,
"grad_norm": 0.06968411058187485,
"learning_rate": 6.853359357037234e-05,
"loss": 0.0967,
"step": 10840
},
{
"epoch": 4.915864567999094,
"grad_norm": 0.08793435990810394,
"learning_rate": 6.835952483735004e-05,
"loss": 0.0985,
"step": 10850
},
{
"epoch": 4.920394066357151,
"grad_norm": 0.07273527979850769,
"learning_rate": 6.818556257518263e-05,
"loss": 0.1007,
"step": 10860
},
{
"epoch": 4.924923564715208,
"grad_norm": 0.0791454091668129,
"learning_rate": 6.80117073692567e-05,
"loss": 0.0966,
"step": 10870
},
{
"epoch": 4.9294530630732645,
"grad_norm": 0.07608039677143097,
"learning_rate": 6.783795980459867e-05,
"loss": 0.1012,
"step": 10880
},
{
"epoch": 4.933982561431321,
"grad_norm": 0.07776329666376114,
"learning_rate": 6.766432046587266e-05,
"loss": 0.1003,
"step": 10890
},
{
"epoch": 4.938512059789378,
"grad_norm": 0.0679519921541214,
"learning_rate": 6.749078993737871e-05,
"loss": 0.0991,
"step": 10900
},
{
"epoch": 4.943041558147435,
"grad_norm": 0.07100383937358856,
"learning_rate": 6.731736880305054e-05,
"loss": 0.0988,
"step": 10910
},
{
"epoch": 4.947571056505492,
"grad_norm": 0.0812440738081932,
"learning_rate": 6.714405764645391e-05,
"loss": 0.0998,
"step": 10920
},
{
"epoch": 4.952100554863549,
"grad_norm": 0.07612130790948868,
"learning_rate": 6.697085705078447e-05,
"loss": 0.1007,
"step": 10930
},
{
"epoch": 4.956630053221605,
"grad_norm": 0.112273670732975,
"learning_rate": 6.679776759886581e-05,
"loss": 0.0987,
"step": 10940
},
{
"epoch": 4.961159551579662,
"grad_norm": 0.07123211026191711,
"learning_rate": 6.662478987314751e-05,
"loss": 0.0987,
"step": 10950
},
{
"epoch": 4.965689049937719,
"grad_norm": 0.0752432569861412,
"learning_rate": 6.645192445570321e-05,
"loss": 0.0986,
"step": 10960
},
{
"epoch": 4.970218548295776,
"grad_norm": 0.08591726422309875,
"learning_rate": 6.627917192822862e-05,
"loss": 0.0987,
"step": 10970
},
{
"epoch": 4.974748046653833,
"grad_norm": 0.0789419561624527,
"learning_rate": 6.610653287203959e-05,
"loss": 0.1001,
"step": 10980
},
{
"epoch": 4.97927754501189,
"grad_norm": 0.07303869724273682,
"learning_rate": 6.593400786807011e-05,
"loss": 0.1005,
"step": 10990
},
{
"epoch": 4.983807043369946,
"grad_norm": 0.062059495598077774,
"learning_rate": 6.57615974968704e-05,
"loss": 0.0993,
"step": 11000
},
{
"epoch": 4.988336541728003,
"grad_norm": 0.07526618242263794,
"learning_rate": 6.558930233860497e-05,
"loss": 0.0994,
"step": 11010
},
{
"epoch": 4.99286604008606,
"grad_norm": 0.05961596965789795,
"learning_rate": 6.541712297305054e-05,
"loss": 0.0994,
"step": 11020
},
{
"epoch": 4.997395538444117,
"grad_norm": 0.08421042561531067,
"learning_rate": 6.524505997959425e-05,
"loss": 0.0992,
"step": 11030
},
{
"epoch": 4.999660287623145,
"eval_loss": 0.1612485647201538,
"eval_runtime": 617.4712,
"eval_samples_per_second": 12.746,
"eval_steps_per_second": 1.594,
"step": 11035
}
],
"logging_steps": 10,
"max_steps": 17656,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.476002265936691e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}