{ "best_metric": 0.15827356278896332, "best_model_checkpoint": "finetuned_models/selection/phi_mini/checkpoint-8828", "epoch": 4.999660287623145, "eval_steps": 500, "global_step": 11035, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004529498358056845, "grad_norm": 2.3706889152526855, "learning_rate": 3.7735849056603773e-06, "loss": 3.1411, "step": 10 }, { "epoch": 0.00905899671611369, "grad_norm": 1.1344068050384521, "learning_rate": 7.547169811320755e-06, "loss": 2.7341, "step": 20 }, { "epoch": 0.013588495074170535, "grad_norm": 0.5109199285507202, "learning_rate": 1.1320754716981132e-05, "loss": 2.3962, "step": 30 }, { "epoch": 0.01811799343222738, "grad_norm": 0.2340932935476303, "learning_rate": 1.509433962264151e-05, "loss": 2.1062, "step": 40 }, { "epoch": 0.022647491790284226, "grad_norm": 0.16189691424369812, "learning_rate": 1.8867924528301888e-05, "loss": 1.9221, "step": 50 }, { "epoch": 0.02717699014834107, "grad_norm": 0.14064399898052216, "learning_rate": 2.2641509433962265e-05, "loss": 1.7971, "step": 60 }, { "epoch": 0.031706488506397915, "grad_norm": 0.1196412444114685, "learning_rate": 2.641509433962264e-05, "loss": 1.7255, "step": 70 }, { "epoch": 0.03623598686445476, "grad_norm": 0.15146440267562866, "learning_rate": 3.018867924528302e-05, "loss": 1.6767, "step": 80 }, { "epoch": 0.040765485222511604, "grad_norm": 0.13450802862644196, "learning_rate": 3.39622641509434e-05, "loss": 1.5943, "step": 90 }, { "epoch": 0.04529498358056845, "grad_norm": 0.15073299407958984, "learning_rate": 3.7735849056603776e-05, "loss": 1.5428, "step": 100 }, { "epoch": 0.0498244819386253, "grad_norm": 0.13764727115631104, "learning_rate": 4.150943396226415e-05, "loss": 1.4956, "step": 110 }, { "epoch": 0.05435398029668214, "grad_norm": 0.23157894611358643, "learning_rate": 4.528301886792453e-05, "loss": 1.4492, "step": 120 }, { "epoch": 0.05888347865473899, "grad_norm": 0.1756928712129593, "learning_rate": 4.9056603773584906e-05, "loss": 1.4258, "step": 130 }, { "epoch": 0.06341297701279583, "grad_norm": 0.19877882301807404, "learning_rate": 5.283018867924528e-05, "loss": 1.3863, "step": 140 }, { "epoch": 0.06794247537085268, "grad_norm": 0.19395482540130615, "learning_rate": 5.660377358490566e-05, "loss": 1.3469, "step": 150 }, { "epoch": 0.07247197372890953, "grad_norm": 0.2622753083705902, "learning_rate": 6.037735849056604e-05, "loss": 1.3177, "step": 160 }, { "epoch": 0.07700147208696637, "grad_norm": 0.47893616557121277, "learning_rate": 6.415094339622641e-05, "loss": 1.2414, "step": 170 }, { "epoch": 0.08153097044502321, "grad_norm": 0.2570054233074188, "learning_rate": 6.79245283018868e-05, "loss": 1.2046, "step": 180 }, { "epoch": 0.08606046880308006, "grad_norm": 0.31944283843040466, "learning_rate": 7.169811320754717e-05, "loss": 1.2254, "step": 190 }, { "epoch": 0.0905899671611369, "grad_norm": 0.35244274139404297, "learning_rate": 7.547169811320755e-05, "loss": 1.1671, "step": 200 }, { "epoch": 0.09511946551919374, "grad_norm": 0.23283237218856812, "learning_rate": 7.924528301886794e-05, "loss": 1.2043, "step": 210 }, { "epoch": 0.0996489638772506, "grad_norm": 0.38952431082725525, "learning_rate": 8.30188679245283e-05, "loss": 1.202, "step": 220 }, { "epoch": 0.10417846223530744, "grad_norm": 0.28450387716293335, "learning_rate": 8.679245283018869e-05, "loss": 1.1323, "step": 230 }, { "epoch": 0.10870796059336428, "grad_norm": 0.30833789706230164, "learning_rate": 9.056603773584906e-05, "loss": 1.1101, "step": 240 }, { "epoch": 0.11323745895142114, "grad_norm": 0.31221917271614075, "learning_rate": 9.433962264150944e-05, "loss": 1.0949, "step": 250 }, { "epoch": 0.11776695730947798, "grad_norm": 0.3738393187522888, "learning_rate": 9.811320754716981e-05, "loss": 1.1372, "step": 260 }, { "epoch": 0.12229645566753482, "grad_norm": 0.2999807596206665, "learning_rate": 0.0001018867924528302, "loss": 1.1028, "step": 270 }, { "epoch": 0.12682595402559166, "grad_norm": 0.4104474186897278, "learning_rate": 0.00010566037735849057, "loss": 1.0796, "step": 280 }, { "epoch": 0.1313554523836485, "grad_norm": 0.2639298141002655, "learning_rate": 0.00010943396226415095, "loss": 1.0626, "step": 290 }, { "epoch": 0.13588495074170537, "grad_norm": 0.2657984495162964, "learning_rate": 0.00011320754716981132, "loss": 1.0418, "step": 300 }, { "epoch": 0.1404144490997622, "grad_norm": 0.2493669092655182, "learning_rate": 0.0001169811320754717, "loss": 1.0157, "step": 310 }, { "epoch": 0.14494394745781905, "grad_norm": 0.21642285585403442, "learning_rate": 0.00012075471698113207, "loss": 0.9852, "step": 320 }, { "epoch": 0.1494734458158759, "grad_norm": 0.2093484252691269, "learning_rate": 0.00012452830188679244, "loss": 0.9938, "step": 330 }, { "epoch": 0.15400294417393273, "grad_norm": 0.2212437391281128, "learning_rate": 0.00012830188679245283, "loss": 1.0289, "step": 340 }, { "epoch": 0.1585324425319896, "grad_norm": 0.22111104428768158, "learning_rate": 0.0001320754716981132, "loss": 0.9656, "step": 350 }, { "epoch": 0.16306194089004641, "grad_norm": 0.31839072704315186, "learning_rate": 0.0001358490566037736, "loss": 0.9723, "step": 360 }, { "epoch": 0.16759143924810327, "grad_norm": 0.26599910855293274, "learning_rate": 0.00013962264150943395, "loss": 0.9503, "step": 370 }, { "epoch": 0.17212093760616012, "grad_norm": 0.273809552192688, "learning_rate": 0.00014339622641509434, "loss": 0.9786, "step": 380 }, { "epoch": 0.17665043596421695, "grad_norm": 0.1905912607908249, "learning_rate": 0.00014716981132075472, "loss": 0.9271, "step": 390 }, { "epoch": 0.1811799343222738, "grad_norm": 0.21957655251026154, "learning_rate": 0.0001509433962264151, "loss": 0.911, "step": 400 }, { "epoch": 0.18570943268033066, "grad_norm": 0.21992002427577972, "learning_rate": 0.0001547169811320755, "loss": 0.9434, "step": 410 }, { "epoch": 0.1902389310383875, "grad_norm": 0.2033444494009018, "learning_rate": 0.00015849056603773587, "loss": 0.9189, "step": 420 }, { "epoch": 0.19476842939644434, "grad_norm": 0.2479432225227356, "learning_rate": 0.00016226415094339625, "loss": 0.9137, "step": 430 }, { "epoch": 0.1992979277545012, "grad_norm": 0.26578351855278015, "learning_rate": 0.0001660377358490566, "loss": 0.9172, "step": 440 }, { "epoch": 0.20382742611255802, "grad_norm": 0.17441338300704956, "learning_rate": 0.000169811320754717, "loss": 0.8783, "step": 450 }, { "epoch": 0.20835692447061488, "grad_norm": 0.18898604810237885, "learning_rate": 0.00017358490566037738, "loss": 0.874, "step": 460 }, { "epoch": 0.21288642282867173, "grad_norm": 0.18335719406604767, "learning_rate": 0.00017735849056603776, "loss": 0.8604, "step": 470 }, { "epoch": 0.21741592118672856, "grad_norm": 0.20873741805553436, "learning_rate": 0.00018113207547169812, "loss": 0.8368, "step": 480 }, { "epoch": 0.22194541954478542, "grad_norm": 0.2140520066022873, "learning_rate": 0.0001849056603773585, "loss": 0.8729, "step": 490 }, { "epoch": 0.22647491790284227, "grad_norm": 0.20203453302383423, "learning_rate": 0.00018867924528301889, "loss": 0.836, "step": 500 }, { "epoch": 0.2310044162608991, "grad_norm": 0.185277059674263, "learning_rate": 0.00019245283018867927, "loss": 0.8224, "step": 510 }, { "epoch": 0.23553391461895595, "grad_norm": 0.207021564245224, "learning_rate": 0.00019622641509433963, "loss": 0.8482, "step": 520 }, { "epoch": 0.2400634129770128, "grad_norm": 0.19016426801681519, "learning_rate": 0.0002, "loss": 0.8296, "step": 530 }, { "epoch": 0.24459291133506963, "grad_norm": 0.20634956657886505, "learning_rate": 0.00019999983174896345, "loss": 0.8294, "step": 540 }, { "epoch": 0.2491224096931265, "grad_norm": 0.16894035041332245, "learning_rate": 0.00019999932699641984, "loss": 0.7966, "step": 550 }, { "epoch": 0.2536519080511833, "grad_norm": 0.21543951332569122, "learning_rate": 0.00019999848574406778, "loss": 0.819, "step": 560 }, { "epoch": 0.2581814064092402, "grad_norm": 0.18474166095256805, "learning_rate": 0.000199997307994738, "loss": 0.8073, "step": 570 }, { "epoch": 0.262710904767297, "grad_norm": 0.1627601534128189, "learning_rate": 0.0001999957937523937, "loss": 0.798, "step": 580 }, { "epoch": 0.26724040312535385, "grad_norm": 0.16344527900218964, "learning_rate": 0.0001999939430221304, "loss": 0.7846, "step": 590 }, { "epoch": 0.27176990148341074, "grad_norm": 0.1784357726573944, "learning_rate": 0.00019999175581017573, "loss": 0.7892, "step": 600 }, { "epoch": 0.27629939984146756, "grad_norm": 0.1735469251871109, "learning_rate": 0.00019998923212388977, "loss": 0.7624, "step": 610 }, { "epoch": 0.2808288981995244, "grad_norm": 0.20232649147510529, "learning_rate": 0.00019998637197176478, "loss": 0.7754, "step": 620 }, { "epoch": 0.2853583965575813, "grad_norm": 0.21980105340480804, "learning_rate": 0.00019998317536342524, "loss": 0.7896, "step": 630 }, { "epoch": 0.2898878949156381, "grad_norm": 0.15072612464427948, "learning_rate": 0.00019997964230962774, "loss": 0.7451, "step": 640 }, { "epoch": 0.2944173932736949, "grad_norm": 0.17559681832790375, "learning_rate": 0.00019997577282226115, "loss": 0.719, "step": 650 }, { "epoch": 0.2989468916317518, "grad_norm": 0.17159104347229004, "learning_rate": 0.00019997156691434632, "loss": 0.7356, "step": 660 }, { "epoch": 0.30347638998980864, "grad_norm": 0.20724473893642426, "learning_rate": 0.00019996702460003623, "loss": 0.7257, "step": 670 }, { "epoch": 0.30800588834786546, "grad_norm": 0.15702813863754272, "learning_rate": 0.00019996214589461592, "loss": 0.7104, "step": 680 }, { "epoch": 0.31253538670592235, "grad_norm": 0.185310959815979, "learning_rate": 0.00019995693081450227, "loss": 0.7192, "step": 690 }, { "epoch": 0.3170648850639792, "grad_norm": 0.17659538984298706, "learning_rate": 0.00019995137937724413, "loss": 0.7084, "step": 700 }, { "epoch": 0.321594383422036, "grad_norm": 0.16541071236133575, "learning_rate": 0.00019994549160152225, "loss": 0.7179, "step": 710 }, { "epoch": 0.32612388178009283, "grad_norm": 0.16881656646728516, "learning_rate": 0.00019993926750714905, "loss": 0.7039, "step": 720 }, { "epoch": 0.3306533801381497, "grad_norm": 0.18213717639446259, "learning_rate": 0.0001999327071150688, "loss": 0.712, "step": 730 }, { "epoch": 0.33518287849620654, "grad_norm": 0.16946811974048615, "learning_rate": 0.00019992581044735736, "loss": 0.7041, "step": 740 }, { "epoch": 0.33971237685426336, "grad_norm": 0.20027601718902588, "learning_rate": 0.00019991857752722208, "loss": 0.6937, "step": 750 }, { "epoch": 0.34424187521232025, "grad_norm": 0.17900145053863525, "learning_rate": 0.000199911008379002, "loss": 0.689, "step": 760 }, { "epoch": 0.3487713735703771, "grad_norm": 0.1626042276620865, "learning_rate": 0.00019990310302816738, "loss": 0.6923, "step": 770 }, { "epoch": 0.3533008719284339, "grad_norm": 0.1776456981897354, "learning_rate": 0.00019989486150131987, "loss": 0.6725, "step": 780 }, { "epoch": 0.3578303702864908, "grad_norm": 0.16232900321483612, "learning_rate": 0.00019988628382619242, "loss": 0.6621, "step": 790 }, { "epoch": 0.3623598686445476, "grad_norm": 0.16653478145599365, "learning_rate": 0.00019987737003164912, "loss": 0.6825, "step": 800 }, { "epoch": 0.36688936700260444, "grad_norm": 0.16946111619472504, "learning_rate": 0.00019986812014768503, "loss": 0.6634, "step": 810 }, { "epoch": 0.3714188653606613, "grad_norm": 0.16169489920139313, "learning_rate": 0.00019985853420542617, "loss": 0.6592, "step": 820 }, { "epoch": 0.37594836371871815, "grad_norm": 0.1830553561449051, "learning_rate": 0.0001998486122371295, "loss": 0.6661, "step": 830 }, { "epoch": 0.380477862076775, "grad_norm": 0.18185435235500336, "learning_rate": 0.00019983835427618262, "loss": 0.6331, "step": 840 }, { "epoch": 0.38500736043483186, "grad_norm": 0.17038173973560333, "learning_rate": 0.0001998277603571038, "loss": 0.6274, "step": 850 }, { "epoch": 0.3895368587928887, "grad_norm": 0.15142400562763214, "learning_rate": 0.00019981683051554174, "loss": 0.6282, "step": 860 }, { "epoch": 0.3940663571509455, "grad_norm": 0.18170781433582306, "learning_rate": 0.00019980556478827564, "loss": 0.605, "step": 870 }, { "epoch": 0.3985958555090024, "grad_norm": 0.1576147973537445, "learning_rate": 0.0001997939632132149, "loss": 0.6393, "step": 880 }, { "epoch": 0.4031253538670592, "grad_norm": 0.17267905175685883, "learning_rate": 0.00019978202582939902, "loss": 0.6274, "step": 890 }, { "epoch": 0.40765485222511605, "grad_norm": 0.19358091056346893, "learning_rate": 0.00019976975267699758, "loss": 0.5976, "step": 900 }, { "epoch": 0.41218435058317293, "grad_norm": 0.20368127524852753, "learning_rate": 0.00019975714379730998, "loss": 0.637, "step": 910 }, { "epoch": 0.41671384894122976, "grad_norm": 0.17673739790916443, "learning_rate": 0.00019974419923276537, "loss": 0.6014, "step": 920 }, { "epoch": 0.4212433472992866, "grad_norm": 0.1759296953678131, "learning_rate": 0.0001997309190269225, "loss": 0.5822, "step": 930 }, { "epoch": 0.42577284565734347, "grad_norm": 0.15785963833332062, "learning_rate": 0.00019971730322446949, "loss": 0.5856, "step": 940 }, { "epoch": 0.4303023440154003, "grad_norm": 0.16193810105323792, "learning_rate": 0.00019970335187122383, "loss": 0.5854, "step": 950 }, { "epoch": 0.4348318423734571, "grad_norm": 0.1555752456188202, "learning_rate": 0.0001996890650141321, "loss": 0.5852, "step": 960 }, { "epoch": 0.439361340731514, "grad_norm": 0.17118428647518158, "learning_rate": 0.00019967444270126988, "loss": 0.5816, "step": 970 }, { "epoch": 0.44389083908957083, "grad_norm": 0.15966954827308655, "learning_rate": 0.00019965948498184153, "loss": 0.5641, "step": 980 }, { "epoch": 0.44842033744762766, "grad_norm": 0.20606863498687744, "learning_rate": 0.0001996441919061801, "loss": 0.588, "step": 990 }, { "epoch": 0.45294983580568454, "grad_norm": 0.17158259451389313, "learning_rate": 0.0001996285635257471, "loss": 0.5437, "step": 1000 }, { "epoch": 0.45747933416374137, "grad_norm": 0.1764381229877472, "learning_rate": 0.0001996125998931324, "loss": 0.5546, "step": 1010 }, { "epoch": 0.4620088325217982, "grad_norm": 0.17307806015014648, "learning_rate": 0.0001995963010620539, "loss": 0.5442, "step": 1020 }, { "epoch": 0.4665383308798551, "grad_norm": 0.17395785450935364, "learning_rate": 0.00019957966708735754, "loss": 0.5198, "step": 1030 }, { "epoch": 0.4710678292379119, "grad_norm": 0.17280320823192596, "learning_rate": 0.00019956269802501696, "loss": 0.5235, "step": 1040 }, { "epoch": 0.47559732759596873, "grad_norm": 0.1894276738166809, "learning_rate": 0.00019954539393213344, "loss": 0.539, "step": 1050 }, { "epoch": 0.4801268259540256, "grad_norm": 0.19094131886959076, "learning_rate": 0.0001995277548669356, "loss": 0.5445, "step": 1060 }, { "epoch": 0.48465632431208244, "grad_norm": 0.182444766163826, "learning_rate": 0.00019950978088877923, "loss": 0.526, "step": 1070 }, { "epoch": 0.48918582267013927, "grad_norm": 0.2150132805109024, "learning_rate": 0.00019949147205814715, "loss": 0.5334, "step": 1080 }, { "epoch": 0.49371532102819615, "grad_norm": 0.17609047889709473, "learning_rate": 0.000199472828436649, "loss": 0.5239, "step": 1090 }, { "epoch": 0.498244819386253, "grad_norm": 0.18994882702827454, "learning_rate": 0.0001994538500870209, "loss": 0.5163, "step": 1100 }, { "epoch": 0.5027743177443098, "grad_norm": 0.1678932011127472, "learning_rate": 0.00019943453707312544, "loss": 0.5379, "step": 1110 }, { "epoch": 0.5073038161023666, "grad_norm": 0.18330644071102142, "learning_rate": 0.00019941488945995125, "loss": 0.5037, "step": 1120 }, { "epoch": 0.5118333144604235, "grad_norm": 0.1946277767419815, "learning_rate": 0.00019939490731361298, "loss": 0.5169, "step": 1130 }, { "epoch": 0.5163628128184804, "grad_norm": 0.1769060641527176, "learning_rate": 0.00019937459070135097, "loss": 0.5016, "step": 1140 }, { "epoch": 0.5208923111765372, "grad_norm": 0.1812835931777954, "learning_rate": 0.00019935393969153106, "loss": 0.4974, "step": 1150 }, { "epoch": 0.525421809534594, "grad_norm": 0.17336933314800262, "learning_rate": 0.00019933295435364432, "loss": 0.4936, "step": 1160 }, { "epoch": 0.5299513078926509, "grad_norm": 0.19504410028457642, "learning_rate": 0.00019931163475830682, "loss": 0.4892, "step": 1170 }, { "epoch": 0.5344808062507077, "grad_norm": 0.17446300387382507, "learning_rate": 0.00019928998097725945, "loss": 0.4851, "step": 1180 }, { "epoch": 0.5390103046087645, "grad_norm": 0.2062528431415558, "learning_rate": 0.00019926799308336767, "loss": 0.4796, "step": 1190 }, { "epoch": 0.5435398029668215, "grad_norm": 0.17791499197483063, "learning_rate": 0.00019924567115062116, "loss": 0.4704, "step": 1200 }, { "epoch": 0.5480693013248783, "grad_norm": 0.20112474262714386, "learning_rate": 0.00019922301525413368, "loss": 0.4848, "step": 1210 }, { "epoch": 0.5525987996829351, "grad_norm": 0.1905170977115631, "learning_rate": 0.00019920002547014283, "loss": 0.4848, "step": 1220 }, { "epoch": 0.557128298040992, "grad_norm": 0.2167678326368332, "learning_rate": 0.00019917670187600967, "loss": 0.475, "step": 1230 }, { "epoch": 0.5616577963990488, "grad_norm": 0.1879906803369522, "learning_rate": 0.00019915304455021859, "loss": 0.4661, "step": 1240 }, { "epoch": 0.5661872947571056, "grad_norm": 0.17811033129692078, "learning_rate": 0.00019912905357237701, "loss": 0.4758, "step": 1250 }, { "epoch": 0.5707167931151625, "grad_norm": 0.18101903796195984, "learning_rate": 0.00019910472902321503, "loss": 0.4668, "step": 1260 }, { "epoch": 0.5752462914732194, "grad_norm": 0.1657211035490036, "learning_rate": 0.0001990800709845853, "loss": 0.4645, "step": 1270 }, { "epoch": 0.5797757898312762, "grad_norm": 0.32196566462516785, "learning_rate": 0.00019905507953946257, "loss": 0.4442, "step": 1280 }, { "epoch": 0.584305288189333, "grad_norm": 0.2010417878627777, "learning_rate": 0.00019902975477194363, "loss": 0.4633, "step": 1290 }, { "epoch": 0.5888347865473899, "grad_norm": 0.18759405612945557, "learning_rate": 0.00019900409676724682, "loss": 0.4642, "step": 1300 }, { "epoch": 0.5933642849054467, "grad_norm": 0.19315552711486816, "learning_rate": 0.00019897810561171189, "loss": 0.4308, "step": 1310 }, { "epoch": 0.5978937832635036, "grad_norm": 0.194192036986351, "learning_rate": 0.00019895178139279956, "loss": 0.4424, "step": 1320 }, { "epoch": 0.6024232816215604, "grad_norm": 0.17403574287891388, "learning_rate": 0.00019892512419909138, "loss": 0.4491, "step": 1330 }, { "epoch": 0.6069527799796173, "grad_norm": 0.20866619050502777, "learning_rate": 0.00019889813412028942, "loss": 0.4546, "step": 1340 }, { "epoch": 0.6114822783376741, "grad_norm": 0.1847338080406189, "learning_rate": 0.00019887081124721583, "loss": 0.4354, "step": 1350 }, { "epoch": 0.6160117766957309, "grad_norm": 0.20528827607631683, "learning_rate": 0.00019884315567181263, "loss": 0.432, "step": 1360 }, { "epoch": 0.6205412750537878, "grad_norm": 0.19688895344734192, "learning_rate": 0.00019881516748714137, "loss": 0.4256, "step": 1370 }, { "epoch": 0.6250707734118447, "grad_norm": 0.1834789514541626, "learning_rate": 0.00019878684678738295, "loss": 0.4142, "step": 1380 }, { "epoch": 0.6296002717699015, "grad_norm": 0.1904083490371704, "learning_rate": 0.00019875819366783705, "loss": 0.4072, "step": 1390 }, { "epoch": 0.6341297701279583, "grad_norm": 0.24558007717132568, "learning_rate": 0.00019872920822492206, "loss": 0.4168, "step": 1400 }, { "epoch": 0.6386592684860152, "grad_norm": 0.19825737178325653, "learning_rate": 0.0001986998905561745, "loss": 0.4102, "step": 1410 }, { "epoch": 0.643188766844072, "grad_norm": 0.2427905946969986, "learning_rate": 0.00019867024076024908, "loss": 0.4266, "step": 1420 }, { "epoch": 0.6477182652021288, "grad_norm": 0.20517700910568237, "learning_rate": 0.00019864025893691784, "loss": 0.4155, "step": 1430 }, { "epoch": 0.6522477635601857, "grad_norm": 0.19519874453544617, "learning_rate": 0.00019860994518707036, "loss": 0.4093, "step": 1440 }, { "epoch": 0.6567772619182426, "grad_norm": 0.17730577290058136, "learning_rate": 0.0001985792996127129, "loss": 0.3932, "step": 1450 }, { "epoch": 0.6613067602762994, "grad_norm": 0.1811046451330185, "learning_rate": 0.00019854832231696855, "loss": 0.3953, "step": 1460 }, { "epoch": 0.6658362586343562, "grad_norm": 0.18473340570926666, "learning_rate": 0.00019851701340407654, "loss": 0.3846, "step": 1470 }, { "epoch": 0.6703657569924131, "grad_norm": 0.1876707524061203, "learning_rate": 0.000198485372979392, "loss": 0.3947, "step": 1480 }, { "epoch": 0.6748952553504699, "grad_norm": 0.21453642845153809, "learning_rate": 0.00019845340114938562, "loss": 0.3893, "step": 1490 }, { "epoch": 0.6794247537085267, "grad_norm": 0.19314515590667725, "learning_rate": 0.00019842109802164327, "loss": 0.3857, "step": 1500 }, { "epoch": 0.6839542520665837, "grad_norm": 0.18713776767253876, "learning_rate": 0.0001983884637048656, "loss": 0.3945, "step": 1510 }, { "epoch": 0.6884837504246405, "grad_norm": 0.18545708060264587, "learning_rate": 0.00019835549830886785, "loss": 0.3829, "step": 1520 }, { "epoch": 0.6930132487826973, "grad_norm": 0.163354754447937, "learning_rate": 0.00019832220194457919, "loss": 0.3681, "step": 1530 }, { "epoch": 0.6975427471407541, "grad_norm": 0.19729359447956085, "learning_rate": 0.0001982885747240426, "loss": 0.376, "step": 1540 }, { "epoch": 0.702072245498811, "grad_norm": 0.19601188600063324, "learning_rate": 0.00019825461676041436, "loss": 0.3738, "step": 1550 }, { "epoch": 0.7066017438568678, "grad_norm": 0.184451162815094, "learning_rate": 0.00019822032816796376, "loss": 0.3689, "step": 1560 }, { "epoch": 0.7111312422149247, "grad_norm": 0.16905899345874786, "learning_rate": 0.0001981857090620726, "loss": 0.3667, "step": 1570 }, { "epoch": 0.7156607405729816, "grad_norm": 0.17829935252666473, "learning_rate": 0.0001981507595592349, "loss": 0.3718, "step": 1580 }, { "epoch": 0.7201902389310384, "grad_norm": 0.17314116656780243, "learning_rate": 0.0001981154797770564, "loss": 0.3711, "step": 1590 }, { "epoch": 0.7247197372890952, "grad_norm": 0.17752452194690704, "learning_rate": 0.0001980798698342544, "loss": 0.3711, "step": 1600 }, { "epoch": 0.729249235647152, "grad_norm": 0.16267523169517517, "learning_rate": 0.00019804392985065702, "loss": 0.3461, "step": 1610 }, { "epoch": 0.7337787340052089, "grad_norm": 0.1715889424085617, "learning_rate": 0.00019800765994720308, "loss": 0.3542, "step": 1620 }, { "epoch": 0.7383082323632658, "grad_norm": 0.2011169195175171, "learning_rate": 0.00019797106024594153, "loss": 0.3602, "step": 1630 }, { "epoch": 0.7428377307213226, "grad_norm": 0.16859227418899536, "learning_rate": 0.00019793413087003115, "loss": 0.3509, "step": 1640 }, { "epoch": 0.7473672290793795, "grad_norm": 0.18904465436935425, "learning_rate": 0.0001978968719437401, "loss": 0.3619, "step": 1650 }, { "epoch": 0.7518967274374363, "grad_norm": 0.1918095499277115, "learning_rate": 0.00019785928359244533, "loss": 0.3529, "step": 1660 }, { "epoch": 0.7564262257954931, "grad_norm": 0.16930030286312103, "learning_rate": 0.0001978213659426325, "loss": 0.3505, "step": 1670 }, { "epoch": 0.76095572415355, "grad_norm": 0.19345726072788239, "learning_rate": 0.00019778311912189528, "loss": 0.3548, "step": 1680 }, { "epoch": 0.7654852225116069, "grad_norm": 0.1755731701850891, "learning_rate": 0.000197744543258935, "loss": 0.3549, "step": 1690 }, { "epoch": 0.7700147208696637, "grad_norm": 0.17827914655208588, "learning_rate": 0.00019770563848356024, "loss": 0.3622, "step": 1700 }, { "epoch": 0.7745442192277205, "grad_norm": 0.1955813765525818, "learning_rate": 0.0001976664049266864, "loss": 0.3412, "step": 1710 }, { "epoch": 0.7790737175857774, "grad_norm": 0.18960636854171753, "learning_rate": 0.00019762684272033515, "loss": 0.3438, "step": 1720 }, { "epoch": 0.7836032159438342, "grad_norm": 0.20935559272766113, "learning_rate": 0.00019758695199763418, "loss": 0.3497, "step": 1730 }, { "epoch": 0.788132714301891, "grad_norm": 0.18760916590690613, "learning_rate": 0.00019754673289281663, "loss": 0.3299, "step": 1740 }, { "epoch": 0.792662212659948, "grad_norm": 0.2013741135597229, "learning_rate": 0.0001975061855412206, "loss": 0.3395, "step": 1750 }, { "epoch": 0.7971917110180048, "grad_norm": 0.18885807693004608, "learning_rate": 0.0001974653100792887, "loss": 0.3321, "step": 1760 }, { "epoch": 0.8017212093760616, "grad_norm": 0.18193817138671875, "learning_rate": 0.00019742410664456777, "loss": 0.3387, "step": 1770 }, { "epoch": 0.8062507077341184, "grad_norm": 0.16840125620365143, "learning_rate": 0.00019738257537570822, "loss": 0.3302, "step": 1780 }, { "epoch": 0.8107802060921753, "grad_norm": 0.1618867665529251, "learning_rate": 0.00019734071641246365, "loss": 0.3212, "step": 1790 }, { "epoch": 0.8153097044502321, "grad_norm": 0.20026183128356934, "learning_rate": 0.00019729852989569028, "loss": 0.3274, "step": 1800 }, { "epoch": 0.819839202808289, "grad_norm": 0.18741321563720703, "learning_rate": 0.00019725601596734668, "loss": 0.3267, "step": 1810 }, { "epoch": 0.8243687011663459, "grad_norm": 0.17450092732906342, "learning_rate": 0.000197213174770493, "loss": 0.3193, "step": 1820 }, { "epoch": 0.8288981995244027, "grad_norm": 0.1721801608800888, "learning_rate": 0.00019717000644929087, "loss": 0.3127, "step": 1830 }, { "epoch": 0.8334276978824595, "grad_norm": 0.18926140666007996, "learning_rate": 0.00019712651114900257, "loss": 0.3214, "step": 1840 }, { "epoch": 0.8379571962405163, "grad_norm": 0.17309771478176117, "learning_rate": 0.0001970826890159906, "loss": 0.318, "step": 1850 }, { "epoch": 0.8424866945985732, "grad_norm": 0.18818823993206024, "learning_rate": 0.00019703854019771742, "loss": 0.3154, "step": 1860 }, { "epoch": 0.84701619295663, "grad_norm": 0.18680931627750397, "learning_rate": 0.00019699406484274468, "loss": 0.3104, "step": 1870 }, { "epoch": 0.8515456913146869, "grad_norm": 0.16489103436470032, "learning_rate": 0.0001969492631007329, "loss": 0.3232, "step": 1880 }, { "epoch": 0.8560751896727438, "grad_norm": 0.17721644043922424, "learning_rate": 0.0001969041351224409, "loss": 0.3034, "step": 1890 }, { "epoch": 0.8606046880308006, "grad_norm": 0.19497451186180115, "learning_rate": 0.00019685868105972517, "loss": 0.3092, "step": 1900 }, { "epoch": 0.8651341863888574, "grad_norm": 0.20427413284778595, "learning_rate": 0.00019681290106553969, "loss": 0.3158, "step": 1910 }, { "epoch": 0.8696636847469142, "grad_norm": 0.18642422556877136, "learning_rate": 0.00019676679529393498, "loss": 0.3058, "step": 1920 }, { "epoch": 0.8741931831049711, "grad_norm": 0.16172035038471222, "learning_rate": 0.00019672036390005798, "loss": 0.3069, "step": 1930 }, { "epoch": 0.878722681463028, "grad_norm": 0.15888796746730804, "learning_rate": 0.00019667360704015127, "loss": 0.3075, "step": 1940 }, { "epoch": 0.8832521798210848, "grad_norm": 0.16608227789402008, "learning_rate": 0.0001966265248715527, "loss": 0.295, "step": 1950 }, { "epoch": 0.8877816781791417, "grad_norm": 0.18529315292835236, "learning_rate": 0.00019657911755269466, "loss": 0.3087, "step": 1960 }, { "epoch": 0.8923111765371985, "grad_norm": 0.1623723804950714, "learning_rate": 0.0001965313852431038, "loss": 0.318, "step": 1970 }, { "epoch": 0.8968406748952553, "grad_norm": 0.18999403715133667, "learning_rate": 0.0001964833281034004, "loss": 0.3013, "step": 1980 }, { "epoch": 0.9013701732533121, "grad_norm": 0.1742704212665558, "learning_rate": 0.0001964349462952976, "loss": 0.2906, "step": 1990 }, { "epoch": 0.9058996716113691, "grad_norm": 0.15007524192333221, "learning_rate": 0.00019638623998160127, "loss": 0.2909, "step": 2000 }, { "epoch": 0.9104291699694259, "grad_norm": 0.18087700009346008, "learning_rate": 0.00019633720932620916, "loss": 0.2852, "step": 2010 }, { "epoch": 0.9149586683274827, "grad_norm": 0.172203928232193, "learning_rate": 0.0001962878544941104, "loss": 0.2894, "step": 2020 }, { "epoch": 0.9194881666855396, "grad_norm": 0.1811007559299469, "learning_rate": 0.00019623817565138512, "loss": 0.2905, "step": 2030 }, { "epoch": 0.9240176650435964, "grad_norm": 0.17736268043518066, "learning_rate": 0.00019618817296520355, "loss": 0.2855, "step": 2040 }, { "epoch": 0.9285471634016532, "grad_norm": 0.1875537484884262, "learning_rate": 0.00019613784660382582, "loss": 0.3006, "step": 2050 }, { "epoch": 0.9330766617597102, "grad_norm": 0.16459111869335175, "learning_rate": 0.00019608719673660117, "loss": 0.2928, "step": 2060 }, { "epoch": 0.937606160117767, "grad_norm": 0.19852280616760254, "learning_rate": 0.00019603622353396745, "loss": 0.2877, "step": 2070 }, { "epoch": 0.9421356584758238, "grad_norm": 0.1441079080104828, "learning_rate": 0.00019598492716745055, "loss": 0.2722, "step": 2080 }, { "epoch": 0.9466651568338806, "grad_norm": 0.17091263830661774, "learning_rate": 0.00019593330780966377, "loss": 0.2845, "step": 2090 }, { "epoch": 0.9511946551919375, "grad_norm": 0.17907531559467316, "learning_rate": 0.00019588136563430735, "loss": 0.2881, "step": 2100 }, { "epoch": 0.9557241535499943, "grad_norm": 0.18411681056022644, "learning_rate": 0.00019582910081616782, "loss": 0.2906, "step": 2110 }, { "epoch": 0.9602536519080512, "grad_norm": 0.19341252744197845, "learning_rate": 0.00019577651353111733, "loss": 0.2926, "step": 2120 }, { "epoch": 0.9647831502661081, "grad_norm": 0.17022013664245605, "learning_rate": 0.00019572360395611317, "loss": 0.2728, "step": 2130 }, { "epoch": 0.9693126486241649, "grad_norm": 0.17077523469924927, "learning_rate": 0.00019567037226919721, "loss": 0.2754, "step": 2140 }, { "epoch": 0.9738421469822217, "grad_norm": 0.16188162565231323, "learning_rate": 0.00019561681864949514, "loss": 0.2761, "step": 2150 }, { "epoch": 0.9783716453402785, "grad_norm": 0.16258101165294647, "learning_rate": 0.00019556294327721603, "loss": 0.2724, "step": 2160 }, { "epoch": 0.9829011436983354, "grad_norm": 0.1751459836959839, "learning_rate": 0.00019550874633365162, "loss": 0.2844, "step": 2170 }, { "epoch": 0.9874306420563923, "grad_norm": 0.14674732089042664, "learning_rate": 0.0001954542280011757, "loss": 0.2818, "step": 2180 }, { "epoch": 0.9919601404144491, "grad_norm": 0.1843065619468689, "learning_rate": 0.00019539938846324363, "loss": 0.2736, "step": 2190 }, { "epoch": 0.996489638772506, "grad_norm": 0.18449115753173828, "learning_rate": 0.00019534422790439164, "loss": 0.2828, "step": 2200 }, { "epoch": 0.9996602876231457, "eval_loss": 0.26604515314102173, "eval_runtime": 617.1505, "eval_samples_per_second": 12.752, "eval_steps_per_second": 1.594, "step": 2207 }, { "epoch": 1.001358849507417, "grad_norm": 0.15234586596488953, "learning_rate": 0.00019528874651023606, "loss": 0.2608, "step": 2210 }, { "epoch": 1.0058883478654739, "grad_norm": 0.15887659788131714, "learning_rate": 0.00019523294446747297, "loss": 0.2417, "step": 2220 }, { "epoch": 1.0104178462235307, "grad_norm": 0.16629189252853394, "learning_rate": 0.00019517682196387744, "loss": 0.2306, "step": 2230 }, { "epoch": 1.0149473445815875, "grad_norm": 0.17960551381111145, "learning_rate": 0.00019512037918830282, "loss": 0.2279, "step": 2240 }, { "epoch": 1.0194768429396444, "grad_norm": 0.1671302169561386, "learning_rate": 0.0001950636163306802, "loss": 0.2181, "step": 2250 }, { "epoch": 1.0240063412977012, "grad_norm": 0.16400860249996185, "learning_rate": 0.0001950065335820178, "loss": 0.2333, "step": 2260 }, { "epoch": 1.0285358396557582, "grad_norm": 0.15259268879890442, "learning_rate": 0.00019494913113440022, "loss": 0.2307, "step": 2270 }, { "epoch": 1.033065338013815, "grad_norm": 0.1612786501646042, "learning_rate": 0.00019489140918098796, "loss": 0.2349, "step": 2280 }, { "epoch": 1.0375948363718719, "grad_norm": 0.15766066312789917, "learning_rate": 0.00019483336791601655, "loss": 0.23, "step": 2290 }, { "epoch": 1.0421243347299287, "grad_norm": 0.16044190526008606, "learning_rate": 0.00019477500753479603, "loss": 0.2234, "step": 2300 }, { "epoch": 1.0466538330879855, "grad_norm": 0.18357709050178528, "learning_rate": 0.00019471632823371028, "loss": 0.2208, "step": 2310 }, { "epoch": 1.0511833314460424, "grad_norm": 0.1702904850244522, "learning_rate": 0.00019465733021021645, "loss": 0.2248, "step": 2320 }, { "epoch": 1.0557128298040992, "grad_norm": 0.15621191263198853, "learning_rate": 0.00019459801366284403, "loss": 0.2286, "step": 2330 }, { "epoch": 1.060242328162156, "grad_norm": 0.1782391220331192, "learning_rate": 0.00019453837879119444, "loss": 0.2304, "step": 2340 }, { "epoch": 1.0647718265202128, "grad_norm": 0.16530479490756989, "learning_rate": 0.00019447842579594027, "loss": 0.2306, "step": 2350 }, { "epoch": 1.0693013248782697, "grad_norm": 0.16082873940467834, "learning_rate": 0.00019441815487882463, "loss": 0.2252, "step": 2360 }, { "epoch": 1.0738308232363265, "grad_norm": 0.15404802560806274, "learning_rate": 0.00019435756624266035, "loss": 0.216, "step": 2370 }, { "epoch": 1.0783603215943833, "grad_norm": 0.14842167496681213, "learning_rate": 0.00019429666009132944, "loss": 0.2218, "step": 2380 }, { "epoch": 1.0828898199524404, "grad_norm": 0.16312135756015778, "learning_rate": 0.00019423543662978245, "loss": 0.212, "step": 2390 }, { "epoch": 1.0874193183104972, "grad_norm": 0.17386338114738464, "learning_rate": 0.00019417389606403752, "loss": 0.2251, "step": 2400 }, { "epoch": 1.091948816668554, "grad_norm": 0.17737415432929993, "learning_rate": 0.00019411203860117995, "loss": 0.2304, "step": 2410 }, { "epoch": 1.0964783150266109, "grad_norm": 0.16693584620952606, "learning_rate": 0.00019404986444936136, "loss": 0.2175, "step": 2420 }, { "epoch": 1.1010078133846677, "grad_norm": 0.1775166392326355, "learning_rate": 0.00019398737381779913, "loss": 0.2209, "step": 2430 }, { "epoch": 1.1055373117427245, "grad_norm": 0.1629152148962021, "learning_rate": 0.00019392456691677546, "loss": 0.2113, "step": 2440 }, { "epoch": 1.1100668101007813, "grad_norm": 0.1428159475326538, "learning_rate": 0.0001938614439576369, "loss": 0.2141, "step": 2450 }, { "epoch": 1.1145963084588382, "grad_norm": 0.1580020934343338, "learning_rate": 0.0001937980051527935, "loss": 0.2193, "step": 2460 }, { "epoch": 1.119125806816895, "grad_norm": 0.13861976563930511, "learning_rate": 0.0001937342507157182, "loss": 0.2091, "step": 2470 }, { "epoch": 1.1236553051749518, "grad_norm": 0.16170430183410645, "learning_rate": 0.00019367018086094594, "loss": 0.2175, "step": 2480 }, { "epoch": 1.1281848035330087, "grad_norm": 0.15579678118228912, "learning_rate": 0.00019360579580407315, "loss": 0.2091, "step": 2490 }, { "epoch": 1.1327143018910655, "grad_norm": 0.15239351987838745, "learning_rate": 0.00019354109576175685, "loss": 0.2189, "step": 2500 }, { "epoch": 1.1372438002491223, "grad_norm": 0.16122813522815704, "learning_rate": 0.00019347608095171407, "loss": 0.2159, "step": 2510 }, { "epoch": 1.1417732986071791, "grad_norm": 0.14791563153266907, "learning_rate": 0.00019341075159272096, "loss": 0.2093, "step": 2520 }, { "epoch": 1.1463027969652362, "grad_norm": 0.138755664229393, "learning_rate": 0.0001933451079046122, "loss": 0.2231, "step": 2530 }, { "epoch": 1.150832295323293, "grad_norm": 0.15061049163341522, "learning_rate": 0.0001932791501082801, "loss": 0.2067, "step": 2540 }, { "epoch": 1.1553617936813498, "grad_norm": 0.17541393637657166, "learning_rate": 0.00019321287842567408, "loss": 0.2197, "step": 2550 }, { "epoch": 1.1598912920394067, "grad_norm": 0.17274054884910583, "learning_rate": 0.00019314629307979968, "loss": 0.2179, "step": 2560 }, { "epoch": 1.1644207903974635, "grad_norm": 0.16083642840385437, "learning_rate": 0.000193079394294718, "loss": 0.2139, "step": 2570 }, { "epoch": 1.1689502887555203, "grad_norm": 0.16815818846225739, "learning_rate": 0.00019301218229554482, "loss": 0.2158, "step": 2580 }, { "epoch": 1.1734797871135771, "grad_norm": 0.15939727425575256, "learning_rate": 0.0001929446573084499, "loss": 0.2139, "step": 2590 }, { "epoch": 1.178009285471634, "grad_norm": 0.14855942130088806, "learning_rate": 0.00019287681956065624, "loss": 0.2156, "step": 2600 }, { "epoch": 1.1825387838296908, "grad_norm": 0.16065727174282074, "learning_rate": 0.00019280866928043927, "loss": 0.2131, "step": 2610 }, { "epoch": 1.1870682821877476, "grad_norm": 0.15156914293766022, "learning_rate": 0.00019274020669712608, "loss": 0.2133, "step": 2620 }, { "epoch": 1.1915977805458047, "grad_norm": 0.15163294970989227, "learning_rate": 0.00019267143204109469, "loss": 0.2172, "step": 2630 }, { "epoch": 1.1961272789038615, "grad_norm": 0.14060626924037933, "learning_rate": 0.00019260234554377325, "loss": 0.2048, "step": 2640 }, { "epoch": 1.2006567772619183, "grad_norm": 0.16215626895427704, "learning_rate": 0.00019253294743763925, "loss": 0.2077, "step": 2650 }, { "epoch": 1.2051862756199752, "grad_norm": 0.13906173408031464, "learning_rate": 0.00019246323795621875, "loss": 0.2125, "step": 2660 }, { "epoch": 1.209715773978032, "grad_norm": 0.15761959552764893, "learning_rate": 0.0001923932173340856, "loss": 0.2104, "step": 2670 }, { "epoch": 1.2142452723360888, "grad_norm": 0.16140113770961761, "learning_rate": 0.00019232288580686068, "loss": 0.2131, "step": 2680 }, { "epoch": 1.2187747706941456, "grad_norm": 0.13611847162246704, "learning_rate": 0.000192252243611211, "loss": 0.2042, "step": 2690 }, { "epoch": 1.2233042690522025, "grad_norm": 0.14395853877067566, "learning_rate": 0.00019218129098484902, "loss": 0.2144, "step": 2700 }, { "epoch": 1.2278337674102593, "grad_norm": 0.14826107025146484, "learning_rate": 0.0001921100281665318, "loss": 0.2119, "step": 2710 }, { "epoch": 1.2323632657683161, "grad_norm": 0.1515769064426422, "learning_rate": 0.0001920384553960602, "loss": 0.2051, "step": 2720 }, { "epoch": 1.236892764126373, "grad_norm": 0.15898488461971283, "learning_rate": 0.00019196657291427807, "loss": 0.2127, "step": 2730 }, { "epoch": 1.2414222624844298, "grad_norm": 0.13833607733249664, "learning_rate": 0.00019189438096307146, "loss": 0.2097, "step": 2740 }, { "epoch": 1.2459517608424866, "grad_norm": 0.14516334235668182, "learning_rate": 0.0001918218797853678, "loss": 0.1958, "step": 2750 }, { "epoch": 1.2504812592005434, "grad_norm": 0.13684655725955963, "learning_rate": 0.00019174906962513504, "loss": 0.2196, "step": 2760 }, { "epoch": 1.2550107575586003, "grad_norm": 0.16645090281963348, "learning_rate": 0.00019167595072738084, "loss": 0.2095, "step": 2770 }, { "epoch": 1.2595402559166573, "grad_norm": 0.1568327695131302, "learning_rate": 0.00019160252333815187, "loss": 0.2046, "step": 2780 }, { "epoch": 1.2640697542747141, "grad_norm": 0.15349489450454712, "learning_rate": 0.00019152878770453279, "loss": 0.2124, "step": 2790 }, { "epoch": 1.268599252632771, "grad_norm": 0.16242361068725586, "learning_rate": 0.00019145474407464554, "loss": 0.2059, "step": 2800 }, { "epoch": 1.2731287509908278, "grad_norm": 0.15133287012577057, "learning_rate": 0.00019138039269764846, "loss": 0.2068, "step": 2810 }, { "epoch": 1.2776582493488846, "grad_norm": 0.1698140799999237, "learning_rate": 0.00019130573382373549, "loss": 0.2165, "step": 2820 }, { "epoch": 1.2821877477069414, "grad_norm": 0.16591964662075043, "learning_rate": 0.00019123076770413526, "loss": 0.2052, "step": 2830 }, { "epoch": 1.2867172460649983, "grad_norm": 0.14136140048503876, "learning_rate": 0.00019115549459111034, "loss": 0.1972, "step": 2840 }, { "epoch": 1.291246744423055, "grad_norm": 0.15886986255645752, "learning_rate": 0.0001910799147379563, "loss": 0.2178, "step": 2850 }, { "epoch": 1.295776242781112, "grad_norm": 0.143589586019516, "learning_rate": 0.00019100402839900097, "loss": 0.2139, "step": 2860 }, { "epoch": 1.300305741139169, "grad_norm": 0.16037988662719727, "learning_rate": 0.0001909278358296034, "loss": 0.2073, "step": 2870 }, { "epoch": 1.3048352394972258, "grad_norm": 0.1397211104631424, "learning_rate": 0.00019085133728615313, "loss": 0.2045, "step": 2880 }, { "epoch": 1.3093647378552826, "grad_norm": 0.1394536942243576, "learning_rate": 0.00019077453302606944, "loss": 0.194, "step": 2890 }, { "epoch": 1.3138942362133395, "grad_norm": 0.1598595380783081, "learning_rate": 0.00019069742330780014, "loss": 0.205, "step": 2900 }, { "epoch": 1.3184237345713963, "grad_norm": 0.16302059590816498, "learning_rate": 0.00019062000839082115, "loss": 0.2044, "step": 2910 }, { "epoch": 1.322953232929453, "grad_norm": 0.15237270295619965, "learning_rate": 0.0001905422885356352, "loss": 0.2061, "step": 2920 }, { "epoch": 1.32748273128751, "grad_norm": 0.16175110638141632, "learning_rate": 0.00019046426400377123, "loss": 0.2127, "step": 2930 }, { "epoch": 1.3320122296455668, "grad_norm": 0.17352445423603058, "learning_rate": 0.00019038593505778343, "loss": 0.2121, "step": 2940 }, { "epoch": 1.3365417280036236, "grad_norm": 0.15539845824241638, "learning_rate": 0.0001903073019612503, "loss": 0.1996, "step": 2950 }, { "epoch": 1.3410712263616804, "grad_norm": 0.1654234081506729, "learning_rate": 0.00019022836497877382, "loss": 0.1982, "step": 2960 }, { "epoch": 1.3456007247197372, "grad_norm": 0.15698087215423584, "learning_rate": 0.00019014912437597862, "loss": 0.2006, "step": 2970 }, { "epoch": 1.350130223077794, "grad_norm": 0.15171001851558685, "learning_rate": 0.00019006958041951094, "loss": 0.2066, "step": 2980 }, { "epoch": 1.354659721435851, "grad_norm": 0.15153132379055023, "learning_rate": 0.00018998973337703784, "loss": 0.1969, "step": 2990 }, { "epoch": 1.3591892197939077, "grad_norm": 0.14000695943832397, "learning_rate": 0.00018990958351724634, "loss": 0.2081, "step": 3000 }, { "epoch": 1.3637187181519645, "grad_norm": 0.14371009171009064, "learning_rate": 0.00018982913110984225, "loss": 0.1964, "step": 3010 }, { "epoch": 1.3682482165100216, "grad_norm": 0.1594901829957962, "learning_rate": 0.0001897483764255497, "loss": 0.2004, "step": 3020 }, { "epoch": 1.3727777148680784, "grad_norm": 0.15266938507556915, "learning_rate": 0.00018966731973610985, "loss": 0.2081, "step": 3030 }, { "epoch": 1.3773072132261353, "grad_norm": 0.17764367163181305, "learning_rate": 0.0001895859613142801, "loss": 0.2028, "step": 3040 }, { "epoch": 1.381836711584192, "grad_norm": 0.15086011588573456, "learning_rate": 0.0001895043014338333, "loss": 0.1984, "step": 3050 }, { "epoch": 1.386366209942249, "grad_norm": 0.1648501455783844, "learning_rate": 0.00018942234036955659, "loss": 0.2019, "step": 3060 }, { "epoch": 1.3908957083003057, "grad_norm": 0.1467510610818863, "learning_rate": 0.00018934007839725063, "loss": 0.1972, "step": 3070 }, { "epoch": 1.3954252066583626, "grad_norm": 0.17046092450618744, "learning_rate": 0.0001892575157937287, "loss": 0.2053, "step": 3080 }, { "epoch": 1.3999547050164194, "grad_norm": 0.14983297884464264, "learning_rate": 0.0001891746528368157, "loss": 0.1986, "step": 3090 }, { "epoch": 1.4044842033744762, "grad_norm": 0.16196715831756592, "learning_rate": 0.00018909148980534712, "loss": 0.1982, "step": 3100 }, { "epoch": 1.409013701732533, "grad_norm": 0.15527282655239105, "learning_rate": 0.00018900802697916836, "loss": 0.2028, "step": 3110 }, { "epoch": 1.41354320009059, "grad_norm": 0.1645379364490509, "learning_rate": 0.0001889242646391335, "loss": 0.1939, "step": 3120 }, { "epoch": 1.418072698448647, "grad_norm": 0.1684643030166626, "learning_rate": 0.0001888402030671046, "loss": 0.1931, "step": 3130 }, { "epoch": 1.4226021968067037, "grad_norm": 0.15814268589019775, "learning_rate": 0.00018875584254595055, "loss": 0.1951, "step": 3140 }, { "epoch": 1.4271316951647606, "grad_norm": 0.1520155966281891, "learning_rate": 0.00018867118335954625, "loss": 0.1886, "step": 3150 }, { "epoch": 1.4316611935228174, "grad_norm": 0.16438494622707367, "learning_rate": 0.0001885862257927717, "loss": 0.2015, "step": 3160 }, { "epoch": 1.4361906918808742, "grad_norm": 0.15568524599075317, "learning_rate": 0.00018850097013151077, "loss": 0.1898, "step": 3170 }, { "epoch": 1.440720190238931, "grad_norm": 0.15463340282440186, "learning_rate": 0.00018841541666265058, "loss": 0.1988, "step": 3180 }, { "epoch": 1.4452496885969879, "grad_norm": 0.14274995028972626, "learning_rate": 0.00018832956567408032, "loss": 0.1884, "step": 3190 }, { "epoch": 1.4497791869550447, "grad_norm": 0.17546044290065765, "learning_rate": 0.00018824341745469033, "loss": 0.1959, "step": 3200 }, { "epoch": 1.4543086853131015, "grad_norm": 0.14111734926700592, "learning_rate": 0.0001881569722943712, "loss": 0.1929, "step": 3210 }, { "epoch": 1.4588381836711584, "grad_norm": 0.1645372211933136, "learning_rate": 0.00018807023048401263, "loss": 0.1913, "step": 3220 }, { "epoch": 1.4633676820292152, "grad_norm": 0.16762864589691162, "learning_rate": 0.00018798319231550265, "loss": 0.1876, "step": 3230 }, { "epoch": 1.467897180387272, "grad_norm": 0.14765408635139465, "learning_rate": 0.00018789585808172649, "loss": 0.1935, "step": 3240 }, { "epoch": 1.4724266787453288, "grad_norm": 0.16272325813770294, "learning_rate": 0.0001878082280765656, "loss": 0.199, "step": 3250 }, { "epoch": 1.4769561771033857, "grad_norm": 0.14496152102947235, "learning_rate": 0.0001877203025948969, "loss": 0.1987, "step": 3260 }, { "epoch": 1.4814856754614427, "grad_norm": 0.1556200087070465, "learning_rate": 0.00018763208193259132, "loss": 0.1938, "step": 3270 }, { "epoch": 1.4860151738194995, "grad_norm": 0.14785943925380707, "learning_rate": 0.00018754356638651332, "loss": 0.1905, "step": 3280 }, { "epoch": 1.4905446721775564, "grad_norm": 0.14636161923408508, "learning_rate": 0.00018745475625451947, "loss": 0.1928, "step": 3290 }, { "epoch": 1.4950741705356132, "grad_norm": 0.16059593856334686, "learning_rate": 0.00018736565183545773, "loss": 0.1967, "step": 3300 }, { "epoch": 1.49960366889367, "grad_norm": 0.15864983201026917, "learning_rate": 0.00018727625342916633, "loss": 0.1984, "step": 3310 }, { "epoch": 1.5041331672517269, "grad_norm": 0.14578469097614288, "learning_rate": 0.00018718656133647277, "loss": 0.1848, "step": 3320 }, { "epoch": 1.5086626656097837, "grad_norm": 0.16975462436676025, "learning_rate": 0.00018709657585919275, "loss": 0.1914, "step": 3330 }, { "epoch": 1.5131921639678405, "grad_norm": 0.14356206357479095, "learning_rate": 0.00018700629730012934, "loss": 0.1978, "step": 3340 }, { "epoch": 1.5177216623258976, "grad_norm": 0.14980971813201904, "learning_rate": 0.00018691572596307173, "loss": 0.1993, "step": 3350 }, { "epoch": 1.5222511606839544, "grad_norm": 0.1422482430934906, "learning_rate": 0.00018682486215279435, "loss": 0.187, "step": 3360 }, { "epoch": 1.5267806590420112, "grad_norm": 0.1586323380470276, "learning_rate": 0.00018673370617505576, "loss": 0.1843, "step": 3370 }, { "epoch": 1.531310157400068, "grad_norm": 0.1464434564113617, "learning_rate": 0.00018664225833659777, "loss": 0.1973, "step": 3380 }, { "epoch": 1.5358396557581249, "grad_norm": 0.16265639662742615, "learning_rate": 0.00018655051894514424, "loss": 0.1873, "step": 3390 }, { "epoch": 1.5403691541161817, "grad_norm": 0.13967713713645935, "learning_rate": 0.00018645848830940013, "loss": 0.1834, "step": 3400 }, { "epoch": 1.5448986524742385, "grad_norm": 0.12256325781345367, "learning_rate": 0.0001863661667390504, "loss": 0.1849, "step": 3410 }, { "epoch": 1.5494281508322953, "grad_norm": 0.14708378911018372, "learning_rate": 0.00018627355454475908, "loss": 0.1921, "step": 3420 }, { "epoch": 1.5539576491903522, "grad_norm": 0.14427697658538818, "learning_rate": 0.00018618065203816812, "loss": 0.1863, "step": 3430 }, { "epoch": 1.558487147548409, "grad_norm": 0.1333187371492386, "learning_rate": 0.0001860874595318964, "loss": 0.1927, "step": 3440 }, { "epoch": 1.5630166459064658, "grad_norm": 0.15604457259178162, "learning_rate": 0.00018599397733953858, "loss": 0.1841, "step": 3450 }, { "epoch": 1.5675461442645227, "grad_norm": 0.147917240858078, "learning_rate": 0.00018590020577566424, "loss": 0.1886, "step": 3460 }, { "epoch": 1.5720756426225795, "grad_norm": 0.14821654558181763, "learning_rate": 0.0001858061451558166, "loss": 0.1833, "step": 3470 }, { "epoch": 1.5766051409806363, "grad_norm": 0.12086760997772217, "learning_rate": 0.00018571179579651159, "loss": 0.1918, "step": 3480 }, { "epoch": 1.5811346393386931, "grad_norm": 0.16424959897994995, "learning_rate": 0.0001856171580152368, "loss": 0.1792, "step": 3490 }, { "epoch": 1.58566413769675, "grad_norm": 0.14219975471496582, "learning_rate": 0.00018552223213045028, "loss": 0.1946, "step": 3500 }, { "epoch": 1.5901936360548068, "grad_norm": 0.1768968552350998, "learning_rate": 0.00018542701846157962, "loss": 0.1843, "step": 3510 }, { "epoch": 1.5947231344128636, "grad_norm": 0.12454737722873688, "learning_rate": 0.0001853315173290208, "loss": 0.1836, "step": 3520 }, { "epoch": 1.5992526327709207, "grad_norm": 0.14064136147499084, "learning_rate": 0.00018523572905413709, "loss": 0.1841, "step": 3530 }, { "epoch": 1.6037821311289775, "grad_norm": 0.15816141664981842, "learning_rate": 0.00018513965395925802, "loss": 0.1882, "step": 3540 }, { "epoch": 1.6083116294870343, "grad_norm": 0.15514902770519257, "learning_rate": 0.00018504329236767832, "loss": 0.1881, "step": 3550 }, { "epoch": 1.6128411278450911, "grad_norm": 0.15803417563438416, "learning_rate": 0.00018494664460365668, "loss": 0.1859, "step": 3560 }, { "epoch": 1.617370626203148, "grad_norm": 0.12781353294849396, "learning_rate": 0.00018484971099241485, "loss": 0.1832, "step": 3570 }, { "epoch": 1.6219001245612048, "grad_norm": 0.16309882700443268, "learning_rate": 0.0001847524918601365, "loss": 0.1962, "step": 3580 }, { "epoch": 1.6264296229192619, "grad_norm": 0.12590362131595612, "learning_rate": 0.00018465498753396595, "loss": 0.1928, "step": 3590 }, { "epoch": 1.6309591212773187, "grad_norm": 0.1451760232448578, "learning_rate": 0.00018455719834200728, "loss": 0.1837, "step": 3600 }, { "epoch": 1.6354886196353755, "grad_norm": 0.14908108115196228, "learning_rate": 0.0001844591246133232, "loss": 0.1866, "step": 3610 }, { "epoch": 1.6400181179934323, "grad_norm": 0.13437342643737793, "learning_rate": 0.00018436076667793382, "loss": 0.1886, "step": 3620 }, { "epoch": 1.6445476163514892, "grad_norm": 0.13465970754623413, "learning_rate": 0.00018426212486681562, "loss": 0.183, "step": 3630 }, { "epoch": 1.649077114709546, "grad_norm": 0.13650234043598175, "learning_rate": 0.00018416319951190032, "loss": 0.177, "step": 3640 }, { "epoch": 1.6536066130676028, "grad_norm": 0.1663140207529068, "learning_rate": 0.00018406399094607386, "loss": 0.187, "step": 3650 }, { "epoch": 1.6581361114256596, "grad_norm": 0.16565509140491486, "learning_rate": 0.00018396449950317504, "loss": 0.1837, "step": 3660 }, { "epoch": 1.6626656097837165, "grad_norm": 0.18802668154239655, "learning_rate": 0.0001838647255179947, "loss": 0.1814, "step": 3670 }, { "epoch": 1.6671951081417733, "grad_norm": 0.17005442082881927, "learning_rate": 0.0001837646693262743, "loss": 0.1871, "step": 3680 }, { "epoch": 1.6717246064998301, "grad_norm": 0.14796973764896393, "learning_rate": 0.00018366433126470506, "loss": 0.1781, "step": 3690 }, { "epoch": 1.676254104857887, "grad_norm": 0.1405303180217743, "learning_rate": 0.0001835637116709266, "loss": 0.1792, "step": 3700 }, { "epoch": 1.6807836032159438, "grad_norm": 0.1343483328819275, "learning_rate": 0.00018346281088352592, "loss": 0.1807, "step": 3710 }, { "epoch": 1.6853131015740006, "grad_norm": 0.14635176956653595, "learning_rate": 0.00018336162924203632, "loss": 0.176, "step": 3720 }, { "epoch": 1.6898425999320574, "grad_norm": 0.13452979922294617, "learning_rate": 0.0001832601670869361, "loss": 0.1822, "step": 3730 }, { "epoch": 1.6943720982901143, "grad_norm": 0.14736182987689972, "learning_rate": 0.00018315842475964748, "loss": 0.1828, "step": 3740 }, { "epoch": 1.698901596648171, "grad_norm": 0.13288873434066772, "learning_rate": 0.00018305640260253553, "loss": 0.1749, "step": 3750 }, { "epoch": 1.703431095006228, "grad_norm": 0.146206796169281, "learning_rate": 0.00018295410095890696, "loss": 0.191, "step": 3760 }, { "epoch": 1.7079605933642847, "grad_norm": 0.13878855109214783, "learning_rate": 0.00018285152017300885, "loss": 0.1827, "step": 3770 }, { "epoch": 1.7124900917223418, "grad_norm": 0.14912264049053192, "learning_rate": 0.00018275895908433733, "loss": 0.173, "step": 3780 }, { "epoch": 1.7170195900803986, "grad_norm": 0.14632469415664673, "learning_rate": 0.0001826558488798913, "loss": 0.1776, "step": 3790 }, { "epoch": 1.7215490884384554, "grad_norm": 0.14830105006694794, "learning_rate": 0.0001825524605368002, "loss": 0.1762, "step": 3800 }, { "epoch": 1.7260785867965123, "grad_norm": 0.15307176113128662, "learning_rate": 0.00018244879440296793, "loss": 0.1753, "step": 3810 }, { "epoch": 1.730608085154569, "grad_norm": 0.15168583393096924, "learning_rate": 0.0001823448508272332, "loss": 0.1774, "step": 3820 }, { "epoch": 1.735137583512626, "grad_norm": 0.14207693934440613, "learning_rate": 0.0001822406301593683, "loss": 0.1765, "step": 3830 }, { "epoch": 1.739667081870683, "grad_norm": 0.15022936463356018, "learning_rate": 0.0001821361327500779, "loss": 0.1852, "step": 3840 }, { "epoch": 1.7441965802287398, "grad_norm": 0.14267757534980774, "learning_rate": 0.00018203135895099797, "loss": 0.1788, "step": 3850 }, { "epoch": 1.7487260785867966, "grad_norm": 0.13068848848342896, "learning_rate": 0.00018192630911469454, "loss": 0.1834, "step": 3860 }, { "epoch": 1.7532555769448535, "grad_norm": 0.13527341187000275, "learning_rate": 0.00018182098359466244, "loss": 0.1878, "step": 3870 }, { "epoch": 1.7577850753029103, "grad_norm": 0.14090019464492798, "learning_rate": 0.00018171538274532428, "loss": 0.1825, "step": 3880 }, { "epoch": 1.762314573660967, "grad_norm": 0.16419830918312073, "learning_rate": 0.00018160950692202907, "loss": 0.1735, "step": 3890 }, { "epoch": 1.766844072019024, "grad_norm": 0.13737310469150543, "learning_rate": 0.00018150335648105118, "loss": 0.1798, "step": 3900 }, { "epoch": 1.7713735703770808, "grad_norm": 0.13491977751255035, "learning_rate": 0.00018139693177958902, "loss": 0.1814, "step": 3910 }, { "epoch": 1.7759030687351376, "grad_norm": 0.13069839775562286, "learning_rate": 0.0001812902331757639, "loss": 0.1795, "step": 3920 }, { "epoch": 1.7804325670931944, "grad_norm": 0.14693836867809296, "learning_rate": 0.0001811832610286189, "loss": 0.1798, "step": 3930 }, { "epoch": 1.7849620654512512, "grad_norm": 0.15298062562942505, "learning_rate": 0.00018107601569811746, "loss": 0.1717, "step": 3940 }, { "epoch": 1.789491563809308, "grad_norm": 0.1533603072166443, "learning_rate": 0.0001809684975451423, "loss": 0.1825, "step": 3950 }, { "epoch": 1.794021062167365, "grad_norm": 0.15522614121437073, "learning_rate": 0.00018086070693149435, "loss": 0.1843, "step": 3960 }, { "epoch": 1.7985505605254217, "grad_norm": 0.12531672418117523, "learning_rate": 0.00018075264421989117, "loss": 0.1839, "step": 3970 }, { "epoch": 1.8030800588834786, "grad_norm": 0.1647823303937912, "learning_rate": 0.00018064430977396607, "loss": 0.1842, "step": 3980 }, { "epoch": 1.8076095572415354, "grad_norm": 0.14417417347431183, "learning_rate": 0.00018053570395826666, "loss": 0.17, "step": 3990 }, { "epoch": 1.8121390555995922, "grad_norm": 0.14394541084766388, "learning_rate": 0.00018042682713825377, "loss": 0.181, "step": 4000 }, { "epoch": 1.816668553957649, "grad_norm": 0.13082464039325714, "learning_rate": 0.0001803176796803002, "loss": 0.1759, "step": 4010 }, { "epoch": 1.821198052315706, "grad_norm": 0.13551370799541473, "learning_rate": 0.00018020826195168938, "loss": 0.1737, "step": 4020 }, { "epoch": 1.825727550673763, "grad_norm": 0.16460978984832764, "learning_rate": 0.00018009857432061424, "loss": 0.1788, "step": 4030 }, { "epoch": 1.8302570490318197, "grad_norm": 0.1246340349316597, "learning_rate": 0.00017998861715617595, "loss": 0.1648, "step": 4040 }, { "epoch": 1.8347865473898766, "grad_norm": 0.14473074674606323, "learning_rate": 0.00017987839082838264, "loss": 0.1683, "step": 4050 }, { "epoch": 1.8393160457479334, "grad_norm": 0.13617068529129028, "learning_rate": 0.00017976789570814812, "loss": 0.1731, "step": 4060 }, { "epoch": 1.8438455441059902, "grad_norm": 0.1399005949497223, "learning_rate": 0.00017965713216729084, "loss": 0.1726, "step": 4070 }, { "epoch": 1.8483750424640473, "grad_norm": 0.15167087316513062, "learning_rate": 0.00017954610057853242, "loss": 0.1769, "step": 4080 }, { "epoch": 1.852904540822104, "grad_norm": 0.1486155092716217, "learning_rate": 0.00017943480131549637, "loss": 0.1735, "step": 4090 }, { "epoch": 1.857434039180161, "grad_norm": 0.12672476470470428, "learning_rate": 0.00017932323475270713, "loss": 0.1692, "step": 4100 }, { "epoch": 1.8619635375382178, "grad_norm": 0.12943005561828613, "learning_rate": 0.0001792114012655884, "loss": 0.1736, "step": 4110 }, { "epoch": 1.8664930358962746, "grad_norm": 0.1305234730243683, "learning_rate": 0.00017909930123046226, "loss": 0.1693, "step": 4120 }, { "epoch": 1.8710225342543314, "grad_norm": 0.15078797936439514, "learning_rate": 0.00017898693502454757, "loss": 0.1714, "step": 4130 }, { "epoch": 1.8755520326123882, "grad_norm": 0.13605743646621704, "learning_rate": 0.00017887430302595902, "loss": 0.1742, "step": 4140 }, { "epoch": 1.880081530970445, "grad_norm": 0.15072084963321686, "learning_rate": 0.0001787614056137056, "loss": 0.1761, "step": 4150 }, { "epoch": 1.8846110293285019, "grad_norm": 0.12788626551628113, "learning_rate": 0.0001786482431676894, "loss": 0.1698, "step": 4160 }, { "epoch": 1.8891405276865587, "grad_norm": 0.11104808747768402, "learning_rate": 0.00017853481606870447, "loss": 0.1673, "step": 4170 }, { "epoch": 1.8936700260446155, "grad_norm": 0.15082287788391113, "learning_rate": 0.00017842112469843526, "loss": 0.1771, "step": 4180 }, { "epoch": 1.8981995244026724, "grad_norm": 0.13250093162059784, "learning_rate": 0.00017830716943945566, "loss": 0.1693, "step": 4190 }, { "epoch": 1.9027290227607292, "grad_norm": 0.1345834881067276, "learning_rate": 0.00017819295067522746, "loss": 0.1657, "step": 4200 }, { "epoch": 1.907258521118786, "grad_norm": 0.12472589313983917, "learning_rate": 0.00017807846879009916, "loss": 0.1673, "step": 4210 }, { "epoch": 1.9117880194768428, "grad_norm": 0.14480777084827423, "learning_rate": 0.00017796372416930466, "loss": 0.1617, "step": 4220 }, { "epoch": 1.9163175178348997, "grad_norm": 0.12188120186328888, "learning_rate": 0.00017784871719896207, "loss": 0.1697, "step": 4230 }, { "epoch": 1.9208470161929565, "grad_norm": 0.13561968505382538, "learning_rate": 0.0001777334482660721, "loss": 0.1675, "step": 4240 }, { "epoch": 1.9253765145510133, "grad_norm": 0.1565788984298706, "learning_rate": 0.0001776179177585172, "loss": 0.1695, "step": 4250 }, { "epoch": 1.9299060129090702, "grad_norm": 0.13274118304252625, "learning_rate": 0.00017750212606505988, "loss": 0.173, "step": 4260 }, { "epoch": 1.9344355112671272, "grad_norm": 0.13509687781333923, "learning_rate": 0.0001773860735753416, "loss": 0.1711, "step": 4270 }, { "epoch": 1.938965009625184, "grad_norm": 0.14307665824890137, "learning_rate": 0.0001772697606798814, "loss": 0.1752, "step": 4280 }, { "epoch": 1.9434945079832409, "grad_norm": 0.14142999053001404, "learning_rate": 0.0001771531877700746, "loss": 0.1746, "step": 4290 }, { "epoch": 1.9480240063412977, "grad_norm": 0.13015881180763245, "learning_rate": 0.0001770363552381914, "loss": 0.1624, "step": 4300 }, { "epoch": 1.9525535046993545, "grad_norm": 0.15056206285953522, "learning_rate": 0.00017691926347737573, "loss": 0.1683, "step": 4310 }, { "epoch": 1.9570830030574113, "grad_norm": 0.1449085772037506, "learning_rate": 0.00017680191288164382, "loss": 0.1652, "step": 4320 }, { "epoch": 1.9616125014154684, "grad_norm": 0.13363459706306458, "learning_rate": 0.00017668430384588278, "loss": 0.1755, "step": 4330 }, { "epoch": 1.9661419997735252, "grad_norm": 0.11182225495576859, "learning_rate": 0.00017656643676584955, "loss": 0.1649, "step": 4340 }, { "epoch": 1.970671498131582, "grad_norm": 0.1344953030347824, "learning_rate": 0.00017644831203816926, "loss": 0.1699, "step": 4350 }, { "epoch": 1.9752009964896389, "grad_norm": 0.14654122292995453, "learning_rate": 0.000176329930060334, "loss": 0.1646, "step": 4360 }, { "epoch": 1.9797304948476957, "grad_norm": 0.12001664191484451, "learning_rate": 0.00017621129123070167, "loss": 0.1732, "step": 4370 }, { "epoch": 1.9842599932057525, "grad_norm": 0.12289103865623474, "learning_rate": 0.00017609239594849435, "loss": 0.1665, "step": 4380 }, { "epoch": 1.9887894915638094, "grad_norm": 0.15383568406105042, "learning_rate": 0.00017597324461379716, "loss": 0.1668, "step": 4390 }, { "epoch": 1.9933189899218662, "grad_norm": 0.11333877593278885, "learning_rate": 0.0001758538376275568, "loss": 0.1699, "step": 4400 }, { "epoch": 1.997848488279923, "grad_norm": 0.13718217611312866, "learning_rate": 0.00017573417539158017, "loss": 0.1674, "step": 4410 }, { "epoch": 1.9996602876231457, "eval_loss": 0.17693181335926056, "eval_runtime": 617.1958, "eval_samples_per_second": 12.751, "eval_steps_per_second": 1.594, "step": 4414 }, { "epoch": 2.002717699014834, "grad_norm": 0.12558519840240479, "learning_rate": 0.0001756142583085333, "loss": 0.1601, "step": 4420 }, { "epoch": 2.007247197372891, "grad_norm": 0.171942800283432, "learning_rate": 0.00017549408678193962, "loss": 0.1325, "step": 4430 }, { "epoch": 2.0117766957309478, "grad_norm": 0.12557823956012726, "learning_rate": 0.0001753736612161788, "loss": 0.1337, "step": 4440 }, { "epoch": 2.0163061940890046, "grad_norm": 0.1112385243177414, "learning_rate": 0.00017525298201648534, "loss": 0.1353, "step": 4450 }, { "epoch": 2.0208356924470614, "grad_norm": 0.10396666824817657, "learning_rate": 0.00017513204958894728, "loss": 0.1344, "step": 4460 }, { "epoch": 2.0253651908051182, "grad_norm": 0.11958423256874084, "learning_rate": 0.0001750108643405047, "loss": 0.1325, "step": 4470 }, { "epoch": 2.029894689163175, "grad_norm": 0.13883349299430847, "learning_rate": 0.00017488942667894856, "loss": 0.1308, "step": 4480 }, { "epoch": 2.034424187521232, "grad_norm": 0.12778469920158386, "learning_rate": 0.00017476773701291905, "loss": 0.1285, "step": 4490 }, { "epoch": 2.0389536858792887, "grad_norm": 0.12921588122844696, "learning_rate": 0.00017464579575190444, "loss": 0.1286, "step": 4500 }, { "epoch": 2.0434831842373455, "grad_norm": 0.14378762245178223, "learning_rate": 0.00017452360330623957, "loss": 0.1389, "step": 4510 }, { "epoch": 2.0480126825954024, "grad_norm": 0.13812440633773804, "learning_rate": 0.00017440116008710457, "loss": 0.1342, "step": 4520 }, { "epoch": 2.052542180953459, "grad_norm": 0.15414589643478394, "learning_rate": 0.00017427846650652342, "loss": 0.1381, "step": 4530 }, { "epoch": 2.0570716793115165, "grad_norm": 0.11771693825721741, "learning_rate": 0.00017415552297736256, "loss": 0.1344, "step": 4540 }, { "epoch": 2.0616011776695733, "grad_norm": 0.13729040324687958, "learning_rate": 0.00017403232991332953, "loss": 0.1323, "step": 4550 }, { "epoch": 2.06613067602763, "grad_norm": 0.11777821183204651, "learning_rate": 0.00017390888772897148, "loss": 0.1354, "step": 4560 }, { "epoch": 2.070660174385687, "grad_norm": 0.11759165674448013, "learning_rate": 0.00017378519683967399, "loss": 0.1359, "step": 4570 }, { "epoch": 2.0751896727437438, "grad_norm": 0.14665256440639496, "learning_rate": 0.00017366125766165943, "loss": 0.1295, "step": 4580 }, { "epoch": 2.0797191711018006, "grad_norm": 0.12388816475868225, "learning_rate": 0.00017353707061198574, "loss": 0.1366, "step": 4590 }, { "epoch": 2.0842486694598574, "grad_norm": 0.12518715858459473, "learning_rate": 0.00017341263610854487, "loss": 0.1372, "step": 4600 }, { "epoch": 2.0887781678179143, "grad_norm": 0.1429567039012909, "learning_rate": 0.00017328795457006153, "loss": 0.1326, "step": 4610 }, { "epoch": 2.093307666175971, "grad_norm": 0.11989770084619522, "learning_rate": 0.00017316302641609167, "loss": 0.134, "step": 4620 }, { "epoch": 2.097837164534028, "grad_norm": 0.11995401233434677, "learning_rate": 0.00017303785206702115, "loss": 0.136, "step": 4630 }, { "epoch": 2.1023666628920847, "grad_norm": 0.11321832239627838, "learning_rate": 0.0001729124319440642, "loss": 0.1371, "step": 4640 }, { "epoch": 2.1068961612501416, "grad_norm": 0.11317916214466095, "learning_rate": 0.00017278676646926219, "loss": 0.1303, "step": 4650 }, { "epoch": 2.1114256596081984, "grad_norm": 0.11971450597047806, "learning_rate": 0.00017266085606548197, "loss": 0.1363, "step": 4660 }, { "epoch": 2.115955157966255, "grad_norm": 0.12779143452644348, "learning_rate": 0.00017253470115641473, "loss": 0.1395, "step": 4670 }, { "epoch": 2.120484656324312, "grad_norm": 0.12094374746084213, "learning_rate": 0.00017240830216657432, "loss": 0.1337, "step": 4680 }, { "epoch": 2.125014154682369, "grad_norm": 0.11902227252721786, "learning_rate": 0.00017228165952129601, "loss": 0.1342, "step": 4690 }, { "epoch": 2.1295436530404257, "grad_norm": 0.12663759291172028, "learning_rate": 0.00017215477364673486, "loss": 0.1356, "step": 4700 }, { "epoch": 2.1340731513984825, "grad_norm": 0.12311159074306488, "learning_rate": 0.0001720276449698645, "loss": 0.1364, "step": 4710 }, { "epoch": 2.1386026497565394, "grad_norm": 0.134132981300354, "learning_rate": 0.00017190027391847555, "loss": 0.1352, "step": 4720 }, { "epoch": 2.143132148114596, "grad_norm": 0.1177242249250412, "learning_rate": 0.00017177266092117428, "loss": 0.132, "step": 4730 }, { "epoch": 2.147661646472653, "grad_norm": 0.11641071736812592, "learning_rate": 0.00017164480640738101, "loss": 0.1359, "step": 4740 }, { "epoch": 2.15219114483071, "grad_norm": 0.1303935945034027, "learning_rate": 0.00017151671080732888, "loss": 0.1354, "step": 4750 }, { "epoch": 2.1567206431887667, "grad_norm": 0.13929632306098938, "learning_rate": 0.0001713883745520622, "loss": 0.1303, "step": 4760 }, { "epoch": 2.1612501415468235, "grad_norm": 0.13775485754013062, "learning_rate": 0.00017125979807343519, "loss": 0.1379, "step": 4770 }, { "epoch": 2.1657796399048808, "grad_norm": 0.10667065531015396, "learning_rate": 0.00017113098180411026, "loss": 0.1323, "step": 4780 }, { "epoch": 2.1703091382629376, "grad_norm": 0.12592215836048126, "learning_rate": 0.00017100192617755693, "loss": 0.1326, "step": 4790 }, { "epoch": 2.1748386366209944, "grad_norm": 0.12523461878299713, "learning_rate": 0.00017087263162805, "loss": 0.1361, "step": 4800 }, { "epoch": 2.1793681349790512, "grad_norm": 0.13614587485790253, "learning_rate": 0.00017074309859066837, "loss": 0.136, "step": 4810 }, { "epoch": 2.183897633337108, "grad_norm": 0.13419945538043976, "learning_rate": 0.00017061332750129332, "loss": 0.1299, "step": 4820 }, { "epoch": 2.188427131695165, "grad_norm": 0.10393204540014267, "learning_rate": 0.00017048331879660733, "loss": 0.1334, "step": 4830 }, { "epoch": 2.1929566300532217, "grad_norm": 0.12654437124729156, "learning_rate": 0.00017035307291409234, "loss": 0.138, "step": 4840 }, { "epoch": 2.1974861284112785, "grad_norm": 0.12029164284467697, "learning_rate": 0.00017022259029202843, "loss": 0.1329, "step": 4850 }, { "epoch": 2.2020156267693354, "grad_norm": 0.1427529752254486, "learning_rate": 0.00017009187136949238, "loss": 0.1314, "step": 4860 }, { "epoch": 2.206545125127392, "grad_norm": 0.10956190526485443, "learning_rate": 0.00016996091658635603, "loss": 0.1324, "step": 4870 }, { "epoch": 2.211074623485449, "grad_norm": 0.12758436799049377, "learning_rate": 0.00016982972638328496, "loss": 0.1326, "step": 4880 }, { "epoch": 2.215604121843506, "grad_norm": 0.10729292035102844, "learning_rate": 0.00016969830120173692, "loss": 0.1317, "step": 4890 }, { "epoch": 2.2201336202015627, "grad_norm": 0.14230488240718842, "learning_rate": 0.0001695666414839604, "loss": 0.1387, "step": 4900 }, { "epoch": 2.2246631185596195, "grad_norm": 0.13682898879051208, "learning_rate": 0.00016943474767299298, "loss": 0.1341, "step": 4910 }, { "epoch": 2.2291926169176763, "grad_norm": 0.14022116363048553, "learning_rate": 0.0001693026202126602, "loss": 0.1345, "step": 4920 }, { "epoch": 2.233722115275733, "grad_norm": 0.12787717580795288, "learning_rate": 0.00016917025954757365, "loss": 0.138, "step": 4930 }, { "epoch": 2.23825161363379, "grad_norm": 0.12592186033725739, "learning_rate": 0.00016903766612312967, "loss": 0.135, "step": 4940 }, { "epoch": 2.242781111991847, "grad_norm": 0.12485472112894058, "learning_rate": 0.00016890484038550792, "loss": 0.1305, "step": 4950 }, { "epoch": 2.2473106103499036, "grad_norm": 0.12487582862377167, "learning_rate": 0.0001687717827816698, "loss": 0.1352, "step": 4960 }, { "epoch": 2.2518401087079605, "grad_norm": 0.1367800235748291, "learning_rate": 0.0001686384937593568, "loss": 0.1377, "step": 4970 }, { "epoch": 2.2563696070660173, "grad_norm": 0.12008614093065262, "learning_rate": 0.00016850497376708935, "loss": 0.1399, "step": 4980 }, { "epoch": 2.260899105424074, "grad_norm": 0.1453281044960022, "learning_rate": 0.00016837122325416494, "loss": 0.134, "step": 4990 }, { "epoch": 2.265428603782131, "grad_norm": 0.1182338148355484, "learning_rate": 0.00016823724267065683, "loss": 0.1386, "step": 5000 }, { "epoch": 2.269958102140188, "grad_norm": 0.1372307538986206, "learning_rate": 0.00016810303246741245, "loss": 0.1336, "step": 5010 }, { "epoch": 2.2744876004982446, "grad_norm": 0.1213153526186943, "learning_rate": 0.00016796859309605195, "loss": 0.1345, "step": 5020 }, { "epoch": 2.2790170988563014, "grad_norm": 0.12057512998580933, "learning_rate": 0.00016783392500896652, "loss": 0.1324, "step": 5030 }, { "epoch": 2.2835465972143583, "grad_norm": 0.13681593537330627, "learning_rate": 0.00016769902865931718, "loss": 0.1377, "step": 5040 }, { "epoch": 2.2880760955724155, "grad_norm": 0.12073809653520584, "learning_rate": 0.00016756390450103285, "loss": 0.1358, "step": 5050 }, { "epoch": 2.2926055939304724, "grad_norm": 0.1260959357023239, "learning_rate": 0.00016742855298880916, "loss": 0.1327, "step": 5060 }, { "epoch": 2.297135092288529, "grad_norm": 0.12705475091934204, "learning_rate": 0.0001672929745781068, "loss": 0.1326, "step": 5070 }, { "epoch": 2.301664590646586, "grad_norm": 0.12451212108135223, "learning_rate": 0.00016715716972514984, "loss": 0.1357, "step": 5080 }, { "epoch": 2.306194089004643, "grad_norm": 0.10446886718273163, "learning_rate": 0.00016702113888692448, "loss": 0.1346, "step": 5090 }, { "epoch": 2.3107235873626997, "grad_norm": 0.1240820363163948, "learning_rate": 0.0001668848825211773, "loss": 0.1376, "step": 5100 }, { "epoch": 2.3152530857207565, "grad_norm": 0.11466921865940094, "learning_rate": 0.00016674840108641382, "loss": 0.1347, "step": 5110 }, { "epoch": 2.3197825840788133, "grad_norm": 0.12086183577775955, "learning_rate": 0.00016661169504189686, "loss": 0.1392, "step": 5120 }, { "epoch": 2.32431208243687, "grad_norm": 0.12020442634820938, "learning_rate": 0.0001664747648476451, "loss": 0.1326, "step": 5130 }, { "epoch": 2.328841580794927, "grad_norm": 0.1300458312034607, "learning_rate": 0.0001663376109644315, "loss": 0.1382, "step": 5140 }, { "epoch": 2.333371079152984, "grad_norm": 0.11588041484355927, "learning_rate": 0.00016620023385378172, "loss": 0.1348, "step": 5150 }, { "epoch": 2.3379005775110406, "grad_norm": 0.11398044973611832, "learning_rate": 0.0001660626339779726, "loss": 0.1335, "step": 5160 }, { "epoch": 2.3424300758690975, "grad_norm": 0.10993365198373795, "learning_rate": 0.0001659248118000305, "loss": 0.1314, "step": 5170 }, { "epoch": 2.3469595742271543, "grad_norm": 0.11220837384462357, "learning_rate": 0.00016578676778373, "loss": 0.1376, "step": 5180 }, { "epoch": 2.351489072585211, "grad_norm": 0.12188950926065445, "learning_rate": 0.000165648502393592, "loss": 0.1371, "step": 5190 }, { "epoch": 2.356018570943268, "grad_norm": 0.11867307126522064, "learning_rate": 0.00016551001609488246, "loss": 0.1335, "step": 5200 }, { "epoch": 2.3605480693013248, "grad_norm": 0.14046625792980194, "learning_rate": 0.00016537130935361064, "loss": 0.1392, "step": 5210 }, { "epoch": 2.3650775676593816, "grad_norm": 0.11454641073942184, "learning_rate": 0.00016523238263652757, "loss": 0.139, "step": 5220 }, { "epoch": 2.3696070660174384, "grad_norm": 0.1256382018327713, "learning_rate": 0.00016509323641112456, "loss": 0.1366, "step": 5230 }, { "epoch": 2.3741365643754953, "grad_norm": 0.11187759041786194, "learning_rate": 0.00016495387114563153, "loss": 0.1338, "step": 5240 }, { "epoch": 2.378666062733552, "grad_norm": 0.14559686183929443, "learning_rate": 0.0001648142873090155, "loss": 0.136, "step": 5250 }, { "epoch": 2.3831955610916093, "grad_norm": 0.12695267796516418, "learning_rate": 0.00016467448537097894, "loss": 0.1365, "step": 5260 }, { "epoch": 2.387725059449666, "grad_norm": 0.1341744363307953, "learning_rate": 0.0001645344658019583, "loss": 0.1354, "step": 5270 }, { "epoch": 2.392254557807723, "grad_norm": 0.12615807354450226, "learning_rate": 0.0001643942290731223, "loss": 0.1317, "step": 5280 }, { "epoch": 2.39678405616578, "grad_norm": 0.1132565289735794, "learning_rate": 0.00016425377565637054, "loss": 0.1322, "step": 5290 }, { "epoch": 2.4013135545238367, "grad_norm": 0.11671450734138489, "learning_rate": 0.00016411310602433156, "loss": 0.1296, "step": 5300 }, { "epoch": 2.4058430528818935, "grad_norm": 0.1351209580898285, "learning_rate": 0.00016397222065036164, "loss": 0.1304, "step": 5310 }, { "epoch": 2.4103725512399503, "grad_norm": 0.1276492178440094, "learning_rate": 0.000163831120008543, "loss": 0.1361, "step": 5320 }, { "epoch": 2.414902049598007, "grad_norm": 0.13524995744228363, "learning_rate": 0.00016368980457368216, "loss": 0.133, "step": 5330 }, { "epoch": 2.419431547956064, "grad_norm": 0.1324642449617386, "learning_rate": 0.00016354827482130855, "loss": 0.1373, "step": 5340 }, { "epoch": 2.423961046314121, "grad_norm": 0.13200613856315613, "learning_rate": 0.0001634065312276727, "loss": 0.1367, "step": 5350 }, { "epoch": 2.4284905446721776, "grad_norm": 0.12052213400602341, "learning_rate": 0.00016326457426974475, "loss": 0.1335, "step": 5360 }, { "epoch": 2.4330200430302344, "grad_norm": 0.1289413571357727, "learning_rate": 0.00016312240442521278, "loss": 0.1358, "step": 5370 }, { "epoch": 2.4375495413882913, "grad_norm": 0.11921897530555725, "learning_rate": 0.00016298002217248131, "loss": 0.1322, "step": 5380 }, { "epoch": 2.442079039746348, "grad_norm": 0.14872752130031586, "learning_rate": 0.00016283742799066953, "loss": 0.1385, "step": 5390 }, { "epoch": 2.446608538104405, "grad_norm": 0.11772260814905167, "learning_rate": 0.00016269462235960985, "loss": 0.1336, "step": 5400 }, { "epoch": 2.4511380364624618, "grad_norm": 0.13925409317016602, "learning_rate": 0.00016255160575984616, "loss": 0.137, "step": 5410 }, { "epoch": 2.4556675348205186, "grad_norm": 0.1357075273990631, "learning_rate": 0.00016240837867263227, "loss": 0.1349, "step": 5420 }, { "epoch": 2.4601970331785754, "grad_norm": 0.1274648904800415, "learning_rate": 0.00016226494157993036, "loss": 0.1307, "step": 5430 }, { "epoch": 2.4647265315366322, "grad_norm": 0.1424674391746521, "learning_rate": 0.00016212129496440914, "loss": 0.1359, "step": 5440 }, { "epoch": 2.469256029894689, "grad_norm": 0.1157744899392128, "learning_rate": 0.00016197743930944247, "loss": 0.1371, "step": 5450 }, { "epoch": 2.473785528252746, "grad_norm": 0.1353282928466797, "learning_rate": 0.00016183337509910762, "loss": 0.1399, "step": 5460 }, { "epoch": 2.4783150266108027, "grad_norm": 0.11779867857694626, "learning_rate": 0.00016168910281818367, "loss": 0.1348, "step": 5470 }, { "epoch": 2.4828445249688595, "grad_norm": 0.11190491169691086, "learning_rate": 0.00016154462295214984, "loss": 0.1341, "step": 5480 }, { "epoch": 2.4873740233269164, "grad_norm": 0.1286158561706543, "learning_rate": 0.0001613999359871838, "loss": 0.1323, "step": 5490 }, { "epoch": 2.491903521684973, "grad_norm": 0.12542322278022766, "learning_rate": 0.0001612550424101603, "loss": 0.1365, "step": 5500 }, { "epoch": 2.49643302004303, "grad_norm": 0.12170036882162094, "learning_rate": 0.00016110994270864912, "loss": 0.1344, "step": 5510 }, { "epoch": 2.500962518401087, "grad_norm": 0.13724590837955475, "learning_rate": 0.00016096463737091382, "loss": 0.1325, "step": 5520 }, { "epoch": 2.5054920167591437, "grad_norm": 0.11381508409976959, "learning_rate": 0.00016081912688590988, "loss": 0.1339, "step": 5530 }, { "epoch": 2.5100215151172005, "grad_norm": 0.12289192527532578, "learning_rate": 0.00016067341174328306, "loss": 0.1302, "step": 5540 }, { "epoch": 2.514551013475258, "grad_norm": 0.12465256452560425, "learning_rate": 0.00016052749243336786, "loss": 0.1354, "step": 5550 }, { "epoch": 2.5190805118333146, "grad_norm": 0.12437895685434341, "learning_rate": 0.0001603813694471858, "loss": 0.1321, "step": 5560 }, { "epoch": 2.5236100101913714, "grad_norm": 0.12177952378988266, "learning_rate": 0.00016023504327644376, "loss": 0.1387, "step": 5570 }, { "epoch": 2.5281395085494283, "grad_norm": 0.12667645514011383, "learning_rate": 0.00016008851441353232, "loss": 0.1383, "step": 5580 }, { "epoch": 2.532669006907485, "grad_norm": 0.13816499710083008, "learning_rate": 0.00015994178335152412, "loss": 0.1419, "step": 5590 }, { "epoch": 2.537198505265542, "grad_norm": 0.13884486258029938, "learning_rate": 0.00015979485058417226, "loss": 0.1345, "step": 5600 }, { "epoch": 2.5417280036235987, "grad_norm": 0.13231264054775238, "learning_rate": 0.0001596477166059085, "loss": 0.1386, "step": 5610 }, { "epoch": 2.5462575019816556, "grad_norm": 0.10923223942518234, "learning_rate": 0.00015950038191184178, "loss": 0.1382, "step": 5620 }, { "epoch": 2.5507870003397124, "grad_norm": 0.1239657923579216, "learning_rate": 0.00015935284699775638, "loss": 0.1345, "step": 5630 }, { "epoch": 2.5553164986977692, "grad_norm": 0.11910531669855118, "learning_rate": 0.00015920511236011038, "loss": 0.1321, "step": 5640 }, { "epoch": 2.559845997055826, "grad_norm": 0.1176079511642456, "learning_rate": 0.00015905717849603384, "loss": 0.1379, "step": 5650 }, { "epoch": 2.564375495413883, "grad_norm": 0.10820971429347992, "learning_rate": 0.0001589090459033273, "loss": 0.1353, "step": 5660 }, { "epoch": 2.5689049937719397, "grad_norm": 0.11455655097961426, "learning_rate": 0.00015876071508046002, "loss": 0.1375, "step": 5670 }, { "epoch": 2.5734344921299965, "grad_norm": 0.13477309048175812, "learning_rate": 0.00015861218652656826, "loss": 0.1345, "step": 5680 }, { "epoch": 2.5779639904880534, "grad_norm": 0.1447640061378479, "learning_rate": 0.00015846346074145374, "loss": 0.1398, "step": 5690 }, { "epoch": 2.58249348884611, "grad_norm": 0.11953482776880264, "learning_rate": 0.00015831453822558178, "loss": 0.1323, "step": 5700 }, { "epoch": 2.587022987204167, "grad_norm": 0.11846103519201279, "learning_rate": 0.00015816541948007967, "loss": 0.1359, "step": 5710 }, { "epoch": 2.591552485562224, "grad_norm": 0.1382216066122055, "learning_rate": 0.00015801610500673524, "loss": 0.1406, "step": 5720 }, { "epoch": 2.5960819839202807, "grad_norm": 0.12505120038986206, "learning_rate": 0.0001578665953079946, "loss": 0.1315, "step": 5730 }, { "epoch": 2.600611482278338, "grad_norm": 0.13036322593688965, "learning_rate": 0.00015771689088696112, "loss": 0.1322, "step": 5740 }, { "epoch": 2.6051409806363948, "grad_norm": 0.10827736556529999, "learning_rate": 0.00015756699224739323, "loss": 0.1346, "step": 5750 }, { "epoch": 2.6096704789944516, "grad_norm": 0.12595966458320618, "learning_rate": 0.00015741689989370294, "loss": 0.1318, "step": 5760 }, { "epoch": 2.6141999773525084, "grad_norm": 0.12824150919914246, "learning_rate": 0.0001572666143309542, "loss": 0.1287, "step": 5770 }, { "epoch": 2.6187294757105652, "grad_norm": 0.12415400892496109, "learning_rate": 0.00015711613606486096, "loss": 0.1329, "step": 5780 }, { "epoch": 2.623258974068622, "grad_norm": 0.1439315378665924, "learning_rate": 0.0001569654656017858, "loss": 0.1307, "step": 5790 }, { "epoch": 2.627788472426679, "grad_norm": 0.11085296422243118, "learning_rate": 0.00015681460344873786, "loss": 0.1343, "step": 5800 }, { "epoch": 2.6323179707847357, "grad_norm": 0.12394888699054718, "learning_rate": 0.00015666355011337147, "loss": 0.132, "step": 5810 }, { "epoch": 2.6368474691427926, "grad_norm": 0.1326746642589569, "learning_rate": 0.0001565123061039842, "loss": 0.1354, "step": 5820 }, { "epoch": 2.6413769675008494, "grad_norm": 0.11657778173685074, "learning_rate": 0.00015636087192951527, "loss": 0.1354, "step": 5830 }, { "epoch": 2.645906465858906, "grad_norm": 0.12350430339574814, "learning_rate": 0.0001562092480995439, "loss": 0.137, "step": 5840 }, { "epoch": 2.650435964216963, "grad_norm": 0.1291380524635315, "learning_rate": 0.0001560574351242873, "loss": 0.1332, "step": 5850 }, { "epoch": 2.65496546257502, "grad_norm": 0.13578584790229797, "learning_rate": 0.00015590543351459937, "loss": 0.1338, "step": 5860 }, { "epoch": 2.6594949609330767, "grad_norm": 0.11825544387102127, "learning_rate": 0.00015575324378196866, "loss": 0.1304, "step": 5870 }, { "epoch": 2.6640244592911335, "grad_norm": 0.11767857521772385, "learning_rate": 0.00015560086643851676, "loss": 0.1346, "step": 5880 }, { "epoch": 2.6685539576491903, "grad_norm": 0.12600229680538177, "learning_rate": 0.00015544830199699662, "loss": 0.1335, "step": 5890 }, { "epoch": 2.673083456007247, "grad_norm": 0.11990875750780106, "learning_rate": 0.00015529555097079065, "loss": 0.1341, "step": 5900 }, { "epoch": 2.677612954365304, "grad_norm": 0.10967559367418289, "learning_rate": 0.00015514261387390935, "loss": 0.1305, "step": 5910 }, { "epoch": 2.682142452723361, "grad_norm": 0.1208115667104721, "learning_rate": 0.00015498949122098914, "loss": 0.1329, "step": 5920 }, { "epoch": 2.6866719510814177, "grad_norm": 0.12302912771701813, "learning_rate": 0.00015483618352729093, "loss": 0.141, "step": 5930 }, { "epoch": 2.6912014494394745, "grad_norm": 0.14282426238059998, "learning_rate": 0.00015468269130869834, "loss": 0.1312, "step": 5940 }, { "epoch": 2.6957309477975313, "grad_norm": 0.1203923374414444, "learning_rate": 0.0001545290150817158, "loss": 0.1327, "step": 5950 }, { "epoch": 2.700260446155588, "grad_norm": 0.141504168510437, "learning_rate": 0.00015437515536346704, "loss": 0.1307, "step": 5960 }, { "epoch": 2.704789944513645, "grad_norm": 0.12170039117336273, "learning_rate": 0.00015422111267169322, "loss": 0.139, "step": 5970 }, { "epoch": 2.709319442871702, "grad_norm": 0.13064149022102356, "learning_rate": 0.0001540668875247511, "loss": 0.1358, "step": 5980 }, { "epoch": 2.7138489412297586, "grad_norm": 0.11947247385978699, "learning_rate": 0.00015391248044161162, "loss": 0.1301, "step": 5990 }, { "epoch": 2.7183784395878154, "grad_norm": 0.10719356685876846, "learning_rate": 0.00015375789194185772, "loss": 0.1296, "step": 6000 }, { "epoch": 2.7229079379458723, "grad_norm": 0.11288373172283173, "learning_rate": 0.00015360312254568295, "loss": 0.1336, "step": 6010 }, { "epoch": 2.727437436303929, "grad_norm": 0.12122143059968948, "learning_rate": 0.00015344817277388955, "loss": 0.1293, "step": 6020 }, { "epoch": 2.731966934661986, "grad_norm": 0.11723847687244415, "learning_rate": 0.0001532930431478867, "loss": 0.133, "step": 6030 }, { "epoch": 2.736496433020043, "grad_norm": 0.11670687049627304, "learning_rate": 0.00015313773418968878, "loss": 0.127, "step": 6040 }, { "epoch": 2.7410259313781, "grad_norm": 0.13267673552036285, "learning_rate": 0.00015298224642191368, "loss": 0.1287, "step": 6050 }, { "epoch": 2.745555429736157, "grad_norm": 0.12557269632816315, "learning_rate": 0.00015282658036778094, "loss": 0.1371, "step": 6060 }, { "epoch": 2.7500849280942137, "grad_norm": 0.12416243553161621, "learning_rate": 0.0001526707365511101, "loss": 0.1339, "step": 6070 }, { "epoch": 2.7546144264522705, "grad_norm": 0.13237670063972473, "learning_rate": 0.00015251471549631882, "loss": 0.1307, "step": 6080 }, { "epoch": 2.7591439248103273, "grad_norm": 0.10942938178777695, "learning_rate": 0.00015235851772842115, "loss": 0.1325, "step": 6090 }, { "epoch": 2.763673423168384, "grad_norm": 0.12319351732730865, "learning_rate": 0.00015220214377302586, "loss": 0.1346, "step": 6100 }, { "epoch": 2.768202921526441, "grad_norm": 0.11745291203260422, "learning_rate": 0.00015204559415633452, "loss": 0.1358, "step": 6110 }, { "epoch": 2.772732419884498, "grad_norm": 0.12627694010734558, "learning_rate": 0.00015188886940513987, "loss": 0.1314, "step": 6120 }, { "epoch": 2.7772619182425546, "grad_norm": 0.12790648639202118, "learning_rate": 0.0001517319700468239, "loss": 0.1314, "step": 6130 }, { "epoch": 2.7817914166006115, "grad_norm": 0.12807555496692657, "learning_rate": 0.00015157489660935625, "loss": 0.1368, "step": 6140 }, { "epoch": 2.7863209149586683, "grad_norm": 0.114469513297081, "learning_rate": 0.00015141764962129227, "loss": 0.1364, "step": 6150 }, { "epoch": 2.790850413316725, "grad_norm": 0.12749959528446198, "learning_rate": 0.00015126022961177134, "loss": 0.133, "step": 6160 }, { "epoch": 2.795379911674782, "grad_norm": 0.12623634934425354, "learning_rate": 0.00015110263711051505, "loss": 0.1341, "step": 6170 }, { "epoch": 2.7999094100328388, "grad_norm": 0.10407795011997223, "learning_rate": 0.00015094487264782544, "loss": 0.1373, "step": 6180 }, { "epoch": 2.8044389083908956, "grad_norm": 0.11660348623991013, "learning_rate": 0.0001507869367545832, "loss": 0.1336, "step": 6190 }, { "epoch": 2.8089684067489524, "grad_norm": 0.13876129686832428, "learning_rate": 0.00015062882996224586, "loss": 0.1282, "step": 6200 }, { "epoch": 2.8134979051070093, "grad_norm": 0.12573808431625366, "learning_rate": 0.0001504705528028461, "loss": 0.1345, "step": 6210 }, { "epoch": 2.818027403465066, "grad_norm": 0.12007986009120941, "learning_rate": 0.0001503121058089898, "loss": 0.1342, "step": 6220 }, { "epoch": 2.8225569018231234, "grad_norm": 0.10775137692689896, "learning_rate": 0.00015015348951385443, "loss": 0.1352, "step": 6230 }, { "epoch": 2.82708640018118, "grad_norm": 0.10959987342357635, "learning_rate": 0.00014999470445118705, "loss": 0.1299, "step": 6240 }, { "epoch": 2.831615898539237, "grad_norm": 0.11662711948156357, "learning_rate": 0.00014983575115530272, "loss": 0.136, "step": 6250 }, { "epoch": 2.836145396897294, "grad_norm": 0.11882171779870987, "learning_rate": 0.00014967663016108258, "loss": 0.1336, "step": 6260 }, { "epoch": 2.8406748952553507, "grad_norm": 0.12361105531454086, "learning_rate": 0.00014951734200397204, "loss": 0.1363, "step": 6270 }, { "epoch": 2.8452043936134075, "grad_norm": 0.11306975781917572, "learning_rate": 0.0001493578872199791, "loss": 0.1315, "step": 6280 }, { "epoch": 2.8497338919714643, "grad_norm": 0.10558556020259857, "learning_rate": 0.0001491982663456724, "loss": 0.1293, "step": 6290 }, { "epoch": 2.854263390329521, "grad_norm": 0.11685465276241302, "learning_rate": 0.00014903847991817946, "loss": 0.1309, "step": 6300 }, { "epoch": 2.858792888687578, "grad_norm": 0.10772823542356491, "learning_rate": 0.00014887852847518497, "loss": 0.1306, "step": 6310 }, { "epoch": 2.863322387045635, "grad_norm": 0.13630211353302002, "learning_rate": 0.0001487184125549288, "loss": 0.1301, "step": 6320 }, { "epoch": 2.8678518854036916, "grad_norm": 0.11658801138401031, "learning_rate": 0.0001485581326962044, "loss": 0.1301, "step": 6330 }, { "epoch": 2.8723813837617485, "grad_norm": 0.14447173476219177, "learning_rate": 0.00014839768943835676, "loss": 0.1364, "step": 6340 }, { "epoch": 2.8769108821198053, "grad_norm": 0.10343156009912491, "learning_rate": 0.00014823708332128077, "loss": 0.1305, "step": 6350 }, { "epoch": 2.881440380477862, "grad_norm": 0.14246292412281036, "learning_rate": 0.00014807631488541938, "loss": 0.1322, "step": 6360 }, { "epoch": 2.885969878835919, "grad_norm": 0.13046808540821075, "learning_rate": 0.00014791538467176174, "loss": 0.1327, "step": 6370 }, { "epoch": 2.8904993771939758, "grad_norm": 0.1174997016787529, "learning_rate": 0.00014775429322184128, "loss": 0.1319, "step": 6380 }, { "epoch": 2.8950288755520326, "grad_norm": 0.11900872737169266, "learning_rate": 0.0001475930410777341, "loss": 0.1346, "step": 6390 }, { "epoch": 2.8995583739100894, "grad_norm": 0.10685596615076065, "learning_rate": 0.000147431628782057, "loss": 0.1309, "step": 6400 }, { "epoch": 2.9040878722681462, "grad_norm": 0.1201610341668129, "learning_rate": 0.00014727005687796573, "loss": 0.1334, "step": 6410 }, { "epoch": 2.908617370626203, "grad_norm": 0.1042858362197876, "learning_rate": 0.00014710832590915306, "loss": 0.1305, "step": 6420 }, { "epoch": 2.91314686898426, "grad_norm": 0.11404233425855637, "learning_rate": 0.00014694643641984708, "loss": 0.1264, "step": 6430 }, { "epoch": 2.9176763673423167, "grad_norm": 0.09692881256341934, "learning_rate": 0.0001467843889548093, "loss": 0.1356, "step": 6440 }, { "epoch": 2.9222058657003736, "grad_norm": 0.11369141191244125, "learning_rate": 0.0001466221840593327, "loss": 0.1281, "step": 6450 }, { "epoch": 2.9267353640584304, "grad_norm": 0.12543022632598877, "learning_rate": 0.0001464598222792402, "loss": 0.1344, "step": 6460 }, { "epoch": 2.931264862416487, "grad_norm": 0.09960107505321503, "learning_rate": 0.00014629730416088256, "loss": 0.1347, "step": 6470 }, { "epoch": 2.935794360774544, "grad_norm": 0.11416647583246231, "learning_rate": 0.00014613463025113662, "loss": 0.128, "step": 6480 }, { "epoch": 2.940323859132601, "grad_norm": 0.13363508880138397, "learning_rate": 0.0001459718010974034, "loss": 0.1362, "step": 6490 }, { "epoch": 2.9448533574906577, "grad_norm": 0.12580367922782898, "learning_rate": 0.00014580881724760638, "loss": 0.1331, "step": 6500 }, { "epoch": 2.9493828558487145, "grad_norm": 0.1310282200574875, "learning_rate": 0.00014564567925018967, "loss": 0.137, "step": 6510 }, { "epoch": 2.9539123542067713, "grad_norm": 0.12097878754138947, "learning_rate": 0.000145482387654116, "loss": 0.1327, "step": 6520 }, { "epoch": 2.9584418525648286, "grad_norm": 0.11536047607660294, "learning_rate": 0.0001453189430088649, "loss": 0.1383, "step": 6530 }, { "epoch": 2.9629713509228854, "grad_norm": 0.11799097061157227, "learning_rate": 0.00014515534586443104, "loss": 0.1365, "step": 6540 }, { "epoch": 2.9675008492809423, "grad_norm": 0.10550688207149506, "learning_rate": 0.00014499159677132219, "loss": 0.1304, "step": 6550 }, { "epoch": 2.972030347638999, "grad_norm": 0.13376198709011078, "learning_rate": 0.00014482769628055748, "loss": 0.1317, "step": 6560 }, { "epoch": 2.976559845997056, "grad_norm": 0.1147933304309845, "learning_rate": 0.0001446636449436654, "loss": 0.1317, "step": 6570 }, { "epoch": 2.9810893443551127, "grad_norm": 0.12273435294628143, "learning_rate": 0.00014449944331268216, "loss": 0.1302, "step": 6580 }, { "epoch": 2.9856188427131696, "grad_norm": 0.12308023869991302, "learning_rate": 0.00014433509194014963, "loss": 0.1284, "step": 6590 }, { "epoch": 2.9901483410712264, "grad_norm": 0.11716390401124954, "learning_rate": 0.00014417059137911356, "loss": 0.1286, "step": 6600 }, { "epoch": 2.9946778394292832, "grad_norm": 0.1330905556678772, "learning_rate": 0.00014400594218312178, "loss": 0.1321, "step": 6610 }, { "epoch": 2.99920733778734, "grad_norm": 0.12336422502994537, "learning_rate": 0.00014384114490622221, "loss": 0.1327, "step": 6620 }, { "epoch": 2.9996602876231457, "eval_loss": 0.16021211445331573, "eval_runtime": 617.3452, "eval_samples_per_second": 12.748, "eval_steps_per_second": 1.594, "step": 6621 }, { "epoch": 3.004076548522251, "grad_norm": 0.1117822602391243, "learning_rate": 0.00014367620010296114, "loss": 0.1199, "step": 6630 }, { "epoch": 3.008606046880308, "grad_norm": 0.10662990808486938, "learning_rate": 0.00014351110832838123, "loss": 0.1082, "step": 6640 }, { "epoch": 3.013135545238365, "grad_norm": 0.09254604578018188, "learning_rate": 0.00014334587013801976, "loss": 0.1106, "step": 6650 }, { "epoch": 3.0176650435964216, "grad_norm": 0.10764751583337784, "learning_rate": 0.00014318048608790663, "loss": 0.1087, "step": 6660 }, { "epoch": 3.0221945419544785, "grad_norm": 0.10320322960615158, "learning_rate": 0.00014301495673456262, "loss": 0.1072, "step": 6670 }, { "epoch": 3.0267240403125353, "grad_norm": 0.09786458313465118, "learning_rate": 0.00014284928263499742, "loss": 0.1052, "step": 6680 }, { "epoch": 3.031253538670592, "grad_norm": 0.0940663069486618, "learning_rate": 0.00014268346434670782, "loss": 0.1141, "step": 6690 }, { "epoch": 3.035783037028649, "grad_norm": 0.12340737879276276, "learning_rate": 0.0001425175024276758, "loss": 0.1099, "step": 6700 }, { "epoch": 3.0403125353867058, "grad_norm": 0.10877358913421631, "learning_rate": 0.00014235139743636662, "loss": 0.1066, "step": 6710 }, { "epoch": 3.0448420337447626, "grad_norm": 0.09268616884946823, "learning_rate": 0.00014218514993172705, "loss": 0.105, "step": 6720 }, { "epoch": 3.0493715321028194, "grad_norm": 0.09083138406276703, "learning_rate": 0.00014201876047318342, "loss": 0.1103, "step": 6730 }, { "epoch": 3.0539010304608762, "grad_norm": 0.10291367769241333, "learning_rate": 0.00014185222962063965, "loss": 0.1072, "step": 6740 }, { "epoch": 3.0584305288189335, "grad_norm": 0.10415250808000565, "learning_rate": 0.00014168555793447554, "loss": 0.1114, "step": 6750 }, { "epoch": 3.0629600271769903, "grad_norm": 0.10135282576084137, "learning_rate": 0.00014151874597554477, "loss": 0.1086, "step": 6760 }, { "epoch": 3.067489525535047, "grad_norm": 0.10510314255952835, "learning_rate": 0.00014135179430517305, "loss": 0.1117, "step": 6770 }, { "epoch": 3.072019023893104, "grad_norm": 0.11414755135774612, "learning_rate": 0.0001411847034851562, "loss": 0.1102, "step": 6780 }, { "epoch": 3.076548522251161, "grad_norm": 0.0981656014919281, "learning_rate": 0.0001410174740777583, "loss": 0.1112, "step": 6790 }, { "epoch": 3.0810780206092176, "grad_norm": 0.09286178648471832, "learning_rate": 0.00014085010664570974, "loss": 0.1085, "step": 6800 }, { "epoch": 3.0856075189672745, "grad_norm": 0.10993903875350952, "learning_rate": 0.00014068260175220546, "loss": 0.1121, "step": 6810 }, { "epoch": 3.0901370173253313, "grad_norm": 0.10415517538785934, "learning_rate": 0.00014051495996090285, "loss": 0.109, "step": 6820 }, { "epoch": 3.094666515683388, "grad_norm": 0.09917622059583664, "learning_rate": 0.00014034718183592, "loss": 0.1085, "step": 6830 }, { "epoch": 3.099196014041445, "grad_norm": 0.09848062694072723, "learning_rate": 0.00014017926794183383, "loss": 0.1047, "step": 6840 }, { "epoch": 3.103725512399502, "grad_norm": 0.12383636087179184, "learning_rate": 0.00014001121884367804, "loss": 0.1105, "step": 6850 }, { "epoch": 3.1082550107575586, "grad_norm": 0.10345660895109177, "learning_rate": 0.00013984303510694134, "loss": 0.1108, "step": 6860 }, { "epoch": 3.1127845091156154, "grad_norm": 0.08951733261346817, "learning_rate": 0.0001396747172975655, "loss": 0.1117, "step": 6870 }, { "epoch": 3.1173140074736723, "grad_norm": 0.09321026504039764, "learning_rate": 0.00013950626598194346, "loss": 0.1095, "step": 6880 }, { "epoch": 3.121843505831729, "grad_norm": 0.09075412154197693, "learning_rate": 0.0001393376817269173, "loss": 0.1111, "step": 6890 }, { "epoch": 3.126373004189786, "grad_norm": 0.08038198202848434, "learning_rate": 0.0001391689650997766, "loss": 0.1085, "step": 6900 }, { "epoch": 3.1309025025478427, "grad_norm": 0.09946314990520477, "learning_rate": 0.00013900011666825632, "loss": 0.1079, "step": 6910 }, { "epoch": 3.1354320009058996, "grad_norm": 0.083831787109375, "learning_rate": 0.00013883113700053493, "loss": 0.108, "step": 6920 }, { "epoch": 3.1399614992639564, "grad_norm": 0.09110364317893982, "learning_rate": 0.00013866202666523245, "loss": 0.1074, "step": 6930 }, { "epoch": 3.1444909976220132, "grad_norm": 0.09342263638973236, "learning_rate": 0.00013849278623140874, "loss": 0.1102, "step": 6940 }, { "epoch": 3.14902049598007, "grad_norm": 0.10097695142030716, "learning_rate": 0.00013832341626856135, "loss": 0.1091, "step": 6950 }, { "epoch": 3.153549994338127, "grad_norm": 0.10724612325429916, "learning_rate": 0.0001381539173466237, "loss": 0.1095, "step": 6960 }, { "epoch": 3.1580794926961837, "grad_norm": 0.113038569688797, "learning_rate": 0.0001379842900359632, "loss": 0.1101, "step": 6970 }, { "epoch": 3.1626089910542405, "grad_norm": 0.10871588438749313, "learning_rate": 0.00013781453490737918, "loss": 0.1074, "step": 6980 }, { "epoch": 3.167138489412298, "grad_norm": 0.09797286987304688, "learning_rate": 0.0001376446525321013, "loss": 0.1107, "step": 6990 }, { "epoch": 3.1716679877703546, "grad_norm": 0.10018666833639145, "learning_rate": 0.0001374746434817872, "loss": 0.1112, "step": 7000 }, { "epoch": 3.1761974861284115, "grad_norm": 0.09767764061689377, "learning_rate": 0.00013730450832852086, "loss": 0.1117, "step": 7010 }, { "epoch": 3.1807269844864683, "grad_norm": 0.10807600617408752, "learning_rate": 0.00013713424764481066, "loss": 0.1069, "step": 7020 }, { "epoch": 3.185256482844525, "grad_norm": 0.11085067689418793, "learning_rate": 0.00013696386200358723, "loss": 0.1098, "step": 7030 }, { "epoch": 3.189785981202582, "grad_norm": 0.11777514964342117, "learning_rate": 0.0001367933519782018, "loss": 0.1095, "step": 7040 }, { "epoch": 3.1943154795606388, "grad_norm": 0.08946658670902252, "learning_rate": 0.00013662271814242422, "loss": 0.1091, "step": 7050 }, { "epoch": 3.1988449779186956, "grad_norm": 0.10264267772436142, "learning_rate": 0.0001364519610704408, "loss": 0.1116, "step": 7060 }, { "epoch": 3.2033744762767524, "grad_norm": 0.0933040976524353, "learning_rate": 0.00013628108133685273, "loss": 0.1091, "step": 7070 }, { "epoch": 3.2079039746348093, "grad_norm": 0.10949963331222534, "learning_rate": 0.00013611007951667376, "loss": 0.1122, "step": 7080 }, { "epoch": 3.212433472992866, "grad_norm": 0.10518185049295425, "learning_rate": 0.0001359389561853286, "loss": 0.1112, "step": 7090 }, { "epoch": 3.216962971350923, "grad_norm": 0.10346280038356781, "learning_rate": 0.00013576771191865078, "loss": 0.109, "step": 7100 }, { "epoch": 3.2214924697089797, "grad_norm": 0.09324981272220612, "learning_rate": 0.00013559634729288088, "loss": 0.1092, "step": 7110 }, { "epoch": 3.2260219680670366, "grad_norm": 0.10806597769260406, "learning_rate": 0.00013542486288466428, "loss": 0.1103, "step": 7120 }, { "epoch": 3.2305514664250934, "grad_norm": 0.10441877692937851, "learning_rate": 0.00013525325927104973, "loss": 0.1095, "step": 7130 }, { "epoch": 3.23508096478315, "grad_norm": 0.08796998858451843, "learning_rate": 0.00013508153702948683, "loss": 0.1104, "step": 7140 }, { "epoch": 3.239610463141207, "grad_norm": 0.12072450667619705, "learning_rate": 0.00013490969673782453, "loss": 0.1095, "step": 7150 }, { "epoch": 3.244139961499264, "grad_norm": 0.10589967668056488, "learning_rate": 0.00013473773897430903, "loss": 0.107, "step": 7160 }, { "epoch": 3.2486694598573207, "grad_norm": 0.10880044102668762, "learning_rate": 0.00013456566431758164, "loss": 0.1101, "step": 7170 }, { "epoch": 3.2531989582153775, "grad_norm": 0.10041461884975433, "learning_rate": 0.00013439347334667722, "loss": 0.1103, "step": 7180 }, { "epoch": 3.2577284565734344, "grad_norm": 0.11079218983650208, "learning_rate": 0.000134221166641022, "loss": 0.1112, "step": 7190 }, { "epoch": 3.262257954931491, "grad_norm": 0.10900229215621948, "learning_rate": 0.00013404874478043153, "loss": 0.1117, "step": 7200 }, { "epoch": 3.266787453289548, "grad_norm": 0.10362094640731812, "learning_rate": 0.000133876208345109, "loss": 0.1114, "step": 7210 }, { "epoch": 3.271316951647605, "grad_norm": 0.10555779188871384, "learning_rate": 0.00013370355791564306, "loss": 0.1123, "step": 7220 }, { "epoch": 3.2758464500056617, "grad_norm": 0.09255950897932053, "learning_rate": 0.00013353079407300603, "loss": 0.1131, "step": 7230 }, { "epoch": 3.2803759483637185, "grad_norm": 0.09914428740739822, "learning_rate": 0.00013335791739855176, "loss": 0.1113, "step": 7240 }, { "epoch": 3.2849054467217758, "grad_norm": 0.10521331429481506, "learning_rate": 0.0001331849284740139, "loss": 0.11, "step": 7250 }, { "epoch": 3.2894349450798326, "grad_norm": 0.09139056503772736, "learning_rate": 0.00013301182788150374, "loss": 0.1109, "step": 7260 }, { "epoch": 3.2939644434378894, "grad_norm": 0.09516976028680801, "learning_rate": 0.00013283861620350836, "loss": 0.1096, "step": 7270 }, { "epoch": 3.2984939417959462, "grad_norm": 0.09153826534748077, "learning_rate": 0.00013266529402288866, "loss": 0.1093, "step": 7280 }, { "epoch": 3.303023440154003, "grad_norm": 0.11171313375234604, "learning_rate": 0.00013249186192287735, "loss": 0.113, "step": 7290 }, { "epoch": 3.30755293851206, "grad_norm": 0.1110367551445961, "learning_rate": 0.00013231832048707712, "loss": 0.1146, "step": 7300 }, { "epoch": 3.3120824368701167, "grad_norm": 0.10271560400724411, "learning_rate": 0.00013214467029945835, "loss": 0.1096, "step": 7310 }, { "epoch": 3.3166119352281735, "grad_norm": 0.10005812346935272, "learning_rate": 0.00013197091194435767, "loss": 0.1089, "step": 7320 }, { "epoch": 3.3211414335862304, "grad_norm": 0.09489379823207855, "learning_rate": 0.00013179704600647547, "loss": 0.1119, "step": 7330 }, { "epoch": 3.325670931944287, "grad_norm": 0.10342545807361603, "learning_rate": 0.00013162307307087423, "loss": 0.1128, "step": 7340 }, { "epoch": 3.330200430302344, "grad_norm": 0.10697804391384125, "learning_rate": 0.0001314489937229765, "loss": 0.1126, "step": 7350 }, { "epoch": 3.334729928660401, "grad_norm": 0.11575332283973694, "learning_rate": 0.00013127480854856295, "loss": 0.1133, "step": 7360 }, { "epoch": 3.3392594270184577, "grad_norm": 0.10017456859350204, "learning_rate": 0.00013110051813377025, "loss": 0.1091, "step": 7370 }, { "epoch": 3.3437889253765145, "grad_norm": 0.11635085195302963, "learning_rate": 0.00013092612306508922, "loss": 0.1139, "step": 7380 }, { "epoch": 3.3483184237345713, "grad_norm": 0.09450142085552216, "learning_rate": 0.00013075162392936295, "loss": 0.1119, "step": 7390 }, { "epoch": 3.352847922092628, "grad_norm": 0.09203408658504486, "learning_rate": 0.0001305770213137846, "loss": 0.1088, "step": 7400 }, { "epoch": 3.357377420450685, "grad_norm": 0.09736169874668121, "learning_rate": 0.00013040231580589565, "loss": 0.1099, "step": 7410 }, { "epoch": 3.361906918808742, "grad_norm": 0.09759002178907394, "learning_rate": 0.0001302275079935837, "loss": 0.1149, "step": 7420 }, { "epoch": 3.3664364171667986, "grad_norm": 0.09410129487514496, "learning_rate": 0.00013005259846508068, "loss": 0.1132, "step": 7430 }, { "epoch": 3.3709659155248555, "grad_norm": 0.09184587746858597, "learning_rate": 0.0001298775878089608, "loss": 0.1099, "step": 7440 }, { "epoch": 3.3754954138829123, "grad_norm": 0.10475565493106842, "learning_rate": 0.00012970247661413855, "loss": 0.1109, "step": 7450 }, { "epoch": 3.380024912240969, "grad_norm": 0.10369405895471573, "learning_rate": 0.00012952726546986668, "loss": 0.1144, "step": 7460 }, { "epoch": 3.3845544105990264, "grad_norm": 0.1000487357378006, "learning_rate": 0.00012935195496573435, "loss": 0.1093, "step": 7470 }, { "epoch": 3.3890839089570832, "grad_norm": 0.1104254201054573, "learning_rate": 0.00012917654569166503, "loss": 0.1093, "step": 7480 }, { "epoch": 3.39361340731514, "grad_norm": 0.10195254534482956, "learning_rate": 0.0001290010382379146, "loss": 0.1104, "step": 7490 }, { "epoch": 3.398142905673197, "grad_norm": 0.10613837838172913, "learning_rate": 0.00012882543319506925, "loss": 0.115, "step": 7500 }, { "epoch": 3.4026724040312537, "grad_norm": 0.10054861009120941, "learning_rate": 0.0001286497311540436, "loss": 0.1093, "step": 7510 }, { "epoch": 3.4072019023893105, "grad_norm": 0.1072639673948288, "learning_rate": 0.0001284739327060787, "loss": 0.114, "step": 7520 }, { "epoch": 3.4117314007473674, "grad_norm": 0.09658465534448624, "learning_rate": 0.00012829803844273987, "loss": 0.1088, "step": 7530 }, { "epoch": 3.416260899105424, "grad_norm": 0.09596540778875351, "learning_rate": 0.00012812204895591505, "loss": 0.1124, "step": 7540 }, { "epoch": 3.420790397463481, "grad_norm": 0.08748818188905716, "learning_rate": 0.00012794596483781248, "loss": 0.1125, "step": 7550 }, { "epoch": 3.425319895821538, "grad_norm": 0.09352606534957886, "learning_rate": 0.00012776978668095884, "loss": 0.1134, "step": 7560 }, { "epoch": 3.4298493941795947, "grad_norm": 0.11329905688762665, "learning_rate": 0.0001275935150781973, "loss": 0.1138, "step": 7570 }, { "epoch": 3.4343788925376515, "grad_norm": 0.09285202622413635, "learning_rate": 0.00012741715062268547, "loss": 0.1096, "step": 7580 }, { "epoch": 3.4389083908957083, "grad_norm": 0.10598818957805634, "learning_rate": 0.00012724069390789342, "loss": 0.113, "step": 7590 }, { "epoch": 3.443437889253765, "grad_norm": 0.11264318972826004, "learning_rate": 0.0001270641455276016, "loss": 0.1135, "step": 7600 }, { "epoch": 3.447967387611822, "grad_norm": 0.09473126381635666, "learning_rate": 0.00012688750607589897, "loss": 0.1106, "step": 7610 }, { "epoch": 3.452496885969879, "grad_norm": 0.09131330251693726, "learning_rate": 0.000126710776147181, "loss": 0.1149, "step": 7620 }, { "epoch": 3.4570263843279356, "grad_norm": 0.10694695264101028, "learning_rate": 0.0001265339563361475, "loss": 0.1126, "step": 7630 }, { "epoch": 3.4615558826859925, "grad_norm": 0.1015838012099266, "learning_rate": 0.00012635704723780087, "loss": 0.1135, "step": 7640 }, { "epoch": 3.4660853810440493, "grad_norm": 0.10224758833646774, "learning_rate": 0.00012618004944744385, "loss": 0.1155, "step": 7650 }, { "epoch": 3.470614879402106, "grad_norm": 0.11169352382421494, "learning_rate": 0.00012600296356067768, "loss": 0.1092, "step": 7660 }, { "epoch": 3.475144377760163, "grad_norm": 0.10369731485843658, "learning_rate": 0.00012582579017340003, "loss": 0.1107, "step": 7670 }, { "epoch": 3.4796738761182198, "grad_norm": 0.09245746582746506, "learning_rate": 0.00012564852988180305, "loss": 0.1093, "step": 7680 }, { "epoch": 3.4842033744762766, "grad_norm": 0.09676039218902588, "learning_rate": 0.0001254711832823713, "loss": 0.1117, "step": 7690 }, { "epoch": 3.4887328728343334, "grad_norm": 0.10541850328445435, "learning_rate": 0.0001252937509718797, "loss": 0.1119, "step": 7700 }, { "epoch": 3.4932623711923902, "grad_norm": 0.08481086790561676, "learning_rate": 0.0001251162335473917, "loss": 0.1103, "step": 7710 }, { "epoch": 3.497791869550447, "grad_norm": 0.09966452419757843, "learning_rate": 0.00012493863160625713, "loss": 0.1147, "step": 7720 }, { "epoch": 3.502321367908504, "grad_norm": 0.09558738023042679, "learning_rate": 0.00012476094574611016, "loss": 0.1123, "step": 7730 }, { "epoch": 3.5068508662665607, "grad_norm": 0.10436621308326721, "learning_rate": 0.00012458317656486746, "loss": 0.1129, "step": 7740 }, { "epoch": 3.5113803646246176, "grad_norm": 0.10191968828439713, "learning_rate": 0.00012440532466072597, "loss": 0.1099, "step": 7750 }, { "epoch": 3.515909862982675, "grad_norm": 0.10766720771789551, "learning_rate": 0.000124227390632161, "loss": 0.1121, "step": 7760 }, { "epoch": 3.5204393613407317, "grad_norm": 0.08841870725154877, "learning_rate": 0.0001240493750779243, "loss": 0.1103, "step": 7770 }, { "epoch": 3.5249688596987885, "grad_norm": 0.1090930923819542, "learning_rate": 0.00012387127859704187, "loss": 0.1164, "step": 7780 }, { "epoch": 3.5294983580568453, "grad_norm": 0.10451924055814743, "learning_rate": 0.00012369310178881205, "loss": 0.1112, "step": 7790 }, { "epoch": 3.534027856414902, "grad_norm": 0.09721478819847107, "learning_rate": 0.0001235148452528035, "loss": 0.1135, "step": 7800 }, { "epoch": 3.538557354772959, "grad_norm": 0.0975523293018341, "learning_rate": 0.00012333650958885322, "loss": 0.1105, "step": 7810 }, { "epoch": 3.543086853131016, "grad_norm": 0.08713623881340027, "learning_rate": 0.00012315809539706436, "loss": 0.1103, "step": 7820 }, { "epoch": 3.5476163514890726, "grad_norm": 0.09232752025127411, "learning_rate": 0.00012297960327780437, "loss": 0.1128, "step": 7830 }, { "epoch": 3.5521458498471294, "grad_norm": 0.09094680100679398, "learning_rate": 0.00012280103383170295, "loss": 0.1104, "step": 7840 }, { "epoch": 3.5566753482051863, "grad_norm": 0.09738276153802872, "learning_rate": 0.00012262238765964995, "loss": 0.1059, "step": 7850 }, { "epoch": 3.561204846563243, "grad_norm": 0.0989813581109047, "learning_rate": 0.0001224436653627935, "loss": 0.112, "step": 7860 }, { "epoch": 3.5657343449213, "grad_norm": 0.09522037208080292, "learning_rate": 0.0001222648675425378, "loss": 0.1081, "step": 7870 }, { "epoch": 3.5702638432793568, "grad_norm": 0.10340669006109238, "learning_rate": 0.00012208599480054125, "loss": 0.1117, "step": 7880 }, { "epoch": 3.5747933416374136, "grad_norm": 0.11090776324272156, "learning_rate": 0.0001219070477387143, "loss": 0.1097, "step": 7890 }, { "epoch": 3.5793228399954704, "grad_norm": 0.08626790344715118, "learning_rate": 0.00012172802695921754, "loss": 0.1128, "step": 7900 }, { "epoch": 3.5838523383535272, "grad_norm": 0.09012069553136826, "learning_rate": 0.00012154893306445961, "loss": 0.1137, "step": 7910 }, { "epoch": 3.588381836711584, "grad_norm": 0.07982558012008667, "learning_rate": 0.00012136976665709516, "loss": 0.1117, "step": 7920 }, { "epoch": 3.592911335069641, "grad_norm": 0.09850164502859116, "learning_rate": 0.00012119052834002289, "loss": 0.1088, "step": 7930 }, { "epoch": 3.597440833427698, "grad_norm": 0.09800245612859726, "learning_rate": 0.00012101121871638343, "loss": 0.1153, "step": 7940 }, { "epoch": 3.601970331785755, "grad_norm": 0.09477314352989197, "learning_rate": 0.0001208318383895574, "loss": 0.1104, "step": 7950 }, { "epoch": 3.606499830143812, "grad_norm": 0.10447141528129578, "learning_rate": 0.00012065238796316331, "loss": 0.1115, "step": 7960 }, { "epoch": 3.6110293285018686, "grad_norm": 0.10505667328834534, "learning_rate": 0.00012047286804105557, "loss": 0.1096, "step": 7970 }, { "epoch": 3.6155588268599255, "grad_norm": 0.0925762876868248, "learning_rate": 0.00012029327922732242, "loss": 0.1146, "step": 7980 }, { "epoch": 3.6200883252179823, "grad_norm": 0.12217893451452255, "learning_rate": 0.00012011362212628397, "loss": 0.1105, "step": 7990 }, { "epoch": 3.624617823576039, "grad_norm": 0.09887892752885818, "learning_rate": 0.00011993389734249006, "loss": 0.1098, "step": 8000 }, { "epoch": 3.629147321934096, "grad_norm": 0.10694731771945953, "learning_rate": 0.00011975410548071832, "loss": 0.1129, "step": 8010 }, { "epoch": 3.6336768202921528, "grad_norm": 0.08971285820007324, "learning_rate": 0.00011957424714597212, "loss": 0.1084, "step": 8020 }, { "epoch": 3.6382063186502096, "grad_norm": 0.08375135064125061, "learning_rate": 0.00011939432294347848, "loss": 0.1098, "step": 8030 }, { "epoch": 3.6427358170082664, "grad_norm": 0.09610874205827713, "learning_rate": 0.00011921433347868602, "loss": 0.1109, "step": 8040 }, { "epoch": 3.6472653153663233, "grad_norm": 0.09743242710828781, "learning_rate": 0.00011903427935726308, "loss": 0.1176, "step": 8050 }, { "epoch": 3.65179481372438, "grad_norm": 0.09157928824424744, "learning_rate": 0.00011885416118509549, "loss": 0.1116, "step": 8060 }, { "epoch": 3.656324312082437, "grad_norm": 0.10359596461057663, "learning_rate": 0.00011867397956828463, "loss": 0.1117, "step": 8070 }, { "epoch": 3.6608538104404937, "grad_norm": 0.08667086809873581, "learning_rate": 0.00011849373511314537, "loss": 0.1126, "step": 8080 }, { "epoch": 3.6653833087985506, "grad_norm": 0.0973113626241684, "learning_rate": 0.00011831342842620405, "loss": 0.1099, "step": 8090 }, { "epoch": 3.6699128071566074, "grad_norm": 0.09472218155860901, "learning_rate": 0.00011813306011419642, "loss": 0.1117, "step": 8100 }, { "epoch": 3.674442305514664, "grad_norm": 0.10071218013763428, "learning_rate": 0.00011795263078406558, "loss": 0.1096, "step": 8110 }, { "epoch": 3.678971803872721, "grad_norm": 0.08343309164047241, "learning_rate": 0.00011777214104295995, "loss": 0.1118, "step": 8120 }, { "epoch": 3.683501302230778, "grad_norm": 0.0963587686419487, "learning_rate": 0.00011759159149823127, "loss": 0.1099, "step": 8130 }, { "epoch": 3.6880308005888347, "grad_norm": 0.09920413792133331, "learning_rate": 0.00011741098275743247, "loss": 0.1132, "step": 8140 }, { "epoch": 3.6925602989468915, "grad_norm": 0.12149636447429657, "learning_rate": 0.00011723031542831578, "loss": 0.1146, "step": 8150 }, { "epoch": 3.6970897973049484, "grad_norm": 0.09953594207763672, "learning_rate": 0.00011704959011883043, "loss": 0.1078, "step": 8160 }, { "epoch": 3.701619295663005, "grad_norm": 0.11264549940824509, "learning_rate": 0.0001168688074371209, "loss": 0.1098, "step": 8170 }, { "epoch": 3.706148794021062, "grad_norm": 0.10793278366327286, "learning_rate": 0.00011668796799152457, "loss": 0.1123, "step": 8180 }, { "epoch": 3.710678292379119, "grad_norm": 0.10062643885612488, "learning_rate": 0.00011650707239057, "loss": 0.1136, "step": 8190 }, { "epoch": 3.7152077907371757, "grad_norm": 0.09304151684045792, "learning_rate": 0.00011632612124297461, "loss": 0.1126, "step": 8200 }, { "epoch": 3.7197372890952325, "grad_norm": 0.10045602172613144, "learning_rate": 0.00011614511515764277, "loss": 0.1092, "step": 8210 }, { "epoch": 3.7242667874532893, "grad_norm": 0.09587648510932922, "learning_rate": 0.00011596405474366372, "loss": 0.1115, "step": 8220 }, { "epoch": 3.728796285811346, "grad_norm": 0.10631423443555832, "learning_rate": 0.00011578294061030947, "loss": 0.111, "step": 8230 }, { "epoch": 3.733325784169403, "grad_norm": 0.09861784428358078, "learning_rate": 0.00011560177336703291, "loss": 0.11, "step": 8240 }, { "epoch": 3.7378552825274602, "grad_norm": 0.0921064168214798, "learning_rate": 0.00011542055362346549, "loss": 0.1109, "step": 8250 }, { "epoch": 3.742384780885517, "grad_norm": 0.10424584895372391, "learning_rate": 0.00011523928198941543, "loss": 0.11, "step": 8260 }, { "epoch": 3.746914279243574, "grad_norm": 0.10199391096830368, "learning_rate": 0.00011505795907486551, "loss": 0.112, "step": 8270 }, { "epoch": 3.7514437776016307, "grad_norm": 0.09731689840555191, "learning_rate": 0.00011487658548997115, "loss": 0.1125, "step": 8280 }, { "epoch": 3.7559732759596876, "grad_norm": 0.07730797678232193, "learning_rate": 0.00011469516184505821, "loss": 0.1096, "step": 8290 }, { "epoch": 3.7605027743177444, "grad_norm": 0.09512131661176682, "learning_rate": 0.00011451368875062101, "loss": 0.1115, "step": 8300 }, { "epoch": 3.765032272675801, "grad_norm": 0.08450417220592499, "learning_rate": 0.00011433216681732027, "loss": 0.1135, "step": 8310 }, { "epoch": 3.769561771033858, "grad_norm": 0.08709891885519028, "learning_rate": 0.00011415059665598105, "loss": 0.111, "step": 8320 }, { "epoch": 3.774091269391915, "grad_norm": 0.12575045228004456, "learning_rate": 0.00011396897887759071, "loss": 0.1145, "step": 8330 }, { "epoch": 3.7786207677499717, "grad_norm": 0.09050168097019196, "learning_rate": 0.00011378731409329684, "loss": 0.1108, "step": 8340 }, { "epoch": 3.7831502661080285, "grad_norm": 0.0824236199259758, "learning_rate": 0.00011360560291440526, "loss": 0.1137, "step": 8350 }, { "epoch": 3.7876797644660853, "grad_norm": 0.10261125862598419, "learning_rate": 0.00011342384595237776, "loss": 0.1089, "step": 8360 }, { "epoch": 3.792209262824142, "grad_norm": 0.08885115385055542, "learning_rate": 0.00011324204381883033, "loss": 0.1109, "step": 8370 }, { "epoch": 3.796738761182199, "grad_norm": 0.10409918427467346, "learning_rate": 0.00011306019712553094, "loss": 0.1142, "step": 8380 }, { "epoch": 3.801268259540256, "grad_norm": 0.0991046279668808, "learning_rate": 0.00011287830648439746, "loss": 0.115, "step": 8390 }, { "epoch": 3.8057977578983126, "grad_norm": 0.10309819132089615, "learning_rate": 0.00011269637250749565, "loss": 0.1112, "step": 8400 }, { "epoch": 3.8103272562563695, "grad_norm": 0.09360276162624359, "learning_rate": 0.00011251439580703716, "loss": 0.1115, "step": 8410 }, { "epoch": 3.8148567546144263, "grad_norm": 0.09267252683639526, "learning_rate": 0.0001123323769953773, "loss": 0.1106, "step": 8420 }, { "epoch": 3.819386252972483, "grad_norm": 0.11334355920553207, "learning_rate": 0.00011215031668501322, "loss": 0.1086, "step": 8430 }, { "epoch": 3.8239157513305404, "grad_norm": 0.09532047063112259, "learning_rate": 0.00011196821548858156, "loss": 0.1091, "step": 8440 }, { "epoch": 3.8284452496885972, "grad_norm": 0.08060566335916519, "learning_rate": 0.00011178607401885668, "loss": 0.1102, "step": 8450 }, { "epoch": 3.832974748046654, "grad_norm": 0.09655016660690308, "learning_rate": 0.0001116038928887484, "loss": 0.1124, "step": 8460 }, { "epoch": 3.837504246404711, "grad_norm": 0.10175477713346481, "learning_rate": 0.00011142167271129996, "loss": 0.1108, "step": 8470 }, { "epoch": 3.8420337447627677, "grad_norm": 0.08714988827705383, "learning_rate": 0.00011123941409968606, "loss": 0.111, "step": 8480 }, { "epoch": 3.8465632431208245, "grad_norm": 0.08987358957529068, "learning_rate": 0.00011105711766721067, "loss": 0.1096, "step": 8490 }, { "epoch": 3.8510927414788814, "grad_norm": 0.10814320296049118, "learning_rate": 0.00011087478402730514, "loss": 0.1151, "step": 8500 }, { "epoch": 3.855622239836938, "grad_norm": 0.09886670112609863, "learning_rate": 0.00011069241379352588, "loss": 0.1078, "step": 8510 }, { "epoch": 3.860151738194995, "grad_norm": 0.09303957968950272, "learning_rate": 0.00011051000757955257, "loss": 0.113, "step": 8520 }, { "epoch": 3.864681236553052, "grad_norm": 0.10088100284337997, "learning_rate": 0.00011032756599918584, "loss": 0.1112, "step": 8530 }, { "epoch": 3.8692107349111087, "grad_norm": 0.11249160021543503, "learning_rate": 0.0001101450896663454, "loss": 0.1124, "step": 8540 }, { "epoch": 3.8737402332691655, "grad_norm": 0.0930514931678772, "learning_rate": 0.00010996257919506794, "loss": 0.1115, "step": 8550 }, { "epoch": 3.8782697316272223, "grad_norm": 0.09656676650047302, "learning_rate": 0.00010978003519950493, "loss": 0.1098, "step": 8560 }, { "epoch": 3.882799229985279, "grad_norm": 0.091661736369133, "learning_rate": 0.00010959745829392069, "loss": 0.1135, "step": 8570 }, { "epoch": 3.887328728343336, "grad_norm": 0.09262984991073608, "learning_rate": 0.00010941484909269036, "loss": 0.1115, "step": 8580 }, { "epoch": 3.891858226701393, "grad_norm": 0.11751729995012283, "learning_rate": 0.00010923220821029762, "loss": 0.1132, "step": 8590 }, { "epoch": 3.8963877250594496, "grad_norm": 0.10761595517396927, "learning_rate": 0.00010904953626133287, "loss": 0.1126, "step": 8600 }, { "epoch": 3.9009172234175065, "grad_norm": 0.08337333053350449, "learning_rate": 0.00010886683386049099, "loss": 0.111, "step": 8610 }, { "epoch": 3.9054467217755633, "grad_norm": 0.10421154648065567, "learning_rate": 0.00010868410162256935, "loss": 0.1108, "step": 8620 }, { "epoch": 3.90997622013362, "grad_norm": 0.10565438121557236, "learning_rate": 0.0001085013401624657, "loss": 0.112, "step": 8630 }, { "epoch": 3.914505718491677, "grad_norm": 0.08946827799081802, "learning_rate": 0.00010831855009517613, "loss": 0.1101, "step": 8640 }, { "epoch": 3.9190352168497338, "grad_norm": 0.08507835865020752, "learning_rate": 0.00010813573203579306, "loss": 0.11, "step": 8650 }, { "epoch": 3.9235647152077906, "grad_norm": 0.07897284626960754, "learning_rate": 0.00010795288659950303, "loss": 0.1111, "step": 8660 }, { "epoch": 3.9280942135658474, "grad_norm": 0.09554194658994675, "learning_rate": 0.00010777001440158472, "loss": 0.1126, "step": 8670 }, { "epoch": 3.9326237119239043, "grad_norm": 0.11981197446584702, "learning_rate": 0.00010758711605740683, "loss": 0.1105, "step": 8680 }, { "epoch": 3.937153210281961, "grad_norm": 0.11121747642755508, "learning_rate": 0.00010740419218242615, "loss": 0.112, "step": 8690 }, { "epoch": 3.941682708640018, "grad_norm": 0.10044469684362411, "learning_rate": 0.00010722124339218524, "loss": 0.1097, "step": 8700 }, { "epoch": 3.9462122069980747, "grad_norm": 0.07444220036268234, "learning_rate": 0.00010703827030231065, "loss": 0.1096, "step": 8710 }, { "epoch": 3.9507417053561316, "grad_norm": 0.08997642993927002, "learning_rate": 0.00010685527352851054, "loss": 0.1098, "step": 8720 }, { "epoch": 3.9552712037141884, "grad_norm": 0.09852538257837296, "learning_rate": 0.0001066722536865729, "loss": 0.1112, "step": 8730 }, { "epoch": 3.9598007020722457, "grad_norm": 0.0946199893951416, "learning_rate": 0.00010648921139236328, "loss": 0.113, "step": 8740 }, { "epoch": 3.9643302004303025, "grad_norm": 0.10738665610551834, "learning_rate": 0.0001063061472618228, "loss": 0.1105, "step": 8750 }, { "epoch": 3.9688596987883593, "grad_norm": 0.09911846369504929, "learning_rate": 0.00010612306191096602, "loss": 0.1092, "step": 8760 }, { "epoch": 3.973389197146416, "grad_norm": 0.09100183844566345, "learning_rate": 0.00010593995595587898, "loss": 0.1075, "step": 8770 }, { "epoch": 3.977918695504473, "grad_norm": 0.08540119975805283, "learning_rate": 0.00010575683001271701, "loss": 0.11, "step": 8780 }, { "epoch": 3.98244819386253, "grad_norm": 0.1455107182264328, "learning_rate": 0.00010557368469770268, "loss": 0.1072, "step": 8790 }, { "epoch": 3.9869776922205866, "grad_norm": 0.09040206670761108, "learning_rate": 0.0001053905206271238, "loss": 0.112, "step": 8800 }, { "epoch": 3.9915071905786434, "grad_norm": 0.08172180503606796, "learning_rate": 0.00010520733841733125, "loss": 0.1128, "step": 8810 }, { "epoch": 3.9960366889367003, "grad_norm": 0.09760237485170364, "learning_rate": 0.000105024138684737, "loss": 0.1119, "step": 8820 }, { "epoch": 3.9996602876231457, "eval_loss": 0.15827356278896332, "eval_runtime": 617.6968, "eval_samples_per_second": 12.741, "eval_steps_per_second": 1.593, "step": 8828 }, { "epoch": 4.000905899671611, "grad_norm": 0.0798049345612526, "learning_rate": 0.00010484092204581189, "loss": 0.1153, "step": 8830 }, { "epoch": 4.005435398029668, "grad_norm": 0.07974246889352798, "learning_rate": 0.00010465768911708373, "loss": 0.0957, "step": 8840 }, { "epoch": 4.009964896387725, "grad_norm": 0.08676203340291977, "learning_rate": 0.00010447444051513513, "loss": 0.0962, "step": 8850 }, { "epoch": 4.014494394745782, "grad_norm": 0.07175087183713913, "learning_rate": 0.00010429117685660146, "loss": 0.0961, "step": 8860 }, { "epoch": 4.019023893103839, "grad_norm": 0.06814973056316376, "learning_rate": 0.00010410789875816866, "loss": 0.0963, "step": 8870 }, { "epoch": 4.0235533914618955, "grad_norm": 0.09090814739465714, "learning_rate": 0.00010392460683657142, "loss": 0.0994, "step": 8880 }, { "epoch": 4.028082889819952, "grad_norm": 0.08229593187570572, "learning_rate": 0.0001037413017085908, "loss": 0.0967, "step": 8890 }, { "epoch": 4.032612388178009, "grad_norm": 0.07398311048746109, "learning_rate": 0.00010355798399105235, "loss": 0.096, "step": 8900 }, { "epoch": 4.037141886536066, "grad_norm": 0.06932748854160309, "learning_rate": 0.00010337465430082403, "loss": 0.0969, "step": 8910 }, { "epoch": 4.041671384894123, "grad_norm": 0.09156011044979095, "learning_rate": 0.000103191313254814, "loss": 0.098, "step": 8920 }, { "epoch": 4.04620088325218, "grad_norm": 0.07946418970823288, "learning_rate": 0.00010300796146996874, "loss": 0.0962, "step": 8930 }, { "epoch": 4.0507303816102365, "grad_norm": 0.08557803928852081, "learning_rate": 0.00010282459956327073, "loss": 0.0948, "step": 8940 }, { "epoch": 4.055259879968293, "grad_norm": 0.0721755251288414, "learning_rate": 0.00010264122815173665, "loss": 0.0981, "step": 8950 }, { "epoch": 4.05978937832635, "grad_norm": 0.069907546043396, "learning_rate": 0.0001024578478524151, "loss": 0.0973, "step": 8960 }, { "epoch": 4.064318876684407, "grad_norm": 0.07597635686397552, "learning_rate": 0.00010227445928238455, "loss": 0.0985, "step": 8970 }, { "epoch": 4.068848375042464, "grad_norm": 0.08416584879159927, "learning_rate": 0.00010209106305875139, "loss": 0.0954, "step": 8980 }, { "epoch": 4.073377873400521, "grad_norm": 0.08617585897445679, "learning_rate": 0.00010190765979864764, "loss": 0.0977, "step": 8990 }, { "epoch": 4.077907371758577, "grad_norm": 0.07779661566019058, "learning_rate": 0.00010172425011922915, "loss": 0.0968, "step": 9000 }, { "epoch": 4.082436870116634, "grad_norm": 0.08647850900888443, "learning_rate": 0.00010154083463767323, "loss": 0.0964, "step": 9010 }, { "epoch": 4.086966368474691, "grad_norm": 0.08829203248023987, "learning_rate": 0.00010135741397117684, "loss": 0.0992, "step": 9020 }, { "epoch": 4.091495866832748, "grad_norm": 0.08579693734645844, "learning_rate": 0.00010117398873695429, "loss": 0.0987, "step": 9030 }, { "epoch": 4.096025365190805, "grad_norm": 0.06886789947748184, "learning_rate": 0.00010099055955223531, "loss": 0.0983, "step": 9040 }, { "epoch": 4.100554863548862, "grad_norm": 0.0997413694858551, "learning_rate": 0.0001008071270342629, "loss": 0.0956, "step": 9050 }, { "epoch": 4.105084361906918, "grad_norm": 0.07166160643100739, "learning_rate": 0.00010062369180029125, "loss": 0.0968, "step": 9060 }, { "epoch": 4.109613860264975, "grad_norm": 0.07676910609006882, "learning_rate": 0.00010044025446758381, "loss": 0.097, "step": 9070 }, { "epoch": 4.114143358623033, "grad_norm": 0.08378776907920837, "learning_rate": 0.00010025681565341091, "loss": 0.0964, "step": 9080 }, { "epoch": 4.11867285698109, "grad_norm": 0.0725962296128273, "learning_rate": 0.00010007337597504804, "loss": 0.0982, "step": 9090 }, { "epoch": 4.123202355339147, "grad_norm": 0.0860457792878151, "learning_rate": 9.988993604977352e-05, "loss": 0.0974, "step": 9100 }, { "epoch": 4.127731853697203, "grad_norm": 0.08629846572875977, "learning_rate": 9.970649649486644e-05, "loss": 0.0981, "step": 9110 }, { "epoch": 4.13226135205526, "grad_norm": 0.08496873825788498, "learning_rate": 9.952305792760475e-05, "loss": 0.0991, "step": 9120 }, { "epoch": 4.136790850413317, "grad_norm": 0.07953400164842606, "learning_rate": 9.933962096526302e-05, "loss": 0.0953, "step": 9130 }, { "epoch": 4.141320348771374, "grad_norm": 0.08169267326593399, "learning_rate": 9.915618622511044e-05, "loss": 0.0985, "step": 9140 }, { "epoch": 4.145849847129431, "grad_norm": 0.09323912113904953, "learning_rate": 9.897275432440872e-05, "loss": 0.0955, "step": 9150 }, { "epoch": 4.1503793454874875, "grad_norm": 0.07836610078811646, "learning_rate": 9.878932588040997e-05, "loss": 0.0983, "step": 9160 }, { "epoch": 4.154908843845544, "grad_norm": 0.06795407086610794, "learning_rate": 9.860590151035473e-05, "loss": 0.097, "step": 9170 }, { "epoch": 4.159438342203601, "grad_norm": 0.082821324467659, "learning_rate": 9.84224818314698e-05, "loss": 0.0972, "step": 9180 }, { "epoch": 4.163967840561658, "grad_norm": 0.06650907546281815, "learning_rate": 9.823906746096622e-05, "loss": 0.0973, "step": 9190 }, { "epoch": 4.168497338919715, "grad_norm": 0.07272431999444962, "learning_rate": 9.805565901603714e-05, "loss": 0.0974, "step": 9200 }, { "epoch": 4.173026837277772, "grad_norm": 0.07406030595302582, "learning_rate": 9.78722571138558e-05, "loss": 0.0968, "step": 9210 }, { "epoch": 4.1775563356358285, "grad_norm": 0.06534506380558014, "learning_rate": 9.768886237157337e-05, "loss": 0.0977, "step": 9220 }, { "epoch": 4.182085833993885, "grad_norm": 0.08346185088157654, "learning_rate": 9.750547540631697e-05, "loss": 0.0966, "step": 9230 }, { "epoch": 4.186615332351942, "grad_norm": 0.0646069347858429, "learning_rate": 9.732209683518753e-05, "loss": 0.0957, "step": 9240 }, { "epoch": 4.191144830709999, "grad_norm": 0.07642305642366409, "learning_rate": 9.713872727525778e-05, "loss": 0.0948, "step": 9250 }, { "epoch": 4.195674329068056, "grad_norm": 0.07574049383401871, "learning_rate": 9.695536734357005e-05, "loss": 0.0977, "step": 9260 }, { "epoch": 4.200203827426113, "grad_norm": 0.08899475634098053, "learning_rate": 9.677201765713435e-05, "loss": 0.0979, "step": 9270 }, { "epoch": 4.2047333257841695, "grad_norm": 0.07823716104030609, "learning_rate": 9.658867883292615e-05, "loss": 0.0986, "step": 9280 }, { "epoch": 4.209262824142226, "grad_norm": 0.07970847934484482, "learning_rate": 9.640535148788443e-05, "loss": 0.0965, "step": 9290 }, { "epoch": 4.213792322500283, "grad_norm": 0.07121343910694122, "learning_rate": 9.622203623890944e-05, "loss": 0.098, "step": 9300 }, { "epoch": 4.21832182085834, "grad_norm": 0.08438264578580856, "learning_rate": 9.603873370286083e-05, "loss": 0.0975, "step": 9310 }, { "epoch": 4.222851319216397, "grad_norm": 0.07344311475753784, "learning_rate": 9.585544449655543e-05, "loss": 0.0995, "step": 9320 }, { "epoch": 4.227380817574454, "grad_norm": 0.08449902385473251, "learning_rate": 9.567216923676526e-05, "loss": 0.1, "step": 9330 }, { "epoch": 4.23191031593251, "grad_norm": 0.08021081984043121, "learning_rate": 9.548890854021529e-05, "loss": 0.0966, "step": 9340 }, { "epoch": 4.236439814290567, "grad_norm": 0.08234046399593353, "learning_rate": 9.530566302358162e-05, "loss": 0.0948, "step": 9350 }, { "epoch": 4.240969312648624, "grad_norm": 0.09645576030015945, "learning_rate": 9.512243330348917e-05, "loss": 0.0952, "step": 9360 }, { "epoch": 4.245498811006681, "grad_norm": 0.07178854942321777, "learning_rate": 9.493921999650981e-05, "loss": 0.0928, "step": 9370 }, { "epoch": 4.250028309364738, "grad_norm": 0.08183001726865768, "learning_rate": 9.475602371916006e-05, "loss": 0.0969, "step": 9380 }, { "epoch": 4.254557807722795, "grad_norm": 0.07914981991052628, "learning_rate": 9.457284508789922e-05, "loss": 0.0967, "step": 9390 }, { "epoch": 4.259087306080851, "grad_norm": 0.07766249775886536, "learning_rate": 9.438968471912718e-05, "loss": 0.0973, "step": 9400 }, { "epoch": 4.263616804438908, "grad_norm": 0.06642225384712219, "learning_rate": 9.420654322918234e-05, "loss": 0.0972, "step": 9410 }, { "epoch": 4.268146302796965, "grad_norm": 0.10396700352430344, "learning_rate": 9.402342123433968e-05, "loss": 0.0992, "step": 9420 }, { "epoch": 4.272675801155022, "grad_norm": 0.0772017240524292, "learning_rate": 9.384031935080849e-05, "loss": 0.0955, "step": 9430 }, { "epoch": 4.277205299513079, "grad_norm": 0.08579739928245544, "learning_rate": 9.365723819473034e-05, "loss": 0.0999, "step": 9440 }, { "epoch": 4.2817347978711355, "grad_norm": 0.07170093059539795, "learning_rate": 9.347417838217719e-05, "loss": 0.0978, "step": 9450 }, { "epoch": 4.286264296229192, "grad_norm": 0.09926804155111313, "learning_rate": 9.329114052914905e-05, "loss": 0.0975, "step": 9460 }, { "epoch": 4.290793794587249, "grad_norm": 0.0870131105184555, "learning_rate": 9.310812525157211e-05, "loss": 0.0976, "step": 9470 }, { "epoch": 4.295323292945306, "grad_norm": 0.09447421133518219, "learning_rate": 9.29251331652966e-05, "loss": 0.0978, "step": 9480 }, { "epoch": 4.299852791303363, "grad_norm": 0.06886494159698486, "learning_rate": 9.274216488609465e-05, "loss": 0.0956, "step": 9490 }, { "epoch": 4.30438228966142, "grad_norm": 0.06958340108394623, "learning_rate": 9.255922102965835e-05, "loss": 0.0978, "step": 9500 }, { "epoch": 4.3089117880194765, "grad_norm": 0.09395691007375717, "learning_rate": 9.237630221159751e-05, "loss": 0.0999, "step": 9510 }, { "epoch": 4.313441286377533, "grad_norm": 0.08615806698799133, "learning_rate": 9.219340904743781e-05, "loss": 0.0971, "step": 9520 }, { "epoch": 4.31797078473559, "grad_norm": 0.09322655200958252, "learning_rate": 9.201054215261849e-05, "loss": 0.1008, "step": 9530 }, { "epoch": 4.322500283093647, "grad_norm": 0.08992312103509903, "learning_rate": 9.182770214249046e-05, "loss": 0.0992, "step": 9540 }, { "epoch": 4.327029781451705, "grad_norm": 0.08701404929161072, "learning_rate": 9.164488963231415e-05, "loss": 0.0969, "step": 9550 }, { "epoch": 4.3315592798097615, "grad_norm": 0.07870589941740036, "learning_rate": 9.146210523725744e-05, "loss": 0.0989, "step": 9560 }, { "epoch": 4.336088778167818, "grad_norm": 0.061097387224435806, "learning_rate": 9.127934957239367e-05, "loss": 0.0986, "step": 9570 }, { "epoch": 4.340618276525875, "grad_norm": 0.08281367272138596, "learning_rate": 9.109662325269932e-05, "loss": 0.0988, "step": 9580 }, { "epoch": 4.345147774883932, "grad_norm": 0.09463726729154587, "learning_rate": 9.091392689305233e-05, "loss": 0.0977, "step": 9590 }, { "epoch": 4.349677273241989, "grad_norm": 0.07657352089881897, "learning_rate": 9.073126110822969e-05, "loss": 0.0995, "step": 9600 }, { "epoch": 4.354206771600046, "grad_norm": 0.08821120113134384, "learning_rate": 9.054862651290559e-05, "loss": 0.0972, "step": 9610 }, { "epoch": 4.3587362699581025, "grad_norm": 0.09997398406267166, "learning_rate": 9.036602372164922e-05, "loss": 0.0987, "step": 9620 }, { "epoch": 4.363265768316159, "grad_norm": 0.08112788945436478, "learning_rate": 9.018345334892275e-05, "loss": 0.0974, "step": 9630 }, { "epoch": 4.367795266674216, "grad_norm": 0.07112699747085571, "learning_rate": 9.000091600907928e-05, "loss": 0.0977, "step": 9640 }, { "epoch": 4.372324765032273, "grad_norm": 0.09066987037658691, "learning_rate": 8.981841231636073e-05, "loss": 0.0989, "step": 9650 }, { "epoch": 4.37685426339033, "grad_norm": 0.08122070878744125, "learning_rate": 8.96359428848958e-05, "loss": 0.0997, "step": 9660 }, { "epoch": 4.381383761748387, "grad_norm": 0.08035853505134583, "learning_rate": 8.945350832869795e-05, "loss": 0.0979, "step": 9670 }, { "epoch": 4.3859132601064434, "grad_norm": 0.07366472482681274, "learning_rate": 8.927110926166324e-05, "loss": 0.0969, "step": 9680 }, { "epoch": 4.3904427584645, "grad_norm": 0.0794186070561409, "learning_rate": 8.908874629756827e-05, "loss": 0.0983, "step": 9690 }, { "epoch": 4.394972256822557, "grad_norm": 0.06437776982784271, "learning_rate": 8.890642005006822e-05, "loss": 0.0984, "step": 9700 }, { "epoch": 4.399501755180614, "grad_norm": 0.07162316143512726, "learning_rate": 8.872413113269468e-05, "loss": 0.0975, "step": 9710 }, { "epoch": 4.404031253538671, "grad_norm": 0.07623278349637985, "learning_rate": 8.854188015885368e-05, "loss": 0.0998, "step": 9720 }, { "epoch": 4.408560751896728, "grad_norm": 0.07586734741926193, "learning_rate": 8.835966774182349e-05, "loss": 0.0973, "step": 9730 }, { "epoch": 4.413090250254784, "grad_norm": 0.0751037672162056, "learning_rate": 8.817749449475266e-05, "loss": 0.099, "step": 9740 }, { "epoch": 4.417619748612841, "grad_norm": 0.07702226936817169, "learning_rate": 8.799536103065794e-05, "loss": 0.098, "step": 9750 }, { "epoch": 4.422149246970898, "grad_norm": 0.07942003011703491, "learning_rate": 8.781326796242222e-05, "loss": 0.0982, "step": 9760 }, { "epoch": 4.426678745328955, "grad_norm": 0.07305794209241867, "learning_rate": 8.763121590279249e-05, "loss": 0.0964, "step": 9770 }, { "epoch": 4.431208243687012, "grad_norm": 0.07927001267671585, "learning_rate": 8.744920546437764e-05, "loss": 0.0985, "step": 9780 }, { "epoch": 4.4357377420450685, "grad_norm": 0.08005883544683456, "learning_rate": 8.726723725964662e-05, "loss": 0.0996, "step": 9790 }, { "epoch": 4.440267240403125, "grad_norm": 0.07482803612947464, "learning_rate": 8.708531190092619e-05, "loss": 0.1007, "step": 9800 }, { "epoch": 4.444796738761182, "grad_norm": 0.08192785084247589, "learning_rate": 8.690343000039895e-05, "loss": 0.1008, "step": 9810 }, { "epoch": 4.449326237119239, "grad_norm": 0.07693403214216232, "learning_rate": 8.67215921701013e-05, "loss": 0.0982, "step": 9820 }, { "epoch": 4.453855735477296, "grad_norm": 0.0875929445028305, "learning_rate": 8.653979902192125e-05, "loss": 0.1003, "step": 9830 }, { "epoch": 4.458385233835353, "grad_norm": 0.07676168531179428, "learning_rate": 8.635805116759656e-05, "loss": 0.0964, "step": 9840 }, { "epoch": 4.4629147321934095, "grad_norm": 0.0706658735871315, "learning_rate": 8.617634921871252e-05, "loss": 0.0996, "step": 9850 }, { "epoch": 4.467444230551466, "grad_norm": 0.08421318978071213, "learning_rate": 8.599469378669997e-05, "loss": 0.1004, "step": 9860 }, { "epoch": 4.471973728909523, "grad_norm": 0.06626369804143906, "learning_rate": 8.581308548283313e-05, "loss": 0.0961, "step": 9870 }, { "epoch": 4.47650322726758, "grad_norm": 0.10955769568681717, "learning_rate": 8.563152491822777e-05, "loss": 0.0989, "step": 9880 }, { "epoch": 4.481032725625637, "grad_norm": 0.07062443345785141, "learning_rate": 8.545001270383896e-05, "loss": 0.0996, "step": 9890 }, { "epoch": 4.485562223983694, "grad_norm": 0.09103110432624817, "learning_rate": 8.526854945045903e-05, "loss": 0.0969, "step": 9900 }, { "epoch": 4.4900917223417505, "grad_norm": 0.08335482329130173, "learning_rate": 8.508713576871564e-05, "loss": 0.0988, "step": 9910 }, { "epoch": 4.494621220699807, "grad_norm": 0.08251272886991501, "learning_rate": 8.490577226906952e-05, "loss": 0.1002, "step": 9920 }, { "epoch": 4.499150719057864, "grad_norm": 0.0790376290678978, "learning_rate": 8.472445956181266e-05, "loss": 0.0959, "step": 9930 }, { "epoch": 4.503680217415921, "grad_norm": 0.07596680521965027, "learning_rate": 8.454319825706607e-05, "loss": 0.0957, "step": 9940 }, { "epoch": 4.508209715773978, "grad_norm": 0.07809595763683319, "learning_rate": 8.436198896477777e-05, "loss": 0.0966, "step": 9950 }, { "epoch": 4.512739214132035, "grad_norm": 0.0959998071193695, "learning_rate": 8.418083229472081e-05, "loss": 0.0983, "step": 9960 }, { "epoch": 4.517268712490091, "grad_norm": 0.0705457404255867, "learning_rate": 8.399972885649115e-05, "loss": 0.0985, "step": 9970 }, { "epoch": 4.521798210848148, "grad_norm": 0.07132048159837723, "learning_rate": 8.381867925950558e-05, "loss": 0.0966, "step": 9980 }, { "epoch": 4.526327709206205, "grad_norm": 0.08615089952945709, "learning_rate": 8.363768411299978e-05, "loss": 0.097, "step": 9990 }, { "epoch": 4.530857207564262, "grad_norm": 0.07540059089660645, "learning_rate": 8.345674402602617e-05, "loss": 0.1016, "step": 10000 }, { "epoch": 4.535386705922319, "grad_norm": 0.0691477432847023, "learning_rate": 8.32758596074519e-05, "loss": 0.1008, "step": 10010 }, { "epoch": 4.539916204280376, "grad_norm": 0.07377701252698898, "learning_rate": 8.309503146595674e-05, "loss": 0.0995, "step": 10020 }, { "epoch": 4.544445702638432, "grad_norm": 0.06582989543676376, "learning_rate": 8.291426021003117e-05, "loss": 0.0974, "step": 10030 }, { "epoch": 4.548975200996489, "grad_norm": 0.07520575076341629, "learning_rate": 8.273354644797421e-05, "loss": 0.0995, "step": 10040 }, { "epoch": 4.553504699354546, "grad_norm": 0.0851583182811737, "learning_rate": 8.255289078789141e-05, "loss": 0.097, "step": 10050 }, { "epoch": 4.558034197712603, "grad_norm": 0.08124125748872757, "learning_rate": 8.237229383769283e-05, "loss": 0.1001, "step": 10060 }, { "epoch": 4.56256369607066, "grad_norm": 0.08267924189567566, "learning_rate": 8.219175620509092e-05, "loss": 0.0969, "step": 10070 }, { "epoch": 4.5670931944287165, "grad_norm": 0.07254312187433243, "learning_rate": 8.201127849759861e-05, "loss": 0.0993, "step": 10080 }, { "epoch": 4.571622692786774, "grad_norm": 0.08983401954174042, "learning_rate": 8.183086132252706e-05, "loss": 0.1003, "step": 10090 }, { "epoch": 4.576152191144831, "grad_norm": 0.06914500892162323, "learning_rate": 8.165050528698385e-05, "loss": 0.1002, "step": 10100 }, { "epoch": 4.580681689502888, "grad_norm": 0.06419195234775543, "learning_rate": 8.147021099787075e-05, "loss": 0.099, "step": 10110 }, { "epoch": 4.585211187860945, "grad_norm": 0.0637657642364502, "learning_rate": 8.12899790618818e-05, "loss": 0.0986, "step": 10120 }, { "epoch": 4.5897406862190016, "grad_norm": 0.06946605443954468, "learning_rate": 8.11098100855012e-05, "loss": 0.1003, "step": 10130 }, { "epoch": 4.594270184577058, "grad_norm": 0.06739254295825958, "learning_rate": 8.092970467500129e-05, "loss": 0.1002, "step": 10140 }, { "epoch": 4.598799682935115, "grad_norm": 0.058849554508924484, "learning_rate": 8.074966343644056e-05, "loss": 0.0991, "step": 10150 }, { "epoch": 4.603329181293172, "grad_norm": 0.07838159799575806, "learning_rate": 8.056968697566141e-05, "loss": 0.0986, "step": 10160 }, { "epoch": 4.607858679651229, "grad_norm": 0.06857123970985413, "learning_rate": 8.038977589828841e-05, "loss": 0.0995, "step": 10170 }, { "epoch": 4.612388178009286, "grad_norm": 0.06318482011556625, "learning_rate": 8.020993080972607e-05, "loss": 0.0993, "step": 10180 }, { "epoch": 4.6169176763673425, "grad_norm": 0.06283606588840485, "learning_rate": 8.003015231515683e-05, "loss": 0.0986, "step": 10190 }, { "epoch": 4.621447174725399, "grad_norm": 0.07274708896875381, "learning_rate": 7.985044101953905e-05, "loss": 0.0967, "step": 10200 }, { "epoch": 4.625976673083456, "grad_norm": 0.0730716809630394, "learning_rate": 7.967079752760498e-05, "loss": 0.0998, "step": 10210 }, { "epoch": 4.630506171441513, "grad_norm": 0.08666019141674042, "learning_rate": 7.949122244385869e-05, "loss": 0.0997, "step": 10220 }, { "epoch": 4.63503566979957, "grad_norm": 0.07280432432889938, "learning_rate": 7.931171637257407e-05, "loss": 0.098, "step": 10230 }, { "epoch": 4.639565168157627, "grad_norm": 0.07623490691184998, "learning_rate": 7.913227991779275e-05, "loss": 0.0972, "step": 10240 }, { "epoch": 4.6440946665156835, "grad_norm": 0.08786217123270035, "learning_rate": 7.895291368332213e-05, "loss": 0.0984, "step": 10250 }, { "epoch": 4.64862416487374, "grad_norm": 0.06460744142532349, "learning_rate": 7.877361827273333e-05, "loss": 0.1003, "step": 10260 }, { "epoch": 4.653153663231797, "grad_norm": 0.0875258669257164, "learning_rate": 7.859439428935907e-05, "loss": 0.0973, "step": 10270 }, { "epoch": 4.657683161589854, "grad_norm": 0.0640462338924408, "learning_rate": 7.841524233629182e-05, "loss": 0.097, "step": 10280 }, { "epoch": 4.662212659947911, "grad_norm": 0.08805970847606659, "learning_rate": 7.823616301638158e-05, "loss": 0.0977, "step": 10290 }, { "epoch": 4.666742158305968, "grad_norm": 0.08403537422418594, "learning_rate": 7.805715693223403e-05, "loss": 0.0974, "step": 10300 }, { "epoch": 4.671271656664024, "grad_norm": 0.08450974524021149, "learning_rate": 7.787822468620831e-05, "loss": 0.0996, "step": 10310 }, { "epoch": 4.675801155022081, "grad_norm": 0.06727894395589828, "learning_rate": 7.76993668804151e-05, "loss": 0.0968, "step": 10320 }, { "epoch": 4.680330653380138, "grad_norm": 0.07860536128282547, "learning_rate": 7.752058411671469e-05, "loss": 0.098, "step": 10330 }, { "epoch": 4.684860151738195, "grad_norm": 0.0783989354968071, "learning_rate": 7.734187699671475e-05, "loss": 0.1001, "step": 10340 }, { "epoch": 4.689389650096252, "grad_norm": 0.09318368136882782, "learning_rate": 7.716324612176848e-05, "loss": 0.102, "step": 10350 }, { "epoch": 4.693919148454309, "grad_norm": 0.06499195098876953, "learning_rate": 7.698469209297243e-05, "loss": 0.0972, "step": 10360 }, { "epoch": 4.698448646812365, "grad_norm": 0.08642645180225372, "learning_rate": 7.680621551116464e-05, "loss": 0.0976, "step": 10370 }, { "epoch": 4.702978145170422, "grad_norm": 0.08057048916816711, "learning_rate": 7.662781697692251e-05, "loss": 0.1001, "step": 10380 }, { "epoch": 4.707507643528479, "grad_norm": 0.07037744671106339, "learning_rate": 7.644949709056081e-05, "loss": 0.0954, "step": 10390 }, { "epoch": 4.712037141886536, "grad_norm": 0.07643935829401016, "learning_rate": 7.627125645212962e-05, "loss": 0.0988, "step": 10400 }, { "epoch": 4.716566640244593, "grad_norm": 0.06035691127181053, "learning_rate": 7.609309566141242e-05, "loss": 0.0951, "step": 10410 }, { "epoch": 4.7210961386026495, "grad_norm": 0.06654711812734604, "learning_rate": 7.591501531792394e-05, "loss": 0.0978, "step": 10420 }, { "epoch": 4.725625636960706, "grad_norm": 0.0829191505908966, "learning_rate": 7.573701602090826e-05, "loss": 0.0974, "step": 10430 }, { "epoch": 4.730155135318763, "grad_norm": 0.06532509624958038, "learning_rate": 7.555909836933668e-05, "loss": 0.1, "step": 10440 }, { "epoch": 4.73468463367682, "grad_norm": 0.07426194101572037, "learning_rate": 7.538126296190578e-05, "loss": 0.0978, "step": 10450 }, { "epoch": 4.739214132034877, "grad_norm": 0.07493621110916138, "learning_rate": 7.520351039703539e-05, "loss": 0.0982, "step": 10460 }, { "epoch": 4.743743630392934, "grad_norm": 0.07495691627264023, "learning_rate": 7.50258412728666e-05, "loss": 0.0988, "step": 10470 }, { "epoch": 4.7482731287509905, "grad_norm": 0.08136378973722458, "learning_rate": 7.484825618725968e-05, "loss": 0.097, "step": 10480 }, { "epoch": 4.752802627109047, "grad_norm": 0.06776054948568344, "learning_rate": 7.467075573779215e-05, "loss": 0.099, "step": 10490 }, { "epoch": 4.757332125467104, "grad_norm": 0.06532083451747894, "learning_rate": 7.449334052175665e-05, "loss": 0.1008, "step": 10500 }, { "epoch": 4.761861623825161, "grad_norm": 0.08907100558280945, "learning_rate": 7.431601113615909e-05, "loss": 0.0995, "step": 10510 }, { "epoch": 4.766391122183219, "grad_norm": 0.07240644842386246, "learning_rate": 7.413876817771655e-05, "loss": 0.0998, "step": 10520 }, { "epoch": 4.7709206205412755, "grad_norm": 0.07485652714967728, "learning_rate": 7.396161224285521e-05, "loss": 0.0964, "step": 10530 }, { "epoch": 4.775450118899332, "grad_norm": 0.07228762656450272, "learning_rate": 7.378454392770851e-05, "loss": 0.0999, "step": 10540 }, { "epoch": 4.779979617257389, "grad_norm": 0.08463383466005325, "learning_rate": 7.360756382811498e-05, "loss": 0.0962, "step": 10550 }, { "epoch": 4.784509115615446, "grad_norm": 0.08021671324968338, "learning_rate": 7.343067253961633e-05, "loss": 0.0982, "step": 10560 }, { "epoch": 4.789038613973503, "grad_norm": 0.0640299916267395, "learning_rate": 7.325387065745542e-05, "loss": 0.0987, "step": 10570 }, { "epoch": 4.79356811233156, "grad_norm": 0.08146077394485474, "learning_rate": 7.307715877657428e-05, "loss": 0.1004, "step": 10580 }, { "epoch": 4.7980976106896165, "grad_norm": 0.0729324147105217, "learning_rate": 7.290053749161197e-05, "loss": 0.098, "step": 10590 }, { "epoch": 4.802627109047673, "grad_norm": 0.08027558028697968, "learning_rate": 7.272400739690281e-05, "loss": 0.1003, "step": 10600 }, { "epoch": 4.80715660740573, "grad_norm": 0.07233118265867233, "learning_rate": 7.254756908647424e-05, "loss": 0.0969, "step": 10610 }, { "epoch": 4.811686105763787, "grad_norm": 0.08703139424324036, "learning_rate": 7.237122315404483e-05, "loss": 0.0978, "step": 10620 }, { "epoch": 4.816215604121844, "grad_norm": 0.09773527085781097, "learning_rate": 7.219497019302231e-05, "loss": 0.1006, "step": 10630 }, { "epoch": 4.820745102479901, "grad_norm": 0.07498451322317123, "learning_rate": 7.201881079650153e-05, "loss": 0.0953, "step": 10640 }, { "epoch": 4.8252746008379575, "grad_norm": 0.08071410655975342, "learning_rate": 7.184274555726251e-05, "loss": 0.0997, "step": 10650 }, { "epoch": 4.829804099196014, "grad_norm": 0.09239617735147476, "learning_rate": 7.166677506776847e-05, "loss": 0.0966, "step": 10660 }, { "epoch": 4.834333597554071, "grad_norm": 0.06160885840654373, "learning_rate": 7.149089992016369e-05, "loss": 0.0996, "step": 10670 }, { "epoch": 4.838863095912128, "grad_norm": 0.06242508441209793, "learning_rate": 7.131512070627174e-05, "loss": 0.0971, "step": 10680 }, { "epoch": 4.843392594270185, "grad_norm": 0.07087717205286026, "learning_rate": 7.113943801759328e-05, "loss": 0.0981, "step": 10690 }, { "epoch": 4.847922092628242, "grad_norm": 0.09145446121692657, "learning_rate": 7.096385244530421e-05, "loss": 0.1018, "step": 10700 }, { "epoch": 4.852451590986298, "grad_norm": 0.06915028393268585, "learning_rate": 7.078836458025367e-05, "loss": 0.0975, "step": 10710 }, { "epoch": 4.856981089344355, "grad_norm": 0.0731835886836052, "learning_rate": 7.06129750129619e-05, "loss": 0.0983, "step": 10720 }, { "epoch": 4.861510587702412, "grad_norm": 0.07754811644554138, "learning_rate": 7.043768433361848e-05, "loss": 0.0987, "step": 10730 }, { "epoch": 4.866040086060469, "grad_norm": 0.07234437018632889, "learning_rate": 7.026249313208013e-05, "loss": 0.0999, "step": 10740 }, { "epoch": 4.870569584418526, "grad_norm": 0.06629019230604172, "learning_rate": 7.008740199786891e-05, "loss": 0.0982, "step": 10750 }, { "epoch": 4.8750990827765825, "grad_norm": 0.07004278153181076, "learning_rate": 6.991241152017009e-05, "loss": 0.0984, "step": 10760 }, { "epoch": 4.879628581134639, "grad_norm": 0.07674950361251831, "learning_rate": 6.973752228783028e-05, "loss": 0.0967, "step": 10770 }, { "epoch": 4.884158079492696, "grad_norm": 0.08505762368440628, "learning_rate": 6.956273488935537e-05, "loss": 0.1013, "step": 10780 }, { "epoch": 4.888687577850753, "grad_norm": 0.07949452847242355, "learning_rate": 6.938804991290856e-05, "loss": 0.0985, "step": 10790 }, { "epoch": 4.89321707620881, "grad_norm": 0.08295728266239166, "learning_rate": 6.921346794630843e-05, "loss": 0.0989, "step": 10800 }, { "epoch": 4.897746574566867, "grad_norm": 0.06370176374912262, "learning_rate": 6.903898957702694e-05, "loss": 0.0973, "step": 10810 }, { "epoch": 4.9022760729249235, "grad_norm": 0.07928381115198135, "learning_rate": 6.886461539218739e-05, "loss": 0.0997, "step": 10820 }, { "epoch": 4.90680557128298, "grad_norm": 0.07781045138835907, "learning_rate": 6.870776818850459e-05, "loss": 0.1002, "step": 10830 }, { "epoch": 4.911335069641037, "grad_norm": 0.06968411058187485, "learning_rate": 6.853359357037234e-05, "loss": 0.0967, "step": 10840 }, { "epoch": 4.915864567999094, "grad_norm": 0.08793435990810394, "learning_rate": 6.835952483735004e-05, "loss": 0.0985, "step": 10850 }, { "epoch": 4.920394066357151, "grad_norm": 0.07273527979850769, "learning_rate": 6.818556257518263e-05, "loss": 0.1007, "step": 10860 }, { "epoch": 4.924923564715208, "grad_norm": 0.0791454091668129, "learning_rate": 6.80117073692567e-05, "loss": 0.0966, "step": 10870 }, { "epoch": 4.9294530630732645, "grad_norm": 0.07608039677143097, "learning_rate": 6.783795980459867e-05, "loss": 0.1012, "step": 10880 }, { "epoch": 4.933982561431321, "grad_norm": 0.07776329666376114, "learning_rate": 6.766432046587266e-05, "loss": 0.1003, "step": 10890 }, { "epoch": 4.938512059789378, "grad_norm": 0.0679519921541214, "learning_rate": 6.749078993737871e-05, "loss": 0.0991, "step": 10900 }, { "epoch": 4.943041558147435, "grad_norm": 0.07100383937358856, "learning_rate": 6.731736880305054e-05, "loss": 0.0988, "step": 10910 }, { "epoch": 4.947571056505492, "grad_norm": 0.0812440738081932, "learning_rate": 6.714405764645391e-05, "loss": 0.0998, "step": 10920 }, { "epoch": 4.952100554863549, "grad_norm": 0.07612130790948868, "learning_rate": 6.697085705078447e-05, "loss": 0.1007, "step": 10930 }, { "epoch": 4.956630053221605, "grad_norm": 0.112273670732975, "learning_rate": 6.679776759886581e-05, "loss": 0.0987, "step": 10940 }, { "epoch": 4.961159551579662, "grad_norm": 0.07123211026191711, "learning_rate": 6.662478987314751e-05, "loss": 0.0987, "step": 10950 }, { "epoch": 4.965689049937719, "grad_norm": 0.0752432569861412, "learning_rate": 6.645192445570321e-05, "loss": 0.0986, "step": 10960 }, { "epoch": 4.970218548295776, "grad_norm": 0.08591726422309875, "learning_rate": 6.627917192822862e-05, "loss": 0.0987, "step": 10970 }, { "epoch": 4.974748046653833, "grad_norm": 0.0789419561624527, "learning_rate": 6.610653287203959e-05, "loss": 0.1001, "step": 10980 }, { "epoch": 4.97927754501189, "grad_norm": 0.07303869724273682, "learning_rate": 6.593400786807011e-05, "loss": 0.1005, "step": 10990 }, { "epoch": 4.983807043369946, "grad_norm": 0.062059495598077774, "learning_rate": 6.57615974968704e-05, "loss": 0.0993, "step": 11000 }, { "epoch": 4.988336541728003, "grad_norm": 0.07526618242263794, "learning_rate": 6.558930233860497e-05, "loss": 0.0994, "step": 11010 }, { "epoch": 4.99286604008606, "grad_norm": 0.05961596965789795, "learning_rate": 6.541712297305054e-05, "loss": 0.0994, "step": 11020 }, { "epoch": 4.997395538444117, "grad_norm": 0.08421042561531067, "learning_rate": 6.524505997959425e-05, "loss": 0.0992, "step": 11030 }, { "epoch": 4.999660287623145, "eval_loss": 0.1612485647201538, "eval_runtime": 617.4712, "eval_samples_per_second": 12.746, "eval_steps_per_second": 1.594, "step": 11035 } ], "logging_steps": 10, "max_steps": 17656, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.476002265936691e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }