{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9958391123439667, "eval_steps": 500, "global_step": 1620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.380993430479339, "learning_rate": 1.234567901234568e-07, "loss": 1.2271, "step": 1 }, { "epoch": 0.01, "grad_norm": 5.348501093359578, "learning_rate": 6.17283950617284e-07, "loss": 1.2058, "step": 5 }, { "epoch": 0.02, "grad_norm": 3.168916677113339, "learning_rate": 1.234567901234568e-06, "loss": 1.1892, "step": 10 }, { "epoch": 0.03, "grad_norm": 1.858629179983347, "learning_rate": 1.8518518518518519e-06, "loss": 1.1398, "step": 15 }, { "epoch": 0.04, "grad_norm": 1.8628353409602443, "learning_rate": 2.469135802469136e-06, "loss": 1.14, "step": 20 }, { "epoch": 0.05, "grad_norm": 1.3320646229754611, "learning_rate": 3.08641975308642e-06, "loss": 1.1307, "step": 25 }, { "epoch": 0.06, "grad_norm": 1.37142743263814, "learning_rate": 3.7037037037037037e-06, "loss": 1.1099, "step": 30 }, { "epoch": 0.06, "grad_norm": 1.0893052711432178, "learning_rate": 4.3209876543209875e-06, "loss": 1.0872, "step": 35 }, { "epoch": 0.07, "grad_norm": 1.0420541911865238, "learning_rate": 4.938271604938272e-06, "loss": 1.0793, "step": 40 }, { "epoch": 0.08, "grad_norm": 0.9751244819319205, "learning_rate": 5.555555555555557e-06, "loss": 1.0845, "step": 45 }, { "epoch": 0.09, "grad_norm": 0.9394712444469778, "learning_rate": 6.17283950617284e-06, "loss": 1.0581, "step": 50 }, { "epoch": 0.1, "grad_norm": 0.9446819107089788, "learning_rate": 6.790123456790124e-06, "loss": 1.0711, "step": 55 }, { "epoch": 0.11, "grad_norm": 0.9219248821961198, "learning_rate": 7.4074074074074075e-06, "loss": 1.0224, "step": 60 }, { "epoch": 0.12, "grad_norm": 0.9248035052434173, "learning_rate": 8.024691358024692e-06, "loss": 1.0185, "step": 65 }, { "epoch": 0.13, "grad_norm": 0.9767921779181035, "learning_rate": 8.641975308641975e-06, "loss": 1.0403, "step": 70 }, { "epoch": 0.14, "grad_norm": 0.900831325888046, "learning_rate": 9.25925925925926e-06, "loss": 0.9946, "step": 75 }, { "epoch": 0.15, "grad_norm": 1.0329827196383643, "learning_rate": 9.876543209876543e-06, "loss": 0.9826, "step": 80 }, { "epoch": 0.16, "grad_norm": 1.028916413396711, "learning_rate": 1.0493827160493827e-05, "loss": 1.0026, "step": 85 }, { "epoch": 0.17, "grad_norm": 0.9986410221150659, "learning_rate": 1.1111111111111113e-05, "loss": 1.0196, "step": 90 }, { "epoch": 0.18, "grad_norm": 1.118007030382272, "learning_rate": 1.1728395061728398e-05, "loss": 1.0301, "step": 95 }, { "epoch": 0.18, "grad_norm": 1.0306686260018971, "learning_rate": 1.234567901234568e-05, "loss": 0.991, "step": 100 }, { "epoch": 0.19, "grad_norm": 1.1783877074338627, "learning_rate": 1.2962962962962964e-05, "loss": 0.9863, "step": 105 }, { "epoch": 0.2, "grad_norm": 1.3347304199894814, "learning_rate": 1.3580246913580248e-05, "loss": 1.0055, "step": 110 }, { "epoch": 0.21, "grad_norm": 1.1567825461094403, "learning_rate": 1.4197530864197532e-05, "loss": 0.9907, "step": 115 }, { "epoch": 0.22, "grad_norm": 1.401145384918854, "learning_rate": 1.4814814814814815e-05, "loss": 1.0195, "step": 120 }, { "epoch": 0.23, "grad_norm": 1.0619364925921315, "learning_rate": 1.54320987654321e-05, "loss": 0.9925, "step": 125 }, { "epoch": 0.24, "grad_norm": 1.1224187517180184, "learning_rate": 1.6049382716049385e-05, "loss": 0.9748, "step": 130 }, { "epoch": 0.25, "grad_norm": 1.058015781823731, "learning_rate": 1.6666666666666667e-05, "loss": 0.9642, "step": 135 }, { "epoch": 0.26, "grad_norm": 1.0057166388176237, "learning_rate": 1.728395061728395e-05, "loss": 0.9794, "step": 140 }, { "epoch": 0.27, "grad_norm": 1.219336306872448, "learning_rate": 1.7901234567901236e-05, "loss": 0.9848, "step": 145 }, { "epoch": 0.28, "grad_norm": 1.1778812615671694, "learning_rate": 1.851851851851852e-05, "loss": 0.9406, "step": 150 }, { "epoch": 0.29, "grad_norm": 1.4369525808973123, "learning_rate": 1.9135802469135804e-05, "loss": 0.9926, "step": 155 }, { "epoch": 0.3, "grad_norm": 1.267670821149399, "learning_rate": 1.9753086419753087e-05, "loss": 0.9552, "step": 160 }, { "epoch": 0.31, "grad_norm": 1.324502025754122, "learning_rate": 1.9999791072456062e-05, "loss": 0.9594, "step": 165 }, { "epoch": 0.31, "grad_norm": 1.6398800307247399, "learning_rate": 1.9998514324636053e-05, "loss": 0.9742, "step": 170 }, { "epoch": 0.32, "grad_norm": 1.282151405513815, "learning_rate": 1.999607704786645e-05, "loss": 0.9782, "step": 175 }, { "epoch": 0.33, "grad_norm": 1.368937000876658, "learning_rate": 1.9992479525042305e-05, "loss": 0.97, "step": 180 }, { "epoch": 0.34, "grad_norm": 1.1429149098857512, "learning_rate": 1.9987722173728587e-05, "loss": 0.9263, "step": 185 }, { "epoch": 0.35, "grad_norm": 1.142494542354463, "learning_rate": 1.9981805546111736e-05, "loss": 0.9612, "step": 190 }, { "epoch": 0.36, "grad_norm": 0.9661133733007405, "learning_rate": 1.9974730328935534e-05, "loss": 0.9564, "step": 195 }, { "epoch": 0.37, "grad_norm": 1.0956422654374287, "learning_rate": 1.996649734342143e-05, "loss": 0.957, "step": 200 }, { "epoch": 0.38, "grad_norm": 1.065395255714296, "learning_rate": 1.995710754517319e-05, "loss": 0.9428, "step": 205 }, { "epoch": 0.39, "grad_norm": 1.0850009958268814, "learning_rate": 1.9946562024066018e-05, "loss": 0.9515, "step": 210 }, { "epoch": 0.4, "grad_norm": 1.3698267000140338, "learning_rate": 1.9934862004120003e-05, "loss": 0.9441, "step": 215 }, { "epoch": 0.41, "grad_norm": 1.2180243717222954, "learning_rate": 1.9922008843358097e-05, "loss": 0.9484, "step": 220 }, { "epoch": 0.42, "grad_norm": 1.05421684125154, "learning_rate": 1.9908004033648452e-05, "loss": 0.9672, "step": 225 }, { "epoch": 0.43, "grad_norm": 1.2382950773680732, "learning_rate": 1.9892849200531294e-05, "loss": 0.9375, "step": 230 }, { "epoch": 0.43, "grad_norm": 1.1542981178332155, "learning_rate": 1.9876546103030195e-05, "loss": 0.9401, "step": 235 }, { "epoch": 0.44, "grad_norm": 1.0395211592834745, "learning_rate": 1.9859096633447965e-05, "loss": 0.965, "step": 240 }, { "epoch": 0.45, "grad_norm": 0.9620854176466995, "learning_rate": 1.9840502817146965e-05, "loss": 0.9254, "step": 245 }, { "epoch": 0.46, "grad_norm": 1.0602273541577587, "learning_rate": 1.9820766812314038e-05, "loss": 0.9307, "step": 250 }, { "epoch": 0.47, "grad_norm": 1.142745002321171, "learning_rate": 1.9799890909710013e-05, "loss": 0.9344, "step": 255 }, { "epoch": 0.48, "grad_norm": 1.0822420018459202, "learning_rate": 1.9777877532403816e-05, "loss": 0.9345, "step": 260 }, { "epoch": 0.49, "grad_norm": 1.165254793190554, "learning_rate": 1.9754729235491207e-05, "loss": 0.9148, "step": 265 }, { "epoch": 0.5, "grad_norm": 0.9461032408631161, "learning_rate": 1.973044870579824e-05, "loss": 0.9248, "step": 270 }, { "epoch": 0.51, "grad_norm": 1.0161807611793272, "learning_rate": 1.9705038761569372e-05, "loss": 0.9229, "step": 275 }, { "epoch": 0.52, "grad_norm": 1.0682171583928652, "learning_rate": 1.967850235214037e-05, "loss": 0.9133, "step": 280 }, { "epoch": 0.53, "grad_norm": 1.0674171129419805, "learning_rate": 1.9650842557595968e-05, "loss": 0.9281, "step": 285 }, { "epoch": 0.54, "grad_norm": 1.0578949150556498, "learning_rate": 1.9622062588412368e-05, "loss": 0.9192, "step": 290 }, { "epoch": 0.55, "grad_norm": 0.9900599023327956, "learning_rate": 1.9592165785084606e-05, "loss": 0.9312, "step": 295 }, { "epoch": 0.55, "grad_norm": 1.0090102752673205, "learning_rate": 1.95611556177388e-05, "loss": 0.9201, "step": 300 }, { "epoch": 0.56, "grad_norm": 1.0481187571731347, "learning_rate": 1.952903568572939e-05, "loss": 0.97, "step": 305 }, { "epoch": 0.57, "grad_norm": 1.0175192665589157, "learning_rate": 1.9495809717221362e-05, "loss": 0.9164, "step": 310 }, { "epoch": 0.58, "grad_norm": 1.1537464954378314, "learning_rate": 1.946148156875751e-05, "loss": 0.8919, "step": 315 }, { "epoch": 0.59, "grad_norm": 1.0953888335358495, "learning_rate": 1.9426055224810788e-05, "loss": 0.9372, "step": 320 }, { "epoch": 0.6, "grad_norm": 1.109481850979543, "learning_rate": 1.9389534797321886e-05, "loss": 0.9099, "step": 325 }, { "epoch": 0.61, "grad_norm": 0.9506715246162528, "learning_rate": 1.93519245252219e-05, "loss": 0.9136, "step": 330 }, { "epoch": 0.62, "grad_norm": 1.0986476835482393, "learning_rate": 1.9313228773940346e-05, "loss": 0.9082, "step": 335 }, { "epoch": 0.63, "grad_norm": 1.2893613575243312, "learning_rate": 1.9273452034898472e-05, "loss": 0.9263, "step": 340 }, { "epoch": 0.64, "grad_norm": 0.9810983163054904, "learning_rate": 1.9232598924987904e-05, "loss": 0.9067, "step": 345 }, { "epoch": 0.65, "grad_norm": 1.0414864956692913, "learning_rate": 1.9190674186034806e-05, "loss": 0.9325, "step": 350 }, { "epoch": 0.66, "grad_norm": 1.0303257386302838, "learning_rate": 1.914768268424946e-05, "loss": 0.9144, "step": 355 }, { "epoch": 0.67, "grad_norm": 0.8564282775083322, "learning_rate": 1.9103629409661468e-05, "loss": 0.922, "step": 360 }, { "epoch": 0.67, "grad_norm": 1.0162068121952108, "learning_rate": 1.905851947554054e-05, "loss": 0.9117, "step": 365 }, { "epoch": 0.68, "grad_norm": 0.9620441168199088, "learning_rate": 1.9012358117803007e-05, "loss": 0.9093, "step": 370 }, { "epoch": 0.69, "grad_norm": 0.9267690533225905, "learning_rate": 1.8965150694404094e-05, "loss": 0.8991, "step": 375 }, { "epoch": 0.7, "grad_norm": 0.8836709911598063, "learning_rate": 1.8916902684716004e-05, "loss": 0.875, "step": 380 }, { "epoch": 0.71, "grad_norm": 0.9323672084536093, "learning_rate": 1.8867619688891937e-05, "loss": 0.9045, "step": 385 }, { "epoch": 0.72, "grad_norm": 0.9390789515929351, "learning_rate": 1.881730742721608e-05, "loss": 0.8889, "step": 390 }, { "epoch": 0.73, "grad_norm": 0.8833432467724566, "learning_rate": 1.876597173943965e-05, "loss": 0.8833, "step": 395 }, { "epoch": 0.74, "grad_norm": 0.9323098875612197, "learning_rate": 1.871361858410308e-05, "loss": 0.9008, "step": 400 }, { "epoch": 0.75, "grad_norm": 0.9297018536651566, "learning_rate": 1.866025403784439e-05, "loss": 0.8825, "step": 405 }, { "epoch": 0.76, "grad_norm": 0.8582511160272933, "learning_rate": 1.8605884294693893e-05, "loss": 0.8854, "step": 410 }, { "epoch": 0.77, "grad_norm": 0.9765404816133791, "learning_rate": 1.8550515665355248e-05, "loss": 0.8704, "step": 415 }, { "epoch": 0.78, "grad_norm": 0.9316200284118313, "learning_rate": 1.8494154576472976e-05, "loss": 0.8944, "step": 420 }, { "epoch": 0.79, "grad_norm": 1.072766513059026, "learning_rate": 1.84368075698865e-05, "loss": 0.8842, "step": 425 }, { "epoch": 0.8, "grad_norm": 1.543811244450574, "learning_rate": 1.8378481301870865e-05, "loss": 0.8928, "step": 430 }, { "epoch": 0.8, "grad_norm": 1.233415241676456, "learning_rate": 1.8319182542364117e-05, "loss": 0.8876, "step": 435 }, { "epoch": 0.81, "grad_norm": 1.0407331796481731, "learning_rate": 1.825891817418153e-05, "loss": 0.8898, "step": 440 }, { "epoch": 0.82, "grad_norm": 0.9852677786684936, "learning_rate": 1.8197695192216702e-05, "loss": 0.8884, "step": 445 }, { "epoch": 0.83, "grad_norm": 0.9027726854768967, "learning_rate": 1.8135520702629677e-05, "loss": 0.8775, "step": 450 }, { "epoch": 0.84, "grad_norm": 1.0941452436773764, "learning_rate": 1.807240192202212e-05, "loss": 0.884, "step": 455 }, { "epoch": 0.85, "grad_norm": 1.107113850508076, "learning_rate": 1.8008346176599675e-05, "loss": 0.8746, "step": 460 }, { "epoch": 0.86, "grad_norm": 1.060890008040459, "learning_rate": 1.794336090132164e-05, "loss": 0.8952, "step": 465 }, { "epoch": 0.87, "grad_norm": 0.884966732457173, "learning_rate": 1.7877453639037957e-05, "loss": 0.8754, "step": 470 }, { "epoch": 0.88, "grad_norm": 1.0704754541130148, "learning_rate": 1.7810632039613735e-05, "loss": 0.8764, "step": 475 }, { "epoch": 0.89, "grad_norm": 0.9919332562506876, "learning_rate": 1.7742903859041324e-05, "loss": 0.9005, "step": 480 }, { "epoch": 0.9, "grad_norm": 0.9210227886163259, "learning_rate": 1.7674276958540073e-05, "loss": 0.8959, "step": 485 }, { "epoch": 0.91, "grad_norm": 1.00396374488261, "learning_rate": 1.7604759303643875e-05, "loss": 0.8801, "step": 490 }, { "epoch": 0.92, "grad_norm": 0.9278723416677115, "learning_rate": 1.7534358963276606e-05, "loss": 0.868, "step": 495 }, { "epoch": 0.92, "grad_norm": 0.9760092293453418, "learning_rate": 1.7463084108815587e-05, "loss": 0.8713, "step": 500 }, { "epoch": 0.93, "grad_norm": 0.9315652245047339, "learning_rate": 1.7390943013143086e-05, "loss": 0.8864, "step": 505 }, { "epoch": 0.94, "grad_norm": 0.9213276756309196, "learning_rate": 1.7317944049686125e-05, "loss": 0.8705, "step": 510 }, { "epoch": 0.95, "grad_norm": 0.873314757040681, "learning_rate": 1.7244095691444548e-05, "loss": 0.8566, "step": 515 }, { "epoch": 0.96, "grad_norm": 0.925108959774349, "learning_rate": 1.7169406510007592e-05, "loss": 0.8742, "step": 520 }, { "epoch": 0.97, "grad_norm": 0.9349095138962109, "learning_rate": 1.709388517455893e-05, "loss": 0.8792, "step": 525 }, { "epoch": 0.98, "grad_norm": 0.879571628524229, "learning_rate": 1.7017540450870488e-05, "loss": 0.8503, "step": 530 }, { "epoch": 0.99, "grad_norm": 0.9345436064226522, "learning_rate": 1.694038120028497e-05, "loss": 0.8655, "step": 535 }, { "epoch": 1.0, "grad_norm": 0.9685720075998935, "learning_rate": 1.686241637868734e-05, "loss": 0.8593, "step": 540 }, { "epoch": 1.01, "grad_norm": 1.3414804136782559, "learning_rate": 1.6783655035465283e-05, "loss": 0.7483, "step": 545 }, { "epoch": 1.02, "grad_norm": 0.9769664465584709, "learning_rate": 1.6704106312458878e-05, "loss": 0.7202, "step": 550 }, { "epoch": 1.03, "grad_norm": 0.9644265725777674, "learning_rate": 1.662377944289948e-05, "loss": 0.6971, "step": 555 }, { "epoch": 1.04, "grad_norm": 0.8362096432716036, "learning_rate": 1.654268375033802e-05, "loss": 0.6859, "step": 560 }, { "epoch": 1.04, "grad_norm": 0.8545500966881254, "learning_rate": 1.646082864756282e-05, "loss": 0.7126, "step": 565 }, { "epoch": 1.05, "grad_norm": 1.010020002670418, "learning_rate": 1.637822363550706e-05, "loss": 0.738, "step": 570 }, { "epoch": 1.06, "grad_norm": 0.9376974945477026, "learning_rate": 1.6294878302145985e-05, "loss": 0.7112, "step": 575 }, { "epoch": 1.07, "grad_norm": 0.980172490577581, "learning_rate": 1.6210802321384046e-05, "loss": 0.7048, "step": 580 }, { "epoch": 1.08, "grad_norm": 0.9692742001606004, "learning_rate": 1.6126005451932028e-05, "loss": 0.715, "step": 585 }, { "epoch": 1.09, "grad_norm": 0.8503062561620759, "learning_rate": 1.6040497536174378e-05, "loss": 0.7068, "step": 590 }, { "epoch": 1.1, "grad_norm": 1.0254807480179138, "learning_rate": 1.5954288499026782e-05, "loss": 0.7268, "step": 595 }, { "epoch": 1.11, "grad_norm": 0.9171158338661931, "learning_rate": 1.586738834678418e-05, "loss": 0.7273, "step": 600 }, { "epoch": 1.12, "grad_norm": 0.9117407742494892, "learning_rate": 1.577980716595934e-05, "loss": 0.7143, "step": 605 }, { "epoch": 1.13, "grad_norm": 0.9418369956367377, "learning_rate": 1.569155512211212e-05, "loss": 0.7151, "step": 610 }, { "epoch": 1.14, "grad_norm": 0.8973669405681762, "learning_rate": 1.5602642458669527e-05, "loss": 0.7096, "step": 615 }, { "epoch": 1.15, "grad_norm": 0.8881508172475038, "learning_rate": 1.5513079495736788e-05, "loss": 0.7104, "step": 620 }, { "epoch": 1.16, "grad_norm": 0.8876361729972202, "learning_rate": 1.5422876628899482e-05, "loss": 0.7195, "step": 625 }, { "epoch": 1.17, "grad_norm": 0.9036094377090472, "learning_rate": 1.5332044328016916e-05, "loss": 0.7092, "step": 630 }, { "epoch": 1.17, "grad_norm": 0.8281239604850335, "learning_rate": 1.5240593136006898e-05, "loss": 0.6929, "step": 635 }, { "epoch": 1.18, "grad_norm": 0.8790456122568177, "learning_rate": 1.5148533667622019e-05, "loss": 0.6925, "step": 640 }, { "epoch": 1.19, "grad_norm": 0.8177639630770596, "learning_rate": 1.505587660821759e-05, "loss": 0.6895, "step": 645 }, { "epoch": 1.2, "grad_norm": 0.8927483767429891, "learning_rate": 1.4962632712511395e-05, "loss": 0.7031, "step": 650 }, { "epoch": 1.21, "grad_norm": 0.8254512048587952, "learning_rate": 1.486881280333539e-05, "loss": 0.699, "step": 655 }, { "epoch": 1.22, "grad_norm": 0.8724637574634844, "learning_rate": 1.4774427770379492e-05, "loss": 0.7107, "step": 660 }, { "epoch": 1.23, "grad_norm": 0.8288821089286669, "learning_rate": 1.4679488568927615e-05, "loss": 0.7087, "step": 665 }, { "epoch": 1.24, "grad_norm": 0.8486744655635069, "learning_rate": 1.4584006218586096e-05, "loss": 0.6947, "step": 670 }, { "epoch": 1.25, "grad_norm": 0.957175575662007, "learning_rate": 1.4487991802004625e-05, "loss": 0.708, "step": 675 }, { "epoch": 1.26, "grad_norm": 0.8860372365894343, "learning_rate": 1.43914564635899e-05, "loss": 0.7133, "step": 680 }, { "epoch": 1.27, "grad_norm": 0.8519026179322932, "learning_rate": 1.4294411408212093e-05, "loss": 0.6917, "step": 685 }, { "epoch": 1.28, "grad_norm": 0.8436704959136121, "learning_rate": 1.4196867899904292e-05, "loss": 0.7086, "step": 690 }, { "epoch": 1.29, "grad_norm": 0.9215284562010572, "learning_rate": 1.4098837260555086e-05, "loss": 0.706, "step": 695 }, { "epoch": 1.29, "grad_norm": 0.8726092038996182, "learning_rate": 1.4000330868594428e-05, "loss": 0.715, "step": 700 }, { "epoch": 1.3, "grad_norm": 0.8335655205275098, "learning_rate": 1.390136015767295e-05, "loss": 0.7033, "step": 705 }, { "epoch": 1.31, "grad_norm": 0.8308170289028934, "learning_rate": 1.3801936615334848e-05, "loss": 0.7017, "step": 710 }, { "epoch": 1.32, "grad_norm": 0.8488389190620426, "learning_rate": 1.370207178168452e-05, "loss": 0.7011, "step": 715 }, { "epoch": 1.33, "grad_norm": 0.9139523556398262, "learning_rate": 1.3601777248047105e-05, "loss": 0.7066, "step": 720 }, { "epoch": 1.34, "grad_norm": 0.9381426787661671, "learning_rate": 1.3501064655623095e-05, "loss": 0.7173, "step": 725 }, { "epoch": 1.35, "grad_norm": 0.822817225044632, "learning_rate": 1.3399945694137109e-05, "loss": 0.6979, "step": 730 }, { "epoch": 1.36, "grad_norm": 0.916057907116943, "learning_rate": 1.3298432100481078e-05, "loss": 0.6942, "step": 735 }, { "epoch": 1.37, "grad_norm": 0.9293994240579861, "learning_rate": 1.3196535657351959e-05, "loss": 0.7102, "step": 740 }, { "epoch": 1.38, "grad_norm": 0.8531666161772917, "learning_rate": 1.309426819188409e-05, "loss": 0.6963, "step": 745 }, { "epoch": 1.39, "grad_norm": 0.8969051991163846, "learning_rate": 1.2991641574276419e-05, "loss": 0.7058, "step": 750 }, { "epoch": 1.4, "grad_norm": 0.8758153277433447, "learning_rate": 1.288866771641474e-05, "loss": 0.7033, "step": 755 }, { "epoch": 1.41, "grad_norm": 0.8800800572010911, "learning_rate": 1.2785358570489077e-05, "loss": 0.6951, "step": 760 }, { "epoch": 1.41, "grad_norm": 0.7903117710072121, "learning_rate": 1.2681726127606374e-05, "loss": 0.6992, "step": 765 }, { "epoch": 1.42, "grad_norm": 0.9044773533349648, "learning_rate": 1.2577782416398708e-05, "loss": 0.7204, "step": 770 }, { "epoch": 1.43, "grad_norm": 0.8539428431672353, "learning_rate": 1.2473539501627101e-05, "loss": 0.7212, "step": 775 }, { "epoch": 1.44, "grad_norm": 0.8455375361267863, "learning_rate": 1.2369009482781191e-05, "loss": 0.702, "step": 780 }, { "epoch": 1.45, "grad_norm": 0.8134470805856595, "learning_rate": 1.2264204492674816e-05, "loss": 0.7134, "step": 785 }, { "epoch": 1.46, "grad_norm": 0.9426558956565058, "learning_rate": 1.2159136696037763e-05, "loss": 0.6899, "step": 790 }, { "epoch": 1.47, "grad_norm": 1.0712082472086517, "learning_rate": 1.205381828810382e-05, "loss": 0.6867, "step": 795 }, { "epoch": 1.48, "grad_norm": 0.8650015131089676, "learning_rate": 1.1948261493195256e-05, "loss": 0.6862, "step": 800 }, { "epoch": 1.49, "grad_norm": 0.8092140890169357, "learning_rate": 1.1842478563303953e-05, "loss": 0.692, "step": 805 }, { "epoch": 1.5, "grad_norm": 0.884259378052905, "learning_rate": 1.1736481776669307e-05, "loss": 0.6956, "step": 810 }, { "epoch": 1.51, "grad_norm": 0.88390160177522, "learning_rate": 1.1630283436353098e-05, "loss": 0.6867, "step": 815 }, { "epoch": 1.52, "grad_norm": 0.8296289744950246, "learning_rate": 1.1523895868811472e-05, "loss": 0.7104, "step": 820 }, { "epoch": 1.53, "grad_norm": 0.8817584934930776, "learning_rate": 1.1417331422464206e-05, "loss": 0.7035, "step": 825 }, { "epoch": 1.53, "grad_norm": 0.8714688770326949, "learning_rate": 1.1310602466261422e-05, "loss": 0.6852, "step": 830 }, { "epoch": 1.54, "grad_norm": 0.8625649509092821, "learning_rate": 1.1203721388247924e-05, "loss": 0.6965, "step": 835 }, { "epoch": 1.55, "grad_norm": 0.8201921799899131, "learning_rate": 1.1096700594125318e-05, "loss": 0.682, "step": 840 }, { "epoch": 1.56, "grad_norm": 0.8435657261904709, "learning_rate": 1.0989552505812073e-05, "loss": 0.6954, "step": 845 }, { "epoch": 1.57, "grad_norm": 0.7847155172741475, "learning_rate": 1.088228956000172e-05, "loss": 0.6988, "step": 850 }, { "epoch": 1.58, "grad_norm": 0.8407369975482921, "learning_rate": 1.077492420671931e-05, "loss": 0.6852, "step": 855 }, { "epoch": 1.59, "grad_norm": 0.8516693758714967, "learning_rate": 1.0667468907876349e-05, "loss": 0.7018, "step": 860 }, { "epoch": 1.6, "grad_norm": 0.8428393845492996, "learning_rate": 1.0559936135824322e-05, "loss": 0.7013, "step": 865 }, { "epoch": 1.61, "grad_norm": 0.8598951874688993, "learning_rate": 1.0452338371907065e-05, "loss": 0.7056, "step": 870 }, { "epoch": 1.62, "grad_norm": 0.9089941816299408, "learning_rate": 1.0344688105012006e-05, "loss": 0.7008, "step": 875 }, { "epoch": 1.63, "grad_norm": 0.8025939134141212, "learning_rate": 1.0236997830120614e-05, "loss": 0.6995, "step": 880 }, { "epoch": 1.64, "grad_norm": 0.8322635796721002, "learning_rate": 1.0129280046858085e-05, "loss": 0.7119, "step": 885 }, { "epoch": 1.65, "grad_norm": 0.9168431530161661, "learning_rate": 1.0021547258042522e-05, "loss": 0.7166, "step": 890 }, { "epoch": 1.66, "grad_norm": 0.8719580027768455, "learning_rate": 9.913811968233716e-06, "loss": 0.6971, "step": 895 }, { "epoch": 1.66, "grad_norm": 0.8754135160174089, "learning_rate": 9.806086682281759e-06, "loss": 0.6926, "step": 900 }, { "epoch": 1.67, "grad_norm": 0.7545254920889488, "learning_rate": 9.69838390387558e-06, "loss": 0.6881, "step": 905 }, { "epoch": 1.68, "grad_norm": 0.8170094862170797, "learning_rate": 9.59071613409167e-06, "loss": 0.6986, "step": 910 }, { "epoch": 1.69, "grad_norm": 0.8727115323209758, "learning_rate": 9.483095869943056e-06, "loss": 0.701, "step": 915 }, { "epoch": 1.7, "grad_norm": 0.8217893100447828, "learning_rate": 9.375535602928776e-06, "loss": 0.7125, "step": 920 }, { "epoch": 1.71, "grad_norm": 0.7669081950681641, "learning_rate": 9.268047817583997e-06, "loss": 0.6951, "step": 925 }, { "epoch": 1.72, "grad_norm": 0.8691713855735111, "learning_rate": 9.160644990030932e-06, "loss": 0.7053, "step": 930 }, { "epoch": 1.73, "grad_norm": 0.8465735557284857, "learning_rate": 9.053339586530724e-06, "loss": 0.7101, "step": 935 }, { "epoch": 1.74, "grad_norm": 0.8556436244244698, "learning_rate": 8.946144062036496e-06, "loss": 0.6859, "step": 940 }, { "epoch": 1.75, "grad_norm": 0.8013250456726765, "learning_rate": 8.839070858747697e-06, "loss": 0.7001, "step": 945 }, { "epoch": 1.76, "grad_norm": 0.8848341169666858, "learning_rate": 8.732132404665947e-06, "loss": 0.67, "step": 950 }, { "epoch": 1.77, "grad_norm": 0.79454106581273, "learning_rate": 8.625341112152487e-06, "loss": 0.6821, "step": 955 }, { "epoch": 1.78, "grad_norm": 0.8304144336068819, "learning_rate": 8.518709376487515e-06, "loss": 0.6831, "step": 960 }, { "epoch": 1.78, "grad_norm": 0.7913540202550371, "learning_rate": 8.412249574431429e-06, "loss": 0.7086, "step": 965 }, { "epoch": 1.79, "grad_norm": 0.8064141920037301, "learning_rate": 8.305974062788278e-06, "loss": 0.6953, "step": 970 }, { "epoch": 1.8, "grad_norm": 0.7828524157109749, "learning_rate": 8.199895176971489e-06, "loss": 0.6983, "step": 975 }, { "epoch": 1.81, "grad_norm": 0.8098706298944635, "learning_rate": 8.094025229572111e-06, "loss": 0.6966, "step": 980 }, { "epoch": 1.82, "grad_norm": 0.849544093718342, "learning_rate": 7.988376508929676e-06, "loss": 0.6727, "step": 985 }, { "epoch": 1.83, "grad_norm": 0.7975592097506601, "learning_rate": 7.882961277705897e-06, "loss": 0.6856, "step": 990 }, { "epoch": 1.84, "grad_norm": 0.8523673745310055, "learning_rate": 7.777791771461332e-06, "loss": 0.7029, "step": 995 }, { "epoch": 1.85, "grad_norm": 0.8204875573442207, "learning_rate": 7.672880197235223e-06, "loss": 0.6864, "step": 1000 }, { "epoch": 1.86, "grad_norm": 0.8248700076350647, "learning_rate": 7.568238732128586e-06, "loss": 0.6835, "step": 1005 }, { "epoch": 1.87, "grad_norm": 0.8191493692115818, "learning_rate": 7.463879521890847e-06, "loss": 0.6943, "step": 1010 }, { "epoch": 1.88, "grad_norm": 0.7791309739864153, "learning_rate": 7.359814679510065e-06, "loss": 0.7092, "step": 1015 }, { "epoch": 1.89, "grad_norm": 0.7843289733720519, "learning_rate": 7.256056283806987e-06, "loss": 0.6937, "step": 1020 }, { "epoch": 1.9, "grad_norm": 0.7842615760396254, "learning_rate": 7.152616378033043e-06, "loss": 0.7117, "step": 1025 }, { "epoch": 1.9, "grad_norm": 0.7829943694050134, "learning_rate": 7.049506968472497e-06, "loss": 0.7065, "step": 1030 }, { "epoch": 1.91, "grad_norm": 0.7561205508975692, "learning_rate": 6.94674002304887e-06, "loss": 0.6857, "step": 1035 }, { "epoch": 1.92, "grad_norm": 0.7512481608435314, "learning_rate": 6.844327469935826e-06, "loss": 0.6766, "step": 1040 }, { "epoch": 1.93, "grad_norm": 0.7532241014261504, "learning_rate": 6.742281196172663e-06, "loss": 0.6696, "step": 1045 }, { "epoch": 1.94, "grad_norm": 0.723095218170628, "learning_rate": 6.640613046284581e-06, "loss": 0.6695, "step": 1050 }, { "epoch": 1.95, "grad_norm": 0.7252212718824871, "learning_rate": 6.539334820907889e-06, "loss": 0.667, "step": 1055 }, { "epoch": 1.96, "grad_norm": 0.7830880287688797, "learning_rate": 6.438458275420309e-06, "loss": 0.683, "step": 1060 }, { "epoch": 1.97, "grad_norm": 0.8130835872824888, "learning_rate": 6.337995118576521e-06, "loss": 0.692, "step": 1065 }, { "epoch": 1.98, "grad_norm": 0.7894458277368371, "learning_rate": 6.23795701114912e-06, "loss": 0.7021, "step": 1070 }, { "epoch": 1.99, "grad_norm": 0.8367949367657657, "learning_rate": 6.138355564575169e-06, "loss": 0.6949, "step": 1075 }, { "epoch": 2.0, "grad_norm": 0.7786700756632423, "learning_rate": 6.039202339608432e-06, "loss": 0.6967, "step": 1080 }, { "epoch": 2.01, "grad_norm": 0.9633228870304994, "learning_rate": 5.9405088449775375e-06, "loss": 0.5819, "step": 1085 }, { "epoch": 2.02, "grad_norm": 1.0344070619967278, "learning_rate": 5.842286536050144e-06, "loss": 0.5011, "step": 1090 }, { "epoch": 2.02, "grad_norm": 1.0538641350381497, "learning_rate": 5.744546813503327e-06, "loss": 0.5319, "step": 1095 }, { "epoch": 2.03, "grad_norm": 0.8864489388413449, "learning_rate": 5.647301022000284e-06, "loss": 0.4926, "step": 1100 }, { "epoch": 2.04, "grad_norm": 0.8474505807265904, "learning_rate": 5.550560448873575e-06, "loss": 0.509, "step": 1105 }, { "epoch": 2.05, "grad_norm": 0.8267823766957728, "learning_rate": 5.454336322814995e-06, "loss": 0.5094, "step": 1110 }, { "epoch": 2.06, "grad_norm": 0.8086042498258639, "learning_rate": 5.358639812572244e-06, "loss": 0.4892, "step": 1115 }, { "epoch": 2.07, "grad_norm": 0.7975013388182653, "learning_rate": 5.263482025652591e-06, "loss": 0.501, "step": 1120 }, { "epoch": 2.08, "grad_norm": 0.8115736728203711, "learning_rate": 5.168874007033615e-06, "loss": 0.4934, "step": 1125 }, { "epoch": 2.09, "grad_norm": 0.7794798018780561, "learning_rate": 5.074826737881202e-06, "loss": 0.5055, "step": 1130 }, { "epoch": 2.1, "grad_norm": 0.795406266403286, "learning_rate": 4.981351134274981e-06, "loss": 0.5062, "step": 1135 }, { "epoch": 2.11, "grad_norm": 0.7805878111888855, "learning_rate": 4.888458045941269e-06, "loss": 0.5016, "step": 1140 }, { "epoch": 2.12, "grad_norm": 0.7875089983056415, "learning_rate": 4.796158254993768e-06, "loss": 0.4976, "step": 1145 }, { "epoch": 2.13, "grad_norm": 0.8049951869389942, "learning_rate": 4.704462474682055e-06, "loss": 0.4987, "step": 1150 }, { "epoch": 2.14, "grad_norm": 0.8009379405913797, "learning_rate": 4.613381348148125e-06, "loss": 0.5092, "step": 1155 }, { "epoch": 2.15, "grad_norm": 0.7841732209811066, "learning_rate": 4.522925447191006e-06, "loss": 0.5061, "step": 1160 }, { "epoch": 2.15, "grad_norm": 0.8055813206538527, "learning_rate": 4.433105271039722e-06, "loss": 0.5303, "step": 1165 }, { "epoch": 2.16, "grad_norm": 0.8150894233057778, "learning_rate": 4.343931245134616e-06, "loss": 0.5078, "step": 1170 }, { "epoch": 2.17, "grad_norm": 0.8052165058822955, "learning_rate": 4.255413719917294e-06, "loss": 0.5124, "step": 1175 }, { "epoch": 2.18, "grad_norm": 0.810378912782989, "learning_rate": 4.167562969629233e-06, "loss": 0.5159, "step": 1180 }, { "epoch": 2.19, "grad_norm": 0.7874419439695609, "learning_rate": 4.080389191119241e-06, "loss": 0.4961, "step": 1185 }, { "epoch": 2.2, "grad_norm": 0.8205952521678742, "learning_rate": 3.9939025026599335e-06, "loss": 0.5109, "step": 1190 }, { "epoch": 2.21, "grad_norm": 0.7961082784450934, "learning_rate": 3.908112942773278e-06, "loss": 0.5106, "step": 1195 }, { "epoch": 2.22, "grad_norm": 0.7902941678889169, "learning_rate": 3.823030469065431e-06, "loss": 0.5087, "step": 1200 }, { "epoch": 2.23, "grad_norm": 0.7993525374229726, "learning_rate": 3.738664957070964e-06, "loss": 0.492, "step": 1205 }, { "epoch": 2.24, "grad_norm": 0.809158748195732, "learning_rate": 3.655026199106595e-06, "loss": 0.4983, "step": 1210 }, { "epoch": 2.25, "grad_norm": 0.7673626425314425, "learning_rate": 3.5721239031346067e-06, "loss": 0.4887, "step": 1215 }, { "epoch": 2.26, "grad_norm": 0.7999177297746718, "learning_rate": 3.489967691636038e-06, "loss": 0.5028, "step": 1220 }, { "epoch": 2.27, "grad_norm": 0.7945863024103252, "learning_rate": 3.408567100493787e-06, "loss": 0.4927, "step": 1225 }, { "epoch": 2.27, "grad_norm": 0.786058386688595, "learning_rate": 3.3279315778858034e-06, "loss": 0.4974, "step": 1230 }, { "epoch": 2.28, "grad_norm": 0.7807385510973233, "learning_rate": 3.248070483188426e-06, "loss": 0.4974, "step": 1235 }, { "epoch": 2.29, "grad_norm": 0.7775262685810439, "learning_rate": 3.1689930858900265e-06, "loss": 0.5117, "step": 1240 }, { "epoch": 2.3, "grad_norm": 0.8435777447984063, "learning_rate": 3.090708564515125e-06, "loss": 0.496, "step": 1245 }, { "epoch": 2.31, "grad_norm": 0.7632453857565443, "learning_rate": 3.013226005559009e-06, "loss": 0.4912, "step": 1250 }, { "epoch": 2.32, "grad_norm": 0.7845597758185894, "learning_rate": 2.936554402433087e-06, "loss": 0.4909, "step": 1255 }, { "epoch": 2.33, "grad_norm": 0.7970687413033447, "learning_rate": 2.8607026544210115e-06, "loss": 0.488, "step": 1260 }, { "epoch": 2.34, "grad_norm": 0.7705687660032146, "learning_rate": 2.785679565645726e-06, "loss": 0.492, "step": 1265 }, { "epoch": 2.35, "grad_norm": 0.7871629482025935, "learning_rate": 2.7114938440475926e-06, "loss": 0.4977, "step": 1270 }, { "epoch": 2.36, "grad_norm": 0.7626742896975243, "learning_rate": 2.6381541003736486e-06, "loss": 0.5035, "step": 1275 }, { "epoch": 2.37, "grad_norm": 0.7747493631843794, "learning_rate": 2.5656688471781455e-06, "loss": 0.4865, "step": 1280 }, { "epoch": 2.38, "grad_norm": 0.7768979283390781, "learning_rate": 2.4940464978345182e-06, "loss": 0.4915, "step": 1285 }, { "epoch": 2.39, "grad_norm": 0.770753571366999, "learning_rate": 2.423295365558821e-06, "loss": 0.4877, "step": 1290 }, { "epoch": 2.39, "grad_norm": 0.796821433348244, "learning_rate": 2.3534236624448302e-06, "loss": 0.5015, "step": 1295 }, { "epoch": 2.4, "grad_norm": 0.7686662201911335, "learning_rate": 2.284439498510854e-06, "loss": 0.5033, "step": 1300 }, { "epoch": 2.41, "grad_norm": 0.7995269080777666, "learning_rate": 2.2163508807584e-06, "loss": 0.5032, "step": 1305 }, { "epoch": 2.42, "grad_norm": 0.7798024100681243, "learning_rate": 2.149165712242811e-06, "loss": 0.508, "step": 1310 }, { "epoch": 2.43, "grad_norm": 0.7952549320908302, "learning_rate": 2.0828917911559544e-06, "loss": 0.4991, "step": 1315 }, { "epoch": 2.44, "grad_norm": 0.7562949546743744, "learning_rate": 2.01753680992107e-06, "loss": 0.479, "step": 1320 }, { "epoch": 2.45, "grad_norm": 0.7738313454477512, "learning_rate": 1.953108354299932e-06, "loss": 0.5101, "step": 1325 }, { "epoch": 2.46, "grad_norm": 0.7632280693175711, "learning_rate": 1.8896139025123549e-06, "loss": 0.5029, "step": 1330 }, { "epoch": 2.47, "grad_norm": 0.7782927279263342, "learning_rate": 1.8270608243681953e-06, "loss": 0.4915, "step": 1335 }, { "epoch": 2.48, "grad_norm": 0.7721170069306357, "learning_rate": 1.7654563804119395e-06, "loss": 0.4949, "step": 1340 }, { "epoch": 2.49, "grad_norm": 0.7569293553310351, "learning_rate": 1.7048077210799774e-06, "loss": 0.4952, "step": 1345 }, { "epoch": 2.5, "grad_norm": 0.7713352384960788, "learning_rate": 1.6451218858706374e-06, "loss": 0.4915, "step": 1350 }, { "epoch": 2.51, "grad_norm": 0.8073489982779348, "learning_rate": 1.5864058025271245e-06, "loss": 0.5019, "step": 1355 }, { "epoch": 2.52, "grad_norm": 0.7532642687412483, "learning_rate": 1.5286662862334035e-06, "loss": 0.4817, "step": 1360 }, { "epoch": 2.52, "grad_norm": 0.7619263721625219, "learning_rate": 1.47191003882317e-06, "loss": 0.4926, "step": 1365 }, { "epoch": 2.53, "grad_norm": 0.7887732532731833, "learning_rate": 1.4161436480019651e-06, "loss": 0.51, "step": 1370 }, { "epoch": 2.54, "grad_norm": 0.756428418923445, "learning_rate": 1.3613735865825305e-06, "loss": 0.5113, "step": 1375 }, { "epoch": 2.55, "grad_norm": 0.7908916608851263, "learning_rate": 1.307606211733522e-06, "loss": 0.4872, "step": 1380 }, { "epoch": 2.56, "grad_norm": 0.7849304184738213, "learning_rate": 1.2548477642416258e-06, "loss": 0.4885, "step": 1385 }, { "epoch": 2.57, "grad_norm": 0.7425717403698943, "learning_rate": 1.2031043677871812e-06, "loss": 0.483, "step": 1390 }, { "epoch": 2.58, "grad_norm": 0.7449369354911503, "learning_rate": 1.152382028233422e-06, "loss": 0.4898, "step": 1395 }, { "epoch": 2.59, "grad_norm": 0.7711579503984141, "learning_rate": 1.102686632929363e-06, "loss": 0.5056, "step": 1400 }, { "epoch": 2.6, "grad_norm": 0.7859105451086833, "learning_rate": 1.0540239500264516e-06, "loss": 0.4914, "step": 1405 }, { "epoch": 2.61, "grad_norm": 0.7798228268418401, "learning_rate": 1.0063996278090704e-06, "loss": 0.5189, "step": 1410 }, { "epoch": 2.62, "grad_norm": 0.7615368434177107, "learning_rate": 9.598191940389257e-07, "loss": 0.5048, "step": 1415 }, { "epoch": 2.63, "grad_norm": 0.7767400021550324, "learning_rate": 9.142880553134515e-07, "loss": 0.4886, "step": 1420 }, { "epoch": 2.64, "grad_norm": 0.7671973600539562, "learning_rate": 8.698114964382598e-07, "loss": 0.4944, "step": 1425 }, { "epoch": 2.64, "grad_norm": 0.7912890267916621, "learning_rate": 8.26394679813729e-07, "loss": 0.5031, "step": 1430 }, { "epoch": 2.65, "grad_norm": 0.7411737851307407, "learning_rate": 7.840426448358085e-07, "loss": 0.4921, "step": 1435 }, { "epoch": 2.66, "grad_norm": 0.7523097552132488, "learning_rate": 7.427603073110967e-07, "loss": 0.4976, "step": 1440 }, { "epoch": 2.67, "grad_norm": 0.7706118040011497, "learning_rate": 7.025524588862542e-07, "loss": 0.483, "step": 1445 }, { "epoch": 2.68, "grad_norm": 0.7825700623120088, "learning_rate": 6.634237664918486e-07, "loss": 0.504, "step": 1450 }, { "epoch": 2.69, "grad_norm": 0.7420108171750802, "learning_rate": 6.253787718006499e-07, "loss": 0.4842, "step": 1455 }, { "epoch": 2.7, "grad_norm": 0.7658665779082047, "learning_rate": 5.884218907004902e-07, "loss": 0.4949, "step": 1460 }, { "epoch": 2.71, "grad_norm": 0.75083219136469, "learning_rate": 5.525574127817046e-07, "loss": 0.4824, "step": 1465 }, { "epoch": 2.72, "grad_norm": 0.7586917678605787, "learning_rate": 5.177895008392353e-07, "loss": 0.4926, "step": 1470 }, { "epoch": 2.73, "grad_norm": 0.7470158525401197, "learning_rate": 4.841221903894633e-07, "loss": 0.4882, "step": 1475 }, { "epoch": 2.74, "grad_norm": 0.7772763452878679, "learning_rate": 4.515593892018e-07, "loss": 0.5021, "step": 1480 }, { "epoch": 2.75, "grad_norm": 0.7477051621945912, "learning_rate": 4.2010487684511105e-07, "loss": 0.5065, "step": 1485 }, { "epoch": 2.76, "grad_norm": 0.7566118062090599, "learning_rate": 3.8976230424902616e-07, "loss": 0.5088, "step": 1490 }, { "epoch": 2.76, "grad_norm": 0.766254582563817, "learning_rate": 3.605351932801693e-07, "loss": 0.4975, "step": 1495 }, { "epoch": 2.77, "grad_norm": 0.7839374958680789, "learning_rate": 3.3242693633337986e-07, "loss": 0.4922, "step": 1500 }, { "epoch": 2.78, "grad_norm": 0.7509582193755757, "learning_rate": 3.054407959379557e-07, "loss": 0.4836, "step": 1505 }, { "epoch": 2.79, "grad_norm": 0.7549781441743341, "learning_rate": 2.795799043789682e-07, "loss": 0.4913, "step": 1510 }, { "epoch": 2.8, "grad_norm": 0.7280618469591504, "learning_rate": 2.548472633337007e-07, "loss": 0.5008, "step": 1515 }, { "epoch": 2.81, "grad_norm": 0.7426209450041604, "learning_rate": 2.3124574352324401e-07, "loss": 0.4859, "step": 1520 }, { "epoch": 2.82, "grad_norm": 0.730840640795566, "learning_rate": 2.0877808437928638e-07, "loss": 0.5015, "step": 1525 }, { "epoch": 2.83, "grad_norm": 0.7454219518836528, "learning_rate": 1.874468937261531e-07, "loss": 0.511, "step": 1530 }, { "epoch": 2.84, "grad_norm": 0.7526874859698496, "learning_rate": 1.6725464747811448e-07, "loss": 0.4833, "step": 1535 }, { "epoch": 2.85, "grad_norm": 0.7749898887857551, "learning_rate": 1.4820368935200003e-07, "loss": 0.5081, "step": 1540 }, { "epoch": 2.86, "grad_norm": 0.7520293477111654, "learning_rate": 1.3029623059517493e-07, "loss": 0.4863, "step": 1545 }, { "epoch": 2.87, "grad_norm": 0.7551486931155532, "learning_rate": 1.1353434972886879e-07, "loss": 0.477, "step": 1550 }, { "epoch": 2.88, "grad_norm": 0.7517623874328504, "learning_rate": 9.791999230692628e-08, "loss": 0.5004, "step": 1555 }, { "epoch": 2.88, "grad_norm": 0.7473196143337467, "learning_rate": 8.345497068998897e-08, "loss": 0.4991, "step": 1560 }, { "epoch": 2.89, "grad_norm": 0.7371970090752852, "learning_rate": 7.014096383512802e-08, "loss": 0.502, "step": 1565 }, { "epoch": 2.9, "grad_norm": 0.7598898724156786, "learning_rate": 5.797951710097449e-08, "loss": 0.4858, "step": 1570 }, { "epoch": 2.91, "grad_norm": 0.7632140847196374, "learning_rate": 4.6972042068341714e-08, "loss": 0.494, "step": 1575 }, { "epoch": 2.92, "grad_norm": 0.7741502455508422, "learning_rate": 3.711981637639084e-08, "loss": 0.4858, "step": 1580 }, { "epoch": 2.93, "grad_norm": 0.7521002538915811, "learning_rate": 2.8423983574328296e-08, "loss": 0.4953, "step": 1585 }, { "epoch": 2.94, "grad_norm": 0.7791912328389965, "learning_rate": 2.088555298867978e-08, "loss": 0.5056, "step": 1590 }, { "epoch": 2.95, "grad_norm": 0.7663549047032114, "learning_rate": 1.4505399606130621e-08, "loss": 0.4958, "step": 1595 }, { "epoch": 2.96, "grad_norm": 0.7692872828036317, "learning_rate": 9.284263971972574e-09, "loss": 0.4986, "step": 1600 }, { "epoch": 2.97, "grad_norm": 0.782988216493056, "learning_rate": 5.222752104147022e-09, "loss": 0.5066, "step": 1605 }, { "epoch": 2.98, "grad_norm": 0.7892002209997082, "learning_rate": 2.3213354229001395e-09, "loss": 0.5093, "step": 1610 }, { "epoch": 2.99, "grad_norm": 0.7748663851832998, "learning_rate": 5.803506960722072e-10, "loss": 0.5038, "step": 1615 }, { "epoch": 3.0, "grad_norm": 0.7392896696894132, "learning_rate": 0.0, "loss": 0.4834, "step": 1620 }, { "epoch": 3.0, "step": 1620, "total_flos": 1470053153243136.0, "train_loss": 0.7158041251294407, "train_runtime": 19809.7521, "train_samples_per_second": 9.171, "train_steps_per_second": 0.082 } ], "logging_steps": 5, "max_steps": 1620, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1470053153243136.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }