{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00032, "grad_norm": 20.05617380625413, "learning_rate": 6.389776357827476e-08, "loss": 1.817, "step": 1 }, { "epoch": 0.0016, "grad_norm": 24.170327240578573, "learning_rate": 3.194888178913738e-07, "loss": 1.8628, "step": 5 }, { "epoch": 0.0032, "grad_norm": 17.63201705481064, "learning_rate": 6.389776357827476e-07, "loss": 1.7731, "step": 10 }, { "epoch": 0.0048, "grad_norm": 12.659034102299168, "learning_rate": 9.584664536741215e-07, "loss": 1.6969, "step": 15 }, { "epoch": 0.0064, "grad_norm": 8.71204128591294, "learning_rate": 1.2779552715654952e-06, "loss": 1.5956, "step": 20 }, { "epoch": 0.008, "grad_norm": 9.482900190556371, "learning_rate": 1.5974440894568691e-06, "loss": 1.5189, "step": 25 }, { "epoch": 0.0096, "grad_norm": 9.082892230070819, "learning_rate": 1.916932907348243e-06, "loss": 1.5307, "step": 30 }, { "epoch": 0.0112, "grad_norm": 8.71715912913066, "learning_rate": 2.2364217252396165e-06, "loss": 1.4863, "step": 35 }, { "epoch": 0.0128, "grad_norm": 8.266982509072658, "learning_rate": 2.5559105431309904e-06, "loss": 1.4411, "step": 40 }, { "epoch": 0.0144, "grad_norm": 7.43964847984838, "learning_rate": 2.8753993610223648e-06, "loss": 1.4772, "step": 45 }, { "epoch": 0.016, "grad_norm": 8.285226877478248, "learning_rate": 3.1948881789137383e-06, "loss": 1.4813, "step": 50 }, { "epoch": 0.0176, "grad_norm": 8.144708422975583, "learning_rate": 3.514376996805112e-06, "loss": 1.4048, "step": 55 }, { "epoch": 0.0192, "grad_norm": 7.597036261894608, "learning_rate": 3.833865814696486e-06, "loss": 1.3852, "step": 60 }, { "epoch": 0.0208, "grad_norm": 7.424686782409782, "learning_rate": 4.15335463258786e-06, "loss": 1.4124, "step": 65 }, { "epoch": 0.0224, "grad_norm": 7.01187059061448, "learning_rate": 4.472843450479233e-06, "loss": 1.3525, "step": 70 }, { "epoch": 0.024, "grad_norm": 8.203285756665721, "learning_rate": 4.792332268370608e-06, "loss": 1.4641, "step": 75 }, { "epoch": 0.0256, "grad_norm": 8.91120559492055, "learning_rate": 5.111821086261981e-06, "loss": 1.4319, "step": 80 }, { "epoch": 0.0272, "grad_norm": 7.95728896500941, "learning_rate": 5.431309904153355e-06, "loss": 1.4198, "step": 85 }, { "epoch": 0.0288, "grad_norm": 8.93733979889198, "learning_rate": 5.7507987220447296e-06, "loss": 1.4431, "step": 90 }, { "epoch": 0.0304, "grad_norm": 7.821680056160703, "learning_rate": 6.070287539936103e-06, "loss": 1.45, "step": 95 }, { "epoch": 0.032, "grad_norm": 8.599771427970811, "learning_rate": 6.3897763578274765e-06, "loss": 1.4271, "step": 100 }, { "epoch": 0.0336, "grad_norm": 8.069041724261744, "learning_rate": 6.709265175718851e-06, "loss": 1.4014, "step": 105 }, { "epoch": 0.0352, "grad_norm": 8.444189736987067, "learning_rate": 7.028753993610224e-06, "loss": 1.4432, "step": 110 }, { "epoch": 0.0368, "grad_norm": 8.440082783021184, "learning_rate": 7.348242811501598e-06, "loss": 1.4234, "step": 115 }, { "epoch": 0.0384, "grad_norm": 9.19834025523869, "learning_rate": 7.667731629392972e-06, "loss": 1.4825, "step": 120 }, { "epoch": 0.04, "grad_norm": 9.44008152483298, "learning_rate": 7.987220447284347e-06, "loss": 1.4536, "step": 125 }, { "epoch": 0.0416, "grad_norm": 9.137864717981165, "learning_rate": 8.30670926517572e-06, "loss": 1.4832, "step": 130 }, { "epoch": 0.0432, "grad_norm": 7.259296421594005, "learning_rate": 8.626198083067093e-06, "loss": 1.4355, "step": 135 }, { "epoch": 0.0448, "grad_norm": 9.129262436644341, "learning_rate": 8.945686900958466e-06, "loss": 1.48, "step": 140 }, { "epoch": 0.0464, "grad_norm": 7.342624679137957, "learning_rate": 9.265175718849841e-06, "loss": 1.4388, "step": 145 }, { "epoch": 0.048, "grad_norm": 9.409291604105896, "learning_rate": 9.584664536741216e-06, "loss": 1.506, "step": 150 }, { "epoch": 0.0496, "grad_norm": 7.195158336471212, "learning_rate": 9.904153354632589e-06, "loss": 1.4852, "step": 155 }, { "epoch": 0.0512, "grad_norm": 7.894560573501637, "learning_rate": 1.0223642172523962e-05, "loss": 1.4228, "step": 160 }, { "epoch": 0.0528, "grad_norm": 9.3990419995412, "learning_rate": 1.0543130990415335e-05, "loss": 1.4624, "step": 165 }, { "epoch": 0.0544, "grad_norm": 6.764474220048631, "learning_rate": 1.086261980830671e-05, "loss": 1.529, "step": 170 }, { "epoch": 0.056, "grad_norm": 8.762621410789437, "learning_rate": 1.1182108626198084e-05, "loss": 1.4889, "step": 175 }, { "epoch": 0.0576, "grad_norm": 9.026712826027282, "learning_rate": 1.1501597444089459e-05, "loss": 1.4889, "step": 180 }, { "epoch": 0.0592, "grad_norm": 10.327681032497772, "learning_rate": 1.1821086261980832e-05, "loss": 1.4632, "step": 185 }, { "epoch": 0.0608, "grad_norm": 8.916460811078778, "learning_rate": 1.2140575079872205e-05, "loss": 1.4957, "step": 190 }, { "epoch": 0.0624, "grad_norm": 9.570316165134905, "learning_rate": 1.2460063897763578e-05, "loss": 1.4974, "step": 195 }, { "epoch": 0.064, "grad_norm": 7.931027623078715, "learning_rate": 1.2779552715654953e-05, "loss": 1.4986, "step": 200 }, { "epoch": 0.0656, "grad_norm": 9.001437919455842, "learning_rate": 1.3099041533546326e-05, "loss": 1.4583, "step": 205 }, { "epoch": 0.0672, "grad_norm": 8.357858455204125, "learning_rate": 1.3418530351437703e-05, "loss": 1.5086, "step": 210 }, { "epoch": 0.0688, "grad_norm": 7.4570481772053325, "learning_rate": 1.3738019169329076e-05, "loss": 1.5331, "step": 215 }, { "epoch": 0.0704, "grad_norm": 9.286290584701554, "learning_rate": 1.4057507987220449e-05, "loss": 1.5065, "step": 220 }, { "epoch": 0.072, "grad_norm": 7.764076796357793, "learning_rate": 1.4376996805111822e-05, "loss": 1.489, "step": 225 }, { "epoch": 0.0736, "grad_norm": 10.09652114646815, "learning_rate": 1.4696485623003197e-05, "loss": 1.5768, "step": 230 }, { "epoch": 0.0752, "grad_norm": 8.809821488359987, "learning_rate": 1.501597444089457e-05, "loss": 1.5554, "step": 235 }, { "epoch": 0.0768, "grad_norm": 8.262499421512615, "learning_rate": 1.5335463258785944e-05, "loss": 1.5407, "step": 240 }, { "epoch": 0.0784, "grad_norm": 8.979598030923686, "learning_rate": 1.5654952076677316e-05, "loss": 1.5761, "step": 245 }, { "epoch": 0.08, "grad_norm": 7.996608590588298, "learning_rate": 1.5974440894568694e-05, "loss": 1.5723, "step": 250 }, { "epoch": 0.0816, "grad_norm": 8.28398837332719, "learning_rate": 1.6293929712460065e-05, "loss": 1.511, "step": 255 }, { "epoch": 0.0832, "grad_norm": 10.050176837076577, "learning_rate": 1.661341853035144e-05, "loss": 1.62, "step": 260 }, { "epoch": 0.0848, "grad_norm": 8.625153292163999, "learning_rate": 1.693290734824281e-05, "loss": 1.5696, "step": 265 }, { "epoch": 0.0864, "grad_norm": 7.036269960407658, "learning_rate": 1.7252396166134186e-05, "loss": 1.6274, "step": 270 }, { "epoch": 0.088, "grad_norm": 9.06080432840939, "learning_rate": 1.757188498402556e-05, "loss": 1.5238, "step": 275 }, { "epoch": 0.0896, "grad_norm": 7.954340307347254, "learning_rate": 1.7891373801916932e-05, "loss": 1.6112, "step": 280 }, { "epoch": 0.0912, "grad_norm": 8.326902382980357, "learning_rate": 1.8210862619808307e-05, "loss": 1.5639, "step": 285 }, { "epoch": 0.0928, "grad_norm": 8.465352754166233, "learning_rate": 1.8530351437699682e-05, "loss": 1.6031, "step": 290 }, { "epoch": 0.0944, "grad_norm": 7.805713490581876, "learning_rate": 1.8849840255591057e-05, "loss": 1.5684, "step": 295 }, { "epoch": 0.096, "grad_norm": 7.8032797694553, "learning_rate": 1.916932907348243e-05, "loss": 1.5322, "step": 300 }, { "epoch": 0.0976, "grad_norm": 7.502438683575227, "learning_rate": 1.9488817891373803e-05, "loss": 1.578, "step": 305 }, { "epoch": 0.0992, "grad_norm": 7.42748507278892, "learning_rate": 1.9808306709265177e-05, "loss": 1.6214, "step": 310 }, { "epoch": 0.1008, "grad_norm": 8.776137139385414, "learning_rate": 1.9999975036876365e-05, "loss": 1.5879, "step": 315 }, { "epoch": 0.1024, "grad_norm": 7.577899989222258, "learning_rate": 1.9999694203166786e-05, "loss": 1.5625, "step": 320 }, { "epoch": 0.104, "grad_norm": 9.304934816231782, "learning_rate": 1.999910134063538e-05, "loss": 1.5502, "step": 325 }, { "epoch": 0.1056, "grad_norm": 8.943302127293093, "learning_rate": 1.9998196467781738e-05, "loss": 1.6447, "step": 330 }, { "epoch": 0.1072, "grad_norm": 10.263653381131993, "learning_rate": 1.999697961284136e-05, "loss": 1.6365, "step": 335 }, { "epoch": 0.1088, "grad_norm": 7.259185945143334, "learning_rate": 1.9995450813784785e-05, "loss": 1.6016, "step": 340 }, { "epoch": 0.1104, "grad_norm": 9.483550631073872, "learning_rate": 1.9993610118316417e-05, "loss": 1.6867, "step": 345 }, { "epoch": 0.112, "grad_norm": 9.069753200477997, "learning_rate": 1.999145758387301e-05, "loss": 1.6832, "step": 350 }, { "epoch": 0.1136, "grad_norm": 9.791327883527874, "learning_rate": 1.99889932776219e-05, "loss": 1.6604, "step": 355 }, { "epoch": 0.1152, "grad_norm": 10.288995257955278, "learning_rate": 1.9986217276458898e-05, "loss": 1.5407, "step": 360 }, { "epoch": 0.1168, "grad_norm": 7.343516596486213, "learning_rate": 1.9983129667005887e-05, "loss": 1.704, "step": 365 }, { "epoch": 0.1184, "grad_norm": 8.414567171107853, "learning_rate": 1.9979730545608128e-05, "loss": 1.6423, "step": 370 }, { "epoch": 0.12, "grad_norm": 31.248624844046244, "learning_rate": 1.9976020018331244e-05, "loss": 1.6615, "step": 375 }, { "epoch": 0.1216, "grad_norm": 7.805253897850886, "learning_rate": 1.997199820095793e-05, "loss": 1.7482, "step": 380 }, { "epoch": 0.1232, "grad_norm": 12.941755164396426, "learning_rate": 1.9967665218984308e-05, "loss": 1.6302, "step": 385 }, { "epoch": 0.1248, "grad_norm": 10.256908147942266, "learning_rate": 1.996302120761605e-05, "loss": 1.6527, "step": 390 }, { "epoch": 0.1264, "grad_norm": 10.323844537641683, "learning_rate": 1.9958066311764115e-05, "loss": 1.6225, "step": 395 }, { "epoch": 0.128, "grad_norm": 8.17544314720727, "learning_rate": 1.9952800686040268e-05, "loss": 1.6112, "step": 400 }, { "epoch": 0.1296, "grad_norm": 9.155308331409806, "learning_rate": 1.9947224494752236e-05, "loss": 1.645, "step": 405 }, { "epoch": 0.1312, "grad_norm": 8.093102355577809, "learning_rate": 1.994133791189857e-05, "loss": 1.6012, "step": 410 }, { "epoch": 0.1328, "grad_norm": 8.539319454896912, "learning_rate": 1.993514112116325e-05, "loss": 1.6163, "step": 415 }, { "epoch": 0.1344, "grad_norm": 9.376291632603513, "learning_rate": 1.992863431590991e-05, "loss": 1.7232, "step": 420 }, { "epoch": 0.136, "grad_norm": 8.427174660902148, "learning_rate": 1.9921817699175844e-05, "loss": 1.6041, "step": 425 }, { "epoch": 0.1376, "grad_norm": 11.225741174921657, "learning_rate": 1.991469148366564e-05, "loss": 1.5705, "step": 430 }, { "epoch": 0.1392, "grad_norm": 7.973508937531369, "learning_rate": 1.9907255891744562e-05, "loss": 1.6436, "step": 435 }, { "epoch": 0.1408, "grad_norm": 9.589627221263973, "learning_rate": 1.989951115543161e-05, "loss": 1.6753, "step": 440 }, { "epoch": 0.1424, "grad_norm": 6.8530374739159345, "learning_rate": 1.9891457516392257e-05, "loss": 1.6441, "step": 445 }, { "epoch": 0.144, "grad_norm": 8.969880080037996, "learning_rate": 1.988309522593095e-05, "loss": 1.632, "step": 450 }, { "epoch": 0.1456, "grad_norm": 8.147836497286043, "learning_rate": 1.9874424544983224e-05, "loss": 1.7166, "step": 455 }, { "epoch": 0.1472, "grad_norm": 10.804136842902594, "learning_rate": 1.9865445744107593e-05, "loss": 1.6071, "step": 460 }, { "epoch": 0.1488, "grad_norm": 7.770558071875093, "learning_rate": 1.9856159103477085e-05, "loss": 1.7466, "step": 465 }, { "epoch": 0.1504, "grad_norm": 11.053317637390126, "learning_rate": 1.9846564912870523e-05, "loss": 1.6221, "step": 470 }, { "epoch": 0.152, "grad_norm": 8.025748690782159, "learning_rate": 1.9836663471663454e-05, "loss": 1.7206, "step": 475 }, { "epoch": 0.1536, "grad_norm": 8.94152560502608, "learning_rate": 1.9826455088818832e-05, "loss": 1.6794, "step": 480 }, { "epoch": 0.1552, "grad_norm": 8.723262455381919, "learning_rate": 1.9815940082877367e-05, "loss": 1.6909, "step": 485 }, { "epoch": 0.1568, "grad_norm": 11.572838719884283, "learning_rate": 1.980511878194758e-05, "loss": 1.7067, "step": 490 }, { "epoch": 0.1584, "grad_norm": 7.98682940495796, "learning_rate": 1.9793991523695578e-05, "loss": 1.7714, "step": 495 }, { "epoch": 0.16, "grad_norm": 7.447864398870841, "learning_rate": 1.9782558655334505e-05, "loss": 1.632, "step": 500 }, { "epoch": 0.1616, "grad_norm": 8.942126665322432, "learning_rate": 1.9770820533613716e-05, "loss": 1.6463, "step": 505 }, { "epoch": 0.1632, "grad_norm": 8.629146616823142, "learning_rate": 1.9758777524807636e-05, "loss": 1.6367, "step": 510 }, { "epoch": 0.1648, "grad_norm": 7.603172313414887, "learning_rate": 1.9746430004704353e-05, "loss": 1.6558, "step": 515 }, { "epoch": 0.1664, "grad_norm": 11.152682711750428, "learning_rate": 1.9733778358593852e-05, "loss": 1.6537, "step": 520 }, { "epoch": 0.168, "grad_norm": 9.887507696721732, "learning_rate": 1.9720822981256034e-05, "loss": 1.6715, "step": 525 }, { "epoch": 0.1696, "grad_norm": 9.158423868070782, "learning_rate": 1.970756427694837e-05, "loss": 1.6244, "step": 530 }, { "epoch": 0.1712, "grad_norm": 8.442009066532837, "learning_rate": 1.9694002659393306e-05, "loss": 1.7073, "step": 535 }, { "epoch": 0.1728, "grad_norm": 7.50714623452273, "learning_rate": 1.9680138551765335e-05, "loss": 1.6241, "step": 540 }, { "epoch": 0.1744, "grad_norm": 8.325913790000977, "learning_rate": 1.9665972386677796e-05, "loss": 1.5779, "step": 545 }, { "epoch": 0.176, "grad_norm": 13.09863126345551, "learning_rate": 1.9651504606169395e-05, "loss": 1.6549, "step": 550 }, { "epoch": 0.1776, "grad_norm": 7.5020724745885925, "learning_rate": 1.9636735661690385e-05, "loss": 1.683, "step": 555 }, { "epoch": 0.1792, "grad_norm": 8.857991440508771, "learning_rate": 1.9621666014088495e-05, "loss": 1.5727, "step": 560 }, { "epoch": 0.1808, "grad_norm": 10.518277008681551, "learning_rate": 1.960629613359454e-05, "loss": 1.5516, "step": 565 }, { "epoch": 0.1824, "grad_norm": 9.763384545927902, "learning_rate": 1.959062649980776e-05, "loss": 1.7654, "step": 570 }, { "epoch": 0.184, "grad_norm": 14.784685999297329, "learning_rate": 1.957465760168084e-05, "loss": 1.6173, "step": 575 }, { "epoch": 0.1856, "grad_norm": 9.47338828693639, "learning_rate": 1.9558389937504664e-05, "loss": 1.6706, "step": 580 }, { "epoch": 0.1872, "grad_norm": 8.255320111181135, "learning_rate": 1.954182401489277e-05, "loss": 1.6829, "step": 585 }, { "epoch": 0.1888, "grad_norm": 9.748611896295628, "learning_rate": 1.952496035076549e-05, "loss": 1.6963, "step": 590 }, { "epoch": 0.1904, "grad_norm": 10.593417080535822, "learning_rate": 1.9507799471333842e-05, "loss": 1.5429, "step": 595 }, { "epoch": 0.192, "grad_norm": 8.495208947428331, "learning_rate": 1.9490341912083103e-05, "loss": 1.5996, "step": 600 }, { "epoch": 0.1936, "grad_norm": 8.116738913737667, "learning_rate": 1.947258821775609e-05, "loss": 1.6509, "step": 605 }, { "epoch": 0.1952, "grad_norm": 8.843543056646068, "learning_rate": 1.945453894233618e-05, "loss": 1.6621, "step": 610 }, { "epoch": 0.1968, "grad_norm": 8.429840678269503, "learning_rate": 1.9436194649030006e-05, "loss": 1.6238, "step": 615 }, { "epoch": 0.1984, "grad_norm": 7.060618638753134, "learning_rate": 1.9417555910249905e-05, "loss": 1.5455, "step": 620 }, { "epoch": 0.2, "grad_norm": 8.735661880336002, "learning_rate": 1.939862330759602e-05, "loss": 1.7242, "step": 625 }, { "epoch": 0.2016, "grad_norm": 7.728994210262772, "learning_rate": 1.9379397431838194e-05, "loss": 1.6244, "step": 630 }, { "epoch": 0.2032, "grad_norm": 8.142561603441006, "learning_rate": 1.935987888289751e-05, "loss": 1.6033, "step": 635 }, { "epoch": 0.2048, "grad_norm": 7.087122461619784, "learning_rate": 1.9340068269827567e-05, "loss": 1.6478, "step": 640 }, { "epoch": 0.2064, "grad_norm": 8.796044551707043, "learning_rate": 1.93199662107955e-05, "loss": 1.622, "step": 645 }, { "epoch": 0.208, "grad_norm": 7.637980213604651, "learning_rate": 1.929957333306267e-05, "loss": 1.6378, "step": 650 }, { "epoch": 0.2096, "grad_norm": 8.176443820248227, "learning_rate": 1.9278890272965097e-05, "loss": 1.603, "step": 655 }, { "epoch": 0.2112, "grad_norm": 8.821331818781406, "learning_rate": 1.92579176758936e-05, "loss": 1.6285, "step": 660 }, { "epoch": 0.2128, "grad_norm": 8.751441102706378, "learning_rate": 1.9236656196273676e-05, "loss": 1.6783, "step": 665 }, { "epoch": 0.2144, "grad_norm": 7.729648207326943, "learning_rate": 1.9215106497545047e-05, "loss": 1.7185, "step": 670 }, { "epoch": 0.216, "grad_norm": 9.632019880573054, "learning_rate": 1.919326925214099e-05, "loss": 1.7044, "step": 675 }, { "epoch": 0.2176, "grad_norm": 7.864267682010497, "learning_rate": 1.9171145141467336e-05, "loss": 1.6628, "step": 680 }, { "epoch": 0.2192, "grad_norm": 7.961635669857337, "learning_rate": 1.9148734855881218e-05, "loss": 1.6602, "step": 685 }, { "epoch": 0.2208, "grad_norm": 10.292993487639919, "learning_rate": 1.912603909466952e-05, "loss": 1.5731, "step": 690 }, { "epoch": 0.2224, "grad_norm": 8.162330383972137, "learning_rate": 1.9103058566027062e-05, "loss": 1.6291, "step": 695 }, { "epoch": 0.224, "grad_norm": 7.554506895298207, "learning_rate": 1.9079793987034497e-05, "loss": 1.6665, "step": 700 }, { "epoch": 0.2256, "grad_norm": 8.135535319158393, "learning_rate": 1.9056246083635943e-05, "loss": 1.6404, "step": 705 }, { "epoch": 0.2272, "grad_norm": 9.74804156354211, "learning_rate": 1.9032415590616323e-05, "loss": 1.5687, "step": 710 }, { "epoch": 0.2288, "grad_norm": 8.104975985879758, "learning_rate": 1.9008303251578445e-05, "loss": 1.6728, "step": 715 }, { "epoch": 0.2304, "grad_norm": 8.720184023804999, "learning_rate": 1.898390981891979e-05, "loss": 1.634, "step": 720 }, { "epoch": 0.232, "grad_norm": 9.210829756244634, "learning_rate": 1.895923605380904e-05, "loss": 1.6422, "step": 725 }, { "epoch": 0.2336, "grad_norm": 6.632734132335917, "learning_rate": 1.8934282726162325e-05, "loss": 1.5709, "step": 730 }, { "epoch": 0.2352, "grad_norm": 7.931181213386623, "learning_rate": 1.8909050614619197e-05, "loss": 1.6244, "step": 735 }, { "epoch": 0.2368, "grad_norm": 8.05038767649013, "learning_rate": 1.8883540506518336e-05, "loss": 1.6756, "step": 740 }, { "epoch": 0.2384, "grad_norm": 9.138124074487578, "learning_rate": 1.885775319787298e-05, "loss": 1.5782, "step": 745 }, { "epoch": 0.24, "grad_norm": 9.052424132753602, "learning_rate": 1.8831689493346095e-05, "loss": 1.6486, "step": 750 }, { "epoch": 0.2416, "grad_norm": 12.11857562509831, "learning_rate": 1.880535020622525e-05, "loss": 1.5488, "step": 755 }, { "epoch": 0.2432, "grad_norm": 8.346673787548578, "learning_rate": 1.8778736158397244e-05, "loss": 1.5675, "step": 760 }, { "epoch": 0.2448, "grad_norm": 7.922560429887788, "learning_rate": 1.8751848180322476e-05, "loss": 1.5814, "step": 765 }, { "epoch": 0.2464, "grad_norm": 7.448862840223704, "learning_rate": 1.872468711100902e-05, "loss": 1.696, "step": 770 }, { "epoch": 0.248, "grad_norm": 7.381712408385049, "learning_rate": 1.869725379798643e-05, "loss": 1.5801, "step": 775 }, { "epoch": 0.2496, "grad_norm": 7.520192699988718, "learning_rate": 1.866954909727932e-05, "loss": 1.5583, "step": 780 }, { "epoch": 0.2512, "grad_norm": 8.458007153827033, "learning_rate": 1.864157387338064e-05, "loss": 1.5893, "step": 785 }, { "epoch": 0.2528, "grad_norm": 11.023935513739714, "learning_rate": 1.86133289992247e-05, "loss": 1.6194, "step": 790 }, { "epoch": 0.2544, "grad_norm": 7.279897256975987, "learning_rate": 1.8584815356159932e-05, "loss": 1.6186, "step": 795 }, { "epoch": 0.256, "grad_norm": 7.807795169814656, "learning_rate": 1.8556033833921386e-05, "loss": 1.7446, "step": 800 }, { "epoch": 0.2576, "grad_norm": 7.782885718886869, "learning_rate": 1.8526985330602973e-05, "loss": 1.6365, "step": 805 }, { "epoch": 0.2592, "grad_norm": 7.881283617531603, "learning_rate": 1.8497670752629437e-05, "loss": 1.7161, "step": 810 }, { "epoch": 0.2608, "grad_norm": 7.32964730840398, "learning_rate": 1.8468091014728076e-05, "loss": 1.6361, "step": 815 }, { "epoch": 0.2624, "grad_norm": 7.4867507136615075, "learning_rate": 1.843824703990019e-05, "loss": 1.7187, "step": 820 }, { "epoch": 0.264, "grad_norm": 7.878583656595656, "learning_rate": 1.840813975939229e-05, "loss": 1.6064, "step": 825 }, { "epoch": 0.2656, "grad_norm": 7.641254628614174, "learning_rate": 1.8377770112667024e-05, "loss": 1.6189, "step": 830 }, { "epoch": 0.2672, "grad_norm": 7.869146750924817, "learning_rate": 1.8347139047373885e-05, "loss": 1.6925, "step": 835 }, { "epoch": 0.2688, "grad_norm": 7.488977024706195, "learning_rate": 1.8316247519319625e-05, "loss": 1.6646, "step": 840 }, { "epoch": 0.2704, "grad_norm": 7.519265775038011, "learning_rate": 1.8285096492438424e-05, "loss": 1.6267, "step": 845 }, { "epoch": 0.272, "grad_norm": 9.052319258879985, "learning_rate": 1.825368693876183e-05, "loss": 1.6113, "step": 850 }, { "epoch": 0.2736, "grad_norm": 7.425912485447768, "learning_rate": 1.8222019838388422e-05, "loss": 1.6341, "step": 855 }, { "epoch": 0.2752, "grad_norm": 7.616489466474544, "learning_rate": 1.8190096179453213e-05, "loss": 1.6195, "step": 860 }, { "epoch": 0.2768, "grad_norm": 8.57212898020857, "learning_rate": 1.8157916958096837e-05, "loss": 1.5484, "step": 865 }, { "epoch": 0.2784, "grad_norm": 8.058170481678996, "learning_rate": 1.8125483178434448e-05, "loss": 1.6392, "step": 870 }, { "epoch": 0.28, "grad_norm": 7.759361204922903, "learning_rate": 1.8092795852524404e-05, "loss": 1.572, "step": 875 }, { "epoch": 0.2816, "grad_norm": 7.814935898355, "learning_rate": 1.8059856000336675e-05, "loss": 1.5755, "step": 880 }, { "epoch": 0.2832, "grad_norm": 8.862683471146346, "learning_rate": 1.8026664649721016e-05, "loss": 1.5711, "step": 885 }, { "epoch": 0.2848, "grad_norm": 9.253730250566822, "learning_rate": 1.7993222836374904e-05, "loss": 1.6258, "step": 890 }, { "epoch": 0.2864, "grad_norm": 8.877026192983683, "learning_rate": 1.795953160381121e-05, "loss": 1.6478, "step": 895 }, { "epoch": 0.288, "grad_norm": 9.101400243172016, "learning_rate": 1.792559200332564e-05, "loss": 1.6171, "step": 900 }, { "epoch": 0.2896, "grad_norm": 7.136286110367266, "learning_rate": 1.789140509396394e-05, "loss": 1.5754, "step": 905 }, { "epoch": 0.2912, "grad_norm": 7.912268476013011, "learning_rate": 1.7856971942488826e-05, "loss": 1.5975, "step": 910 }, { "epoch": 0.2928, "grad_norm": 7.20611974657277, "learning_rate": 1.7822293623346736e-05, "loss": 1.658, "step": 915 }, { "epoch": 0.2944, "grad_norm": 10.223106004235314, "learning_rate": 1.7787371218634263e-05, "loss": 1.6974, "step": 920 }, { "epoch": 0.296, "grad_norm": 7.6171324527749, "learning_rate": 1.77522058180644e-05, "loss": 1.692, "step": 925 }, { "epoch": 0.2976, "grad_norm": 9.169763628140181, "learning_rate": 1.7716798518932564e-05, "loss": 1.623, "step": 930 }, { "epoch": 0.2992, "grad_norm": 7.322606018864152, "learning_rate": 1.7681150426082322e-05, "loss": 1.6117, "step": 935 }, { "epoch": 0.3008, "grad_norm": 7.081332396019932, "learning_rate": 1.7645262651870926e-05, "loss": 1.7026, "step": 940 }, { "epoch": 0.3024, "grad_norm": 7.141747828380935, "learning_rate": 1.7609136316134616e-05, "loss": 1.5313, "step": 945 }, { "epoch": 0.304, "grad_norm": 8.73987298901696, "learning_rate": 1.7572772546153657e-05, "loss": 1.6529, "step": 950 }, { "epoch": 0.3056, "grad_norm": 8.397556088140782, "learning_rate": 1.7536172476617183e-05, "loss": 1.6707, "step": 955 }, { "epoch": 0.3072, "grad_norm": 7.447729626374141, "learning_rate": 1.749933724958777e-05, "loss": 1.6343, "step": 960 }, { "epoch": 0.3088, "grad_norm": 6.907736776247304, "learning_rate": 1.746226801446582e-05, "loss": 1.6028, "step": 965 }, { "epoch": 0.3104, "grad_norm": 8.374764423916597, "learning_rate": 1.742496592795368e-05, "loss": 1.5854, "step": 970 }, { "epoch": 0.312, "grad_norm": 6.656458694402831, "learning_rate": 1.738743215401955e-05, "loss": 1.5937, "step": 975 }, { "epoch": 0.3136, "grad_norm": 8.119235048754572, "learning_rate": 1.7349667863861175e-05, "loss": 1.6238, "step": 980 }, { "epoch": 0.3152, "grad_norm": 10.758877616976827, "learning_rate": 1.7311674235869285e-05, "loss": 1.6329, "step": 985 }, { "epoch": 0.3168, "grad_norm": 6.855294162792502, "learning_rate": 1.7273452455590835e-05, "loss": 1.5509, "step": 990 }, { "epoch": 0.3184, "grad_norm": 7.436276047950917, "learning_rate": 1.7235003715691996e-05, "loss": 1.6302, "step": 995 }, { "epoch": 0.32, "grad_norm": 9.112788481739514, "learning_rate": 1.7196329215920963e-05, "loss": 1.5555, "step": 1000 }, { "epoch": 0.3216, "grad_norm": 7.707360400350489, "learning_rate": 1.71574301630705e-05, "loss": 1.5711, "step": 1005 }, { "epoch": 0.3232, "grad_norm": 8.985392901674105, "learning_rate": 1.711830777094028e-05, "loss": 1.5719, "step": 1010 }, { "epoch": 0.3248, "grad_norm": 7.854703662416667, "learning_rate": 1.707896326029903e-05, "loss": 1.687, "step": 1015 }, { "epoch": 0.3264, "grad_norm": 7.991591880545251, "learning_rate": 1.7039397858846428e-05, "loss": 1.5265, "step": 1020 }, { "epoch": 0.328, "grad_norm": 8.38735368829394, "learning_rate": 1.6999612801174782e-05, "loss": 1.5071, "step": 1025 }, { "epoch": 0.3296, "grad_norm": 7.088683318490433, "learning_rate": 1.6959609328730526e-05, "loss": 1.5594, "step": 1030 }, { "epoch": 0.3312, "grad_norm": 7.732979117471935, "learning_rate": 1.6919388689775463e-05, "loss": 1.5789, "step": 1035 }, { "epoch": 0.3328, "grad_norm": 7.480645400348206, "learning_rate": 1.6878952139347834e-05, "loss": 1.5923, "step": 1040 }, { "epoch": 0.3344, "grad_norm": 7.858191371747195, "learning_rate": 1.6838300939223144e-05, "loss": 1.5487, "step": 1045 }, { "epoch": 0.336, "grad_norm": 8.85580923674649, "learning_rate": 1.679743635787479e-05, "loss": 1.5857, "step": 1050 }, { "epoch": 0.3376, "grad_norm": 7.901857553601464, "learning_rate": 1.6756359670434478e-05, "loss": 1.593, "step": 1055 }, { "epoch": 0.3392, "grad_norm": 9.026231263712882, "learning_rate": 1.6715072158652444e-05, "loss": 1.6723, "step": 1060 }, { "epoch": 0.3408, "grad_norm": 13.504122295950328, "learning_rate": 1.6673575110857457e-05, "loss": 1.6176, "step": 1065 }, { "epoch": 0.3424, "grad_norm": 10.243586542935969, "learning_rate": 1.6631869821916602e-05, "loss": 1.5779, "step": 1070 }, { "epoch": 0.344, "grad_norm": 7.489622112226259, "learning_rate": 1.6589957593194887e-05, "loss": 1.6419, "step": 1075 }, { "epoch": 0.3456, "grad_norm": 7.573818239927097, "learning_rate": 1.6547839732514646e-05, "loss": 1.5614, "step": 1080 }, { "epoch": 0.3472, "grad_norm": 10.0936611445675, "learning_rate": 1.650551755411471e-05, "loss": 1.5626, "step": 1085 }, { "epoch": 0.3488, "grad_norm": 7.874542575121299, "learning_rate": 1.646299237860941e-05, "loss": 1.6234, "step": 1090 }, { "epoch": 0.3504, "grad_norm": 8.356085115655546, "learning_rate": 1.6420265532947364e-05, "loss": 1.4703, "step": 1095 }, { "epoch": 0.352, "grad_norm": 9.024857376590765, "learning_rate": 1.6377338350370077e-05, "loss": 1.5622, "step": 1100 }, { "epoch": 0.3536, "grad_norm": 7.46614009546006, "learning_rate": 1.6334212170370323e-05, "loss": 1.6042, "step": 1105 }, { "epoch": 0.3552, "grad_norm": 8.565479511807924, "learning_rate": 1.6290888338650373e-05, "loss": 1.4699, "step": 1110 }, { "epoch": 0.3568, "grad_norm": 7.2168157043192025, "learning_rate": 1.624736820707998e-05, "loss": 1.5334, "step": 1115 }, { "epoch": 0.3584, "grad_norm": 6.584169033723296, "learning_rate": 1.6203653133654213e-05, "loss": 1.6832, "step": 1120 }, { "epoch": 0.36, "grad_norm": 6.225227952156274, "learning_rate": 1.615974448245107e-05, "loss": 1.5323, "step": 1125 }, { "epoch": 0.3616, "grad_norm": 10.601001964167809, "learning_rate": 1.6115643623588915e-05, "loss": 1.6198, "step": 1130 }, { "epoch": 0.3632, "grad_norm": 10.795794042349174, "learning_rate": 1.6071351933183736e-05, "loss": 1.5853, "step": 1135 }, { "epoch": 0.3648, "grad_norm": 7.346429916746835, "learning_rate": 1.602687079330619e-05, "loss": 1.5795, "step": 1140 }, { "epoch": 0.3664, "grad_norm": 6.659543911557079, "learning_rate": 1.5982201591938496e-05, "loss": 1.5804, "step": 1145 }, { "epoch": 0.368, "grad_norm": 7.221758133107811, "learning_rate": 1.5937345722931098e-05, "loss": 1.6121, "step": 1150 }, { "epoch": 0.3696, "grad_norm": 9.731811312205782, "learning_rate": 1.5892304585959193e-05, "loss": 1.6606, "step": 1155 }, { "epoch": 0.3712, "grad_norm": 8.589776786846603, "learning_rate": 1.5847079586479052e-05, "loss": 1.6185, "step": 1160 }, { "epoch": 0.3728, "grad_norm": 7.386876191799319, "learning_rate": 1.580167213568416e-05, "loss": 1.556, "step": 1165 }, { "epoch": 0.3744, "grad_norm": 8.140967833939259, "learning_rate": 1.575608365046118e-05, "loss": 1.5469, "step": 1170 }, { "epoch": 0.376, "grad_norm": 12.874290112872854, "learning_rate": 1.571031555334575e-05, "loss": 1.5306, "step": 1175 }, { "epoch": 0.3776, "grad_norm": 7.6631355790039395, "learning_rate": 1.566436927247808e-05, "loss": 1.5181, "step": 1180 }, { "epoch": 0.3792, "grad_norm": 6.840987993796332, "learning_rate": 1.5618246241558402e-05, "loss": 1.6786, "step": 1185 }, { "epoch": 0.3808, "grad_norm": 10.867647495390518, "learning_rate": 1.5571947899802227e-05, "loss": 1.6774, "step": 1190 }, { "epoch": 0.3824, "grad_norm": 7.370178174067006, "learning_rate": 1.5525475691895438e-05, "loss": 1.5014, "step": 1195 }, { "epoch": 0.384, "grad_norm": 7.247801379833239, "learning_rate": 1.5478831067949203e-05, "loss": 1.5683, "step": 1200 }, { "epoch": 0.3856, "grad_norm": 7.548129806153257, "learning_rate": 1.5432015483454736e-05, "loss": 1.5433, "step": 1205 }, { "epoch": 0.3872, "grad_norm": 7.403487909131467, "learning_rate": 1.5385030399237878e-05, "loss": 1.617, "step": 1210 }, { "epoch": 0.3888, "grad_norm": 7.085555289389217, "learning_rate": 1.533787728141351e-05, "loss": 1.5484, "step": 1215 }, { "epoch": 0.3904, "grad_norm": 6.688525472576119, "learning_rate": 1.5290557601339807e-05, "loss": 1.603, "step": 1220 }, { "epoch": 0.392, "grad_norm": 7.062961511805713, "learning_rate": 1.5243072835572319e-05, "loss": 1.6592, "step": 1225 }, { "epoch": 0.3936, "grad_norm": 7.338639370748799, "learning_rate": 1.5195424465817911e-05, "loss": 1.5468, "step": 1230 }, { "epoch": 0.3952, "grad_norm": 7.404342838798467, "learning_rate": 1.5147613978888514e-05, "loss": 1.4803, "step": 1235 }, { "epoch": 0.3968, "grad_norm": 7.815793838467086, "learning_rate": 1.5099642866654747e-05, "loss": 1.6002, "step": 1240 }, { "epoch": 0.3984, "grad_norm": 6.983719434750113, "learning_rate": 1.505151262599934e-05, "loss": 1.4679, "step": 1245 }, { "epoch": 0.4, "grad_norm": 7.689132853147396, "learning_rate": 1.5003224758770447e-05, "loss": 1.5426, "step": 1250 }, { "epoch": 0.4016, "grad_norm": 7.99779675336406, "learning_rate": 1.4954780771734783e-05, "loss": 1.6113, "step": 1255 }, { "epoch": 0.4032, "grad_norm": 7.956111242399946, "learning_rate": 1.4906182176530588e-05, "loss": 1.6137, "step": 1260 }, { "epoch": 0.4048, "grad_norm": 8.34462398939598, "learning_rate": 1.4857430489620476e-05, "loss": 1.6686, "step": 1265 }, { "epoch": 0.4064, "grad_norm": 6.87092130604808, "learning_rate": 1.4808527232244113e-05, "loss": 1.5328, "step": 1270 }, { "epoch": 0.408, "grad_norm": 7.069072459949788, "learning_rate": 1.4759473930370738e-05, "loss": 1.5685, "step": 1275 }, { "epoch": 0.4096, "grad_norm": 9.020054344062027, "learning_rate": 1.4710272114651555e-05, "loss": 1.5414, "step": 1280 }, { "epoch": 0.4112, "grad_norm": 9.445393180818964, "learning_rate": 1.4660923320371974e-05, "loss": 1.5297, "step": 1285 }, { "epoch": 0.4128, "grad_norm": 7.558351219070578, "learning_rate": 1.4611429087403695e-05, "loss": 1.5524, "step": 1290 }, { "epoch": 0.4144, "grad_norm": 7.092500501081219, "learning_rate": 1.456179096015667e-05, "loss": 1.686, "step": 1295 }, { "epoch": 0.416, "grad_norm": 7.207957772603401, "learning_rate": 1.4512010487530899e-05, "loss": 1.5716, "step": 1300 }, { "epoch": 0.4176, "grad_norm": 7.34637827560702, "learning_rate": 1.4462089222868099e-05, "loss": 1.469, "step": 1305 }, { "epoch": 0.4192, "grad_norm": 9.334037108256902, "learning_rate": 1.4412028723903251e-05, "loss": 1.5605, "step": 1310 }, { "epoch": 0.4208, "grad_norm": 7.693732678205702, "learning_rate": 1.4361830552715973e-05, "loss": 1.6241, "step": 1315 }, { "epoch": 0.4224, "grad_norm": 8.129038938122736, "learning_rate": 1.4311496275681785e-05, "loss": 1.4064, "step": 1320 }, { "epoch": 0.424, "grad_norm": 6.237348905953311, "learning_rate": 1.4261027463423232e-05, "loss": 1.4584, "step": 1325 }, { "epoch": 0.4256, "grad_norm": 6.031996271045834, "learning_rate": 1.4210425690760876e-05, "loss": 1.5006, "step": 1330 }, { "epoch": 0.4272, "grad_norm": 7.539819121652781, "learning_rate": 1.4159692536664147e-05, "loss": 1.5603, "step": 1335 }, { "epoch": 0.4288, "grad_norm": 7.523103920232781, "learning_rate": 1.410882958420209e-05, "loss": 1.5839, "step": 1340 }, { "epoch": 0.4304, "grad_norm": 7.666216752529669, "learning_rate": 1.405783842049395e-05, "loss": 1.5192, "step": 1345 }, { "epoch": 0.432, "grad_norm": 7.32146225643351, "learning_rate": 1.4006720636659656e-05, "loss": 1.5088, "step": 1350 }, { "epoch": 0.4336, "grad_norm": 8.854549575824441, "learning_rate": 1.3955477827770174e-05, "loss": 1.5657, "step": 1355 }, { "epoch": 0.4352, "grad_norm": 8.471943810603415, "learning_rate": 1.3904111592797724e-05, "loss": 1.4975, "step": 1360 }, { "epoch": 0.4368, "grad_norm": 7.881715416808106, "learning_rate": 1.3852623534565901e-05, "loss": 1.4941, "step": 1365 }, { "epoch": 0.4384, "grad_norm": 7.441858474366784, "learning_rate": 1.3801015259699648e-05, "loss": 1.5424, "step": 1370 }, { "epoch": 0.44, "grad_norm": 8.277844600049658, "learning_rate": 1.3749288378575133e-05, "loss": 1.5154, "step": 1375 }, { "epoch": 0.4416, "grad_norm": 7.1055378423472835, "learning_rate": 1.3697444505269489e-05, "loss": 1.5948, "step": 1380 }, { "epoch": 0.4432, "grad_norm": 7.146958460316305, "learning_rate": 1.3645485257510456e-05, "loss": 1.5807, "step": 1385 }, { "epoch": 0.4448, "grad_norm": 6.5302083293339965, "learning_rate": 1.3593412256625898e-05, "loss": 1.4385, "step": 1390 }, { "epoch": 0.4464, "grad_norm": 7.24352130131181, "learning_rate": 1.3541227127493218e-05, "loss": 1.4763, "step": 1395 }, { "epoch": 0.448, "grad_norm": 7.695971362233706, "learning_rate": 1.348893149848865e-05, "loss": 1.4818, "step": 1400 }, { "epoch": 0.4496, "grad_norm": 7.454125385613667, "learning_rate": 1.3436527001436437e-05, "loss": 1.4103, "step": 1405 }, { "epoch": 0.4512, "grad_norm": 7.82334723806768, "learning_rate": 1.3384015271557938e-05, "loss": 1.4473, "step": 1410 }, { "epoch": 0.4528, "grad_norm": 7.813760357009132, "learning_rate": 1.3331397947420578e-05, "loss": 1.4581, "step": 1415 }, { "epoch": 0.4544, "grad_norm": 6.911969016129946, "learning_rate": 1.3278676670886728e-05, "loss": 1.4735, "step": 1420 }, { "epoch": 0.456, "grad_norm": 7.885416119688343, "learning_rate": 1.3225853087062481e-05, "loss": 1.4771, "step": 1425 }, { "epoch": 0.4576, "grad_norm": 7.171521274701965, "learning_rate": 1.3172928844246297e-05, "loss": 1.4909, "step": 1430 }, { "epoch": 0.4592, "grad_norm": 6.038998548284215, "learning_rate": 1.3119905593877593e-05, "loss": 1.4862, "step": 1435 }, { "epoch": 0.4608, "grad_norm": 7.850277653518824, "learning_rate": 1.3066784990485202e-05, "loss": 1.4361, "step": 1440 }, { "epoch": 0.4624, "grad_norm": 8.240535964327846, "learning_rate": 1.3013568691635733e-05, "loss": 1.4437, "step": 1445 }, { "epoch": 0.464, "grad_norm": 6.764355409884025, "learning_rate": 1.2960258357881875e-05, "loss": 1.5192, "step": 1450 }, { "epoch": 0.4656, "grad_norm": 7.159838600132645, "learning_rate": 1.2906855652710557e-05, "loss": 1.5827, "step": 1455 }, { "epoch": 0.4672, "grad_norm": 8.306859326063204, "learning_rate": 1.2853362242491054e-05, "loss": 1.5463, "step": 1460 }, { "epoch": 0.4688, "grad_norm": 7.019302747818116, "learning_rate": 1.279977979642299e-05, "loss": 1.4193, "step": 1465 }, { "epoch": 0.4704, "grad_norm": 7.622978792348708, "learning_rate": 1.2746109986484236e-05, "loss": 1.422, "step": 1470 }, { "epoch": 0.472, "grad_norm": 6.56284738301426, "learning_rate": 1.2692354487378768e-05, "loss": 1.5312, "step": 1475 }, { "epoch": 0.4736, "grad_norm": 6.387578487452188, "learning_rate": 1.2638514976484384e-05, "loss": 1.4795, "step": 1480 }, { "epoch": 0.4752, "grad_norm": 6.050045115060615, "learning_rate": 1.2584593133800374e-05, "loss": 1.4694, "step": 1485 }, { "epoch": 0.4768, "grad_norm": 7.023845599021495, "learning_rate": 1.2530590641895089e-05, "loss": 1.5678, "step": 1490 }, { "epoch": 0.4784, "grad_norm": 6.600644496789812, "learning_rate": 1.2476509185853456e-05, "loss": 1.5587, "step": 1495 }, { "epoch": 0.48, "grad_norm": 6.483920132677383, "learning_rate": 1.242235045322438e-05, "loss": 1.4022, "step": 1500 }, { "epoch": 0.4816, "grad_norm": 7.393698504360439, "learning_rate": 1.2368116133968091e-05, "loss": 1.4542, "step": 1505 }, { "epoch": 0.4832, "grad_norm": 6.59515492702019, "learning_rate": 1.2313807920403419e-05, "loss": 1.4563, "step": 1510 }, { "epoch": 0.4848, "grad_norm": 11.46310019096542, "learning_rate": 1.2259427507154964e-05, "loss": 1.4436, "step": 1515 }, { "epoch": 0.4864, "grad_norm": 7.492782202626292, "learning_rate": 1.2204976591100253e-05, "loss": 1.5193, "step": 1520 }, { "epoch": 0.488, "grad_norm": 7.7736261505421025, "learning_rate": 1.2150456871316758e-05, "loss": 1.4168, "step": 1525 }, { "epoch": 0.4896, "grad_norm": 7.9354251057792515, "learning_rate": 1.2095870049028898e-05, "loss": 1.4414, "step": 1530 }, { "epoch": 0.4912, "grad_norm": 16.059395970692933, "learning_rate": 1.2041217827554939e-05, "loss": 1.4695, "step": 1535 }, { "epoch": 0.4928, "grad_norm": 6.808900956376759, "learning_rate": 1.1986501912253863e-05, "loss": 1.4531, "step": 1540 }, { "epoch": 0.4944, "grad_norm": 7.736857454242817, "learning_rate": 1.1931724010472135e-05, "loss": 1.4924, "step": 1545 }, { "epoch": 0.496, "grad_norm": 8.3359924924688, "learning_rate": 1.1876885831490442e-05, "loss": 1.555, "step": 1550 }, { "epoch": 0.4976, "grad_norm": 7.120202493727813, "learning_rate": 1.1821989086470349e-05, "loss": 1.4645, "step": 1555 }, { "epoch": 0.4992, "grad_norm": 7.599913279445224, "learning_rate": 1.1767035488400903e-05, "loss": 1.4863, "step": 1560 }, { "epoch": 0.5008, "grad_norm": 7.125989534788617, "learning_rate": 1.1712026752045189e-05, "loss": 1.4652, "step": 1565 }, { "epoch": 0.5024, "grad_norm": 8.088722450627488, "learning_rate": 1.1656964593886819e-05, "loss": 1.3223, "step": 1570 }, { "epoch": 0.504, "grad_norm": 7.516194041566506, "learning_rate": 1.1601850732076361e-05, "loss": 1.5801, "step": 1575 }, { "epoch": 0.5056, "grad_norm": 7.239851442002106, "learning_rate": 1.1546686886377745e-05, "loss": 1.4731, "step": 1580 }, { "epoch": 0.5072, "grad_norm": 7.520103127972609, "learning_rate": 1.1491474778114588e-05, "loss": 1.3683, "step": 1585 }, { "epoch": 0.5088, "grad_norm": 9.131577917600616, "learning_rate": 1.143621613011648e-05, "loss": 1.562, "step": 1590 }, { "epoch": 0.5104, "grad_norm": 7.486194993220994, "learning_rate": 1.1380912666665234e-05, "loss": 1.4111, "step": 1595 }, { "epoch": 0.512, "grad_norm": 6.496362336710197, "learning_rate": 1.1325566113441074e-05, "loss": 1.5706, "step": 1600 }, { "epoch": 0.5136, "grad_norm": 6.823989380990125, "learning_rate": 1.1270178197468788e-05, "loss": 1.4575, "step": 1605 }, { "epoch": 0.5152, "grad_norm": 7.5663657895225125, "learning_rate": 1.121475064706385e-05, "loss": 1.3982, "step": 1610 }, { "epoch": 0.5168, "grad_norm": 6.5005524818168965, "learning_rate": 1.1159285191778473e-05, "loss": 1.4888, "step": 1615 }, { "epoch": 0.5184, "grad_norm": 7.387470263764186, "learning_rate": 1.1103783562347642e-05, "loss": 1.359, "step": 1620 }, { "epoch": 0.52, "grad_norm": 7.505035445167394, "learning_rate": 1.1048247490635133e-05, "loss": 1.4775, "step": 1625 }, { "epoch": 0.5216, "grad_norm": 6.645939875937268, "learning_rate": 1.099267870957943e-05, "loss": 1.5036, "step": 1630 }, { "epoch": 0.5232, "grad_norm": 6.913899992846711, "learning_rate": 1.0937078953139691e-05, "loss": 1.4273, "step": 1635 }, { "epoch": 0.5248, "grad_norm": 7.88033348199259, "learning_rate": 1.0881449956241616e-05, "loss": 1.2594, "step": 1640 }, { "epoch": 0.5264, "grad_norm": 7.305128083335553, "learning_rate": 1.0825793454723325e-05, "loss": 1.4711, "step": 1645 }, { "epoch": 0.528, "grad_norm": 8.41830449904285, "learning_rate": 1.0770111185281182e-05, "loss": 1.4567, "step": 1650 }, { "epoch": 0.5296, "grad_norm": 8.793536036546655, "learning_rate": 1.071440488541562e-05, "loss": 1.4354, "step": 1655 }, { "epoch": 0.5312, "grad_norm": 7.225955221122391, "learning_rate": 1.0658676293376894e-05, "loss": 1.4268, "step": 1660 }, { "epoch": 0.5328, "grad_norm": 7.296610012317386, "learning_rate": 1.0602927148110882e-05, "loss": 1.355, "step": 1665 }, { "epoch": 0.5344, "grad_norm": 7.368094614876765, "learning_rate": 1.0547159189204788e-05, "loss": 1.3505, "step": 1670 }, { "epoch": 0.536, "grad_norm": 6.381999606829369, "learning_rate": 1.0491374156832875e-05, "loss": 1.3813, "step": 1675 }, { "epoch": 0.5376, "grad_norm": 7.047380620438761, "learning_rate": 1.043557379170217e-05, "loss": 1.4362, "step": 1680 }, { "epoch": 0.5392, "grad_norm": 6.040591759821475, "learning_rate": 1.0379759834998133e-05, "loss": 1.4112, "step": 1685 }, { "epoch": 0.5408, "grad_norm": 6.915219064223463, "learning_rate": 1.0323934028330337e-05, "loss": 1.5057, "step": 1690 }, { "epoch": 0.5424, "grad_norm": 7.271768400559689, "learning_rate": 1.0268098113678124e-05, "loss": 1.4705, "step": 1695 }, { "epoch": 0.544, "grad_norm": 6.053160359266474, "learning_rate": 1.0212253833336237e-05, "loss": 1.477, "step": 1700 }, { "epoch": 0.5456, "grad_norm": 6.534057764124044, "learning_rate": 1.015640292986046e-05, "loss": 1.4277, "step": 1705 }, { "epoch": 0.5472, "grad_norm": 8.2582991694942, "learning_rate": 1.0100547146013252e-05, "loss": 1.4827, "step": 1710 }, { "epoch": 0.5488, "grad_norm": 6.985572611193969, "learning_rate": 1.0044688224709346e-05, "loss": 1.3615, "step": 1715 }, { "epoch": 0.5504, "grad_norm": 6.902453232669348, "learning_rate": 9.988827908961392e-06, "loss": 1.4461, "step": 1720 }, { "epoch": 0.552, "grad_norm": 6.996523125729606, "learning_rate": 9.932967941825539e-06, "loss": 1.3792, "step": 1725 }, { "epoch": 0.5536, "grad_norm": 6.402198205444748, "learning_rate": 9.87711006634706e-06, "loss": 1.5359, "step": 1730 }, { "epoch": 0.5552, "grad_norm": 6.398796292025464, "learning_rate": 9.821256025505964e-06, "loss": 1.3898, "step": 1735 }, { "epoch": 0.5568, "grad_norm": 6.940842238375595, "learning_rate": 9.765407562162606e-06, "loss": 1.4426, "step": 1740 }, { "epoch": 0.5584, "grad_norm": 8.063938708721054, "learning_rate": 9.709566419003292e-06, "loss": 1.4324, "step": 1745 }, { "epoch": 0.56, "grad_norm": 7.113579365707868, "learning_rate": 9.653734338485924e-06, "loss": 1.3696, "step": 1750 }, { "epoch": 0.5616, "grad_norm": 9.305625046247098, "learning_rate": 9.597913062785603e-06, "loss": 1.3767, "step": 1755 }, { "epoch": 0.5632, "grad_norm": 7.713589183858762, "learning_rate": 9.54210433374028e-06, "loss": 1.3993, "step": 1760 }, { "epoch": 0.5648, "grad_norm": 6.882609362707895, "learning_rate": 9.486309892796413e-06, "loss": 1.2881, "step": 1765 }, { "epoch": 0.5664, "grad_norm": 7.382583057213062, "learning_rate": 9.430531480954605e-06, "loss": 1.3868, "step": 1770 }, { "epoch": 0.568, "grad_norm": 7.060465219449526, "learning_rate": 9.374770838715289e-06, "loss": 1.3008, "step": 1775 }, { "epoch": 0.5696, "grad_norm": 6.548055523692506, "learning_rate": 9.319029706024428e-06, "loss": 1.4179, "step": 1780 }, { "epoch": 0.5712, "grad_norm": 7.296597392978876, "learning_rate": 9.2633098222192e-06, "loss": 1.3648, "step": 1785 }, { "epoch": 0.5728, "grad_norm": 7.6566983852007695, "learning_rate": 9.20761292597375e-06, "loss": 1.48, "step": 1790 }, { "epoch": 0.5744, "grad_norm": 6.346223117268573, "learning_rate": 9.151940755244912e-06, "loss": 1.4082, "step": 1795 }, { "epoch": 0.576, "grad_norm": 7.11841923298027, "learning_rate": 9.096295047217988e-06, "loss": 1.3294, "step": 1800 }, { "epoch": 0.5776, "grad_norm": 8.50296732146637, "learning_rate": 9.040677538252555e-06, "loss": 1.4083, "step": 1805 }, { "epoch": 0.5792, "grad_norm": 7.014419702757138, "learning_rate": 8.985089963828262e-06, "loss": 1.4773, "step": 1810 }, { "epoch": 0.5808, "grad_norm": 7.176240770998421, "learning_rate": 8.929534058490682e-06, "loss": 1.2781, "step": 1815 }, { "epoch": 0.5824, "grad_norm": 7.803766989451665, "learning_rate": 8.8740115557972e-06, "loss": 1.4017, "step": 1820 }, { "epoch": 0.584, "grad_norm": 7.779698071769793, "learning_rate": 8.8185241882629e-06, "loss": 1.4537, "step": 1825 }, { "epoch": 0.5856, "grad_norm": 6.415990926093912, "learning_rate": 8.763073687306523e-06, "loss": 1.2469, "step": 1830 }, { "epoch": 0.5872, "grad_norm": 8.66418584105546, "learning_rate": 8.707661783196432e-06, "loss": 1.352, "step": 1835 }, { "epoch": 0.5888, "grad_norm": 7.910777048810527, "learning_rate": 8.652290204996613e-06, "loss": 1.4686, "step": 1840 }, { "epoch": 0.5904, "grad_norm": 6.3176543086259995, "learning_rate": 8.59696068051273e-06, "loss": 1.4047, "step": 1845 }, { "epoch": 0.592, "grad_norm": 7.512370939346078, "learning_rate": 8.541674936238219e-06, "loss": 1.422, "step": 1850 }, { "epoch": 0.5936, "grad_norm": 7.8096285750182215, "learning_rate": 8.486434697300394e-06, "loss": 1.4087, "step": 1855 }, { "epoch": 0.5952, "grad_norm": 6.178330789312308, "learning_rate": 8.431241687406631e-06, "loss": 1.3726, "step": 1860 }, { "epoch": 0.5968, "grad_norm": 6.624672273010726, "learning_rate": 8.376097628790586e-06, "loss": 1.3789, "step": 1865 }, { "epoch": 0.5984, "grad_norm": 7.3241310271547295, "learning_rate": 8.321004242158439e-06, "loss": 1.3609, "step": 1870 }, { "epoch": 0.6, "grad_norm": 6.04309140021108, "learning_rate": 8.265963246635212e-06, "loss": 1.288, "step": 1875 }, { "epoch": 0.6016, "grad_norm": 6.865692791476556, "learning_rate": 8.210976359711124e-06, "loss": 1.3762, "step": 1880 }, { "epoch": 0.6032, "grad_norm": 6.771762456175988, "learning_rate": 8.156045297187994e-06, "loss": 1.2717, "step": 1885 }, { "epoch": 0.6048, "grad_norm": 6.154571636328125, "learning_rate": 8.101171773125716e-06, "loss": 1.3669, "step": 1890 }, { "epoch": 0.6064, "grad_norm": 6.572968900317892, "learning_rate": 8.046357499788757e-06, "loss": 1.38, "step": 1895 }, { "epoch": 0.608, "grad_norm": 8.531193639514733, "learning_rate": 7.991604187592732e-06, "loss": 1.5369, "step": 1900 }, { "epoch": 0.6096, "grad_norm": 6.698720683326016, "learning_rate": 7.93691354505103e-06, "loss": 1.4231, "step": 1905 }, { "epoch": 0.6112, "grad_norm": 7.521329845585438, "learning_rate": 7.882287278721523e-06, "loss": 1.4031, "step": 1910 }, { "epoch": 0.6128, "grad_norm": 7.6211310302707505, "learning_rate": 7.82772709315328e-06, "loss": 1.3337, "step": 1915 }, { "epoch": 0.6144, "grad_norm": 6.018003479238453, "learning_rate": 7.77323469083341e-06, "loss": 1.3658, "step": 1920 }, { "epoch": 0.616, "grad_norm": 6.464569749783369, "learning_rate": 7.718811772133918e-06, "loss": 1.2912, "step": 1925 }, { "epoch": 0.6176, "grad_norm": 6.514713737748858, "learning_rate": 7.664460035258651e-06, "loss": 1.5244, "step": 1930 }, { "epoch": 0.6192, "grad_norm": 6.205352658391803, "learning_rate": 7.610181176190318e-06, "loss": 1.2526, "step": 1935 }, { "epoch": 0.6208, "grad_norm": 7.014357158748993, "learning_rate": 7.555976888637556e-06, "loss": 1.3358, "step": 1940 }, { "epoch": 0.6224, "grad_norm": 7.288009471718571, "learning_rate": 7.501848863982082e-06, "loss": 1.419, "step": 1945 }, { "epoch": 0.624, "grad_norm": 7.468562712854107, "learning_rate": 7.447798791225925e-06, "loss": 1.2961, "step": 1950 }, { "epoch": 0.6256, "grad_norm": 7.203677073348877, "learning_rate": 7.393828356938709e-06, "loss": 1.3685, "step": 1955 }, { "epoch": 0.6272, "grad_norm": 7.047746758873238, "learning_rate": 7.3399392452050385e-06, "loss": 1.4267, "step": 1960 }, { "epoch": 0.6288, "grad_norm": 7.352486974428156, "learning_rate": 7.286133137571938e-06, "loss": 1.284, "step": 1965 }, { "epoch": 0.6304, "grad_norm": 6.913404968326128, "learning_rate": 7.2324117129963815e-06, "loss": 1.3034, "step": 1970 }, { "epoch": 0.632, "grad_norm": 7.129552158427298, "learning_rate": 7.178776647792918e-06, "loss": 1.3451, "step": 1975 }, { "epoch": 0.6336, "grad_norm": 7.337355717351341, "learning_rate": 7.125229615581346e-06, "loss": 1.2403, "step": 1980 }, { "epoch": 0.6352, "grad_norm": 7.212501447309288, "learning_rate": 7.071772287234497e-06, "loss": 1.2539, "step": 1985 }, { "epoch": 0.6368, "grad_norm": 7.268649220162584, "learning_rate": 7.018406330826096e-06, "loss": 1.3539, "step": 1990 }, { "epoch": 0.6384, "grad_norm": 7.2645693935162905, "learning_rate": 6.96513341157872e-06, "loss": 1.3748, "step": 1995 }, { "epoch": 0.64, "grad_norm": 8.342662156660523, "learning_rate": 6.911955191811819e-06, "loss": 1.3324, "step": 2000 }, { "epoch": 0.6416, "grad_norm": 7.283104811608162, "learning_rate": 6.858873330889868e-06, "loss": 1.3388, "step": 2005 }, { "epoch": 0.6432, "grad_norm": 6.514509652191098, "learning_rate": 6.8058894851705655e-06, "loss": 1.3698, "step": 2010 }, { "epoch": 0.6448, "grad_norm": 7.590737838567425, "learning_rate": 6.7530053079531664e-06, "loss": 1.3399, "step": 2015 }, { "epoch": 0.6464, "grad_norm": 10.218109612754096, "learning_rate": 6.700222449426885e-06, "loss": 1.3822, "step": 2020 }, { "epoch": 0.648, "grad_norm": 7.9917404827520055, "learning_rate": 6.6475425566194006e-06, "loss": 1.406, "step": 2025 }, { "epoch": 0.6496, "grad_norm": 5.976353190616539, "learning_rate": 6.59496727334547e-06, "loss": 1.3219, "step": 2030 }, { "epoch": 0.6512, "grad_norm": 8.101871551936691, "learning_rate": 6.5424982401556305e-06, "loss": 1.3451, "step": 2035 }, { "epoch": 0.6528, "grad_norm": 6.374533499924878, "learning_rate": 6.490137094285008e-06, "loss": 1.3402, "step": 2040 }, { "epoch": 0.6544, "grad_norm": 6.459403952059301, "learning_rate": 6.437885469602235e-06, "loss": 1.3804, "step": 2045 }, { "epoch": 0.656, "grad_norm": 7.0688192727901935, "learning_rate": 6.385744996558456e-06, "loss": 1.3406, "step": 2050 }, { "epoch": 0.6576, "grad_norm": 6.7313792745199486, "learning_rate": 6.333717302136457e-06, "loss": 1.3536, "step": 2055 }, { "epoch": 0.6592, "grad_norm": 6.882985114725421, "learning_rate": 6.28180400979991e-06, "loss": 1.2695, "step": 2060 }, { "epoch": 0.6608, "grad_norm": 6.533077979959867, "learning_rate": 6.230006739442692e-06, "loss": 1.2323, "step": 2065 }, { "epoch": 0.6624, "grad_norm": 6.540348895029834, "learning_rate": 6.178327107338353e-06, "loss": 1.4556, "step": 2070 }, { "epoch": 0.664, "grad_norm": 8.070984765040714, "learning_rate": 6.1267667260896755e-06, "loss": 1.3297, "step": 2075 }, { "epoch": 0.6656, "grad_norm": 10.638827242283833, "learning_rate": 6.075327204578363e-06, "loss": 1.3589, "step": 2080 }, { "epoch": 0.6672, "grad_norm": 6.9949360978136434, "learning_rate": 6.024010147914826e-06, "loss": 1.2704, "step": 2085 }, { "epoch": 0.6688, "grad_norm": 7.6982660712041255, "learning_rate": 5.972817157388106e-06, "loss": 1.3201, "step": 2090 }, { "epoch": 0.6704, "grad_norm": 7.526592127466844, "learning_rate": 5.921749830415905e-06, "loss": 1.3338, "step": 2095 }, { "epoch": 0.672, "grad_norm": 7.590856683299185, "learning_rate": 5.870809760494734e-06, "loss": 1.2958, "step": 2100 }, { "epoch": 0.6736, "grad_norm": 6.9630835241832125, "learning_rate": 5.819998537150203e-06, "loss": 1.2639, "step": 2105 }, { "epoch": 0.6752, "grad_norm": 6.253694167385759, "learning_rate": 5.769317745887413e-06, "loss": 1.324, "step": 2110 }, { "epoch": 0.6768, "grad_norm": 6.682246580824873, "learning_rate": 5.718768968141482e-06, "loss": 1.33, "step": 2115 }, { "epoch": 0.6784, "grad_norm": 7.24085307136985, "learning_rate": 5.668353781228193e-06, "loss": 1.3596, "step": 2120 }, { "epoch": 0.68, "grad_norm": 6.889515698119121, "learning_rate": 5.618073758294802e-06, "loss": 1.244, "step": 2125 }, { "epoch": 0.6816, "grad_norm": 7.707218870893382, "learning_rate": 5.567930468270911e-06, "loss": 1.3282, "step": 2130 }, { "epoch": 0.6832, "grad_norm": 7.803800032255029, "learning_rate": 5.517925475819539e-06, "loss": 1.34, "step": 2135 }, { "epoch": 0.6848, "grad_norm": 7.2010746561843755, "learning_rate": 5.468060341288286e-06, "loss": 1.2944, "step": 2140 }, { "epoch": 0.6864, "grad_norm": 6.695133409250474, "learning_rate": 5.418336620660658e-06, "loss": 1.2467, "step": 2145 }, { "epoch": 0.688, "grad_norm": 6.811091226395264, "learning_rate": 5.36875586550749e-06, "loss": 1.2451, "step": 2150 }, { "epoch": 0.6896, "grad_norm": 7.299012057736775, "learning_rate": 5.319319622938563e-06, "loss": 1.2175, "step": 2155 }, { "epoch": 0.6912, "grad_norm": 6.523483550221802, "learning_rate": 5.270029435554295e-06, "loss": 1.3653, "step": 2160 }, { "epoch": 0.6928, "grad_norm": 7.006983737221144, "learning_rate": 5.22088684139763e-06, "loss": 1.3037, "step": 2165 }, { "epoch": 0.6944, "grad_norm": 9.658145298208773, "learning_rate": 5.171893373906036e-06, "loss": 1.311, "step": 2170 }, { "epoch": 0.696, "grad_norm": 6.9236063967034145, "learning_rate": 5.1230505618636575e-06, "loss": 1.3054, "step": 2175 }, { "epoch": 0.6976, "grad_norm": 6.910379366692173, "learning_rate": 5.074359929353604e-06, "loss": 1.1699, "step": 2180 }, { "epoch": 0.6992, "grad_norm": 7.780823172347376, "learning_rate": 5.025822995710414e-06, "loss": 1.2552, "step": 2185 }, { "epoch": 0.7008, "grad_norm": 6.782111700218943, "learning_rate": 4.977441275472622e-06, "loss": 1.4024, "step": 2190 }, { "epoch": 0.7024, "grad_norm": 7.01625141674153, "learning_rate": 4.929216278335508e-06, "loss": 1.3484, "step": 2195 }, { "epoch": 0.704, "grad_norm": 6.611844691268078, "learning_rate": 4.881149509103993e-06, "loss": 1.3066, "step": 2200 }, { "epoch": 0.7056, "grad_norm": 6.399244868619568, "learning_rate": 4.833242467645677e-06, "loss": 1.4538, "step": 2205 }, { "epoch": 0.7072, "grad_norm": 7.977931392416363, "learning_rate": 4.785496648844049e-06, "loss": 1.219, "step": 2210 }, { "epoch": 0.7088, "grad_norm": 7.034256666397906, "learning_rate": 4.737913542551824e-06, "loss": 1.319, "step": 2215 }, { "epoch": 0.7104, "grad_norm": 7.5123049812985565, "learning_rate": 4.690494633544466e-06, "loss": 1.3271, "step": 2220 }, { "epoch": 0.712, "grad_norm": 6.515957790093329, "learning_rate": 4.643241401473849e-06, "loss": 1.3427, "step": 2225 }, { "epoch": 0.7136, "grad_norm": 6.839639945427634, "learning_rate": 4.596155320822103e-06, "loss": 1.2736, "step": 2230 }, { "epoch": 0.7152, "grad_norm": 7.689485372330819, "learning_rate": 4.549237860855578e-06, "loss": 1.278, "step": 2235 }, { "epoch": 0.7168, "grad_norm": 7.447174910713708, "learning_rate": 4.502490485579024e-06, "loss": 1.2915, "step": 2240 }, { "epoch": 0.7184, "grad_norm": 7.345469812975139, "learning_rate": 4.455914653689889e-06, "loss": 1.2697, "step": 2245 }, { "epoch": 0.72, "grad_norm": 6.852760914424491, "learning_rate": 4.409511818532809e-06, "loss": 1.3326, "step": 2250 }, { "epoch": 0.7216, "grad_norm": 9.60045018569782, "learning_rate": 4.363283428054262e-06, "loss": 1.2859, "step": 2255 }, { "epoch": 0.7232, "grad_norm": 7.195814960386886, "learning_rate": 4.317230924757379e-06, "loss": 1.2189, "step": 2260 }, { "epoch": 0.7248, "grad_norm": 6.838988916881499, "learning_rate": 4.271355745656934e-06, "loss": 1.2248, "step": 2265 }, { "epoch": 0.7264, "grad_norm": 7.060258323380059, "learning_rate": 4.2256593222345185e-06, "loss": 1.3148, "step": 2270 }, { "epoch": 0.728, "grad_norm": 7.168253685594636, "learning_rate": 4.1801430803938496e-06, "loss": 1.2706, "step": 2275 }, { "epoch": 0.7296, "grad_norm": 8.110296241359842, "learning_rate": 4.1348084404162895e-06, "loss": 1.22, "step": 2280 }, { "epoch": 0.7312, "grad_norm": 6.146120619441478, "learning_rate": 4.089656816916525e-06, "loss": 1.2596, "step": 2285 }, { "epoch": 0.7328, "grad_norm": 7.51200746635356, "learning_rate": 4.0446896187984275e-06, "loss": 1.2051, "step": 2290 }, { "epoch": 0.7344, "grad_norm": 7.00889844522249, "learning_rate": 3.999908249211096e-06, "loss": 1.3089, "step": 2295 }, { "epoch": 0.736, "grad_norm": 7.433128694008251, "learning_rate": 3.955314105505056e-06, "loss": 1.1858, "step": 2300 }, { "epoch": 0.7376, "grad_norm": 7.682196556731105, "learning_rate": 3.910908579188672e-06, "loss": 1.2912, "step": 2305 }, { "epoch": 0.7392, "grad_norm": 8.377582759029309, "learning_rate": 3.866693055884723e-06, "loss": 1.2959, "step": 2310 }, { "epoch": 0.7408, "grad_norm": 7.525533674291028, "learning_rate": 3.8226689152871576e-06, "loss": 1.3002, "step": 2315 }, { "epoch": 0.7424, "grad_norm": 6.5405611376318085, "learning_rate": 3.7788375311180624e-06, "loss": 1.1595, "step": 2320 }, { "epoch": 0.744, "grad_norm": 6.351041058258562, "learning_rate": 3.735200271084779e-06, "loss": 1.2756, "step": 2325 }, { "epoch": 0.7456, "grad_norm": 6.945497138509825, "learning_rate": 3.691758496837228e-06, "loss": 1.3431, "step": 2330 }, { "epoch": 0.7472, "grad_norm": 7.80452005536971, "learning_rate": 3.6485135639254234e-06, "loss": 1.1743, "step": 2335 }, { "epoch": 0.7488, "grad_norm": 6.904075689429772, "learning_rate": 3.6054668217571774e-06, "loss": 1.2647, "step": 2340 }, { "epoch": 0.7504, "grad_norm": 5.826075155016658, "learning_rate": 3.5626196135559898e-06, "loss": 1.4307, "step": 2345 }, { "epoch": 0.752, "grad_norm": 7.4800929811344234, "learning_rate": 3.5199732763191317e-06, "loss": 1.3035, "step": 2350 }, { "epoch": 0.7536, "grad_norm": 6.592404311501557, "learning_rate": 3.4775291407759393e-06, "loss": 1.3101, "step": 2355 }, { "epoch": 0.7552, "grad_norm": 6.0603172825038625, "learning_rate": 3.435288531346269e-06, "loss": 1.1216, "step": 2360 }, { "epoch": 0.7568, "grad_norm": 7.862732877894627, "learning_rate": 3.3932527660991877e-06, "loss": 1.2707, "step": 2365 }, { "epoch": 0.7584, "grad_norm": 6.934609447705353, "learning_rate": 3.351423156711836e-06, "loss": 1.1406, "step": 2370 }, { "epoch": 0.76, "grad_norm": 6.676258909833237, "learning_rate": 3.309801008428498e-06, "loss": 1.1754, "step": 2375 }, { "epoch": 0.7616, "grad_norm": 7.059333862950036, "learning_rate": 3.268387620019885e-06, "loss": 1.214, "step": 2380 }, { "epoch": 0.7632, "grad_norm": 6.4985460219892115, "learning_rate": 3.2271842837425917e-06, "loss": 1.2429, "step": 2385 }, { "epoch": 0.7648, "grad_norm": 6.318472678199636, "learning_rate": 3.1861922852987794e-06, "loss": 1.2567, "step": 2390 }, { "epoch": 0.7664, "grad_norm": 8.106856700967048, "learning_rate": 3.1454129037960614e-06, "loss": 1.2827, "step": 2395 }, { "epoch": 0.768, "grad_norm": 6.544814378582256, "learning_rate": 3.1048474117075834e-06, "loss": 1.2408, "step": 2400 }, { "epoch": 0.7696, "grad_norm": 8.187702879843156, "learning_rate": 3.0644970748323253e-06, "loss": 1.3093, "step": 2405 }, { "epoch": 0.7712, "grad_norm": 7.060252319548262, "learning_rate": 3.0243631522556027e-06, "loss": 1.2043, "step": 2410 }, { "epoch": 0.7728, "grad_norm": 7.212166473447804, "learning_rate": 2.984446896309764e-06, "loss": 1.29, "step": 2415 }, { "epoch": 0.7744, "grad_norm": 6.205766494665441, "learning_rate": 2.94474955253513e-06, "loss": 1.2984, "step": 2420 }, { "epoch": 0.776, "grad_norm": 8.705823518308824, "learning_rate": 2.9052723596411194e-06, "loss": 1.272, "step": 2425 }, { "epoch": 0.7776, "grad_norm": 6.193262549894205, "learning_rate": 2.866016549467602e-06, "loss": 1.2184, "step": 2430 }, { "epoch": 0.7792, "grad_norm": 7.425984999552959, "learning_rate": 2.82698334694645e-06, "loss": 1.2093, "step": 2435 }, { "epoch": 0.7808, "grad_norm": 6.720543568523316, "learning_rate": 2.7881739700633382e-06, "loss": 1.2015, "step": 2440 }, { "epoch": 0.7824, "grad_norm": 7.41759595687626, "learning_rate": 2.749589629819708e-06, "loss": 1.1781, "step": 2445 }, { "epoch": 0.784, "grad_norm": 6.10376266829222, "learning_rate": 2.7112315301949986e-06, "loss": 1.2669, "step": 2450 }, { "epoch": 0.7856, "grad_norm": 7.211278115666269, "learning_rate": 2.6731008681090763e-06, "loss": 1.2374, "step": 2455 }, { "epoch": 0.7872, "grad_norm": 6.323650227039773, "learning_rate": 2.6351988333848787e-06, "loss": 1.2188, "step": 2460 }, { "epoch": 0.7888, "grad_norm": 8.023736585244432, "learning_rate": 2.5975266087113015e-06, "loss": 1.1623, "step": 2465 }, { "epoch": 0.7904, "grad_norm": 6.611440556841849, "learning_rate": 2.5600853696062766e-06, "loss": 1.2767, "step": 2470 }, { "epoch": 0.792, "grad_norm": 6.563637825114816, "learning_rate": 2.5228762843801047e-06, "loss": 1.255, "step": 2475 }, { "epoch": 0.7936, "grad_norm": 7.2147131416231325, "learning_rate": 2.485900514098991e-06, "loss": 1.1983, "step": 2480 }, { "epoch": 0.7952, "grad_norm": 6.707678159171325, "learning_rate": 2.4491592125488206e-06, "loss": 1.0984, "step": 2485 }, { "epoch": 0.7968, "grad_norm": 6.811597254056078, "learning_rate": 2.4126535261991577e-06, "loss": 1.256, "step": 2490 }, { "epoch": 0.7984, "grad_norm": 7.126090842225368, "learning_rate": 2.3763845941674703e-06, "loss": 1.0681, "step": 2495 }, { "epoch": 0.8, "grad_norm": 7.3532700559553215, "learning_rate": 2.340353548183575e-06, "loss": 1.318, "step": 2500 }, { "epoch": 0.8016, "grad_norm": 7.106651921925259, "learning_rate": 2.3045615125543353e-06, "loss": 1.1499, "step": 2505 }, { "epoch": 0.8032, "grad_norm": 6.895206052430061, "learning_rate": 2.2690096041285757e-06, "loss": 1.2491, "step": 2510 }, { "epoch": 0.8048, "grad_norm": 7.077398116538314, "learning_rate": 2.2336989322622306e-06, "loss": 1.1645, "step": 2515 }, { "epoch": 0.8064, "grad_norm": 7.163449736184592, "learning_rate": 2.198630598783723e-06, "loss": 1.2058, "step": 2520 }, { "epoch": 0.808, "grad_norm": 7.411419676699458, "learning_rate": 2.1638056979596012e-06, "loss": 1.2915, "step": 2525 }, { "epoch": 0.8096, "grad_norm": 6.83424681277287, "learning_rate": 2.1292253164603673e-06, "loss": 1.236, "step": 2530 }, { "epoch": 0.8112, "grad_norm": 7.517941880919823, "learning_rate": 2.094890533326589e-06, "loss": 1.215, "step": 2535 }, { "epoch": 0.8128, "grad_norm": 7.707098663070382, "learning_rate": 2.0608024199352216e-06, "loss": 1.2385, "step": 2540 }, { "epoch": 0.8144, "grad_norm": 7.918662426971106, "learning_rate": 2.026962039966176e-06, "loss": 1.1269, "step": 2545 }, { "epoch": 0.816, "grad_norm": 8.318700772754628, "learning_rate": 1.9933704493691354e-06, "loss": 1.1145, "step": 2550 }, { "epoch": 0.8176, "grad_norm": 7.58106431146203, "learning_rate": 1.960028696330596e-06, "loss": 1.2279, "step": 2555 }, { "epoch": 0.8192, "grad_norm": 7.987545520077578, "learning_rate": 1.926937821241164e-06, "loss": 1.1885, "step": 2560 }, { "epoch": 0.8208, "grad_norm": 7.251449924956015, "learning_rate": 1.8940988566630903e-06, "loss": 1.2175, "step": 2565 }, { "epoch": 0.8224, "grad_norm": 6.84899161592509, "learning_rate": 1.861512827298051e-06, "loss": 1.0726, "step": 2570 }, { "epoch": 0.824, "grad_norm": 7.124772325379492, "learning_rate": 1.8291807499551772e-06, "loss": 1.238, "step": 2575 }, { "epoch": 0.8256, "grad_norm": 7.245848004677167, "learning_rate": 1.7971036335193249e-06, "loss": 1.1557, "step": 2580 }, { "epoch": 0.8272, "grad_norm": 8.125243585964292, "learning_rate": 1.7652824789195811e-06, "loss": 1.2371, "step": 2585 }, { "epoch": 0.8288, "grad_norm": 7.1050108164531975, "learning_rate": 1.73371827909805e-06, "loss": 1.2496, "step": 2590 }, { "epoch": 0.8304, "grad_norm": 6.936091197387642, "learning_rate": 1.7024120189788573e-06, "loss": 1.136, "step": 2595 }, { "epoch": 0.832, "grad_norm": 7.21210256556138, "learning_rate": 1.6713646754374225e-06, "loss": 1.1357, "step": 2600 }, { "epoch": 0.8336, "grad_norm": 6.3383661134472264, "learning_rate": 1.6405772172699696e-06, "loss": 1.1153, "step": 2605 }, { "epoch": 0.8352, "grad_norm": 8.718135834059972, "learning_rate": 1.6100506051633136e-06, "loss": 1.1553, "step": 2610 }, { "epoch": 0.8368, "grad_norm": 8.95033808717262, "learning_rate": 1.5797857916648596e-06, "loss": 1.2361, "step": 2615 }, { "epoch": 0.8384, "grad_norm": 6.673199959521796, "learning_rate": 1.5497837211528965e-06, "loss": 1.185, "step": 2620 }, { "epoch": 0.84, "grad_norm": 6.696629258511363, "learning_rate": 1.5200453298071238e-06, "loss": 1.2785, "step": 2625 }, { "epoch": 0.8416, "grad_norm": 7.572117970721777, "learning_rate": 1.4905715455794379e-06, "loss": 1.2972, "step": 2630 }, { "epoch": 0.8432, "grad_norm": 9.096173969266975, "learning_rate": 1.461363288164983e-06, "loss": 1.262, "step": 2635 }, { "epoch": 0.8448, "grad_norm": 7.743393031922831, "learning_rate": 1.432421468973444e-06, "loss": 1.1165, "step": 2640 }, { "epoch": 0.8464, "grad_norm": 7.2853153371541675, "learning_rate": 1.4037469911006096e-06, "loss": 1.2781, "step": 2645 }, { "epoch": 0.848, "grad_norm": 7.4707067773918965, "learning_rate": 1.3753407493001968e-06, "loss": 1.2033, "step": 2650 }, { "epoch": 0.8496, "grad_norm": 6.265617058277776, "learning_rate": 1.3472036299559255e-06, "loss": 1.1115, "step": 2655 }, { "epoch": 0.8512, "grad_norm": 6.784902517223355, "learning_rate": 1.3193365110538647e-06, "loss": 1.218, "step": 2660 }, { "epoch": 0.8528, "grad_norm": 6.719604118534906, "learning_rate": 1.2917402621550369e-06, "loss": 1.0628, "step": 2665 }, { "epoch": 0.8544, "grad_norm": 9.25094531147658, "learning_rate": 1.2644157443682737e-06, "loss": 1.2604, "step": 2670 }, { "epoch": 0.856, "grad_norm": 7.5401988168853125, "learning_rate": 1.23736381032336e-06, "loss": 1.2161, "step": 2675 }, { "epoch": 0.8576, "grad_norm": 5.8986225646917845, "learning_rate": 1.2105853041444172e-06, "loss": 1.1727, "step": 2680 }, { "epoch": 0.8592, "grad_norm": 7.05150428234311, "learning_rate": 1.184081061423572e-06, "loss": 1.2429, "step": 2685 }, { "epoch": 0.8608, "grad_norm": 7.8831648202761055, "learning_rate": 1.157851909194876e-06, "loss": 1.1535, "step": 2690 }, { "epoch": 0.8624, "grad_norm": 6.764706659905809, "learning_rate": 1.1318986659085062e-06, "loss": 1.2213, "step": 2695 }, { "epoch": 0.864, "grad_norm": 7.091883009667597, "learning_rate": 1.10622214140522e-06, "loss": 1.2937, "step": 2700 }, { "epoch": 0.8656, "grad_norm": 8.205435093308155, "learning_rate": 1.080823136891086e-06, "loss": 1.0753, "step": 2705 }, { "epoch": 0.8672, "grad_norm": 7.707709432086696, "learning_rate": 1.0557024449124854e-06, "loss": 1.22, "step": 2710 }, { "epoch": 0.8688, "grad_norm": 7.701900190738418, "learning_rate": 1.0308608493313776e-06, "loss": 1.2737, "step": 2715 }, { "epoch": 0.8704, "grad_norm": 7.957037822325651, "learning_rate": 1.0062991253008525e-06, "loss": 1.1962, "step": 2720 }, { "epoch": 0.872, "grad_norm": 7.098180709287834, "learning_rate": 9.820180392409252e-07, "loss": 1.2093, "step": 2725 }, { "epoch": 0.8736, "grad_norm": 7.610067003127613, "learning_rate": 9.580183488146323e-07, "loss": 1.2104, "step": 2730 }, { "epoch": 0.8752, "grad_norm": 7.399913871553443, "learning_rate": 9.343008029043876e-07, "loss": 1.2166, "step": 2735 }, { "epoch": 0.8768, "grad_norm": 7.845545343933413, "learning_rate": 9.108661415886111e-07, "loss": 1.224, "step": 2740 }, { "epoch": 0.8784, "grad_norm": 8.250129826284104, "learning_rate": 8.87715096118642e-07, "loss": 1.2828, "step": 2745 }, { "epoch": 0.88, "grad_norm": 7.754702962013714, "learning_rate": 8.64848388895917e-07, "loss": 1.2148, "step": 2750 }, { "epoch": 0.8816, "grad_norm": 7.13532064965064, "learning_rate": 8.42266733449425e-07, "loss": 1.1721, "step": 2755 }, { "epoch": 0.8832, "grad_norm": 7.085924370440946, "learning_rate": 8.199708344134493e-07, "loss": 1.1104, "step": 2760 }, { "epoch": 0.8848, "grad_norm": 7.73698788059968, "learning_rate": 7.979613875055736e-07, "loss": 1.2089, "step": 2765 }, { "epoch": 0.8864, "grad_norm": 7.791514673450203, "learning_rate": 7.76239079504979e-07, "loss": 1.2016, "step": 2770 }, { "epoch": 0.888, "grad_norm": 6.7303438186194855, "learning_rate": 7.548045882310084e-07, "loss": 1.0739, "step": 2775 }, { "epoch": 0.8896, "grad_norm": 6.776201230293054, "learning_rate": 7.336585825220244e-07, "loss": 1.1817, "step": 2780 }, { "epoch": 0.8912, "grad_norm": 7.513647159391496, "learning_rate": 7.128017222145267e-07, "loss": 1.1978, "step": 2785 }, { "epoch": 0.8928, "grad_norm": 7.203824082806031, "learning_rate": 6.922346581225725e-07, "loss": 1.1137, "step": 2790 }, { "epoch": 0.8944, "grad_norm": 7.674190673017307, "learning_rate": 6.719580320174657e-07, "loss": 1.1589, "step": 2795 }, { "epoch": 0.896, "grad_norm": 7.427895020925505, "learning_rate": 6.519724766077262e-07, "loss": 1.1643, "step": 2800 }, { "epoch": 0.8976, "grad_norm": 7.522555453911278, "learning_rate": 6.322786155193594e-07, "loss": 1.1759, "step": 2805 }, { "epoch": 0.8992, "grad_norm": 7.160309775741213, "learning_rate": 6.128770632763825e-07, "loss": 1.1401, "step": 2810 }, { "epoch": 0.9008, "grad_norm": 7.064709958323231, "learning_rate": 5.937684252816578e-07, "loss": 1.2335, "step": 2815 }, { "epoch": 0.9024, "grad_norm": 7.336329439580518, "learning_rate": 5.749532977979977e-07, "loss": 1.1347, "step": 2820 }, { "epoch": 0.904, "grad_norm": 7.690187518015022, "learning_rate": 5.564322679295619e-07, "loss": 1.1116, "step": 2825 }, { "epoch": 0.9056, "grad_norm": 7.108832953401795, "learning_rate": 5.382059136035389e-07, "loss": 1.1324, "step": 2830 }, { "epoch": 0.9072, "grad_norm": 7.578023550663353, "learning_rate": 5.202748035521021e-07, "loss": 1.1919, "step": 2835 }, { "epoch": 0.9088, "grad_norm": 6.903932907016643, "learning_rate": 5.026394972946813e-07, "loss": 1.1659, "step": 2840 }, { "epoch": 0.9104, "grad_norm": 6.079707510002676, "learning_rate": 4.85300545120484e-07, "loss": 1.2324, "step": 2845 }, { "epoch": 0.912, "grad_norm": 7.452754847950774, "learning_rate": 4.6825848807133813e-07, "loss": 1.2998, "step": 2850 }, { "epoch": 0.9136, "grad_norm": 6.122150122845729, "learning_rate": 4.515138579248035e-07, "loss": 1.2366, "step": 2855 }, { "epoch": 0.9152, "grad_norm": 8.029542518584094, "learning_rate": 4.350671771775772e-07, "loss": 1.1056, "step": 2860 }, { "epoch": 0.9168, "grad_norm": 5.867322954044555, "learning_rate": 4.189189590291975e-07, "loss": 1.2582, "step": 2865 }, { "epoch": 0.9184, "grad_norm": 8.148008523277348, "learning_rate": 4.030697073660217e-07, "loss": 1.2402, "step": 2870 }, { "epoch": 0.92, "grad_norm": 6.608369807826164, "learning_rate": 3.875199167455035e-07, "loss": 1.2935, "step": 2875 }, { "epoch": 0.9216, "grad_norm": 6.225848002947905, "learning_rate": 3.7227007238076596e-07, "loss": 1.1169, "step": 2880 }, { "epoch": 0.9232, "grad_norm": 7.4677935130036435, "learning_rate": 3.573206501254556e-07, "loss": 1.2367, "step": 2885 }, { "epoch": 0.9248, "grad_norm": 6.610580655839769, "learning_rate": 3.4267211645890306e-07, "loss": 1.2118, "step": 2890 }, { "epoch": 0.9264, "grad_norm": 5.9140036705595564, "learning_rate": 3.283249284715528e-07, "loss": 1.1834, "step": 2895 }, { "epoch": 0.928, "grad_norm": 6.959822912506896, "learning_rate": 3.1427953385071207e-07, "loss": 1.2302, "step": 2900 }, { "epoch": 0.9296, "grad_norm": 7.043289170717119, "learning_rate": 3.005363708665765e-07, "loss": 1.0955, "step": 2905 }, { "epoch": 0.9312, "grad_norm": 6.32656303603902, "learning_rate": 2.870958683585545e-07, "loss": 1.1267, "step": 2910 }, { "epoch": 0.9328, "grad_norm": 6.352958182641917, "learning_rate": 2.7395844572188915e-07, "loss": 1.1378, "step": 2915 }, { "epoch": 0.9344, "grad_norm": 7.408074075398155, "learning_rate": 2.6112451289456495e-07, "loss": 1.159, "step": 2920 }, { "epoch": 0.936, "grad_norm": 6.867410397272755, "learning_rate": 2.4859447034452424e-07, "loss": 1.1076, "step": 2925 }, { "epoch": 0.9376, "grad_norm": 7.248191222610278, "learning_rate": 2.3636870905716424e-07, "loss": 1.0558, "step": 2930 }, { "epoch": 0.9392, "grad_norm": 6.2060869333572946, "learning_rate": 2.2444761052313857e-07, "loss": 1.0717, "step": 2935 }, { "epoch": 0.9408, "grad_norm": 7.123428377796361, "learning_rate": 2.1283154672645522e-07, "loss": 1.2433, "step": 2940 }, { "epoch": 0.9424, "grad_norm": 7.587226030653606, "learning_rate": 2.015208801328694e-07, "loss": 1.137, "step": 2945 }, { "epoch": 0.944, "grad_norm": 8.023362652127654, "learning_rate": 1.905159636785714e-07, "loss": 1.1554, "step": 2950 }, { "epoch": 0.9456, "grad_norm": 7.1454832029115485, "learning_rate": 1.79817140759172e-07, "loss": 1.1245, "step": 2955 }, { "epoch": 0.9472, "grad_norm": 6.22220394431011, "learning_rate": 1.6942474521899232e-07, "loss": 1.1412, "step": 2960 }, { "epoch": 0.9488, "grad_norm": 6.91072459977162, "learning_rate": 1.5933910134064202e-07, "loss": 1.1616, "step": 2965 }, { "epoch": 0.9504, "grad_norm": 7.785687521486665, "learning_rate": 1.4956052383490295e-07, "loss": 1.2241, "step": 2970 }, { "epoch": 0.952, "grad_norm": 6.558516061888123, "learning_rate": 1.4008931783090707e-07, "loss": 1.1245, "step": 2975 }, { "epoch": 0.9536, "grad_norm": 7.69748620578705, "learning_rate": 1.309257788666174e-07, "loss": 1.0904, "step": 2980 }, { "epoch": 0.9552, "grad_norm": 7.747365211882861, "learning_rate": 1.220701928796042e-07, "loss": 1.1514, "step": 2985 }, { "epoch": 0.9568, "grad_norm": 6.611064880165885, "learning_rate": 1.1352283619812443e-07, "loss": 1.0847, "step": 2990 }, { "epoch": 0.9584, "grad_norm": 7.002989095386839, "learning_rate": 1.0528397553249636e-07, "loss": 1.1879, "step": 2995 }, { "epoch": 0.96, "grad_norm": 7.0152838160784805, "learning_rate": 9.73538679667807e-08, "loss": 1.2758, "step": 3000 }, { "epoch": 0.9616, "grad_norm": 7.094274121519631, "learning_rate": 8.97327609507559e-08, "loss": 1.1738, "step": 3005 }, { "epoch": 0.9632, "grad_norm": 8.216237482299412, "learning_rate": 8.242089229219984e-08, "loss": 1.1826, "step": 3010 }, { "epoch": 0.9648, "grad_norm": 7.697329542588331, "learning_rate": 7.541849014946479e-08, "loss": 1.1891, "step": 3015 }, { "epoch": 0.9664, "grad_norm": 5.975230690605909, "learning_rate": 6.872577302436179e-08, "loss": 1.2118, "step": 3020 }, { "epoch": 0.968, "grad_norm": 6.8498253272657434, "learning_rate": 6.234294975534183e-08, "loss": 1.0708, "step": 3025 }, { "epoch": 0.9696, "grad_norm": 7.431417632188548, "learning_rate": 5.6270219510975445e-08, "loss": 1.2071, "step": 3030 }, { "epoch": 0.9712, "grad_norm": 7.544413890406052, "learning_rate": 5.050777178374544e-08, "loss": 1.1812, "step": 3035 }, { "epoch": 0.9728, "grad_norm": 7.824359941406518, "learning_rate": 4.505578638412722e-08, "loss": 1.1669, "step": 3040 }, { "epoch": 0.9744, "grad_norm": 7.681706989711725, "learning_rate": 3.9914433434982135e-08, "loss": 1.1548, "step": 3045 }, { "epoch": 0.976, "grad_norm": 6.900708976315526, "learning_rate": 3.508387336624619e-08, "loss": 1.0863, "step": 3050 }, { "epoch": 0.9776, "grad_norm": 8.898326692320722, "learning_rate": 3.056425690992404e-08, "loss": 1.2441, "step": 3055 }, { "epoch": 0.9792, "grad_norm": 6.965073943785161, "learning_rate": 2.6355725095389416e-08, "loss": 1.2815, "step": 3060 }, { "epoch": 0.9808, "grad_norm": 8.318803563609041, "learning_rate": 2.2458409244979772e-08, "loss": 1.266, "step": 3065 }, { "epoch": 0.9824, "grad_norm": 7.233363975555559, "learning_rate": 1.8872430969901766e-08, "loss": 1.227, "step": 3070 }, { "epoch": 0.984, "grad_norm": 7.484219660419537, "learning_rate": 1.559790216643542e-08, "loss": 1.1119, "step": 3075 }, { "epoch": 0.9856, "grad_norm": 6.4306561615634426, "learning_rate": 1.2634925012440235e-08, "loss": 1.1649, "step": 3080 }, { "epoch": 0.9872, "grad_norm": 7.328910487014067, "learning_rate": 9.983591964171091e-09, "loss": 1.2878, "step": 3085 }, { "epoch": 0.9888, "grad_norm": 7.278976626995467, "learning_rate": 7.643985753390537e-09, "loss": 1.0835, "step": 3090 }, { "epoch": 0.9904, "grad_norm": 7.408823192054949, "learning_rate": 5.616179384788645e-09, "loss": 1.2181, "step": 3095 }, { "epoch": 0.992, "grad_norm": 7.7605788783327485, "learning_rate": 3.900236133703717e-09, "loss": 1.1201, "step": 3100 }, { "epoch": 0.9936, "grad_norm": 7.585618897897224, "learning_rate": 2.496209544147199e-09, "loss": 1.2209, "step": 3105 }, { "epoch": 0.9952, "grad_norm": 7.2343937302933, "learning_rate": 1.4041434271350184e-09, "loss": 1.1871, "step": 3110 }, { "epoch": 0.9968, "grad_norm": 7.2380371709860265, "learning_rate": 6.240718593208961e-10, "loss": 1.0477, "step": 3115 }, { "epoch": 0.9984, "grad_norm": 7.4068514185411995, "learning_rate": 1.5601918192942322e-10, "loss": 1.1603, "step": 3120 }, { "epoch": 1.0, "grad_norm": 7.2891019708519105, "learning_rate": 0.0, "loss": 1.1876, "step": 3125 }, { "epoch": 1.0, "eval_loss": 1.1837999820709229, "eval_runtime": 37.7062, "eval_samples_per_second": 13.26, "eval_steps_per_second": 0.849, "step": 3125 }, { "epoch": 1.0, "step": 3125, "total_flos": 40105673687040.0, "train_loss": 1.4248850550079346, "train_runtime": 11739.5291, "train_samples_per_second": 4.259, "train_steps_per_second": 0.266 } ], "logging_steps": 5, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 40105673687040.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }