{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22228396776882467, "eval_steps": 34, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005557099194220616, "eval_loss": 2.150275230407715, "eval_runtime": 385.5335, "eval_samples_per_second": 7.862, "eval_steps_per_second": 0.983, "step": 1 }, { "epoch": 0.0016671297582661851, "grad_norm": 0.9782189726829529, "learning_rate": 1.5e-05, "loss": 8.4821, "step": 3 }, { "epoch": 0.0033342595165323703, "grad_norm": 9.340497970581055, "learning_rate": 3e-05, "loss": 8.0841, "step": 6 }, { "epoch": 0.005001389274798555, "grad_norm": 1.2668017148971558, "learning_rate": 4.5e-05, "loss": 9.0, "step": 9 }, { "epoch": 0.0066685190330647405, "grad_norm": 1.2630752325057983, "learning_rate": 4.999675562428437e-05, "loss": 8.5273, "step": 12 }, { "epoch": 0.008335648791330925, "grad_norm": 1.4245824813842773, "learning_rate": 4.9979724954289244e-05, "loss": 8.1841, "step": 15 }, { "epoch": 0.01000277854959711, "grad_norm": 1.4201760292053223, "learning_rate": 4.994810682835951e-05, "loss": 7.569, "step": 18 }, { "epoch": 0.011669908307863295, "grad_norm": 1.8734160661697388, "learning_rate": 4.990191971059033e-05, "loss": 7.0022, "step": 21 }, { "epoch": 0.013337038066129481, "grad_norm": 1.1166455745697021, "learning_rate": 4.984119057295783e-05, "loss": 6.704, "step": 24 }, { "epoch": 0.015004167824395665, "grad_norm": 1.3420771360397339, "learning_rate": 4.976595487956823e-05, "loss": 7.354, "step": 27 }, { "epoch": 0.01667129758266185, "grad_norm": 1.4245537519454956, "learning_rate": 4.967625656594782e-05, "loss": 6.7949, "step": 30 }, { "epoch": 0.018338427340928037, "grad_norm": 1.3899712562561035, "learning_rate": 4.957214801338581e-05, "loss": 6.8879, "step": 33 }, { "epoch": 0.018894137260350097, "eval_loss": 1.6003342866897583, "eval_runtime": 387.5811, "eval_samples_per_second": 7.82, "eval_steps_per_second": 0.978, "step": 34 }, { "epoch": 0.02000555709919422, "grad_norm": 1.2631561756134033, "learning_rate": 4.9453690018345144e-05, "loss": 6.9872, "step": 36 }, { "epoch": 0.021672686857460405, "grad_norm": 1.1402790546417236, "learning_rate": 4.932095175695911e-05, "loss": 6.2949, "step": 39 }, { "epoch": 0.02333981661572659, "grad_norm": 1.0827226638793945, "learning_rate": 4.917401074463441e-05, "loss": 6.049, "step": 42 }, { "epoch": 0.025006946373992776, "grad_norm": 1.1316773891448975, "learning_rate": 4.901295279078431e-05, "loss": 6.1119, "step": 45 }, { "epoch": 0.026674076132258962, "grad_norm": 1.0732563734054565, "learning_rate": 4.883787194871841e-05, "loss": 5.899, "step": 48 }, { "epoch": 0.028341205890525144, "grad_norm": 1.0603082180023193, "learning_rate": 4.864887046071813e-05, "loss": 5.7208, "step": 51 }, { "epoch": 0.03000833564879133, "grad_norm": 1.1684924364089966, "learning_rate": 4.8446058698330115e-05, "loss": 6.4161, "step": 54 }, { "epoch": 0.03167546540705751, "grad_norm": 1.147234320640564, "learning_rate": 4.822955509791233e-05, "loss": 6.0586, "step": 57 }, { "epoch": 0.0333425951653237, "grad_norm": 1.4725079536437988, "learning_rate": 4.799948609147061e-05, "loss": 6.2541, "step": 60 }, { "epoch": 0.035009724923589884, "grad_norm": 1.3403408527374268, "learning_rate": 4.7755986032825864e-05, "loss": 6.0201, "step": 63 }, { "epoch": 0.03667685468185607, "grad_norm": 1.272765040397644, "learning_rate": 4.74991971191553e-05, "loss": 5.6225, "step": 66 }, { "epoch": 0.037788274520700195, "eval_loss": 1.4521716833114624, "eval_runtime": 387.3586, "eval_samples_per_second": 7.825, "eval_steps_per_second": 0.978, "step": 68 }, { "epoch": 0.038343984440122256, "grad_norm": 1.3769108057022095, "learning_rate": 4.7229269307953235e-05, "loss": 5.515, "step": 69 }, { "epoch": 0.04001111419838844, "grad_norm": 1.1973274946212769, "learning_rate": 4.694636022946012e-05, "loss": 5.7673, "step": 72 }, { "epoch": 0.04167824395665463, "grad_norm": 1.3785252571105957, "learning_rate": 4.665063509461097e-05, "loss": 5.945, "step": 75 }, { "epoch": 0.04334537371492081, "grad_norm": 1.3699363470077515, "learning_rate": 4.6342266598556814e-05, "loss": 5.5223, "step": 78 }, { "epoch": 0.045012503473187, "grad_norm": 1.2516443729400635, "learning_rate": 4.6021434819815555e-05, "loss": 5.6879, "step": 81 }, { "epoch": 0.04667963323145318, "grad_norm": 1.6110479831695557, "learning_rate": 4.568832711511125e-05, "loss": 5.6658, "step": 84 }, { "epoch": 0.048346762989719363, "grad_norm": 1.2466659545898438, "learning_rate": 4.534313800996299e-05, "loss": 5.6652, "step": 87 }, { "epoch": 0.05001389274798555, "grad_norm": 1.5405720472335815, "learning_rate": 4.498606908508754e-05, "loss": 5.4541, "step": 90 }, { "epoch": 0.051681022506251735, "grad_norm": 1.4913195371627808, "learning_rate": 4.46173288586818e-05, "loss": 6.3391, "step": 93 }, { "epoch": 0.053348152264517924, "grad_norm": 1.4445130825042725, "learning_rate": 4.4237132664654154e-05, "loss": 5.5362, "step": 96 }, { "epoch": 0.05501528202278411, "grad_norm": 1.4353359937667847, "learning_rate": 4.384570252687542e-05, "loss": 5.6292, "step": 99 }, { "epoch": 0.05668241178105029, "grad_norm": 1.515329122543335, "learning_rate": 4.344326702952326e-05, "loss": 5.9767, "step": 102 }, { "epoch": 0.05668241178105029, "eval_loss": 1.3901065587997437, "eval_runtime": 387.1962, "eval_samples_per_second": 7.828, "eval_steps_per_second": 0.979, "step": 102 }, { "epoch": 0.05834954153931648, "grad_norm": 1.3371084928512573, "learning_rate": 4.303006118359537e-05, "loss": 5.0829, "step": 105 }, { "epoch": 0.06001667129758266, "grad_norm": 1.3986601829528809, "learning_rate": 4.260632628966974e-05, "loss": 4.9783, "step": 108 }, { "epoch": 0.06168380105584885, "grad_norm": 1.6927534341812134, "learning_rate": 4.217230979699188e-05, "loss": 5.5207, "step": 111 }, { "epoch": 0.06335093081411503, "grad_norm": 1.4875972270965576, "learning_rate": 4.172826515897146e-05, "loss": 4.9577, "step": 114 }, { "epoch": 0.06501806057238121, "grad_norm": 1.5819252729415894, "learning_rate": 4.12744516851726e-05, "loss": 5.5329, "step": 117 }, { "epoch": 0.0666851903306474, "grad_norm": 1.7034679651260376, "learning_rate": 4.0811134389884433e-05, "loss": 5.8642, "step": 120 }, { "epoch": 0.06835232008891359, "grad_norm": 1.4892022609710693, "learning_rate": 4.0338583837360225e-05, "loss": 4.9988, "step": 123 }, { "epoch": 0.07001944984717977, "grad_norm": 1.8653110265731812, "learning_rate": 3.985707598381544e-05, "loss": 5.5333, "step": 126 }, { "epoch": 0.07168657960544596, "grad_norm": 1.6267685890197754, "learning_rate": 3.9366892016277096e-05, "loss": 5.5853, "step": 129 }, { "epoch": 0.07335370936371215, "grad_norm": 1.710310935974121, "learning_rate": 3.886831818837847e-05, "loss": 5.5769, "step": 132 }, { "epoch": 0.07502083912197832, "grad_norm": 2.016270160675049, "learning_rate": 3.8361645653195026e-05, "loss": 5.5546, "step": 135 }, { "epoch": 0.07557654904140039, "eval_loss": 1.3533298969268799, "eval_runtime": 387.1489, "eval_samples_per_second": 7.829, "eval_steps_per_second": 0.979, "step": 136 }, { "epoch": 0.07668796888024451, "grad_norm": 2.021454095840454, "learning_rate": 3.784717029321922e-05, "loss": 5.2784, "step": 138 }, { "epoch": 0.0783550986385107, "grad_norm": 1.5546106100082397, "learning_rate": 3.732519254757344e-05, "loss": 5.21, "step": 141 }, { "epoch": 0.08002222839677688, "grad_norm": 1.662156343460083, "learning_rate": 3.679601723656205e-05, "loss": 5.3778, "step": 144 }, { "epoch": 0.08168935815504307, "grad_norm": 1.6201971769332886, "learning_rate": 3.625995338366492e-05, "loss": 5.4251, "step": 147 }, { "epoch": 0.08335648791330925, "grad_norm": 1.6661242246627808, "learning_rate": 3.5717314035076355e-05, "loss": 5.292, "step": 150 }, { "epoch": 0.08502361767157544, "grad_norm": 1.595383644104004, "learning_rate": 3.516841607689501e-05, "loss": 5.128, "step": 153 }, { "epoch": 0.08669074742984162, "grad_norm": 1.7640243768692017, "learning_rate": 3.461358005007128e-05, "loss": 5.2638, "step": 156 }, { "epoch": 0.08835787718810781, "grad_norm": 2.1148338317871094, "learning_rate": 3.405312996322042e-05, "loss": 5.1089, "step": 159 }, { "epoch": 0.090025006946374, "grad_norm": 1.9622715711593628, "learning_rate": 3.348739310341068e-05, "loss": 4.7301, "step": 162 }, { "epoch": 0.09169213670464017, "grad_norm": 1.920733094215393, "learning_rate": 3.2916699845036816e-05, "loss": 5.0066, "step": 165 }, { "epoch": 0.09335926646290636, "grad_norm": 1.6611617803573608, "learning_rate": 3.234138345689077e-05, "loss": 5.198, "step": 168 }, { "epoch": 0.09447068630175048, "eval_loss": 1.3297733068466187, "eval_runtime": 387.3661, "eval_samples_per_second": 7.825, "eval_steps_per_second": 0.978, "step": 170 }, { "epoch": 0.09502639622117255, "grad_norm": 1.5192731618881226, "learning_rate": 3.17617799075421e-05, "loss": 4.9727, "step": 171 }, { "epoch": 0.09669352597943873, "grad_norm": 2.1325037479400635, "learning_rate": 3.1178227669141744e-05, "loss": 5.287, "step": 174 }, { "epoch": 0.09836065573770492, "grad_norm": 1.6394548416137695, "learning_rate": 3.0591067519763895e-05, "loss": 5.0878, "step": 177 }, { "epoch": 0.1000277854959711, "grad_norm": 1.954785704612732, "learning_rate": 3.0000642344401113e-05, "loss": 5.7474, "step": 180 }, { "epoch": 0.1016949152542373, "grad_norm": 1.7333064079284668, "learning_rate": 2.9407296934729227e-05, "loss": 5.2069, "step": 183 }, { "epoch": 0.10336204501250347, "grad_norm": 1.7775465250015259, "learning_rate": 2.8811377787758636e-05, "loss": 4.8365, "step": 186 }, { "epoch": 0.10502917477076966, "grad_norm": 1.766340970993042, "learning_rate": 2.8213232903489865e-05, "loss": 4.8806, "step": 189 }, { "epoch": 0.10669630452903585, "grad_norm": 2.064275026321411, "learning_rate": 2.761321158169134e-05, "loss": 5.1876, "step": 192 }, { "epoch": 0.10836343428730202, "grad_norm": 1.731985330581665, "learning_rate": 2.7011664217918154e-05, "loss": 4.6924, "step": 195 }, { "epoch": 0.11003056404556821, "grad_norm": 1.8852187395095825, "learning_rate": 2.6408942098890936e-05, "loss": 5.0911, "step": 198 }, { "epoch": 0.1116976938038344, "grad_norm": 1.8446505069732666, "learning_rate": 2.580539719735433e-05, "loss": 5.0379, "step": 201 }, { "epoch": 0.11336482356210058, "grad_norm": 1.863871455192566, "learning_rate": 2.5201381966534748e-05, "loss": 5.3173, "step": 204 }, { "epoch": 0.11336482356210058, "eval_loss": 1.3148518800735474, "eval_runtime": 387.4809, "eval_samples_per_second": 7.822, "eval_steps_per_second": 0.978, "step": 204 }, { "epoch": 0.11503195332036677, "grad_norm": 2.1243629455566406, "learning_rate": 2.459724913431772e-05, "loss": 5.1268, "step": 207 }, { "epoch": 0.11669908307863296, "grad_norm": 1.8287479877471924, "learning_rate": 2.399335149726463e-05, "loss": 4.8911, "step": 210 }, { "epoch": 0.11836621283689913, "grad_norm": 1.827951431274414, "learning_rate": 2.3390041714589514e-05, "loss": 5.0788, "step": 213 }, { "epoch": 0.12003334259516532, "grad_norm": 1.9181324243545532, "learning_rate": 2.2787672102216042e-05, "loss": 5.2716, "step": 216 }, { "epoch": 0.12170047235343151, "grad_norm": 2.334996461868286, "learning_rate": 2.2186594427034864e-05, "loss": 5.4529, "step": 219 }, { "epoch": 0.1233676021116977, "grad_norm": 2.042280673980713, "learning_rate": 2.1587159701481716e-05, "loss": 4.8902, "step": 222 }, { "epoch": 0.1250347318699639, "grad_norm": 1.9080718755722046, "learning_rate": 2.098971797855599e-05, "loss": 4.933, "step": 225 }, { "epoch": 0.12670186162823005, "grad_norm": 1.7101670503616333, "learning_rate": 2.0394618147399713e-05, "loss": 5.0186, "step": 228 }, { "epoch": 0.12836899138649624, "grad_norm": 2.011359453201294, "learning_rate": 1.980220772955602e-05, "loss": 4.7936, "step": 231 }, { "epoch": 0.13003612114476243, "grad_norm": 2.302273750305176, "learning_rate": 1.921283267602643e-05, "loss": 5.1487, "step": 234 }, { "epoch": 0.13170325090302862, "grad_norm": 2.4797189235687256, "learning_rate": 1.8626837165245165e-05, "loss": 5.2862, "step": 237 }, { "epoch": 0.13225896082245067, "eval_loss": 1.303543210029602, "eval_runtime": 387.3037, "eval_samples_per_second": 7.826, "eval_steps_per_second": 0.979, "step": 238 }, { "epoch": 0.1333703806612948, "grad_norm": 1.8348701000213623, "learning_rate": 1.8044563402088684e-05, "loss": 5.0605, "step": 240 }, { "epoch": 0.135037510419561, "grad_norm": 2.109149217605591, "learning_rate": 1.746635141803761e-05, "loss": 4.9242, "step": 243 }, { "epoch": 0.13670464017782719, "grad_norm": 2.1694352626800537, "learning_rate": 1.6892538872607937e-05, "loss": 5.0852, "step": 246 }, { "epoch": 0.13837176993609335, "grad_norm": 2.145925998687744, "learning_rate": 1.6323460856167426e-05, "loss": 4.9473, "step": 249 }, { "epoch": 0.14003889969435954, "grad_norm": 2.4143218994140625, "learning_rate": 1.5759449694252226e-05, "loss": 5.2909, "step": 252 }, { "epoch": 0.14170602945262573, "grad_norm": 2.154897689819336, "learning_rate": 1.5200834753498128e-05, "loss": 5.2477, "step": 255 }, { "epoch": 0.14337315921089192, "grad_norm": 2.189666986465454, "learning_rate": 1.4647942249299707e-05, "loss": 5.4482, "step": 258 }, { "epoch": 0.1450402889691581, "grad_norm": 1.9108495712280273, "learning_rate": 1.4101095055309746e-05, "loss": 5.1046, "step": 261 }, { "epoch": 0.1467074187274243, "grad_norm": 1.8444137573242188, "learning_rate": 1.356061251489012e-05, "loss": 5.0423, "step": 264 }, { "epoch": 0.14837454848569048, "grad_norm": 2.031024694442749, "learning_rate": 1.302681025462424e-05, "loss": 4.5857, "step": 267 }, { "epoch": 0.15004167824395664, "grad_norm": 2.1987197399139404, "learning_rate": 1.2500000000000006e-05, "loss": 5.381, "step": 270 }, { "epoch": 0.15115309808280078, "eval_loss": 1.2963863611221313, "eval_runtime": 387.5334, "eval_samples_per_second": 7.821, "eval_steps_per_second": 0.978, "step": 272 }, { "epoch": 0.15170880800222283, "grad_norm": 1.8340719938278198, "learning_rate": 1.1980489393370938e-05, "loss": 5.4333, "step": 273 }, { "epoch": 0.15337593776048902, "grad_norm": 2.617314577102661, "learning_rate": 1.1468581814301717e-05, "loss": 5.757, "step": 276 }, { "epoch": 0.1550430675187552, "grad_norm": 2.0526530742645264, "learning_rate": 1.096457620240298e-05, "loss": 4.9528, "step": 279 }, { "epoch": 0.1567101972770214, "grad_norm": 2.3239846229553223, "learning_rate": 1.0468766882759094e-05, "loss": 5.2481, "step": 282 }, { "epoch": 0.1583773270352876, "grad_norm": 2.1533966064453125, "learning_rate": 9.981443394050525e-06, "loss": 5.6509, "step": 285 }, { "epoch": 0.16004445679355375, "grad_norm": 1.9592647552490234, "learning_rate": 9.502890319471491e-06, "loss": 4.9243, "step": 288 }, { "epoch": 0.16171158655181994, "grad_norm": 2.204939126968384, "learning_rate": 9.033387120541306e-06, "loss": 5.2745, "step": 291 }, { "epoch": 0.16337871631008613, "grad_norm": 2.236279010772705, "learning_rate": 8.573207973906735e-06, "loss": 5.4374, "step": 294 }, { "epoch": 0.16504584606835232, "grad_norm": 2.4140145778656006, "learning_rate": 8.1226216112306e-06, "loss": 5.4428, "step": 297 }, { "epoch": 0.1667129758266185, "grad_norm": 2.0701277256011963, "learning_rate": 7.681891162260015e-06, "loss": 5.4502, "step": 300 }, { "epoch": 0.1683801055848847, "grad_norm": 2.154461622238159, "learning_rate": 7.251274001166044e-06, "loss": 5.0715, "step": 303 }, { "epoch": 0.1700472353431509, "grad_norm": 2.3085010051727295, "learning_rate": 6.831021596244424e-06, "loss": 4.8451, "step": 306 }, { "epoch": 0.1700472353431509, "eval_loss": 1.2917685508728027, "eval_runtime": 387.2177, "eval_samples_per_second": 7.828, "eval_steps_per_second": 0.979, "step": 306 }, { "epoch": 0.17171436510141705, "grad_norm": 2.220491886138916, "learning_rate": 6.421379363065142e-06, "loss": 5.3908, "step": 309 }, { "epoch": 0.17338149485968324, "grad_norm": 2.2047910690307617, "learning_rate": 6.022586521156715e-06, "loss": 5.2721, "step": 312 }, { "epoch": 0.17504862461794943, "grad_norm": 2.1623401641845703, "learning_rate": 5.634875954308638e-06, "loss": 5.5073, "step": 315 }, { "epoch": 0.17671575437621562, "grad_norm": 1.9954192638397217, "learning_rate": 5.258474074573877e-06, "loss": 5.1791, "step": 318 }, { "epoch": 0.1783828841344818, "grad_norm": 2.24808669090271, "learning_rate": 4.893600690050579e-06, "loss": 5.0704, "step": 321 }, { "epoch": 0.180050013892748, "grad_norm": 2.2592039108276367, "learning_rate": 4.540468876520323e-06, "loss": 5.0177, "step": 324 }, { "epoch": 0.18171714365101416, "grad_norm": 1.9192252159118652, "learning_rate": 4.199284853017896e-06, "loss": 5.2738, "step": 327 }, { "epoch": 0.18338427340928035, "grad_norm": 2.021440267562866, "learning_rate": 3.8702478614051355e-06, "loss": 4.6439, "step": 330 }, { "epoch": 0.18505140316754654, "grad_norm": 2.309406042098999, "learning_rate": 3.5535500500193357e-06, "loss": 5.4409, "step": 333 }, { "epoch": 0.18671853292581272, "grad_norm": 2.2390878200531006, "learning_rate": 3.249376361464021e-06, "loss": 5.1074, "step": 336 }, { "epoch": 0.18838566268407891, "grad_norm": 2.540015697479248, "learning_rate": 2.957904424607652e-06, "loss": 5.2675, "step": 339 }, { "epoch": 0.18894137260350097, "eval_loss": 1.2895426750183105, "eval_runtime": 387.2989, "eval_samples_per_second": 7.826, "eval_steps_per_second": 0.979, "step": 340 }, { "epoch": 0.1900527924423451, "grad_norm": 2.492077112197876, "learning_rate": 2.679304450853401e-06, "loss": 5.211, "step": 342 }, { "epoch": 0.1917199222006113, "grad_norm": 1.7696568965911865, "learning_rate": 2.4137391347404476e-06, "loss": 5.3133, "step": 345 }, { "epoch": 0.19338705195887745, "grad_norm": 1.9813653230667114, "learning_rate": 2.1613635589349756e-06, "loss": 5.0046, "step": 348 }, { "epoch": 0.19505418171714364, "grad_norm": 2.0188918113708496, "learning_rate": 1.922325103666281e-06, "loss": 5.1457, "step": 351 }, { "epoch": 0.19672131147540983, "grad_norm": 2.231536865234375, "learning_rate": 1.696763360660808e-06, "loss": 4.8438, "step": 354 }, { "epoch": 0.19838844123367602, "grad_norm": 2.102440357208252, "learning_rate": 1.4848100516245717e-06, "loss": 5.4079, "step": 357 }, { "epoch": 0.2000555709919422, "grad_norm": 2.369569778442383, "learning_rate": 1.286588951321363e-06, "loss": 5.661, "step": 360 }, { "epoch": 0.2017227007502084, "grad_norm": 1.8659355640411377, "learning_rate": 1.102215815291774e-06, "loss": 4.9097, "step": 363 }, { "epoch": 0.2033898305084746, "grad_norm": 2.071213722229004, "learning_rate": 9.317983122552332e-07, "loss": 5.0217, "step": 366 }, { "epoch": 0.20505696026674075, "grad_norm": 2.435093402862549, "learning_rate": 7.754359612344859e-07, "loss": 5.44, "step": 369 }, { "epoch": 0.20672409002500694, "grad_norm": 2.3098435401916504, "learning_rate": 6.332200734393057e-07, "loss": 5.2669, "step": 372 }, { "epoch": 0.20783550986385108, "eval_loss": 1.2886923551559448, "eval_runtime": 387.521, "eval_samples_per_second": 7.822, "eval_steps_per_second": 0.978, "step": 374 }, { "epoch": 0.20839121978327313, "grad_norm": 2.0424201488494873, "learning_rate": 5.052336989433082e-07, "loss": 4.992, "step": 375 }, { "epoch": 0.21005834954153932, "grad_norm": 1.7625197172164917, "learning_rate": 3.915515781850565e-07, "loss": 5.1093, "step": 378 }, { "epoch": 0.2117254792998055, "grad_norm": 2.2345762252807617, "learning_rate": 2.922400983217416e-07, "loss": 5.2131, "step": 381 }, { "epoch": 0.2133926090580717, "grad_norm": 2.1834142208099365, "learning_rate": 2.0735725446094923e-07, "loss": 5.2563, "step": 384 }, { "epoch": 0.21505973881633786, "grad_norm": 2.676879405975342, "learning_rate": 1.3695261579316777e-07, "loss": 5.3258, "step": 387 }, { "epoch": 0.21672686857460405, "grad_norm": 2.2723827362060547, "learning_rate": 8.106729664475176e-08, "loss": 4.9767, "step": 390 }, { "epoch": 0.21839399833287024, "grad_norm": 2.160226345062256, "learning_rate": 3.9733932468333234e-08, "loss": 4.8822, "step": 393 }, { "epoch": 0.22006112809113643, "grad_norm": 2.3494038581848145, "learning_rate": 1.297666078462767e-08, "loss": 5.1916, "step": 396 }, { "epoch": 0.22172825784940262, "grad_norm": 2.137200117111206, "learning_rate": 8.111070868010995e-10, "loss": 5.5166, "step": 399 } ], "logging_steps": 3, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 34, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.95313700272128e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }