|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3125, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00032, |
|
"grad_norm": 20.05617380625413, |
|
"learning_rate": 6.389776357827476e-08, |
|
"loss": 1.817, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 24.170327240578573, |
|
"learning_rate": 3.194888178913738e-07, |
|
"loss": 1.8628, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 17.63201705481064, |
|
"learning_rate": 6.389776357827476e-07, |
|
"loss": 1.7731, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0048, |
|
"grad_norm": 12.659034102299168, |
|
"learning_rate": 9.584664536741215e-07, |
|
"loss": 1.6969, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 8.71204128591294, |
|
"learning_rate": 1.2779552715654952e-06, |
|
"loss": 1.5956, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 9.482900190556371, |
|
"learning_rate": 1.5974440894568691e-06, |
|
"loss": 1.5189, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 9.082892230070819, |
|
"learning_rate": 1.916932907348243e-06, |
|
"loss": 1.5307, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0112, |
|
"grad_norm": 8.71715912913066, |
|
"learning_rate": 2.2364217252396165e-06, |
|
"loss": 1.4863, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 8.266982509072658, |
|
"learning_rate": 2.5559105431309904e-06, |
|
"loss": 1.4411, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0144, |
|
"grad_norm": 7.43964847984838, |
|
"learning_rate": 2.8753993610223648e-06, |
|
"loss": 1.4772, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 8.285226877478248, |
|
"learning_rate": 3.1948881789137383e-06, |
|
"loss": 1.4813, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0176, |
|
"grad_norm": 8.144708422975583, |
|
"learning_rate": 3.514376996805112e-06, |
|
"loss": 1.4048, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 7.597036261894608, |
|
"learning_rate": 3.833865814696486e-06, |
|
"loss": 1.3852, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0208, |
|
"grad_norm": 7.424686782409782, |
|
"learning_rate": 4.15335463258786e-06, |
|
"loss": 1.4124, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 7.01187059061448, |
|
"learning_rate": 4.472843450479233e-06, |
|
"loss": 1.3525, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 8.203285756665721, |
|
"learning_rate": 4.792332268370608e-06, |
|
"loss": 1.4641, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 8.91120559492055, |
|
"learning_rate": 5.111821086261981e-06, |
|
"loss": 1.4319, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0272, |
|
"grad_norm": 7.95728896500941, |
|
"learning_rate": 5.431309904153355e-06, |
|
"loss": 1.4198, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 8.93733979889198, |
|
"learning_rate": 5.7507987220447296e-06, |
|
"loss": 1.4431, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0304, |
|
"grad_norm": 7.821680056160703, |
|
"learning_rate": 6.070287539936103e-06, |
|
"loss": 1.45, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 8.599771427970811, |
|
"learning_rate": 6.3897763578274765e-06, |
|
"loss": 1.4271, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0336, |
|
"grad_norm": 8.069041724261744, |
|
"learning_rate": 6.709265175718851e-06, |
|
"loss": 1.4014, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 8.444189736987067, |
|
"learning_rate": 7.028753993610224e-06, |
|
"loss": 1.4432, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0368, |
|
"grad_norm": 8.440082783021184, |
|
"learning_rate": 7.348242811501598e-06, |
|
"loss": 1.4234, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 9.19834025523869, |
|
"learning_rate": 7.667731629392972e-06, |
|
"loss": 1.4825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 9.44008152483298, |
|
"learning_rate": 7.987220447284347e-06, |
|
"loss": 1.4536, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 9.137864717981165, |
|
"learning_rate": 8.30670926517572e-06, |
|
"loss": 1.4832, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0432, |
|
"grad_norm": 7.259296421594005, |
|
"learning_rate": 8.626198083067093e-06, |
|
"loss": 1.4355, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 9.129262436644341, |
|
"learning_rate": 8.945686900958466e-06, |
|
"loss": 1.48, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0464, |
|
"grad_norm": 7.342624679137957, |
|
"learning_rate": 9.265175718849841e-06, |
|
"loss": 1.4388, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 9.409291604105896, |
|
"learning_rate": 9.584664536741216e-06, |
|
"loss": 1.506, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0496, |
|
"grad_norm": 7.195158336471212, |
|
"learning_rate": 9.904153354632589e-06, |
|
"loss": 1.4852, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 7.894560573501637, |
|
"learning_rate": 1.0223642172523962e-05, |
|
"loss": 1.4228, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0528, |
|
"grad_norm": 9.3990419995412, |
|
"learning_rate": 1.0543130990415335e-05, |
|
"loss": 1.4624, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 6.764474220048631, |
|
"learning_rate": 1.086261980830671e-05, |
|
"loss": 1.529, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 8.762621410789437, |
|
"learning_rate": 1.1182108626198084e-05, |
|
"loss": 1.4889, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 9.026712826027282, |
|
"learning_rate": 1.1501597444089459e-05, |
|
"loss": 1.4889, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0592, |
|
"grad_norm": 10.327681032497772, |
|
"learning_rate": 1.1821086261980832e-05, |
|
"loss": 1.4632, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 8.916460811078778, |
|
"learning_rate": 1.2140575079872205e-05, |
|
"loss": 1.4957, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0624, |
|
"grad_norm": 9.570316165134905, |
|
"learning_rate": 1.2460063897763578e-05, |
|
"loss": 1.4974, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 7.931027623078715, |
|
"learning_rate": 1.2779552715654953e-05, |
|
"loss": 1.4986, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0656, |
|
"grad_norm": 9.001437919455842, |
|
"learning_rate": 1.3099041533546326e-05, |
|
"loss": 1.4583, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 8.357858455204125, |
|
"learning_rate": 1.3418530351437703e-05, |
|
"loss": 1.5086, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0688, |
|
"grad_norm": 7.4570481772053325, |
|
"learning_rate": 1.3738019169329076e-05, |
|
"loss": 1.5331, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 9.286290584701554, |
|
"learning_rate": 1.4057507987220449e-05, |
|
"loss": 1.5065, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 7.764076796357793, |
|
"learning_rate": 1.4376996805111822e-05, |
|
"loss": 1.489, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 10.09652114646815, |
|
"learning_rate": 1.4696485623003197e-05, |
|
"loss": 1.5768, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0752, |
|
"grad_norm": 8.809821488359987, |
|
"learning_rate": 1.501597444089457e-05, |
|
"loss": 1.5554, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 8.262499421512615, |
|
"learning_rate": 1.5335463258785944e-05, |
|
"loss": 1.5407, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0784, |
|
"grad_norm": 8.979598030923686, |
|
"learning_rate": 1.5654952076677316e-05, |
|
"loss": 1.5761, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.996608590588298, |
|
"learning_rate": 1.5974440894568694e-05, |
|
"loss": 1.5723, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0816, |
|
"grad_norm": 8.28398837332719, |
|
"learning_rate": 1.6293929712460065e-05, |
|
"loss": 1.511, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 10.050176837076577, |
|
"learning_rate": 1.661341853035144e-05, |
|
"loss": 1.62, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0848, |
|
"grad_norm": 8.625153292163999, |
|
"learning_rate": 1.693290734824281e-05, |
|
"loss": 1.5696, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 7.036269960407658, |
|
"learning_rate": 1.7252396166134186e-05, |
|
"loss": 1.6274, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 9.06080432840939, |
|
"learning_rate": 1.757188498402556e-05, |
|
"loss": 1.5238, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 7.954340307347254, |
|
"learning_rate": 1.7891373801916932e-05, |
|
"loss": 1.6112, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0912, |
|
"grad_norm": 8.326902382980357, |
|
"learning_rate": 1.8210862619808307e-05, |
|
"loss": 1.5639, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 8.465352754166233, |
|
"learning_rate": 1.8530351437699682e-05, |
|
"loss": 1.6031, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0944, |
|
"grad_norm": 7.805713490581876, |
|
"learning_rate": 1.8849840255591057e-05, |
|
"loss": 1.5684, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 7.8032797694553, |
|
"learning_rate": 1.916932907348243e-05, |
|
"loss": 1.5322, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0976, |
|
"grad_norm": 7.502438683575227, |
|
"learning_rate": 1.9488817891373803e-05, |
|
"loss": 1.578, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 7.42748507278892, |
|
"learning_rate": 1.9808306709265177e-05, |
|
"loss": 1.6214, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1008, |
|
"grad_norm": 8.776137139385414, |
|
"learning_rate": 1.9999975036876365e-05, |
|
"loss": 1.5879, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 7.577899989222258, |
|
"learning_rate": 1.9999694203166786e-05, |
|
"loss": 1.5625, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 9.304934816231782, |
|
"learning_rate": 1.999910134063538e-05, |
|
"loss": 1.5502, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 8.943302127293093, |
|
"learning_rate": 1.9998196467781738e-05, |
|
"loss": 1.6447, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1072, |
|
"grad_norm": 10.263653381131993, |
|
"learning_rate": 1.999697961284136e-05, |
|
"loss": 1.6365, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 7.259185945143334, |
|
"learning_rate": 1.9995450813784785e-05, |
|
"loss": 1.6016, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1104, |
|
"grad_norm": 9.483550631073872, |
|
"learning_rate": 1.9993610118316417e-05, |
|
"loss": 1.6867, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 9.069753200477997, |
|
"learning_rate": 1.999145758387301e-05, |
|
"loss": 1.6832, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1136, |
|
"grad_norm": 9.791327883527874, |
|
"learning_rate": 1.99889932776219e-05, |
|
"loss": 1.6604, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 10.288995257955278, |
|
"learning_rate": 1.9986217276458898e-05, |
|
"loss": 1.5407, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1168, |
|
"grad_norm": 7.343516596486213, |
|
"learning_rate": 1.9983129667005887e-05, |
|
"loss": 1.704, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 8.414567171107853, |
|
"learning_rate": 1.9979730545608128e-05, |
|
"loss": 1.6423, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 31.248624844046244, |
|
"learning_rate": 1.9976020018331244e-05, |
|
"loss": 1.6615, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 7.805253897850886, |
|
"learning_rate": 1.997199820095793e-05, |
|
"loss": 1.7482, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1232, |
|
"grad_norm": 12.941755164396426, |
|
"learning_rate": 1.9967665218984308e-05, |
|
"loss": 1.6302, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 10.256908147942266, |
|
"learning_rate": 1.996302120761605e-05, |
|
"loss": 1.6527, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1264, |
|
"grad_norm": 10.323844537641683, |
|
"learning_rate": 1.9958066311764115e-05, |
|
"loss": 1.6225, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 8.17544314720727, |
|
"learning_rate": 1.9952800686040268e-05, |
|
"loss": 1.6112, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1296, |
|
"grad_norm": 9.155308331409806, |
|
"learning_rate": 1.9947224494752236e-05, |
|
"loss": 1.645, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 8.093102355577809, |
|
"learning_rate": 1.994133791189857e-05, |
|
"loss": 1.6012, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1328, |
|
"grad_norm": 8.539319454896912, |
|
"learning_rate": 1.993514112116325e-05, |
|
"loss": 1.6163, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 9.376291632603513, |
|
"learning_rate": 1.992863431590991e-05, |
|
"loss": 1.7232, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 8.427174660902148, |
|
"learning_rate": 1.9921817699175844e-05, |
|
"loss": 1.6041, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 11.225741174921657, |
|
"learning_rate": 1.991469148366564e-05, |
|
"loss": 1.5705, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1392, |
|
"grad_norm": 7.973508937531369, |
|
"learning_rate": 1.9907255891744562e-05, |
|
"loss": 1.6436, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 9.589627221263973, |
|
"learning_rate": 1.989951115543161e-05, |
|
"loss": 1.6753, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.1424, |
|
"grad_norm": 6.8530374739159345, |
|
"learning_rate": 1.9891457516392257e-05, |
|
"loss": 1.6441, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 8.969880080037996, |
|
"learning_rate": 1.988309522593095e-05, |
|
"loss": 1.632, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1456, |
|
"grad_norm": 8.147836497286043, |
|
"learning_rate": 1.9874424544983224e-05, |
|
"loss": 1.7166, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 10.804136842902594, |
|
"learning_rate": 1.9865445744107593e-05, |
|
"loss": 1.6071, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1488, |
|
"grad_norm": 7.770558071875093, |
|
"learning_rate": 1.9856159103477085e-05, |
|
"loss": 1.7466, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 11.053317637390126, |
|
"learning_rate": 1.9846564912870523e-05, |
|
"loss": 1.6221, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 8.025748690782159, |
|
"learning_rate": 1.9836663471663454e-05, |
|
"loss": 1.7206, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 8.94152560502608, |
|
"learning_rate": 1.9826455088818832e-05, |
|
"loss": 1.6794, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1552, |
|
"grad_norm": 8.723262455381919, |
|
"learning_rate": 1.9815940082877367e-05, |
|
"loss": 1.6909, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 11.572838719884283, |
|
"learning_rate": 1.980511878194758e-05, |
|
"loss": 1.7067, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1584, |
|
"grad_norm": 7.98682940495796, |
|
"learning_rate": 1.9793991523695578e-05, |
|
"loss": 1.7714, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.447864398870841, |
|
"learning_rate": 1.9782558655334505e-05, |
|
"loss": 1.632, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1616, |
|
"grad_norm": 8.942126665322432, |
|
"learning_rate": 1.9770820533613716e-05, |
|
"loss": 1.6463, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 8.629146616823142, |
|
"learning_rate": 1.9758777524807636e-05, |
|
"loss": 1.6367, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1648, |
|
"grad_norm": 7.603172313414887, |
|
"learning_rate": 1.9746430004704353e-05, |
|
"loss": 1.6558, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 11.152682711750428, |
|
"learning_rate": 1.9733778358593852e-05, |
|
"loss": 1.6537, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 9.887507696721732, |
|
"learning_rate": 1.9720822981256034e-05, |
|
"loss": 1.6715, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 9.158423868070782, |
|
"learning_rate": 1.970756427694837e-05, |
|
"loss": 1.6244, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1712, |
|
"grad_norm": 8.442009066532837, |
|
"learning_rate": 1.9694002659393306e-05, |
|
"loss": 1.7073, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 7.50714623452273, |
|
"learning_rate": 1.9680138551765335e-05, |
|
"loss": 1.6241, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1744, |
|
"grad_norm": 8.325913790000977, |
|
"learning_rate": 1.9665972386677796e-05, |
|
"loss": 1.5779, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 13.09863126345551, |
|
"learning_rate": 1.9651504606169395e-05, |
|
"loss": 1.6549, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1776, |
|
"grad_norm": 7.5020724745885925, |
|
"learning_rate": 1.9636735661690385e-05, |
|
"loss": 1.683, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 8.857991440508771, |
|
"learning_rate": 1.9621666014088495e-05, |
|
"loss": 1.5727, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1808, |
|
"grad_norm": 10.518277008681551, |
|
"learning_rate": 1.960629613359454e-05, |
|
"loss": 1.5516, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 9.763384545927902, |
|
"learning_rate": 1.959062649980776e-05, |
|
"loss": 1.7654, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 14.784685999297329, |
|
"learning_rate": 1.957465760168084e-05, |
|
"loss": 1.6173, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 9.47338828693639, |
|
"learning_rate": 1.9558389937504664e-05, |
|
"loss": 1.6706, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1872, |
|
"grad_norm": 8.255320111181135, |
|
"learning_rate": 1.954182401489277e-05, |
|
"loss": 1.6829, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 9.748611896295628, |
|
"learning_rate": 1.952496035076549e-05, |
|
"loss": 1.6963, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.1904, |
|
"grad_norm": 10.593417080535822, |
|
"learning_rate": 1.9507799471333842e-05, |
|
"loss": 1.5429, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 8.495208947428331, |
|
"learning_rate": 1.9490341912083103e-05, |
|
"loss": 1.5996, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1936, |
|
"grad_norm": 8.116738913737667, |
|
"learning_rate": 1.947258821775609e-05, |
|
"loss": 1.6509, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 8.843543056646068, |
|
"learning_rate": 1.945453894233618e-05, |
|
"loss": 1.6621, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1968, |
|
"grad_norm": 8.429840678269503, |
|
"learning_rate": 1.9436194649030006e-05, |
|
"loss": 1.6238, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 7.060618638753134, |
|
"learning_rate": 1.9417555910249905e-05, |
|
"loss": 1.5455, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8.735661880336002, |
|
"learning_rate": 1.939862330759602e-05, |
|
"loss": 1.7242, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 7.728994210262772, |
|
"learning_rate": 1.9379397431838194e-05, |
|
"loss": 1.6244, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2032, |
|
"grad_norm": 8.142561603441006, |
|
"learning_rate": 1.935987888289751e-05, |
|
"loss": 1.6033, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 7.087122461619784, |
|
"learning_rate": 1.9340068269827567e-05, |
|
"loss": 1.6478, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2064, |
|
"grad_norm": 8.796044551707043, |
|
"learning_rate": 1.93199662107955e-05, |
|
"loss": 1.622, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 7.637980213604651, |
|
"learning_rate": 1.929957333306267e-05, |
|
"loss": 1.6378, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2096, |
|
"grad_norm": 8.176443820248227, |
|
"learning_rate": 1.9278890272965097e-05, |
|
"loss": 1.603, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 8.821331818781406, |
|
"learning_rate": 1.92579176758936e-05, |
|
"loss": 1.6285, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2128, |
|
"grad_norm": 8.751441102706378, |
|
"learning_rate": 1.9236656196273676e-05, |
|
"loss": 1.6783, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 7.729648207326943, |
|
"learning_rate": 1.9215106497545047e-05, |
|
"loss": 1.7185, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 9.632019880573054, |
|
"learning_rate": 1.919326925214099e-05, |
|
"loss": 1.7044, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 7.864267682010497, |
|
"learning_rate": 1.9171145141467336e-05, |
|
"loss": 1.6628, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2192, |
|
"grad_norm": 7.961635669857337, |
|
"learning_rate": 1.9148734855881218e-05, |
|
"loss": 1.6602, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 10.292993487639919, |
|
"learning_rate": 1.912603909466952e-05, |
|
"loss": 1.5731, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2224, |
|
"grad_norm": 8.162330383972137, |
|
"learning_rate": 1.9103058566027062e-05, |
|
"loss": 1.6291, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 7.554506895298207, |
|
"learning_rate": 1.9079793987034497e-05, |
|
"loss": 1.6665, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2256, |
|
"grad_norm": 8.135535319158393, |
|
"learning_rate": 1.9056246083635943e-05, |
|
"loss": 1.6404, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 9.74804156354211, |
|
"learning_rate": 1.9032415590616323e-05, |
|
"loss": 1.5687, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2288, |
|
"grad_norm": 8.104975985879758, |
|
"learning_rate": 1.9008303251578445e-05, |
|
"loss": 1.6728, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 8.720184023804999, |
|
"learning_rate": 1.898390981891979e-05, |
|
"loss": 1.634, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 9.210829756244634, |
|
"learning_rate": 1.895923605380904e-05, |
|
"loss": 1.6422, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 6.632734132335917, |
|
"learning_rate": 1.8934282726162325e-05, |
|
"loss": 1.5709, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2352, |
|
"grad_norm": 7.931181213386623, |
|
"learning_rate": 1.8909050614619197e-05, |
|
"loss": 1.6244, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 8.05038767649013, |
|
"learning_rate": 1.8883540506518336e-05, |
|
"loss": 1.6756, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2384, |
|
"grad_norm": 9.138124074487578, |
|
"learning_rate": 1.885775319787298e-05, |
|
"loss": 1.5782, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 9.052424132753602, |
|
"learning_rate": 1.8831689493346095e-05, |
|
"loss": 1.6486, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2416, |
|
"grad_norm": 12.11857562509831, |
|
"learning_rate": 1.880535020622525e-05, |
|
"loss": 1.5488, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 8.346673787548578, |
|
"learning_rate": 1.8778736158397244e-05, |
|
"loss": 1.5675, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2448, |
|
"grad_norm": 7.922560429887788, |
|
"learning_rate": 1.8751848180322476e-05, |
|
"loss": 1.5814, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 7.448862840223704, |
|
"learning_rate": 1.872468711100902e-05, |
|
"loss": 1.696, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 7.381712408385049, |
|
"learning_rate": 1.869725379798643e-05, |
|
"loss": 1.5801, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 7.520192699988718, |
|
"learning_rate": 1.866954909727932e-05, |
|
"loss": 1.5583, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2512, |
|
"grad_norm": 8.458007153827033, |
|
"learning_rate": 1.864157387338064e-05, |
|
"loss": 1.5893, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 11.023935513739714, |
|
"learning_rate": 1.86133289992247e-05, |
|
"loss": 1.6194, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.2544, |
|
"grad_norm": 7.279897256975987, |
|
"learning_rate": 1.8584815356159932e-05, |
|
"loss": 1.6186, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 7.807795169814656, |
|
"learning_rate": 1.8556033833921386e-05, |
|
"loss": 1.7446, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2576, |
|
"grad_norm": 7.782885718886869, |
|
"learning_rate": 1.8526985330602973e-05, |
|
"loss": 1.6365, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 7.881283617531603, |
|
"learning_rate": 1.8497670752629437e-05, |
|
"loss": 1.7161, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2608, |
|
"grad_norm": 7.32964730840398, |
|
"learning_rate": 1.8468091014728076e-05, |
|
"loss": 1.6361, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 7.4867507136615075, |
|
"learning_rate": 1.843824703990019e-05, |
|
"loss": 1.7187, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 7.878583656595656, |
|
"learning_rate": 1.840813975939229e-05, |
|
"loss": 1.6064, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 7.641254628614174, |
|
"learning_rate": 1.8377770112667024e-05, |
|
"loss": 1.6189, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2672, |
|
"grad_norm": 7.869146750924817, |
|
"learning_rate": 1.8347139047373885e-05, |
|
"loss": 1.6925, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 7.488977024706195, |
|
"learning_rate": 1.8316247519319625e-05, |
|
"loss": 1.6646, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2704, |
|
"grad_norm": 7.519265775038011, |
|
"learning_rate": 1.8285096492438424e-05, |
|
"loss": 1.6267, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 9.052319258879985, |
|
"learning_rate": 1.825368693876183e-05, |
|
"loss": 1.6113, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2736, |
|
"grad_norm": 7.425912485447768, |
|
"learning_rate": 1.8222019838388422e-05, |
|
"loss": 1.6341, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 7.616489466474544, |
|
"learning_rate": 1.8190096179453213e-05, |
|
"loss": 1.6195, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2768, |
|
"grad_norm": 8.57212898020857, |
|
"learning_rate": 1.8157916958096837e-05, |
|
"loss": 1.5484, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 8.058170481678996, |
|
"learning_rate": 1.8125483178434448e-05, |
|
"loss": 1.6392, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 7.759361204922903, |
|
"learning_rate": 1.8092795852524404e-05, |
|
"loss": 1.572, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 7.814935898355, |
|
"learning_rate": 1.8059856000336675e-05, |
|
"loss": 1.5755, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2832, |
|
"grad_norm": 8.862683471146346, |
|
"learning_rate": 1.8026664649721016e-05, |
|
"loss": 1.5711, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 9.253730250566822, |
|
"learning_rate": 1.7993222836374904e-05, |
|
"loss": 1.6258, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2864, |
|
"grad_norm": 8.877026192983683, |
|
"learning_rate": 1.795953160381121e-05, |
|
"loss": 1.6478, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 9.101400243172016, |
|
"learning_rate": 1.792559200332564e-05, |
|
"loss": 1.6171, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2896, |
|
"grad_norm": 7.136286110367266, |
|
"learning_rate": 1.789140509396394e-05, |
|
"loss": 1.5754, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 7.912268476013011, |
|
"learning_rate": 1.7856971942488826e-05, |
|
"loss": 1.5975, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2928, |
|
"grad_norm": 7.20611974657277, |
|
"learning_rate": 1.7822293623346736e-05, |
|
"loss": 1.658, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 10.223106004235314, |
|
"learning_rate": 1.7787371218634263e-05, |
|
"loss": 1.6974, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 7.6171324527749, |
|
"learning_rate": 1.77522058180644e-05, |
|
"loss": 1.692, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 9.169763628140181, |
|
"learning_rate": 1.7716798518932564e-05, |
|
"loss": 1.623, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.2992, |
|
"grad_norm": 7.322606018864152, |
|
"learning_rate": 1.7681150426082322e-05, |
|
"loss": 1.6117, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 7.081332396019932, |
|
"learning_rate": 1.7645262651870926e-05, |
|
"loss": 1.7026, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3024, |
|
"grad_norm": 7.141747828380935, |
|
"learning_rate": 1.7609136316134616e-05, |
|
"loss": 1.5313, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 8.73987298901696, |
|
"learning_rate": 1.7572772546153657e-05, |
|
"loss": 1.6529, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3056, |
|
"grad_norm": 8.397556088140782, |
|
"learning_rate": 1.7536172476617183e-05, |
|
"loss": 1.6707, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 7.447729626374141, |
|
"learning_rate": 1.749933724958777e-05, |
|
"loss": 1.6343, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3088, |
|
"grad_norm": 6.907736776247304, |
|
"learning_rate": 1.746226801446582e-05, |
|
"loss": 1.6028, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 8.374764423916597, |
|
"learning_rate": 1.742496592795368e-05, |
|
"loss": 1.5854, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 6.656458694402831, |
|
"learning_rate": 1.738743215401955e-05, |
|
"loss": 1.5937, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 8.119235048754572, |
|
"learning_rate": 1.7349667863861175e-05, |
|
"loss": 1.6238, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3152, |
|
"grad_norm": 10.758877616976827, |
|
"learning_rate": 1.7311674235869285e-05, |
|
"loss": 1.6329, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 6.855294162792502, |
|
"learning_rate": 1.7273452455590835e-05, |
|
"loss": 1.5509, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3184, |
|
"grad_norm": 7.436276047950917, |
|
"learning_rate": 1.7235003715691996e-05, |
|
"loss": 1.6302, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 9.112788481739514, |
|
"learning_rate": 1.7196329215920963e-05, |
|
"loss": 1.5555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3216, |
|
"grad_norm": 7.707360400350489, |
|
"learning_rate": 1.71574301630705e-05, |
|
"loss": 1.5711, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 8.985392901674105, |
|
"learning_rate": 1.711830777094028e-05, |
|
"loss": 1.5719, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3248, |
|
"grad_norm": 7.854703662416667, |
|
"learning_rate": 1.707896326029903e-05, |
|
"loss": 1.687, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 7.991591880545251, |
|
"learning_rate": 1.7039397858846428e-05, |
|
"loss": 1.5265, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 8.38735368829394, |
|
"learning_rate": 1.6999612801174782e-05, |
|
"loss": 1.5071, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 7.088683318490433, |
|
"learning_rate": 1.6959609328730526e-05, |
|
"loss": 1.5594, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3312, |
|
"grad_norm": 7.732979117471935, |
|
"learning_rate": 1.6919388689775463e-05, |
|
"loss": 1.5789, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 7.480645400348206, |
|
"learning_rate": 1.6878952139347834e-05, |
|
"loss": 1.5923, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3344, |
|
"grad_norm": 7.858191371747195, |
|
"learning_rate": 1.6838300939223144e-05, |
|
"loss": 1.5487, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 8.85580923674649, |
|
"learning_rate": 1.679743635787479e-05, |
|
"loss": 1.5857, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3376, |
|
"grad_norm": 7.901857553601464, |
|
"learning_rate": 1.6756359670434478e-05, |
|
"loss": 1.593, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 9.026231263712882, |
|
"learning_rate": 1.6715072158652444e-05, |
|
"loss": 1.6723, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3408, |
|
"grad_norm": 13.504122295950328, |
|
"learning_rate": 1.6673575110857457e-05, |
|
"loss": 1.6176, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 10.243586542935969, |
|
"learning_rate": 1.6631869821916602e-05, |
|
"loss": 1.5779, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 7.489622112226259, |
|
"learning_rate": 1.6589957593194887e-05, |
|
"loss": 1.6419, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 7.573818239927097, |
|
"learning_rate": 1.6547839732514646e-05, |
|
"loss": 1.5614, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3472, |
|
"grad_norm": 10.0936611445675, |
|
"learning_rate": 1.650551755411471e-05, |
|
"loss": 1.5626, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 7.874542575121299, |
|
"learning_rate": 1.646299237860941e-05, |
|
"loss": 1.6234, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.3504, |
|
"grad_norm": 8.356085115655546, |
|
"learning_rate": 1.6420265532947364e-05, |
|
"loss": 1.4703, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 9.024857376590765, |
|
"learning_rate": 1.6377338350370077e-05, |
|
"loss": 1.5622, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3536, |
|
"grad_norm": 7.46614009546006, |
|
"learning_rate": 1.6334212170370323e-05, |
|
"loss": 1.6042, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 8.565479511807924, |
|
"learning_rate": 1.6290888338650373e-05, |
|
"loss": 1.4699, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3568, |
|
"grad_norm": 7.2168157043192025, |
|
"learning_rate": 1.624736820707998e-05, |
|
"loss": 1.5334, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 6.584169033723296, |
|
"learning_rate": 1.6203653133654213e-05, |
|
"loss": 1.6832, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.225227952156274, |
|
"learning_rate": 1.615974448245107e-05, |
|
"loss": 1.5323, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 10.601001964167809, |
|
"learning_rate": 1.6115643623588915e-05, |
|
"loss": 1.6198, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3632, |
|
"grad_norm": 10.795794042349174, |
|
"learning_rate": 1.6071351933183736e-05, |
|
"loss": 1.5853, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 7.346429916746835, |
|
"learning_rate": 1.602687079330619e-05, |
|
"loss": 1.5795, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3664, |
|
"grad_norm": 6.659543911557079, |
|
"learning_rate": 1.5982201591938496e-05, |
|
"loss": 1.5804, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 7.221758133107811, |
|
"learning_rate": 1.5937345722931098e-05, |
|
"loss": 1.6121, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3696, |
|
"grad_norm": 9.731811312205782, |
|
"learning_rate": 1.5892304585959193e-05, |
|
"loss": 1.6606, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 8.589776786846603, |
|
"learning_rate": 1.5847079586479052e-05, |
|
"loss": 1.6185, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3728, |
|
"grad_norm": 7.386876191799319, |
|
"learning_rate": 1.580167213568416e-05, |
|
"loss": 1.556, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 8.140967833939259, |
|
"learning_rate": 1.575608365046118e-05, |
|
"loss": 1.5469, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 12.874290112872854, |
|
"learning_rate": 1.571031555334575e-05, |
|
"loss": 1.5306, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 7.6631355790039395, |
|
"learning_rate": 1.566436927247808e-05, |
|
"loss": 1.5181, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3792, |
|
"grad_norm": 6.840987993796332, |
|
"learning_rate": 1.5618246241558402e-05, |
|
"loss": 1.6786, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 10.867647495390518, |
|
"learning_rate": 1.5571947899802227e-05, |
|
"loss": 1.6774, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3824, |
|
"grad_norm": 7.370178174067006, |
|
"learning_rate": 1.5525475691895438e-05, |
|
"loss": 1.5014, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 7.247801379833239, |
|
"learning_rate": 1.5478831067949203e-05, |
|
"loss": 1.5683, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3856, |
|
"grad_norm": 7.548129806153257, |
|
"learning_rate": 1.5432015483454736e-05, |
|
"loss": 1.5433, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 7.403487909131467, |
|
"learning_rate": 1.5385030399237878e-05, |
|
"loss": 1.617, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3888, |
|
"grad_norm": 7.085555289389217, |
|
"learning_rate": 1.533787728141351e-05, |
|
"loss": 1.5484, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 6.688525472576119, |
|
"learning_rate": 1.5290557601339807e-05, |
|
"loss": 1.603, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 7.062961511805713, |
|
"learning_rate": 1.5243072835572319e-05, |
|
"loss": 1.6592, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 7.338639370748799, |
|
"learning_rate": 1.5195424465817911e-05, |
|
"loss": 1.5468, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3952, |
|
"grad_norm": 7.404342838798467, |
|
"learning_rate": 1.5147613978888514e-05, |
|
"loss": 1.4803, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 7.815793838467086, |
|
"learning_rate": 1.5099642866654747e-05, |
|
"loss": 1.6002, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3984, |
|
"grad_norm": 6.983719434750113, |
|
"learning_rate": 1.505151262599934e-05, |
|
"loss": 1.4679, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 7.689132853147396, |
|
"learning_rate": 1.5003224758770447e-05, |
|
"loss": 1.5426, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4016, |
|
"grad_norm": 7.99779675336406, |
|
"learning_rate": 1.4954780771734783e-05, |
|
"loss": 1.6113, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 7.956111242399946, |
|
"learning_rate": 1.4906182176530588e-05, |
|
"loss": 1.6137, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4048, |
|
"grad_norm": 8.34462398939598, |
|
"learning_rate": 1.4857430489620476e-05, |
|
"loss": 1.6686, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 6.87092130604808, |
|
"learning_rate": 1.4808527232244113e-05, |
|
"loss": 1.5328, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 7.069072459949788, |
|
"learning_rate": 1.4759473930370738e-05, |
|
"loss": 1.5685, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 9.020054344062027, |
|
"learning_rate": 1.4710272114651555e-05, |
|
"loss": 1.5414, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4112, |
|
"grad_norm": 9.445393180818964, |
|
"learning_rate": 1.4660923320371974e-05, |
|
"loss": 1.5297, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 7.558351219070578, |
|
"learning_rate": 1.4611429087403695e-05, |
|
"loss": 1.5524, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4144, |
|
"grad_norm": 7.092500501081219, |
|
"learning_rate": 1.456179096015667e-05, |
|
"loss": 1.686, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 7.207957772603401, |
|
"learning_rate": 1.4512010487530899e-05, |
|
"loss": 1.5716, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4176, |
|
"grad_norm": 7.34637827560702, |
|
"learning_rate": 1.4462089222868099e-05, |
|
"loss": 1.469, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 9.334037108256902, |
|
"learning_rate": 1.4412028723903251e-05, |
|
"loss": 1.5605, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.4208, |
|
"grad_norm": 7.693732678205702, |
|
"learning_rate": 1.4361830552715973e-05, |
|
"loss": 1.6241, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 8.129038938122736, |
|
"learning_rate": 1.4311496275681785e-05, |
|
"loss": 1.4064, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 6.237348905953311, |
|
"learning_rate": 1.4261027463423232e-05, |
|
"loss": 1.4584, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 6.031996271045834, |
|
"learning_rate": 1.4210425690760876e-05, |
|
"loss": 1.5006, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.4272, |
|
"grad_norm": 7.539819121652781, |
|
"learning_rate": 1.4159692536664147e-05, |
|
"loss": 1.5603, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 7.523103920232781, |
|
"learning_rate": 1.410882958420209e-05, |
|
"loss": 1.5839, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.4304, |
|
"grad_norm": 7.666216752529669, |
|
"learning_rate": 1.405783842049395e-05, |
|
"loss": 1.5192, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 7.32146225643351, |
|
"learning_rate": 1.4006720636659656e-05, |
|
"loss": 1.5088, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4336, |
|
"grad_norm": 8.854549575824441, |
|
"learning_rate": 1.3955477827770174e-05, |
|
"loss": 1.5657, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 8.471943810603415, |
|
"learning_rate": 1.3904111592797724e-05, |
|
"loss": 1.4975, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4368, |
|
"grad_norm": 7.881715416808106, |
|
"learning_rate": 1.3852623534565901e-05, |
|
"loss": 1.4941, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 7.441858474366784, |
|
"learning_rate": 1.3801015259699648e-05, |
|
"loss": 1.5424, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 8.277844600049658, |
|
"learning_rate": 1.3749288378575133e-05, |
|
"loss": 1.5154, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 7.1055378423472835, |
|
"learning_rate": 1.3697444505269489e-05, |
|
"loss": 1.5948, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4432, |
|
"grad_norm": 7.146958460316305, |
|
"learning_rate": 1.3645485257510456e-05, |
|
"loss": 1.5807, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 6.5302083293339965, |
|
"learning_rate": 1.3593412256625898e-05, |
|
"loss": 1.4385, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4464, |
|
"grad_norm": 7.24352130131181, |
|
"learning_rate": 1.3541227127493218e-05, |
|
"loss": 1.4763, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 7.695971362233706, |
|
"learning_rate": 1.348893149848865e-05, |
|
"loss": 1.4818, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4496, |
|
"grad_norm": 7.454125385613667, |
|
"learning_rate": 1.3436527001436437e-05, |
|
"loss": 1.4103, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 7.82334723806768, |
|
"learning_rate": 1.3384015271557938e-05, |
|
"loss": 1.4473, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4528, |
|
"grad_norm": 7.813760357009132, |
|
"learning_rate": 1.3331397947420578e-05, |
|
"loss": 1.4581, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 6.911969016129946, |
|
"learning_rate": 1.3278676670886728e-05, |
|
"loss": 1.4735, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 7.885416119688343, |
|
"learning_rate": 1.3225853087062481e-05, |
|
"loss": 1.4771, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 7.171521274701965, |
|
"learning_rate": 1.3172928844246297e-05, |
|
"loss": 1.4909, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4592, |
|
"grad_norm": 6.038998548284215, |
|
"learning_rate": 1.3119905593877593e-05, |
|
"loss": 1.4862, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 7.850277653518824, |
|
"learning_rate": 1.3066784990485202e-05, |
|
"loss": 1.4361, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4624, |
|
"grad_norm": 8.240535964327846, |
|
"learning_rate": 1.3013568691635733e-05, |
|
"loss": 1.4437, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 6.764355409884025, |
|
"learning_rate": 1.2960258357881875e-05, |
|
"loss": 1.5192, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4656, |
|
"grad_norm": 7.159838600132645, |
|
"learning_rate": 1.2906855652710557e-05, |
|
"loss": 1.5827, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 8.306859326063204, |
|
"learning_rate": 1.2853362242491054e-05, |
|
"loss": 1.5463, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4688, |
|
"grad_norm": 7.019302747818116, |
|
"learning_rate": 1.279977979642299e-05, |
|
"loss": 1.4193, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 7.622978792348708, |
|
"learning_rate": 1.2746109986484236e-05, |
|
"loss": 1.422, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 6.56284738301426, |
|
"learning_rate": 1.2692354487378768e-05, |
|
"loss": 1.5312, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 6.387578487452188, |
|
"learning_rate": 1.2638514976484384e-05, |
|
"loss": 1.4795, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4752, |
|
"grad_norm": 6.050045115060615, |
|
"learning_rate": 1.2584593133800374e-05, |
|
"loss": 1.4694, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 7.023845599021495, |
|
"learning_rate": 1.2530590641895089e-05, |
|
"loss": 1.5678, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4784, |
|
"grad_norm": 6.600644496789812, |
|
"learning_rate": 1.2476509185853456e-05, |
|
"loss": 1.5587, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.483920132677383, |
|
"learning_rate": 1.242235045322438e-05, |
|
"loss": 1.4022, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4816, |
|
"grad_norm": 7.393698504360439, |
|
"learning_rate": 1.2368116133968091e-05, |
|
"loss": 1.4542, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 6.59515492702019, |
|
"learning_rate": 1.2313807920403419e-05, |
|
"loss": 1.4563, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4848, |
|
"grad_norm": 11.46310019096542, |
|
"learning_rate": 1.2259427507154964e-05, |
|
"loss": 1.4436, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 7.492782202626292, |
|
"learning_rate": 1.2204976591100253e-05, |
|
"loss": 1.5193, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 7.7736261505421025, |
|
"learning_rate": 1.2150456871316758e-05, |
|
"loss": 1.4168, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 7.9354251057792515, |
|
"learning_rate": 1.2095870049028898e-05, |
|
"loss": 1.4414, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4912, |
|
"grad_norm": 16.059395970692933, |
|
"learning_rate": 1.2041217827554939e-05, |
|
"loss": 1.4695, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 6.808900956376759, |
|
"learning_rate": 1.1986501912253863e-05, |
|
"loss": 1.4531, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4944, |
|
"grad_norm": 7.736857454242817, |
|
"learning_rate": 1.1931724010472135e-05, |
|
"loss": 1.4924, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 8.3359924924688, |
|
"learning_rate": 1.1876885831490442e-05, |
|
"loss": 1.555, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4976, |
|
"grad_norm": 7.120202493727813, |
|
"learning_rate": 1.1821989086470349e-05, |
|
"loss": 1.4645, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 7.599913279445224, |
|
"learning_rate": 1.1767035488400903e-05, |
|
"loss": 1.4863, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5008, |
|
"grad_norm": 7.125989534788617, |
|
"learning_rate": 1.1712026752045189e-05, |
|
"loss": 1.4652, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 8.088722450627488, |
|
"learning_rate": 1.1656964593886819e-05, |
|
"loss": 1.3223, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 7.516194041566506, |
|
"learning_rate": 1.1601850732076361e-05, |
|
"loss": 1.5801, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 7.239851442002106, |
|
"learning_rate": 1.1546686886377745e-05, |
|
"loss": 1.4731, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5072, |
|
"grad_norm": 7.520103127972609, |
|
"learning_rate": 1.1491474778114588e-05, |
|
"loss": 1.3683, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 9.131577917600616, |
|
"learning_rate": 1.143621613011648e-05, |
|
"loss": 1.562, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5104, |
|
"grad_norm": 7.486194993220994, |
|
"learning_rate": 1.1380912666665234e-05, |
|
"loss": 1.4111, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 6.496362336710197, |
|
"learning_rate": 1.1325566113441074e-05, |
|
"loss": 1.5706, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5136, |
|
"grad_norm": 6.823989380990125, |
|
"learning_rate": 1.1270178197468788e-05, |
|
"loss": 1.4575, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 7.5663657895225125, |
|
"learning_rate": 1.121475064706385e-05, |
|
"loss": 1.3982, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5168, |
|
"grad_norm": 6.5005524818168965, |
|
"learning_rate": 1.1159285191778473e-05, |
|
"loss": 1.4888, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 7.387470263764186, |
|
"learning_rate": 1.1103783562347642e-05, |
|
"loss": 1.359, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.505035445167394, |
|
"learning_rate": 1.1048247490635133e-05, |
|
"loss": 1.4775, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 6.645939875937268, |
|
"learning_rate": 1.099267870957943e-05, |
|
"loss": 1.5036, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5232, |
|
"grad_norm": 6.913899992846711, |
|
"learning_rate": 1.0937078953139691e-05, |
|
"loss": 1.4273, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 7.88033348199259, |
|
"learning_rate": 1.0881449956241616e-05, |
|
"loss": 1.2594, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.5264, |
|
"grad_norm": 7.305128083335553, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 1.4711, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 8.41830449904285, |
|
"learning_rate": 1.0770111185281182e-05, |
|
"loss": 1.4567, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5296, |
|
"grad_norm": 8.793536036546655, |
|
"learning_rate": 1.071440488541562e-05, |
|
"loss": 1.4354, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 7.225955221122391, |
|
"learning_rate": 1.0658676293376894e-05, |
|
"loss": 1.4268, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.5328, |
|
"grad_norm": 7.296610012317386, |
|
"learning_rate": 1.0602927148110882e-05, |
|
"loss": 1.355, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 7.368094614876765, |
|
"learning_rate": 1.0547159189204788e-05, |
|
"loss": 1.3505, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 6.381999606829369, |
|
"learning_rate": 1.0491374156832875e-05, |
|
"loss": 1.3813, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 7.047380620438761, |
|
"learning_rate": 1.043557379170217e-05, |
|
"loss": 1.4362, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5392, |
|
"grad_norm": 6.040591759821475, |
|
"learning_rate": 1.0379759834998133e-05, |
|
"loss": 1.4112, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 6.915219064223463, |
|
"learning_rate": 1.0323934028330337e-05, |
|
"loss": 1.5057, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5424, |
|
"grad_norm": 7.271768400559689, |
|
"learning_rate": 1.0268098113678124e-05, |
|
"loss": 1.4705, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 6.053160359266474, |
|
"learning_rate": 1.0212253833336237e-05, |
|
"loss": 1.477, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5456, |
|
"grad_norm": 6.534057764124044, |
|
"learning_rate": 1.015640292986046e-05, |
|
"loss": 1.4277, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 8.2582991694942, |
|
"learning_rate": 1.0100547146013252e-05, |
|
"loss": 1.4827, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5488, |
|
"grad_norm": 6.985572611193969, |
|
"learning_rate": 1.0044688224709346e-05, |
|
"loss": 1.3615, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 6.902453232669348, |
|
"learning_rate": 9.988827908961392e-06, |
|
"loss": 1.4461, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 6.996523125729606, |
|
"learning_rate": 9.932967941825539e-06, |
|
"loss": 1.3792, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 6.402198205444748, |
|
"learning_rate": 9.87711006634706e-06, |
|
"loss": 1.5359, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5552, |
|
"grad_norm": 6.398796292025464, |
|
"learning_rate": 9.821256025505964e-06, |
|
"loss": 1.3898, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 6.940842238375595, |
|
"learning_rate": 9.765407562162606e-06, |
|
"loss": 1.4426, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5584, |
|
"grad_norm": 8.063938708721054, |
|
"learning_rate": 9.709566419003292e-06, |
|
"loss": 1.4324, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.113579365707868, |
|
"learning_rate": 9.653734338485924e-06, |
|
"loss": 1.3696, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5616, |
|
"grad_norm": 9.305625046247098, |
|
"learning_rate": 9.597913062785603e-06, |
|
"loss": 1.3767, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 7.713589183858762, |
|
"learning_rate": 9.54210433374028e-06, |
|
"loss": 1.3993, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5648, |
|
"grad_norm": 6.882609362707895, |
|
"learning_rate": 9.486309892796413e-06, |
|
"loss": 1.2881, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 7.382583057213062, |
|
"learning_rate": 9.430531480954605e-06, |
|
"loss": 1.3868, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 7.060465219449526, |
|
"learning_rate": 9.374770838715289e-06, |
|
"loss": 1.3008, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 6.548055523692506, |
|
"learning_rate": 9.319029706024428e-06, |
|
"loss": 1.4179, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5712, |
|
"grad_norm": 7.296597392978876, |
|
"learning_rate": 9.2633098222192e-06, |
|
"loss": 1.3648, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 7.6566983852007695, |
|
"learning_rate": 9.20761292597375e-06, |
|
"loss": 1.48, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5744, |
|
"grad_norm": 6.346223117268573, |
|
"learning_rate": 9.151940755244912e-06, |
|
"loss": 1.4082, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 7.11841923298027, |
|
"learning_rate": 9.096295047217988e-06, |
|
"loss": 1.3294, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5776, |
|
"grad_norm": 8.50296732146637, |
|
"learning_rate": 9.040677538252555e-06, |
|
"loss": 1.4083, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 7.014419702757138, |
|
"learning_rate": 8.985089963828262e-06, |
|
"loss": 1.4773, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5808, |
|
"grad_norm": 7.176240770998421, |
|
"learning_rate": 8.929534058490682e-06, |
|
"loss": 1.2781, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 7.803766989451665, |
|
"learning_rate": 8.8740115557972e-06, |
|
"loss": 1.4017, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 7.779698071769793, |
|
"learning_rate": 8.8185241882629e-06, |
|
"loss": 1.4537, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 6.415990926093912, |
|
"learning_rate": 8.763073687306523e-06, |
|
"loss": 1.2469, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5872, |
|
"grad_norm": 8.66418584105546, |
|
"learning_rate": 8.707661783196432e-06, |
|
"loss": 1.352, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 7.910777048810527, |
|
"learning_rate": 8.652290204996613e-06, |
|
"loss": 1.4686, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5904, |
|
"grad_norm": 6.3176543086259995, |
|
"learning_rate": 8.59696068051273e-06, |
|
"loss": 1.4047, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 7.512370939346078, |
|
"learning_rate": 8.541674936238219e-06, |
|
"loss": 1.422, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5936, |
|
"grad_norm": 7.8096285750182215, |
|
"learning_rate": 8.486434697300394e-06, |
|
"loss": 1.4087, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 6.178330789312308, |
|
"learning_rate": 8.431241687406631e-06, |
|
"loss": 1.3726, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5968, |
|
"grad_norm": 6.624672273010726, |
|
"learning_rate": 8.376097628790586e-06, |
|
"loss": 1.3789, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 7.3241310271547295, |
|
"learning_rate": 8.321004242158439e-06, |
|
"loss": 1.3609, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.04309140021108, |
|
"learning_rate": 8.265963246635212e-06, |
|
"loss": 1.288, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 6.865692791476556, |
|
"learning_rate": 8.210976359711124e-06, |
|
"loss": 1.3762, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.6032, |
|
"grad_norm": 6.771762456175988, |
|
"learning_rate": 8.156045297187994e-06, |
|
"loss": 1.2717, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 6.154571636328125, |
|
"learning_rate": 8.101171773125716e-06, |
|
"loss": 1.3669, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6064, |
|
"grad_norm": 6.572968900317892, |
|
"learning_rate": 8.046357499788757e-06, |
|
"loss": 1.38, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 8.531193639514733, |
|
"learning_rate": 7.991604187592732e-06, |
|
"loss": 1.5369, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6096, |
|
"grad_norm": 6.698720683326016, |
|
"learning_rate": 7.93691354505103e-06, |
|
"loss": 1.4231, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 7.521329845585438, |
|
"learning_rate": 7.882287278721523e-06, |
|
"loss": 1.4031, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6128, |
|
"grad_norm": 7.6211310302707505, |
|
"learning_rate": 7.82772709315328e-06, |
|
"loss": 1.3337, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 6.018003479238453, |
|
"learning_rate": 7.77323469083341e-06, |
|
"loss": 1.3658, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 6.464569749783369, |
|
"learning_rate": 7.718811772133918e-06, |
|
"loss": 1.2912, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 6.514713737748858, |
|
"learning_rate": 7.664460035258651e-06, |
|
"loss": 1.5244, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6192, |
|
"grad_norm": 6.205352658391803, |
|
"learning_rate": 7.610181176190318e-06, |
|
"loss": 1.2526, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 7.014357158748993, |
|
"learning_rate": 7.555976888637556e-06, |
|
"loss": 1.3358, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6224, |
|
"grad_norm": 7.288009471718571, |
|
"learning_rate": 7.501848863982082e-06, |
|
"loss": 1.419, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 7.468562712854107, |
|
"learning_rate": 7.447798791225925e-06, |
|
"loss": 1.2961, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6256, |
|
"grad_norm": 7.203677073348877, |
|
"learning_rate": 7.393828356938709e-06, |
|
"loss": 1.3685, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 7.047746758873238, |
|
"learning_rate": 7.3399392452050385e-06, |
|
"loss": 1.4267, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6288, |
|
"grad_norm": 7.352486974428156, |
|
"learning_rate": 7.286133137571938e-06, |
|
"loss": 1.284, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 6.913404968326128, |
|
"learning_rate": 7.2324117129963815e-06, |
|
"loss": 1.3034, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 7.129552158427298, |
|
"learning_rate": 7.178776647792918e-06, |
|
"loss": 1.3451, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 7.337355717351341, |
|
"learning_rate": 7.125229615581346e-06, |
|
"loss": 1.2403, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6352, |
|
"grad_norm": 7.212501447309288, |
|
"learning_rate": 7.071772287234497e-06, |
|
"loss": 1.2539, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 7.268649220162584, |
|
"learning_rate": 7.018406330826096e-06, |
|
"loss": 1.3539, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.6384, |
|
"grad_norm": 7.2645693935162905, |
|
"learning_rate": 6.96513341157872e-06, |
|
"loss": 1.3748, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 8.342662156660523, |
|
"learning_rate": 6.911955191811819e-06, |
|
"loss": 1.3324, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6416, |
|
"grad_norm": 7.283104811608162, |
|
"learning_rate": 6.858873330889868e-06, |
|
"loss": 1.3388, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 6.514509652191098, |
|
"learning_rate": 6.8058894851705655e-06, |
|
"loss": 1.3698, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6448, |
|
"grad_norm": 7.590737838567425, |
|
"learning_rate": 6.7530053079531664e-06, |
|
"loss": 1.3399, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 10.218109612754096, |
|
"learning_rate": 6.700222449426885e-06, |
|
"loss": 1.3822, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 7.9917404827520055, |
|
"learning_rate": 6.6475425566194006e-06, |
|
"loss": 1.406, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 5.976353190616539, |
|
"learning_rate": 6.59496727334547e-06, |
|
"loss": 1.3219, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6512, |
|
"grad_norm": 8.101871551936691, |
|
"learning_rate": 6.5424982401556305e-06, |
|
"loss": 1.3451, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 6.374533499924878, |
|
"learning_rate": 6.490137094285008e-06, |
|
"loss": 1.3402, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6544, |
|
"grad_norm": 6.459403952059301, |
|
"learning_rate": 6.437885469602235e-06, |
|
"loss": 1.3804, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 7.0688192727901935, |
|
"learning_rate": 6.385744996558456e-06, |
|
"loss": 1.3406, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6576, |
|
"grad_norm": 6.7313792745199486, |
|
"learning_rate": 6.333717302136457e-06, |
|
"loss": 1.3536, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 6.882985114725421, |
|
"learning_rate": 6.28180400979991e-06, |
|
"loss": 1.2695, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6608, |
|
"grad_norm": 6.533077979959867, |
|
"learning_rate": 6.230006739442692e-06, |
|
"loss": 1.2323, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 6.540348895029834, |
|
"learning_rate": 6.178327107338353e-06, |
|
"loss": 1.4556, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 8.070984765040714, |
|
"learning_rate": 6.1267667260896755e-06, |
|
"loss": 1.3297, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 10.638827242283833, |
|
"learning_rate": 6.075327204578363e-06, |
|
"loss": 1.3589, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6672, |
|
"grad_norm": 6.9949360978136434, |
|
"learning_rate": 6.024010147914826e-06, |
|
"loss": 1.2704, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 7.6982660712041255, |
|
"learning_rate": 5.972817157388106e-06, |
|
"loss": 1.3201, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6704, |
|
"grad_norm": 7.526592127466844, |
|
"learning_rate": 5.921749830415905e-06, |
|
"loss": 1.3338, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 7.590856683299185, |
|
"learning_rate": 5.870809760494734e-06, |
|
"loss": 1.2958, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6736, |
|
"grad_norm": 6.9630835241832125, |
|
"learning_rate": 5.819998537150203e-06, |
|
"loss": 1.2639, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 6.253694167385759, |
|
"learning_rate": 5.769317745887413e-06, |
|
"loss": 1.324, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6768, |
|
"grad_norm": 6.682246580824873, |
|
"learning_rate": 5.718768968141482e-06, |
|
"loss": 1.33, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 7.24085307136985, |
|
"learning_rate": 5.668353781228193e-06, |
|
"loss": 1.3596, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 6.889515698119121, |
|
"learning_rate": 5.618073758294802e-06, |
|
"loss": 1.244, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 7.707218870893382, |
|
"learning_rate": 5.567930468270911e-06, |
|
"loss": 1.3282, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6832, |
|
"grad_norm": 7.803800032255029, |
|
"learning_rate": 5.517925475819539e-06, |
|
"loss": 1.34, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 7.2010746561843755, |
|
"learning_rate": 5.468060341288286e-06, |
|
"loss": 1.2944, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6864, |
|
"grad_norm": 6.695133409250474, |
|
"learning_rate": 5.418336620660658e-06, |
|
"loss": 1.2467, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 6.811091226395264, |
|
"learning_rate": 5.36875586550749e-06, |
|
"loss": 1.2451, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6896, |
|
"grad_norm": 7.299012057736775, |
|
"learning_rate": 5.319319622938563e-06, |
|
"loss": 1.2175, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 6.523483550221802, |
|
"learning_rate": 5.270029435554295e-06, |
|
"loss": 1.3653, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6928, |
|
"grad_norm": 7.006983737221144, |
|
"learning_rate": 5.22088684139763e-06, |
|
"loss": 1.3037, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 9.658145298208773, |
|
"learning_rate": 5.171893373906036e-06, |
|
"loss": 1.311, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 6.9236063967034145, |
|
"learning_rate": 5.1230505618636575e-06, |
|
"loss": 1.3054, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 6.910379366692173, |
|
"learning_rate": 5.074359929353604e-06, |
|
"loss": 1.1699, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6992, |
|
"grad_norm": 7.780823172347376, |
|
"learning_rate": 5.025822995710414e-06, |
|
"loss": 1.2552, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 6.782111700218943, |
|
"learning_rate": 4.977441275472622e-06, |
|
"loss": 1.4024, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.7024, |
|
"grad_norm": 7.01625141674153, |
|
"learning_rate": 4.929216278335508e-06, |
|
"loss": 1.3484, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 6.611844691268078, |
|
"learning_rate": 4.881149509103993e-06, |
|
"loss": 1.3066, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7056, |
|
"grad_norm": 6.399244868619568, |
|
"learning_rate": 4.833242467645677e-06, |
|
"loss": 1.4538, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 7.977931392416363, |
|
"learning_rate": 4.785496648844049e-06, |
|
"loss": 1.219, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.7088, |
|
"grad_norm": 7.034256666397906, |
|
"learning_rate": 4.737913542551824e-06, |
|
"loss": 1.319, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 7.5123049812985565, |
|
"learning_rate": 4.690494633544466e-06, |
|
"loss": 1.3271, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 6.515957790093329, |
|
"learning_rate": 4.643241401473849e-06, |
|
"loss": 1.3427, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 6.839639945427634, |
|
"learning_rate": 4.596155320822103e-06, |
|
"loss": 1.2736, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7152, |
|
"grad_norm": 7.689485372330819, |
|
"learning_rate": 4.549237860855578e-06, |
|
"loss": 1.278, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 7.447174910713708, |
|
"learning_rate": 4.502490485579024e-06, |
|
"loss": 1.2915, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.7184, |
|
"grad_norm": 7.345469812975139, |
|
"learning_rate": 4.455914653689889e-06, |
|
"loss": 1.2697, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.852760914424491, |
|
"learning_rate": 4.409511818532809e-06, |
|
"loss": 1.3326, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7216, |
|
"grad_norm": 9.60045018569782, |
|
"learning_rate": 4.363283428054262e-06, |
|
"loss": 1.2859, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 7.195814960386886, |
|
"learning_rate": 4.317230924757379e-06, |
|
"loss": 1.2189, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7248, |
|
"grad_norm": 6.838988916881499, |
|
"learning_rate": 4.271355745656934e-06, |
|
"loss": 1.2248, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 7.060258323380059, |
|
"learning_rate": 4.2256593222345185e-06, |
|
"loss": 1.3148, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 7.168253685594636, |
|
"learning_rate": 4.1801430803938496e-06, |
|
"loss": 1.2706, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 8.110296241359842, |
|
"learning_rate": 4.1348084404162895e-06, |
|
"loss": 1.22, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7312, |
|
"grad_norm": 6.146120619441478, |
|
"learning_rate": 4.089656816916525e-06, |
|
"loss": 1.2596, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 7.51200746635356, |
|
"learning_rate": 4.0446896187984275e-06, |
|
"loss": 1.2051, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.7344, |
|
"grad_norm": 7.00889844522249, |
|
"learning_rate": 3.999908249211096e-06, |
|
"loss": 1.3089, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 7.433128694008251, |
|
"learning_rate": 3.955314105505056e-06, |
|
"loss": 1.1858, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7376, |
|
"grad_norm": 7.682196556731105, |
|
"learning_rate": 3.910908579188672e-06, |
|
"loss": 1.2912, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 8.377582759029309, |
|
"learning_rate": 3.866693055884723e-06, |
|
"loss": 1.2959, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7408, |
|
"grad_norm": 7.525533674291028, |
|
"learning_rate": 3.8226689152871576e-06, |
|
"loss": 1.3002, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 6.5405611376318085, |
|
"learning_rate": 3.7788375311180624e-06, |
|
"loss": 1.1595, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 6.351041058258562, |
|
"learning_rate": 3.735200271084779e-06, |
|
"loss": 1.2756, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 6.945497138509825, |
|
"learning_rate": 3.691758496837228e-06, |
|
"loss": 1.3431, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.7472, |
|
"grad_norm": 7.80452005536971, |
|
"learning_rate": 3.6485135639254234e-06, |
|
"loss": 1.1743, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 6.904075689429772, |
|
"learning_rate": 3.6054668217571774e-06, |
|
"loss": 1.2647, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.7504, |
|
"grad_norm": 5.826075155016658, |
|
"learning_rate": 3.5626196135559898e-06, |
|
"loss": 1.4307, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 7.4800929811344234, |
|
"learning_rate": 3.5199732763191317e-06, |
|
"loss": 1.3035, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.7536, |
|
"grad_norm": 6.592404311501557, |
|
"learning_rate": 3.4775291407759393e-06, |
|
"loss": 1.3101, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 6.0603172825038625, |
|
"learning_rate": 3.435288531346269e-06, |
|
"loss": 1.1216, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.7568, |
|
"grad_norm": 7.862732877894627, |
|
"learning_rate": 3.3932527660991877e-06, |
|
"loss": 1.2707, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 6.934609447705353, |
|
"learning_rate": 3.351423156711836e-06, |
|
"loss": 1.1406, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 6.676258909833237, |
|
"learning_rate": 3.309801008428498e-06, |
|
"loss": 1.1754, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 7.059333862950036, |
|
"learning_rate": 3.268387620019885e-06, |
|
"loss": 1.214, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7632, |
|
"grad_norm": 6.4985460219892115, |
|
"learning_rate": 3.2271842837425917e-06, |
|
"loss": 1.2429, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 6.318472678199636, |
|
"learning_rate": 3.1861922852987794e-06, |
|
"loss": 1.2567, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.7664, |
|
"grad_norm": 8.106856700967048, |
|
"learning_rate": 3.1454129037960614e-06, |
|
"loss": 1.2827, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 6.544814378582256, |
|
"learning_rate": 3.1048474117075834e-06, |
|
"loss": 1.2408, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7696, |
|
"grad_norm": 8.187702879843156, |
|
"learning_rate": 3.0644970748323253e-06, |
|
"loss": 1.3093, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 7.060252319548262, |
|
"learning_rate": 3.0243631522556027e-06, |
|
"loss": 1.2043, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.7728, |
|
"grad_norm": 7.212166473447804, |
|
"learning_rate": 2.984446896309764e-06, |
|
"loss": 1.29, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 6.205766494665441, |
|
"learning_rate": 2.94474955253513e-06, |
|
"loss": 1.2984, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 8.705823518308824, |
|
"learning_rate": 2.9052723596411194e-06, |
|
"loss": 1.272, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 6.193262549894205, |
|
"learning_rate": 2.866016549467602e-06, |
|
"loss": 1.2184, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7792, |
|
"grad_norm": 7.425984999552959, |
|
"learning_rate": 2.82698334694645e-06, |
|
"loss": 1.2093, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 6.720543568523316, |
|
"learning_rate": 2.7881739700633382e-06, |
|
"loss": 1.2015, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7824, |
|
"grad_norm": 7.41759595687626, |
|
"learning_rate": 2.749589629819708e-06, |
|
"loss": 1.1781, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 6.10376266829222, |
|
"learning_rate": 2.7112315301949986e-06, |
|
"loss": 1.2669, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7856, |
|
"grad_norm": 7.211278115666269, |
|
"learning_rate": 2.6731008681090763e-06, |
|
"loss": 1.2374, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 6.323650227039773, |
|
"learning_rate": 2.6351988333848787e-06, |
|
"loss": 1.2188, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7888, |
|
"grad_norm": 8.023736585244432, |
|
"learning_rate": 2.5975266087113015e-06, |
|
"loss": 1.1623, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 6.611440556841849, |
|
"learning_rate": 2.5600853696062766e-06, |
|
"loss": 1.2767, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 6.563637825114816, |
|
"learning_rate": 2.5228762843801047e-06, |
|
"loss": 1.255, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 7.2147131416231325, |
|
"learning_rate": 2.485900514098991e-06, |
|
"loss": 1.1983, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7952, |
|
"grad_norm": 6.707678159171325, |
|
"learning_rate": 2.4491592125488206e-06, |
|
"loss": 1.0984, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 6.811597254056078, |
|
"learning_rate": 2.4126535261991577e-06, |
|
"loss": 1.256, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7984, |
|
"grad_norm": 7.126090842225368, |
|
"learning_rate": 2.3763845941674703e-06, |
|
"loss": 1.0681, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.3532700559553215, |
|
"learning_rate": 2.340353548183575e-06, |
|
"loss": 1.318, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8016, |
|
"grad_norm": 7.106651921925259, |
|
"learning_rate": 2.3045615125543353e-06, |
|
"loss": 1.1499, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 6.895206052430061, |
|
"learning_rate": 2.2690096041285757e-06, |
|
"loss": 1.2491, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.8048, |
|
"grad_norm": 7.077398116538314, |
|
"learning_rate": 2.2336989322622306e-06, |
|
"loss": 1.1645, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 7.163449736184592, |
|
"learning_rate": 2.198630598783723e-06, |
|
"loss": 1.2058, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 7.411419676699458, |
|
"learning_rate": 2.1638056979596012e-06, |
|
"loss": 1.2915, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 6.83424681277287, |
|
"learning_rate": 2.1292253164603673e-06, |
|
"loss": 1.236, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.8112, |
|
"grad_norm": 7.517941880919823, |
|
"learning_rate": 2.094890533326589e-06, |
|
"loss": 1.215, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 7.707098663070382, |
|
"learning_rate": 2.0608024199352216e-06, |
|
"loss": 1.2385, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.8144, |
|
"grad_norm": 7.918662426971106, |
|
"learning_rate": 2.026962039966176e-06, |
|
"loss": 1.1269, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 8.318700772754628, |
|
"learning_rate": 1.9933704493691354e-06, |
|
"loss": 1.1145, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8176, |
|
"grad_norm": 7.58106431146203, |
|
"learning_rate": 1.960028696330596e-06, |
|
"loss": 1.2279, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 7.987545520077578, |
|
"learning_rate": 1.926937821241164e-06, |
|
"loss": 1.1885, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8208, |
|
"grad_norm": 7.251449924956015, |
|
"learning_rate": 1.8940988566630903e-06, |
|
"loss": 1.2175, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 6.84899161592509, |
|
"learning_rate": 1.861512827298051e-06, |
|
"loss": 1.0726, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 7.124772325379492, |
|
"learning_rate": 1.8291807499551772e-06, |
|
"loss": 1.238, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 7.245848004677167, |
|
"learning_rate": 1.7971036335193249e-06, |
|
"loss": 1.1557, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8272, |
|
"grad_norm": 8.125243585964292, |
|
"learning_rate": 1.7652824789195811e-06, |
|
"loss": 1.2371, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 7.1050108164531975, |
|
"learning_rate": 1.73371827909805e-06, |
|
"loss": 1.2496, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.8304, |
|
"grad_norm": 6.936091197387642, |
|
"learning_rate": 1.7024120189788573e-06, |
|
"loss": 1.136, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 7.21210256556138, |
|
"learning_rate": 1.6713646754374225e-06, |
|
"loss": 1.1357, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8336, |
|
"grad_norm": 6.3383661134472264, |
|
"learning_rate": 1.6405772172699696e-06, |
|
"loss": 1.1153, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 8.718135834059972, |
|
"learning_rate": 1.6100506051633136e-06, |
|
"loss": 1.1553, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.8368, |
|
"grad_norm": 8.95033808717262, |
|
"learning_rate": 1.5797857916648596e-06, |
|
"loss": 1.2361, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 6.673199959521796, |
|
"learning_rate": 1.5497837211528965e-06, |
|
"loss": 1.185, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6.696629258511363, |
|
"learning_rate": 1.5200453298071238e-06, |
|
"loss": 1.2785, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 7.572117970721777, |
|
"learning_rate": 1.4905715455794379e-06, |
|
"loss": 1.2972, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.8432, |
|
"grad_norm": 9.096173969266975, |
|
"learning_rate": 1.461363288164983e-06, |
|
"loss": 1.262, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 7.743393031922831, |
|
"learning_rate": 1.432421468973444e-06, |
|
"loss": 1.1165, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.8464, |
|
"grad_norm": 7.2853153371541675, |
|
"learning_rate": 1.4037469911006096e-06, |
|
"loss": 1.2781, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 7.4707067773918965, |
|
"learning_rate": 1.3753407493001968e-06, |
|
"loss": 1.2033, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.8496, |
|
"grad_norm": 6.265617058277776, |
|
"learning_rate": 1.3472036299559255e-06, |
|
"loss": 1.1115, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 6.784902517223355, |
|
"learning_rate": 1.3193365110538647e-06, |
|
"loss": 1.218, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.8528, |
|
"grad_norm": 6.719604118534906, |
|
"learning_rate": 1.2917402621550369e-06, |
|
"loss": 1.0628, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 9.25094531147658, |
|
"learning_rate": 1.2644157443682737e-06, |
|
"loss": 1.2604, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 7.5401988168853125, |
|
"learning_rate": 1.23736381032336e-06, |
|
"loss": 1.2161, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 5.8986225646917845, |
|
"learning_rate": 1.2105853041444172e-06, |
|
"loss": 1.1727, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.8592, |
|
"grad_norm": 7.05150428234311, |
|
"learning_rate": 1.184081061423572e-06, |
|
"loss": 1.2429, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 7.8831648202761055, |
|
"learning_rate": 1.157851909194876e-06, |
|
"loss": 1.1535, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.8624, |
|
"grad_norm": 6.764706659905809, |
|
"learning_rate": 1.1318986659085062e-06, |
|
"loss": 1.2213, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 7.091883009667597, |
|
"learning_rate": 1.10622214140522e-06, |
|
"loss": 1.2937, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8656, |
|
"grad_norm": 8.205435093308155, |
|
"learning_rate": 1.080823136891086e-06, |
|
"loss": 1.0753, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 7.707709432086696, |
|
"learning_rate": 1.0557024449124854e-06, |
|
"loss": 1.22, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.8688, |
|
"grad_norm": 7.701900190738418, |
|
"learning_rate": 1.0308608493313776e-06, |
|
"loss": 1.2737, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 7.957037822325651, |
|
"learning_rate": 1.0062991253008525e-06, |
|
"loss": 1.1962, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 7.098180709287834, |
|
"learning_rate": 9.820180392409252e-07, |
|
"loss": 1.2093, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 7.610067003127613, |
|
"learning_rate": 9.580183488146323e-07, |
|
"loss": 1.2104, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8752, |
|
"grad_norm": 7.399913871553443, |
|
"learning_rate": 9.343008029043876e-07, |
|
"loss": 1.2166, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 7.845545343933413, |
|
"learning_rate": 9.108661415886111e-07, |
|
"loss": 1.224, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.8784, |
|
"grad_norm": 8.250129826284104, |
|
"learning_rate": 8.87715096118642e-07, |
|
"loss": 1.2828, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 7.754702962013714, |
|
"learning_rate": 8.64848388895917e-07, |
|
"loss": 1.2148, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8816, |
|
"grad_norm": 7.13532064965064, |
|
"learning_rate": 8.42266733449425e-07, |
|
"loss": 1.1721, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 7.085924370440946, |
|
"learning_rate": 8.199708344134493e-07, |
|
"loss": 1.1104, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8848, |
|
"grad_norm": 7.73698788059968, |
|
"learning_rate": 7.979613875055736e-07, |
|
"loss": 1.2089, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 7.791514673450203, |
|
"learning_rate": 7.76239079504979e-07, |
|
"loss": 1.2016, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 6.7303438186194855, |
|
"learning_rate": 7.548045882310084e-07, |
|
"loss": 1.0739, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 6.776201230293054, |
|
"learning_rate": 7.336585825220244e-07, |
|
"loss": 1.1817, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8912, |
|
"grad_norm": 7.513647159391496, |
|
"learning_rate": 7.128017222145267e-07, |
|
"loss": 1.1978, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 7.203824082806031, |
|
"learning_rate": 6.922346581225725e-07, |
|
"loss": 1.1137, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8944, |
|
"grad_norm": 7.674190673017307, |
|
"learning_rate": 6.719580320174657e-07, |
|
"loss": 1.1589, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 7.427895020925505, |
|
"learning_rate": 6.519724766077262e-07, |
|
"loss": 1.1643, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8976, |
|
"grad_norm": 7.522555453911278, |
|
"learning_rate": 6.322786155193594e-07, |
|
"loss": 1.1759, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 7.160309775741213, |
|
"learning_rate": 6.128770632763825e-07, |
|
"loss": 1.1401, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.9008, |
|
"grad_norm": 7.064709958323231, |
|
"learning_rate": 5.937684252816578e-07, |
|
"loss": 1.2335, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 7.336329439580518, |
|
"learning_rate": 5.749532977979977e-07, |
|
"loss": 1.1347, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 7.690187518015022, |
|
"learning_rate": 5.564322679295619e-07, |
|
"loss": 1.1116, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 7.108832953401795, |
|
"learning_rate": 5.382059136035389e-07, |
|
"loss": 1.1324, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.9072, |
|
"grad_norm": 7.578023550663353, |
|
"learning_rate": 5.202748035521021e-07, |
|
"loss": 1.1919, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 6.903932907016643, |
|
"learning_rate": 5.026394972946813e-07, |
|
"loss": 1.1659, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.9104, |
|
"grad_norm": 6.079707510002676, |
|
"learning_rate": 4.85300545120484e-07, |
|
"loss": 1.2324, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 7.452754847950774, |
|
"learning_rate": 4.6825848807133813e-07, |
|
"loss": 1.2998, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9136, |
|
"grad_norm": 6.122150122845729, |
|
"learning_rate": 4.515138579248035e-07, |
|
"loss": 1.2366, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 8.029542518584094, |
|
"learning_rate": 4.350671771775772e-07, |
|
"loss": 1.1056, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.9168, |
|
"grad_norm": 5.867322954044555, |
|
"learning_rate": 4.189189590291975e-07, |
|
"loss": 1.2582, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 8.148008523277348, |
|
"learning_rate": 4.030697073660217e-07, |
|
"loss": 1.2402, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.608369807826164, |
|
"learning_rate": 3.875199167455035e-07, |
|
"loss": 1.2935, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 6.225848002947905, |
|
"learning_rate": 3.7227007238076596e-07, |
|
"loss": 1.1169, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.9232, |
|
"grad_norm": 7.4677935130036435, |
|
"learning_rate": 3.573206501254556e-07, |
|
"loss": 1.2367, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 6.610580655839769, |
|
"learning_rate": 3.4267211645890306e-07, |
|
"loss": 1.2118, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.9264, |
|
"grad_norm": 5.9140036705595564, |
|
"learning_rate": 3.283249284715528e-07, |
|
"loss": 1.1834, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 6.959822912506896, |
|
"learning_rate": 3.1427953385071207e-07, |
|
"loss": 1.2302, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9296, |
|
"grad_norm": 7.043289170717119, |
|
"learning_rate": 3.005363708665765e-07, |
|
"loss": 1.0955, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 6.32656303603902, |
|
"learning_rate": 2.870958683585545e-07, |
|
"loss": 1.1267, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9328, |
|
"grad_norm": 6.352958182641917, |
|
"learning_rate": 2.7395844572188915e-07, |
|
"loss": 1.1378, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 7.408074075398155, |
|
"learning_rate": 2.6112451289456495e-07, |
|
"loss": 1.159, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 6.867410397272755, |
|
"learning_rate": 2.4859447034452424e-07, |
|
"loss": 1.1076, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 7.248191222610278, |
|
"learning_rate": 2.3636870905716424e-07, |
|
"loss": 1.0558, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.9392, |
|
"grad_norm": 6.2060869333572946, |
|
"learning_rate": 2.2444761052313857e-07, |
|
"loss": 1.0717, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 7.123428377796361, |
|
"learning_rate": 2.1283154672645522e-07, |
|
"loss": 1.2433, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.9424, |
|
"grad_norm": 7.587226030653606, |
|
"learning_rate": 2.015208801328694e-07, |
|
"loss": 1.137, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 8.023362652127654, |
|
"learning_rate": 1.905159636785714e-07, |
|
"loss": 1.1554, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.9456, |
|
"grad_norm": 7.1454832029115485, |
|
"learning_rate": 1.79817140759172e-07, |
|
"loss": 1.1245, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 6.22220394431011, |
|
"learning_rate": 1.6942474521899232e-07, |
|
"loss": 1.1412, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.9488, |
|
"grad_norm": 6.91072459977162, |
|
"learning_rate": 1.5933910134064202e-07, |
|
"loss": 1.1616, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 7.785687521486665, |
|
"learning_rate": 1.4956052383490295e-07, |
|
"loss": 1.2241, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 6.558516061888123, |
|
"learning_rate": 1.4008931783090707e-07, |
|
"loss": 1.1245, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 7.69748620578705, |
|
"learning_rate": 1.309257788666174e-07, |
|
"loss": 1.0904, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.9552, |
|
"grad_norm": 7.747365211882861, |
|
"learning_rate": 1.220701928796042e-07, |
|
"loss": 1.1514, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 6.611064880165885, |
|
"learning_rate": 1.1352283619812443e-07, |
|
"loss": 1.0847, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.9584, |
|
"grad_norm": 7.002989095386839, |
|
"learning_rate": 1.0528397553249636e-07, |
|
"loss": 1.1879, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 7.0152838160784805, |
|
"learning_rate": 9.73538679667807e-08, |
|
"loss": 1.2758, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9616, |
|
"grad_norm": 7.094274121519631, |
|
"learning_rate": 8.97327609507559e-08, |
|
"loss": 1.1738, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.9632, |
|
"grad_norm": 8.216237482299412, |
|
"learning_rate": 8.242089229219984e-08, |
|
"loss": 1.1826, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.9648, |
|
"grad_norm": 7.697329542588331, |
|
"learning_rate": 7.541849014946479e-08, |
|
"loss": 1.1891, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.9664, |
|
"grad_norm": 5.975230690605909, |
|
"learning_rate": 6.872577302436179e-08, |
|
"loss": 1.2118, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 6.8498253272657434, |
|
"learning_rate": 6.234294975534183e-08, |
|
"loss": 1.0708, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.9696, |
|
"grad_norm": 7.431417632188548, |
|
"learning_rate": 5.6270219510975445e-08, |
|
"loss": 1.2071, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.9712, |
|
"grad_norm": 7.544413890406052, |
|
"learning_rate": 5.050777178374544e-08, |
|
"loss": 1.1812, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.9728, |
|
"grad_norm": 7.824359941406518, |
|
"learning_rate": 4.505578638412722e-08, |
|
"loss": 1.1669, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.9744, |
|
"grad_norm": 7.681706989711725, |
|
"learning_rate": 3.9914433434982135e-08, |
|
"loss": 1.1548, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 6.900708976315526, |
|
"learning_rate": 3.508387336624619e-08, |
|
"loss": 1.0863, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9776, |
|
"grad_norm": 8.898326692320722, |
|
"learning_rate": 3.056425690992404e-08, |
|
"loss": 1.2441, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.9792, |
|
"grad_norm": 6.965073943785161, |
|
"learning_rate": 2.6355725095389416e-08, |
|
"loss": 1.2815, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9808, |
|
"grad_norm": 8.318803563609041, |
|
"learning_rate": 2.2458409244979772e-08, |
|
"loss": 1.266, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.9824, |
|
"grad_norm": 7.233363975555559, |
|
"learning_rate": 1.8872430969901766e-08, |
|
"loss": 1.227, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 7.484219660419537, |
|
"learning_rate": 1.559790216643542e-08, |
|
"loss": 1.1119, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.9856, |
|
"grad_norm": 6.4306561615634426, |
|
"learning_rate": 1.2634925012440235e-08, |
|
"loss": 1.1649, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9872, |
|
"grad_norm": 7.328910487014067, |
|
"learning_rate": 9.983591964171091e-09, |
|
"loss": 1.2878, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.9888, |
|
"grad_norm": 7.278976626995467, |
|
"learning_rate": 7.643985753390537e-09, |
|
"loss": 1.0835, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.9904, |
|
"grad_norm": 7.408823192054949, |
|
"learning_rate": 5.616179384788645e-09, |
|
"loss": 1.2181, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 7.7605788783327485, |
|
"learning_rate": 3.900236133703717e-09, |
|
"loss": 1.1201, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9936, |
|
"grad_norm": 7.585618897897224, |
|
"learning_rate": 2.496209544147199e-09, |
|
"loss": 1.2209, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.9952, |
|
"grad_norm": 7.2343937302933, |
|
"learning_rate": 1.4041434271350184e-09, |
|
"loss": 1.1871, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.9968, |
|
"grad_norm": 7.2380371709860265, |
|
"learning_rate": 6.240718593208961e-10, |
|
"loss": 1.0477, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.9984, |
|
"grad_norm": 7.4068514185411995, |
|
"learning_rate": 1.5601918192942322e-10, |
|
"loss": 1.1603, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.2891019708519105, |
|
"learning_rate": 0.0, |
|
"loss": 1.1876, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.1837999820709229, |
|
"eval_runtime": 37.7062, |
|
"eval_samples_per_second": 13.26, |
|
"eval_steps_per_second": 0.849, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3125, |
|
"total_flos": 40105673687040.0, |
|
"train_loss": 1.4248850550079346, |
|
"train_runtime": 11739.5291, |
|
"train_samples_per_second": 4.259, |
|
"train_steps_per_second": 0.266 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 40105673687040.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|