|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.20783550986385108, |
|
"eval_steps": 34, |
|
"global_step": 374, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005557099194220616, |
|
"eval_loss": 2.150275230407715, |
|
"eval_runtime": 385.5335, |
|
"eval_samples_per_second": 7.862, |
|
"eval_steps_per_second": 0.983, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0016671297582661851, |
|
"grad_norm": 0.9782189726829529, |
|
"learning_rate": 1.5e-05, |
|
"loss": 8.4821, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0033342595165323703, |
|
"grad_norm": 9.340497970581055, |
|
"learning_rate": 3e-05, |
|
"loss": 8.0841, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005001389274798555, |
|
"grad_norm": 1.2668017148971558, |
|
"learning_rate": 4.5e-05, |
|
"loss": 9.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0066685190330647405, |
|
"grad_norm": 1.2630752325057983, |
|
"learning_rate": 4.999675562428437e-05, |
|
"loss": 8.5273, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.008335648791330925, |
|
"grad_norm": 1.4245824813842773, |
|
"learning_rate": 4.9979724954289244e-05, |
|
"loss": 8.1841, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01000277854959711, |
|
"grad_norm": 1.4201760292053223, |
|
"learning_rate": 4.994810682835951e-05, |
|
"loss": 7.569, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.011669908307863295, |
|
"grad_norm": 1.8734160661697388, |
|
"learning_rate": 4.990191971059033e-05, |
|
"loss": 7.0022, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.013337038066129481, |
|
"grad_norm": 1.1166455745697021, |
|
"learning_rate": 4.984119057295783e-05, |
|
"loss": 6.704, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.015004167824395665, |
|
"grad_norm": 1.3420771360397339, |
|
"learning_rate": 4.976595487956823e-05, |
|
"loss": 7.354, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.01667129758266185, |
|
"grad_norm": 1.4245537519454956, |
|
"learning_rate": 4.967625656594782e-05, |
|
"loss": 6.7949, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.018338427340928037, |
|
"grad_norm": 1.3899712562561035, |
|
"learning_rate": 4.957214801338581e-05, |
|
"loss": 6.8879, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.018894137260350097, |
|
"eval_loss": 1.6003342866897583, |
|
"eval_runtime": 387.5811, |
|
"eval_samples_per_second": 7.82, |
|
"eval_steps_per_second": 0.978, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.02000555709919422, |
|
"grad_norm": 1.2631561756134033, |
|
"learning_rate": 4.9453690018345144e-05, |
|
"loss": 6.9872, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.021672686857460405, |
|
"grad_norm": 1.1402790546417236, |
|
"learning_rate": 4.932095175695911e-05, |
|
"loss": 6.2949, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.02333981661572659, |
|
"grad_norm": 1.0827226638793945, |
|
"learning_rate": 4.917401074463441e-05, |
|
"loss": 6.049, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.025006946373992776, |
|
"grad_norm": 1.1316773891448975, |
|
"learning_rate": 4.901295279078431e-05, |
|
"loss": 6.1119, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.026674076132258962, |
|
"grad_norm": 1.0732563734054565, |
|
"learning_rate": 4.883787194871841e-05, |
|
"loss": 5.899, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.028341205890525144, |
|
"grad_norm": 1.0603082180023193, |
|
"learning_rate": 4.864887046071813e-05, |
|
"loss": 5.7208, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03000833564879133, |
|
"grad_norm": 1.1684924364089966, |
|
"learning_rate": 4.8446058698330115e-05, |
|
"loss": 6.4161, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03167546540705751, |
|
"grad_norm": 1.147234320640564, |
|
"learning_rate": 4.822955509791233e-05, |
|
"loss": 6.0586, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0333425951653237, |
|
"grad_norm": 1.4725079536437988, |
|
"learning_rate": 4.799948609147061e-05, |
|
"loss": 6.2541, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.035009724923589884, |
|
"grad_norm": 1.3403408527374268, |
|
"learning_rate": 4.7755986032825864e-05, |
|
"loss": 6.0201, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.03667685468185607, |
|
"grad_norm": 1.272765040397644, |
|
"learning_rate": 4.74991971191553e-05, |
|
"loss": 5.6225, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.037788274520700195, |
|
"eval_loss": 1.4521716833114624, |
|
"eval_runtime": 387.3586, |
|
"eval_samples_per_second": 7.825, |
|
"eval_steps_per_second": 0.978, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.038343984440122256, |
|
"grad_norm": 1.3769108057022095, |
|
"learning_rate": 4.7229269307953235e-05, |
|
"loss": 5.515, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.04001111419838844, |
|
"grad_norm": 1.1973274946212769, |
|
"learning_rate": 4.694636022946012e-05, |
|
"loss": 5.7673, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.04167824395665463, |
|
"grad_norm": 1.3785252571105957, |
|
"learning_rate": 4.665063509461097e-05, |
|
"loss": 5.945, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.04334537371492081, |
|
"grad_norm": 1.3699363470077515, |
|
"learning_rate": 4.6342266598556814e-05, |
|
"loss": 5.5223, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.045012503473187, |
|
"grad_norm": 1.2516443729400635, |
|
"learning_rate": 4.6021434819815555e-05, |
|
"loss": 5.6879, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.04667963323145318, |
|
"grad_norm": 1.6110479831695557, |
|
"learning_rate": 4.568832711511125e-05, |
|
"loss": 5.6658, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.048346762989719363, |
|
"grad_norm": 1.2466659545898438, |
|
"learning_rate": 4.534313800996299e-05, |
|
"loss": 5.6652, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.05001389274798555, |
|
"grad_norm": 1.5405720472335815, |
|
"learning_rate": 4.498606908508754e-05, |
|
"loss": 5.4541, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.051681022506251735, |
|
"grad_norm": 1.4913195371627808, |
|
"learning_rate": 4.46173288586818e-05, |
|
"loss": 6.3391, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.053348152264517924, |
|
"grad_norm": 1.4445130825042725, |
|
"learning_rate": 4.4237132664654154e-05, |
|
"loss": 5.5362, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.05501528202278411, |
|
"grad_norm": 1.4353359937667847, |
|
"learning_rate": 4.384570252687542e-05, |
|
"loss": 5.6292, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.05668241178105029, |
|
"grad_norm": 1.515329122543335, |
|
"learning_rate": 4.344326702952326e-05, |
|
"loss": 5.9767, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.05668241178105029, |
|
"eval_loss": 1.3901065587997437, |
|
"eval_runtime": 387.1962, |
|
"eval_samples_per_second": 7.828, |
|
"eval_steps_per_second": 0.979, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.05834954153931648, |
|
"grad_norm": 1.3371084928512573, |
|
"learning_rate": 4.303006118359537e-05, |
|
"loss": 5.0829, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06001667129758266, |
|
"grad_norm": 1.3986601829528809, |
|
"learning_rate": 4.260632628966974e-05, |
|
"loss": 4.9783, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.06168380105584885, |
|
"grad_norm": 1.6927534341812134, |
|
"learning_rate": 4.217230979699188e-05, |
|
"loss": 5.5207, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.06335093081411503, |
|
"grad_norm": 1.4875972270965576, |
|
"learning_rate": 4.172826515897146e-05, |
|
"loss": 4.9577, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.06501806057238121, |
|
"grad_norm": 1.5819252729415894, |
|
"learning_rate": 4.12744516851726e-05, |
|
"loss": 5.5329, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.0666851903306474, |
|
"grad_norm": 1.7034679651260376, |
|
"learning_rate": 4.0811134389884433e-05, |
|
"loss": 5.8642, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06835232008891359, |
|
"grad_norm": 1.4892022609710693, |
|
"learning_rate": 4.0338583837360225e-05, |
|
"loss": 4.9988, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.07001944984717977, |
|
"grad_norm": 1.8653110265731812, |
|
"learning_rate": 3.985707598381544e-05, |
|
"loss": 5.5333, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.07168657960544596, |
|
"grad_norm": 1.6267685890197754, |
|
"learning_rate": 3.9366892016277096e-05, |
|
"loss": 5.5853, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.07335370936371215, |
|
"grad_norm": 1.710310935974121, |
|
"learning_rate": 3.886831818837847e-05, |
|
"loss": 5.5769, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.07502083912197832, |
|
"grad_norm": 2.016270160675049, |
|
"learning_rate": 3.8361645653195026e-05, |
|
"loss": 5.5546, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.07557654904140039, |
|
"eval_loss": 1.3533298969268799, |
|
"eval_runtime": 387.1489, |
|
"eval_samples_per_second": 7.829, |
|
"eval_steps_per_second": 0.979, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.07668796888024451, |
|
"grad_norm": 2.021454095840454, |
|
"learning_rate": 3.784717029321922e-05, |
|
"loss": 5.2784, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.0783550986385107, |
|
"grad_norm": 1.5546106100082397, |
|
"learning_rate": 3.732519254757344e-05, |
|
"loss": 5.21, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.08002222839677688, |
|
"grad_norm": 1.662156343460083, |
|
"learning_rate": 3.679601723656205e-05, |
|
"loss": 5.3778, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.08168935815504307, |
|
"grad_norm": 1.6201971769332886, |
|
"learning_rate": 3.625995338366492e-05, |
|
"loss": 5.4251, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.08335648791330925, |
|
"grad_norm": 1.6661242246627808, |
|
"learning_rate": 3.5717314035076355e-05, |
|
"loss": 5.292, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08502361767157544, |
|
"grad_norm": 1.595383644104004, |
|
"learning_rate": 3.516841607689501e-05, |
|
"loss": 5.128, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.08669074742984162, |
|
"grad_norm": 1.7640243768692017, |
|
"learning_rate": 3.461358005007128e-05, |
|
"loss": 5.2638, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.08835787718810781, |
|
"grad_norm": 2.1148338317871094, |
|
"learning_rate": 3.405312996322042e-05, |
|
"loss": 5.1089, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.090025006946374, |
|
"grad_norm": 1.9622715711593628, |
|
"learning_rate": 3.348739310341068e-05, |
|
"loss": 4.7301, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.09169213670464017, |
|
"grad_norm": 1.920733094215393, |
|
"learning_rate": 3.2916699845036816e-05, |
|
"loss": 5.0066, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.09335926646290636, |
|
"grad_norm": 1.6611617803573608, |
|
"learning_rate": 3.234138345689077e-05, |
|
"loss": 5.198, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.09447068630175048, |
|
"eval_loss": 1.3297733068466187, |
|
"eval_runtime": 387.3661, |
|
"eval_samples_per_second": 7.825, |
|
"eval_steps_per_second": 0.978, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09502639622117255, |
|
"grad_norm": 1.5192731618881226, |
|
"learning_rate": 3.17617799075421e-05, |
|
"loss": 4.9727, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.09669352597943873, |
|
"grad_norm": 2.1325037479400635, |
|
"learning_rate": 3.1178227669141744e-05, |
|
"loss": 5.287, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.09836065573770492, |
|
"grad_norm": 1.6394548416137695, |
|
"learning_rate": 3.0591067519763895e-05, |
|
"loss": 5.0878, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.1000277854959711, |
|
"grad_norm": 1.954785704612732, |
|
"learning_rate": 3.0000642344401113e-05, |
|
"loss": 5.7474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1016949152542373, |
|
"grad_norm": 1.7333064079284668, |
|
"learning_rate": 2.9407296934729227e-05, |
|
"loss": 5.2069, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.10336204501250347, |
|
"grad_norm": 1.7775465250015259, |
|
"learning_rate": 2.8811377787758636e-05, |
|
"loss": 4.8365, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.10502917477076966, |
|
"grad_norm": 1.766340970993042, |
|
"learning_rate": 2.8213232903489865e-05, |
|
"loss": 4.8806, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.10669630452903585, |
|
"grad_norm": 2.064275026321411, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 5.1876, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.10836343428730202, |
|
"grad_norm": 1.731985330581665, |
|
"learning_rate": 2.7011664217918154e-05, |
|
"loss": 4.6924, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11003056404556821, |
|
"grad_norm": 1.8852187395095825, |
|
"learning_rate": 2.6408942098890936e-05, |
|
"loss": 5.0911, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1116976938038344, |
|
"grad_norm": 1.8446505069732666, |
|
"learning_rate": 2.580539719735433e-05, |
|
"loss": 5.0379, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.11336482356210058, |
|
"grad_norm": 1.863871455192566, |
|
"learning_rate": 2.5201381966534748e-05, |
|
"loss": 5.3173, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.11336482356210058, |
|
"eval_loss": 1.3148518800735474, |
|
"eval_runtime": 387.4809, |
|
"eval_samples_per_second": 7.822, |
|
"eval_steps_per_second": 0.978, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.11503195332036677, |
|
"grad_norm": 2.1243629455566406, |
|
"learning_rate": 2.459724913431772e-05, |
|
"loss": 5.1268, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.11669908307863296, |
|
"grad_norm": 1.8287479877471924, |
|
"learning_rate": 2.399335149726463e-05, |
|
"loss": 4.8911, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11836621283689913, |
|
"grad_norm": 1.827951431274414, |
|
"learning_rate": 2.3390041714589514e-05, |
|
"loss": 5.0788, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.12003334259516532, |
|
"grad_norm": 1.9181324243545532, |
|
"learning_rate": 2.2787672102216042e-05, |
|
"loss": 5.2716, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.12170047235343151, |
|
"grad_norm": 2.334996461868286, |
|
"learning_rate": 2.2186594427034864e-05, |
|
"loss": 5.4529, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.1233676021116977, |
|
"grad_norm": 2.042280673980713, |
|
"learning_rate": 2.1587159701481716e-05, |
|
"loss": 4.8902, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.1250347318699639, |
|
"grad_norm": 1.9080718755722046, |
|
"learning_rate": 2.098971797855599e-05, |
|
"loss": 4.933, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.12670186162823005, |
|
"grad_norm": 1.7101670503616333, |
|
"learning_rate": 2.0394618147399713e-05, |
|
"loss": 5.0186, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.12836899138649624, |
|
"grad_norm": 2.011359453201294, |
|
"learning_rate": 1.980220772955602e-05, |
|
"loss": 4.7936, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.13003612114476243, |
|
"grad_norm": 2.302273750305176, |
|
"learning_rate": 1.921283267602643e-05, |
|
"loss": 5.1487, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.13170325090302862, |
|
"grad_norm": 2.4797189235687256, |
|
"learning_rate": 1.8626837165245165e-05, |
|
"loss": 5.2862, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.13225896082245067, |
|
"eval_loss": 1.303543210029602, |
|
"eval_runtime": 387.3037, |
|
"eval_samples_per_second": 7.826, |
|
"eval_steps_per_second": 0.979, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.1333703806612948, |
|
"grad_norm": 1.8348701000213623, |
|
"learning_rate": 1.8044563402088684e-05, |
|
"loss": 5.0605, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.135037510419561, |
|
"grad_norm": 2.109149217605591, |
|
"learning_rate": 1.746635141803761e-05, |
|
"loss": 4.9242, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.13670464017782719, |
|
"grad_norm": 2.1694352626800537, |
|
"learning_rate": 1.6892538872607937e-05, |
|
"loss": 5.0852, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.13837176993609335, |
|
"grad_norm": 2.145925998687744, |
|
"learning_rate": 1.6323460856167426e-05, |
|
"loss": 4.9473, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.14003889969435954, |
|
"grad_norm": 2.4143218994140625, |
|
"learning_rate": 1.5759449694252226e-05, |
|
"loss": 5.2909, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.14170602945262573, |
|
"grad_norm": 2.154897689819336, |
|
"learning_rate": 1.5200834753498128e-05, |
|
"loss": 5.2477, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.14337315921089192, |
|
"grad_norm": 2.189666986465454, |
|
"learning_rate": 1.4647942249299707e-05, |
|
"loss": 5.4482, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.1450402889691581, |
|
"grad_norm": 1.9108495712280273, |
|
"learning_rate": 1.4101095055309746e-05, |
|
"loss": 5.1046, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.1467074187274243, |
|
"grad_norm": 1.8444137573242188, |
|
"learning_rate": 1.356061251489012e-05, |
|
"loss": 5.0423, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.14837454848569048, |
|
"grad_norm": 2.031024694442749, |
|
"learning_rate": 1.302681025462424e-05, |
|
"loss": 4.5857, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.15004167824395664, |
|
"grad_norm": 2.1987197399139404, |
|
"learning_rate": 1.2500000000000006e-05, |
|
"loss": 5.381, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15115309808280078, |
|
"eval_loss": 1.2963863611221313, |
|
"eval_runtime": 387.5334, |
|
"eval_samples_per_second": 7.821, |
|
"eval_steps_per_second": 0.978, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.15170880800222283, |
|
"grad_norm": 1.8340719938278198, |
|
"learning_rate": 1.1980489393370938e-05, |
|
"loss": 5.4333, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.15337593776048902, |
|
"grad_norm": 2.617314577102661, |
|
"learning_rate": 1.1468581814301717e-05, |
|
"loss": 5.757, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.1550430675187552, |
|
"grad_norm": 2.0526530742645264, |
|
"learning_rate": 1.096457620240298e-05, |
|
"loss": 4.9528, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.1567101972770214, |
|
"grad_norm": 2.3239846229553223, |
|
"learning_rate": 1.0468766882759094e-05, |
|
"loss": 5.2481, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.1583773270352876, |
|
"grad_norm": 2.1533966064453125, |
|
"learning_rate": 9.981443394050525e-06, |
|
"loss": 5.6509, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.16004445679355375, |
|
"grad_norm": 1.9592647552490234, |
|
"learning_rate": 9.502890319471491e-06, |
|
"loss": 4.9243, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.16171158655181994, |
|
"grad_norm": 2.204939126968384, |
|
"learning_rate": 9.033387120541306e-06, |
|
"loss": 5.2745, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.16337871631008613, |
|
"grad_norm": 2.236279010772705, |
|
"learning_rate": 8.573207973906735e-06, |
|
"loss": 5.4374, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.16504584606835232, |
|
"grad_norm": 2.4140145778656006, |
|
"learning_rate": 8.1226216112306e-06, |
|
"loss": 5.4428, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.1667129758266185, |
|
"grad_norm": 2.0701277256011963, |
|
"learning_rate": 7.681891162260015e-06, |
|
"loss": 5.4502, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1683801055848847, |
|
"grad_norm": 2.154461622238159, |
|
"learning_rate": 7.251274001166044e-06, |
|
"loss": 5.0715, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.1700472353431509, |
|
"grad_norm": 2.3085010051727295, |
|
"learning_rate": 6.831021596244424e-06, |
|
"loss": 4.8451, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.1700472353431509, |
|
"eval_loss": 1.2917685508728027, |
|
"eval_runtime": 387.2177, |
|
"eval_samples_per_second": 7.828, |
|
"eval_steps_per_second": 0.979, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.17171436510141705, |
|
"grad_norm": 2.220491886138916, |
|
"learning_rate": 6.421379363065142e-06, |
|
"loss": 5.3908, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.17338149485968324, |
|
"grad_norm": 2.2047910690307617, |
|
"learning_rate": 6.022586521156715e-06, |
|
"loss": 5.2721, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.17504862461794943, |
|
"grad_norm": 2.1623401641845703, |
|
"learning_rate": 5.634875954308638e-06, |
|
"loss": 5.5073, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.17671575437621562, |
|
"grad_norm": 1.9954192638397217, |
|
"learning_rate": 5.258474074573877e-06, |
|
"loss": 5.1791, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.1783828841344818, |
|
"grad_norm": 2.24808669090271, |
|
"learning_rate": 4.893600690050579e-06, |
|
"loss": 5.0704, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.180050013892748, |
|
"grad_norm": 2.2592039108276367, |
|
"learning_rate": 4.540468876520323e-06, |
|
"loss": 5.0177, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.18171714365101416, |
|
"grad_norm": 1.9192252159118652, |
|
"learning_rate": 4.199284853017896e-06, |
|
"loss": 5.2738, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.18338427340928035, |
|
"grad_norm": 2.021440267562866, |
|
"learning_rate": 3.8702478614051355e-06, |
|
"loss": 4.6439, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18505140316754654, |
|
"grad_norm": 2.309406042098999, |
|
"learning_rate": 3.5535500500193357e-06, |
|
"loss": 5.4409, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.18671853292581272, |
|
"grad_norm": 2.2390878200531006, |
|
"learning_rate": 3.249376361464021e-06, |
|
"loss": 5.1074, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.18838566268407891, |
|
"grad_norm": 2.540015697479248, |
|
"learning_rate": 2.957904424607652e-06, |
|
"loss": 5.2675, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.18894137260350097, |
|
"eval_loss": 1.2895426750183105, |
|
"eval_runtime": 387.2989, |
|
"eval_samples_per_second": 7.826, |
|
"eval_steps_per_second": 0.979, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1900527924423451, |
|
"grad_norm": 2.492077112197876, |
|
"learning_rate": 2.679304450853401e-06, |
|
"loss": 5.211, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.1917199222006113, |
|
"grad_norm": 1.7696568965911865, |
|
"learning_rate": 2.4137391347404476e-06, |
|
"loss": 5.3133, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.19338705195887745, |
|
"grad_norm": 1.9813653230667114, |
|
"learning_rate": 2.1613635589349756e-06, |
|
"loss": 5.0046, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.19505418171714364, |
|
"grad_norm": 2.0188918113708496, |
|
"learning_rate": 1.922325103666281e-06, |
|
"loss": 5.1457, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 2.231536865234375, |
|
"learning_rate": 1.696763360660808e-06, |
|
"loss": 4.8438, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.19838844123367602, |
|
"grad_norm": 2.102440357208252, |
|
"learning_rate": 1.4848100516245717e-06, |
|
"loss": 5.4079, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.2000555709919422, |
|
"grad_norm": 2.369569778442383, |
|
"learning_rate": 1.286588951321363e-06, |
|
"loss": 5.661, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2017227007502084, |
|
"grad_norm": 1.8659355640411377, |
|
"learning_rate": 1.102215815291774e-06, |
|
"loss": 4.9097, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.2033898305084746, |
|
"grad_norm": 2.071213722229004, |
|
"learning_rate": 9.317983122552332e-07, |
|
"loss": 5.0217, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.20505696026674075, |
|
"grad_norm": 2.435093402862549, |
|
"learning_rate": 7.754359612344859e-07, |
|
"loss": 5.44, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.20672409002500694, |
|
"grad_norm": 2.3098435401916504, |
|
"learning_rate": 6.332200734393057e-07, |
|
"loss": 5.2669, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.20783550986385108, |
|
"eval_loss": 1.2886923551559448, |
|
"eval_runtime": 387.521, |
|
"eval_samples_per_second": 7.822, |
|
"eval_steps_per_second": 0.978, |
|
"step": 374 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 34, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.43602420813824e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|