|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.5960942634494413, |
|
"eval_steps": 5000, |
|
"global_step": 17000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009388789784996713, |
|
"grad_norm": 72.80598449707031, |
|
"learning_rate": 9.103707179727828e-07, |
|
"loss": 16.0755, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.018777579569993427, |
|
"grad_norm": 72.60921478271484, |
|
"learning_rate": 1.8301267010793056e-06, |
|
"loss": 13.0643, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02816636935499014, |
|
"grad_norm": 10.017908096313477, |
|
"learning_rate": 2.7592679493195683e-06, |
|
"loss": 9.3474, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03755515913998685, |
|
"grad_norm": 5.694988250732422, |
|
"learning_rate": 3.6977944626935713e-06, |
|
"loss": 8.2606, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.04694394892498357, |
|
"grad_norm": 4.844100475311279, |
|
"learning_rate": 4.6363209760675744e-06, |
|
"loss": 8.084, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05633273870998028, |
|
"grad_norm": 3.6125142574310303, |
|
"learning_rate": 5.574847489441577e-06, |
|
"loss": 8.0581, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.06572152849497699, |
|
"grad_norm": 3.166046380996704, |
|
"learning_rate": 6.51337400281558e-06, |
|
"loss": 8.0175, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.0751103182799737, |
|
"grad_norm": 2.600433111190796, |
|
"learning_rate": 7.451900516189583e-06, |
|
"loss": 8.0285, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.08449910806497042, |
|
"grad_norm": 2.3805315494537354, |
|
"learning_rate": 8.390427029563585e-06, |
|
"loss": 8.0024, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.09388789784996714, |
|
"grad_norm": 12.382240295410156, |
|
"learning_rate": 9.328953542937589e-06, |
|
"loss": 8.0161, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10327668763496385, |
|
"grad_norm": 2.7355728149414062, |
|
"learning_rate": 1.0267480056311592e-05, |
|
"loss": 7.9941, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.11266547741996057, |
|
"grad_norm": 2.0243470668792725, |
|
"learning_rate": 1.1206006569685594e-05, |
|
"loss": 8.0233, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.12205426720495728, |
|
"grad_norm": 1.9162158966064453, |
|
"learning_rate": 1.2144533083059597e-05, |
|
"loss": 8.0141, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.13144305698995398, |
|
"grad_norm": 11.409939765930176, |
|
"learning_rate": 1.3083059596433601e-05, |
|
"loss": 7.9644, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.1408318467749507, |
|
"grad_norm": 1.712424635887146, |
|
"learning_rate": 1.4021586109807603e-05, |
|
"loss": 8.0311, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.1502206365599474, |
|
"grad_norm": 2.4589834213256836, |
|
"learning_rate": 1.4960112623181606e-05, |
|
"loss": 8.0306, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.15960942634494413, |
|
"grad_norm": 1.7343533039093018, |
|
"learning_rate": 1.589863913655561e-05, |
|
"loss": 7.989, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.16899821612994084, |
|
"grad_norm": 2.0726826190948486, |
|
"learning_rate": 1.6837165649929613e-05, |
|
"loss": 8.0034, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.17838700591493756, |
|
"grad_norm": 1.7758458852767944, |
|
"learning_rate": 1.7775692163303613e-05, |
|
"loss": 8.0107, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.18777579569993427, |
|
"grad_norm": 3.3816475868225098, |
|
"learning_rate": 1.8714218676677617e-05, |
|
"loss": 7.9737, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.197164585484931, |
|
"grad_norm": 1.8136950731277466, |
|
"learning_rate": 1.965274519005162e-05, |
|
"loss": 7.9827, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.2065533752699277, |
|
"grad_norm": 1.7819303274154663, |
|
"learning_rate": 1.9934275728965626e-05, |
|
"loss": 8.0389, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.21594216505492442, |
|
"grad_norm": 2.269160509109497, |
|
"learning_rate": 1.9829951489228525e-05, |
|
"loss": 7.973, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.22533095483992113, |
|
"grad_norm": 3.3508036136627197, |
|
"learning_rate": 1.972562724949142e-05, |
|
"loss": 7.9669, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.23471974462491785, |
|
"grad_norm": 1.674142599105835, |
|
"learning_rate": 1.962130300975432e-05, |
|
"loss": 8.0296, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.24410853440991456, |
|
"grad_norm": 1.454300880432129, |
|
"learning_rate": 1.9516978770017215e-05, |
|
"loss": 7.9984, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.2534973241949113, |
|
"grad_norm": 2.2951695919036865, |
|
"learning_rate": 1.9412654530280113e-05, |
|
"loss": 7.9772, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.26288611397990796, |
|
"grad_norm": 6.295051574707031, |
|
"learning_rate": 1.930833029054301e-05, |
|
"loss": 7.9838, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.2722749037649047, |
|
"grad_norm": 1.8874555826187134, |
|
"learning_rate": 1.9204006050805904e-05, |
|
"loss": 7.9816, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.2816636935499014, |
|
"grad_norm": 20.835277557373047, |
|
"learning_rate": 1.9099681811068803e-05, |
|
"loss": 8.0021, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.29105248333489814, |
|
"grad_norm": 2.1683876514434814, |
|
"learning_rate": 1.8995357571331702e-05, |
|
"loss": 7.9715, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.3004412731198948, |
|
"grad_norm": 1.6533387899398804, |
|
"learning_rate": 1.8891033331594598e-05, |
|
"loss": 7.9809, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.30983006290489157, |
|
"grad_norm": 4.595189094543457, |
|
"learning_rate": 1.8786709091857496e-05, |
|
"loss": 7.9849, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.31921885268988826, |
|
"grad_norm": 1.994147539138794, |
|
"learning_rate": 1.8682384852120392e-05, |
|
"loss": 7.9463, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.328607642474885, |
|
"grad_norm": 1.961474895477295, |
|
"learning_rate": 1.8578060612383287e-05, |
|
"loss": 8.0067, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3379964322598817, |
|
"grad_norm": 24.005535125732422, |
|
"learning_rate": 1.8473736372646186e-05, |
|
"loss": 7.9431, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.34738522204487843, |
|
"grad_norm": 1.9433845281600952, |
|
"learning_rate": 1.8369412132909085e-05, |
|
"loss": 7.9877, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.3567740118298751, |
|
"grad_norm": 10.296500205993652, |
|
"learning_rate": 1.826508789317198e-05, |
|
"loss": 7.9494, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.36616280161487186, |
|
"grad_norm": 2.194976568222046, |
|
"learning_rate": 1.8160763653434876e-05, |
|
"loss": 7.9466, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.37555159139986855, |
|
"grad_norm": 1.5201098918914795, |
|
"learning_rate": 1.8056439413697775e-05, |
|
"loss": 7.9708, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3849403811848653, |
|
"grad_norm": 2.9077212810516357, |
|
"learning_rate": 1.795211517396067e-05, |
|
"loss": 7.9525, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.394329170969862, |
|
"grad_norm": 2.041530132293701, |
|
"learning_rate": 1.784779093422357e-05, |
|
"loss": 7.9322, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.4037179607548587, |
|
"grad_norm": 2.0275838375091553, |
|
"learning_rate": 1.7743466694486468e-05, |
|
"loss": 7.9415, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.4131067505398554, |
|
"grad_norm": 2.000778913497925, |
|
"learning_rate": 1.7639142454749364e-05, |
|
"loss": 7.9932, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.42249554032485215, |
|
"grad_norm": 2.2984609603881836, |
|
"learning_rate": 1.753481821501226e-05, |
|
"loss": 7.9481, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.43188433010984884, |
|
"grad_norm": 1.0885875225067139, |
|
"learning_rate": 1.7430493975275155e-05, |
|
"loss": 7.976, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.4412731198948456, |
|
"grad_norm": 4.441020488739014, |
|
"learning_rate": 1.7326169735538053e-05, |
|
"loss": 7.971, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.45066190967984227, |
|
"grad_norm": 2.2056221961975098, |
|
"learning_rate": 1.7221845495800952e-05, |
|
"loss": 7.9647, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.460050699464839, |
|
"grad_norm": 2.1192028522491455, |
|
"learning_rate": 1.7117521256063848e-05, |
|
"loss": 7.9217, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.4694394892498357, |
|
"grad_norm": 14.011516571044922, |
|
"learning_rate": 1.7013197016326747e-05, |
|
"loss": 7.9374, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4694394892498357, |
|
"eval_loss": 7.951793193817139, |
|
"eval_runtime": 900.8057, |
|
"eval_samples_per_second": 378.358, |
|
"eval_steps_per_second": 2.956, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4788282790348324, |
|
"grad_norm": 39.19038009643555, |
|
"learning_rate": 1.6908872776589642e-05, |
|
"loss": 7.9026, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.4882170688198291, |
|
"grad_norm": 1.812458872795105, |
|
"learning_rate": 1.6804548536852537e-05, |
|
"loss": 7.9304, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.4976058586048258, |
|
"grad_norm": 70.96247863769531, |
|
"learning_rate": 1.6700224297115436e-05, |
|
"loss": 7.9148, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.5069946483898226, |
|
"grad_norm": 1.6605011224746704, |
|
"learning_rate": 1.6595900057378335e-05, |
|
"loss": 7.9538, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.5163834381748192, |
|
"grad_norm": 2.0463483333587646, |
|
"learning_rate": 1.649157581764123e-05, |
|
"loss": 8.0002, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5257722279598159, |
|
"grad_norm": 3.8601722717285156, |
|
"learning_rate": 1.6387251577904126e-05, |
|
"loss": 7.9571, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.5351610177448127, |
|
"grad_norm": 2.184122323989868, |
|
"learning_rate": 1.6282927338167025e-05, |
|
"loss": 7.932, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.5445498075298094, |
|
"grad_norm": 2.165367603302002, |
|
"learning_rate": 1.617860309842992e-05, |
|
"loss": 7.9047, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.5539385973148061, |
|
"grad_norm": 1.5312166213989258, |
|
"learning_rate": 1.607427885869282e-05, |
|
"loss": 7.9353, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.5633273870998028, |
|
"grad_norm": 21.75490379333496, |
|
"learning_rate": 1.5969954618955715e-05, |
|
"loss": 7.9203, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5727161768847996, |
|
"grad_norm": 1.8674250841140747, |
|
"learning_rate": 1.5865630379218614e-05, |
|
"loss": 7.8967, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.5821049666697963, |
|
"grad_norm": 49.87809371948242, |
|
"learning_rate": 1.576130613948151e-05, |
|
"loss": 7.9414, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.591493756454793, |
|
"grad_norm": 54.42366409301758, |
|
"learning_rate": 1.5658025142141778e-05, |
|
"loss": 7.9631, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.6008825462397896, |
|
"grad_norm": 37.58320236206055, |
|
"learning_rate": 1.5554744144802047e-05, |
|
"loss": 7.9606, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.6102713360247864, |
|
"grad_norm": 3.1502482891082764, |
|
"learning_rate": 1.5450419905064945e-05, |
|
"loss": 7.9377, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6196601258097831, |
|
"grad_norm": 2.5369224548339844, |
|
"learning_rate": 1.534609566532784e-05, |
|
"loss": 7.9108, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.6290489155947798, |
|
"grad_norm": 2.5891005992889404, |
|
"learning_rate": 1.5241771425590736e-05, |
|
"loss": 7.9225, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.6384377053797765, |
|
"grad_norm": 1.7794080972671509, |
|
"learning_rate": 1.5137447185853635e-05, |
|
"loss": 7.9154, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.6478264951647733, |
|
"grad_norm": 2.324805974960327, |
|
"learning_rate": 1.5033122946116532e-05, |
|
"loss": 7.9191, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.65721528494977, |
|
"grad_norm": 2.601715564727783, |
|
"learning_rate": 1.4928798706379428e-05, |
|
"loss": 7.8903, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6666040747347667, |
|
"grad_norm": 2.4438092708587646, |
|
"learning_rate": 1.4824474466642325e-05, |
|
"loss": 7.9213, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.6759928645197634, |
|
"grad_norm": 8.118125915527344, |
|
"learning_rate": 1.4720150226905224e-05, |
|
"loss": 7.9202, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.6853816543047602, |
|
"grad_norm": 3.908555746078491, |
|
"learning_rate": 1.461582598716812e-05, |
|
"loss": 7.8998, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.6947704440897569, |
|
"grad_norm": 2.72293758392334, |
|
"learning_rate": 1.4511501747431017e-05, |
|
"loss": 7.9153, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.7041592338747535, |
|
"grad_norm": 3.108797073364258, |
|
"learning_rate": 1.4407177507693915e-05, |
|
"loss": 7.9037, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7135480236597502, |
|
"grad_norm": 2.6256439685821533, |
|
"learning_rate": 1.430285326795681e-05, |
|
"loss": 7.9146, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.7229368134447469, |
|
"grad_norm": 3.5525624752044678, |
|
"learning_rate": 1.4198529028219708e-05, |
|
"loss": 7.8972, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.7323256032297437, |
|
"grad_norm": 2.6983673572540283, |
|
"learning_rate": 1.4094204788482607e-05, |
|
"loss": 7.9374, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.7417143930147404, |
|
"grad_norm": 1.545486569404602, |
|
"learning_rate": 1.3989880548745502e-05, |
|
"loss": 7.8647, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.7511031827997371, |
|
"grad_norm": 2.5116941928863525, |
|
"learning_rate": 1.38855563090084e-05, |
|
"loss": 7.8915, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7604919725847338, |
|
"grad_norm": 1.8576518297195435, |
|
"learning_rate": 1.3781232069271295e-05, |
|
"loss": 7.8846, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.7698807623697306, |
|
"grad_norm": 3.3226571083068848, |
|
"learning_rate": 1.3676907829534194e-05, |
|
"loss": 7.8988, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.7792695521547273, |
|
"grad_norm": 2.946324586868286, |
|
"learning_rate": 1.3572583589797091e-05, |
|
"loss": 7.8702, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.788658341939724, |
|
"grad_norm": 2.5089969635009766, |
|
"learning_rate": 1.3468259350059986e-05, |
|
"loss": 7.923, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.7980471317247206, |
|
"grad_norm": 2.2807912826538086, |
|
"learning_rate": 1.3363935110322885e-05, |
|
"loss": 7.891, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8074359215097174, |
|
"grad_norm": 2.5889735221862793, |
|
"learning_rate": 1.3259610870585782e-05, |
|
"loss": 7.8832, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.8168247112947141, |
|
"grad_norm": 2.8306784629821777, |
|
"learning_rate": 1.3155286630848678e-05, |
|
"loss": 7.8726, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.8262135010797108, |
|
"grad_norm": 2.626786231994629, |
|
"learning_rate": 1.3050962391111577e-05, |
|
"loss": 7.8813, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.8356022908647075, |
|
"grad_norm": 3.195319414138794, |
|
"learning_rate": 1.2946638151374474e-05, |
|
"loss": 7.8986, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.8449910806497043, |
|
"grad_norm": 5.254043102264404, |
|
"learning_rate": 1.284231391163737e-05, |
|
"loss": 7.8743, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.854379870434701, |
|
"grad_norm": 2.9493279457092285, |
|
"learning_rate": 1.2737989671900267e-05, |
|
"loss": 7.8791, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.8637686602196977, |
|
"grad_norm": 3.130415439605713, |
|
"learning_rate": 1.2633665432163165e-05, |
|
"loss": 7.8783, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.8731574500046944, |
|
"grad_norm": 4.030152797698975, |
|
"learning_rate": 1.2529341192426061e-05, |
|
"loss": 7.8528, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.8825462397896912, |
|
"grad_norm": 2.9882099628448486, |
|
"learning_rate": 1.2425016952688958e-05, |
|
"loss": 7.8864, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.8919350295746878, |
|
"grad_norm": 3.802172899246216, |
|
"learning_rate": 1.2320692712951855e-05, |
|
"loss": 7.8989, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.9013238193596845, |
|
"grad_norm": 2.724433183670044, |
|
"learning_rate": 1.2216368473214752e-05, |
|
"loss": 7.8617, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.9107126091446812, |
|
"grad_norm": 2.459376573562622, |
|
"learning_rate": 1.211204423347765e-05, |
|
"loss": 7.8371, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.920101398929678, |
|
"grad_norm": 4.715926647186279, |
|
"learning_rate": 1.2007719993740547e-05, |
|
"loss": 7.8566, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.9294901887146747, |
|
"grad_norm": 2.6845057010650635, |
|
"learning_rate": 1.1903395754003444e-05, |
|
"loss": 7.8776, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.9388789784996714, |
|
"grad_norm": 2.62907075881958, |
|
"learning_rate": 1.1799071514266341e-05, |
|
"loss": 7.8558, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9388789784996714, |
|
"eval_loss": 7.849188327789307, |
|
"eval_runtime": 1155.7489, |
|
"eval_samples_per_second": 294.897, |
|
"eval_steps_per_second": 2.304, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9482677682846681, |
|
"grad_norm": 4.570381164550781, |
|
"learning_rate": 1.1694747274529237e-05, |
|
"loss": 7.848, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.9576565580696648, |
|
"grad_norm": 21.764062881469727, |
|
"learning_rate": 1.1590423034792135e-05, |
|
"loss": 7.8227, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.9670453478546616, |
|
"grad_norm": 18.442140579223633, |
|
"learning_rate": 1.1486098795055033e-05, |
|
"loss": 7.8311, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.9764341376396583, |
|
"grad_norm": 4.737902641296387, |
|
"learning_rate": 1.1381774555317928e-05, |
|
"loss": 7.8437, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.9858229274246549, |
|
"grad_norm": 3.0295650959014893, |
|
"learning_rate": 1.1277450315580827e-05, |
|
"loss": 7.8454, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9952117172096516, |
|
"grad_norm": 3.0269651412963867, |
|
"learning_rate": 1.1173126075843724e-05, |
|
"loss": 7.8362, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.0046005069946484, |
|
"grad_norm": 4.033662796020508, |
|
"learning_rate": 1.1069845078503991e-05, |
|
"loss": 7.8681, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.013989296779645, |
|
"grad_norm": 3.5319488048553467, |
|
"learning_rate": 1.0965520838766888e-05, |
|
"loss": 7.8745, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.0233780865646418, |
|
"grad_norm": 2.7731130123138428, |
|
"learning_rate": 1.0861196599029787e-05, |
|
"loss": 7.8339, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.0327668763496385, |
|
"grad_norm": 4.000971794128418, |
|
"learning_rate": 1.0756872359292683e-05, |
|
"loss": 7.8458, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.0421556661346352, |
|
"grad_norm": 15.05604362487793, |
|
"learning_rate": 1.065254811955558e-05, |
|
"loss": 7.8493, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.0515444559196319, |
|
"grad_norm": 4.498584747314453, |
|
"learning_rate": 1.0548223879818477e-05, |
|
"loss": 7.8317, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.0609332457046288, |
|
"grad_norm": 2.8218085765838623, |
|
"learning_rate": 1.0443899640081374e-05, |
|
"loss": 7.841, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.0703220354896255, |
|
"grad_norm": 3.627685785293579, |
|
"learning_rate": 1.0339575400344271e-05, |
|
"loss": 7.8292, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.0797108252746221, |
|
"grad_norm": 4.804520606994629, |
|
"learning_rate": 1.0235251160607167e-05, |
|
"loss": 7.8121, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.0890996150596188, |
|
"grad_norm": 15.256156921386719, |
|
"learning_rate": 1.0130926920870066e-05, |
|
"loss": 7.8165, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.0984884048446155, |
|
"grad_norm": 3.684401273727417, |
|
"learning_rate": 1.0026602681132963e-05, |
|
"loss": 7.8259, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.1078771946296122, |
|
"grad_norm": 3.7146763801574707, |
|
"learning_rate": 9.92227844139586e-06, |
|
"loss": 7.8303, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.117265984414609, |
|
"grad_norm": 3.4437708854675293, |
|
"learning_rate": 9.817954201658755e-06, |
|
"loss": 7.809, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.1266547741996056, |
|
"grad_norm": 4.232120990753174, |
|
"learning_rate": 9.713629961921654e-06, |
|
"loss": 7.818, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.1360435639846025, |
|
"grad_norm": 3.4967739582061768, |
|
"learning_rate": 9.60930572218455e-06, |
|
"loss": 7.8071, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.1454323537695992, |
|
"grad_norm": 10.542444229125977, |
|
"learning_rate": 9.504981482447447e-06, |
|
"loss": 7.801, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.1548211435545959, |
|
"grad_norm": 3.744981527328491, |
|
"learning_rate": 9.400657242710344e-06, |
|
"loss": 7.8123, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 1.1642099333395926, |
|
"grad_norm": 3.3549323081970215, |
|
"learning_rate": 9.296333002973241e-06, |
|
"loss": 7.8203, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 1.1735987231245892, |
|
"grad_norm": 5.337845325469971, |
|
"learning_rate": 9.192008763236138e-06, |
|
"loss": 7.8609, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.182987512909586, |
|
"grad_norm": 3.204465627670288, |
|
"learning_rate": 9.087684523499036e-06, |
|
"loss": 7.7782, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 1.1923763026945826, |
|
"grad_norm": 4.669897079467773, |
|
"learning_rate": 8.983360283761933e-06, |
|
"loss": 7.8092, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 1.2017650924795793, |
|
"grad_norm": 3.1824800968170166, |
|
"learning_rate": 8.87903604402483e-06, |
|
"loss": 7.815, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 1.211153882264576, |
|
"grad_norm": 3.6459527015686035, |
|
"learning_rate": 8.774711804287727e-06, |
|
"loss": 7.8196, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 1.220542672049573, |
|
"grad_norm": 3.732983112335205, |
|
"learning_rate": 8.670387564550624e-06, |
|
"loss": 7.8206, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.2299314618345696, |
|
"grad_norm": 4.478656768798828, |
|
"learning_rate": 8.566063324813521e-06, |
|
"loss": 7.8022, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 1.2393202516195663, |
|
"grad_norm": 3.7781801223754883, |
|
"learning_rate": 8.461739085076418e-06, |
|
"loss": 7.8043, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 1.248709041404563, |
|
"grad_norm": 5.932605743408203, |
|
"learning_rate": 8.357414845339316e-06, |
|
"loss": 7.7823, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.2580978311895596, |
|
"grad_norm": 3.8288068771362305, |
|
"learning_rate": 8.254133847999584e-06, |
|
"loss": 7.8061, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.2674866209745563, |
|
"grad_norm": 4.60470724105835, |
|
"learning_rate": 8.14980960826248e-06, |
|
"loss": 7.8016, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.276875410759553, |
|
"grad_norm": 5.450839996337891, |
|
"learning_rate": 8.045485368525377e-06, |
|
"loss": 7.8076, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.28626420054455, |
|
"grad_norm": 7.866298198699951, |
|
"learning_rate": 7.941161128788276e-06, |
|
"loss": 7.7996, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.2956529903295464, |
|
"grad_norm": 3.059967041015625, |
|
"learning_rate": 7.836836889051171e-06, |
|
"loss": 7.8035, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.3050417801145433, |
|
"grad_norm": 3.5380911827087402, |
|
"learning_rate": 7.732512649314069e-06, |
|
"loss": 7.8092, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.31443056989954, |
|
"grad_norm": 4.589097499847412, |
|
"learning_rate": 7.628188409576966e-06, |
|
"loss": 7.7902, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.3238193596845367, |
|
"grad_norm": 6.932407855987549, |
|
"learning_rate": 7.523864169839863e-06, |
|
"loss": 7.8114, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.3332081494695334, |
|
"grad_norm": 3.5786869525909424, |
|
"learning_rate": 7.41953993010276e-06, |
|
"loss": 7.8112, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.34259693925453, |
|
"grad_norm": 4.283187389373779, |
|
"learning_rate": 7.315215690365657e-06, |
|
"loss": 7.8036, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.3519857290395267, |
|
"grad_norm": 14.625285148620605, |
|
"learning_rate": 7.210891450628554e-06, |
|
"loss": 7.8178, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.3613745188245234, |
|
"grad_norm": 3.5072567462921143, |
|
"learning_rate": 7.106567210891451e-06, |
|
"loss": 7.8391, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.3707633086095203, |
|
"grad_norm": 4.140475749969482, |
|
"learning_rate": 7.002242971154349e-06, |
|
"loss": 7.8151, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.380152098394517, |
|
"grad_norm": 6.985396385192871, |
|
"learning_rate": 6.897918731417246e-06, |
|
"loss": 7.7957, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.3895408881795137, |
|
"grad_norm": 3.8024065494537354, |
|
"learning_rate": 6.793594491680142e-06, |
|
"loss": 7.7833, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.3989296779645104, |
|
"grad_norm": 4.183823585510254, |
|
"learning_rate": 6.689270251943039e-06, |
|
"loss": 7.8049, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.408318467749507, |
|
"grad_norm": 3.431105375289917, |
|
"learning_rate": 6.5849460122059365e-06, |
|
"loss": 7.8163, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.408318467749507, |
|
"eval_loss": 7.807833671569824, |
|
"eval_runtime": 335.7694, |
|
"eval_samples_per_second": 1015.063, |
|
"eval_steps_per_second": 7.931, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.4177072575345038, |
|
"grad_norm": 8.183846473693848, |
|
"learning_rate": 6.480621772468834e-06, |
|
"loss": 7.7864, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 1.4270960473195005, |
|
"grad_norm": 11.66592788696289, |
|
"learning_rate": 6.376297532731731e-06, |
|
"loss": 7.8241, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 1.4364848371044971, |
|
"grad_norm": 9.620813369750977, |
|
"learning_rate": 6.271973292994628e-06, |
|
"loss": 7.7694, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 1.4458736268894938, |
|
"grad_norm": 4.250575065612793, |
|
"learning_rate": 6.167649053257525e-06, |
|
"loss": 7.7784, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 1.4552624166744907, |
|
"grad_norm": 3.8679049015045166, |
|
"learning_rate": 6.0633248135204214e-06, |
|
"loss": 7.7628, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.4646512064594874, |
|
"grad_norm": 4.648382186889648, |
|
"learning_rate": 5.959000573783319e-06, |
|
"loss": 7.8044, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 1.4740399962444841, |
|
"grad_norm": 4.5424113273620605, |
|
"learning_rate": 5.854676334046217e-06, |
|
"loss": 7.7871, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 1.4834287860294808, |
|
"grad_norm": 4.026553630828857, |
|
"learning_rate": 5.750352094309113e-06, |
|
"loss": 7.809, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 1.4928175758144775, |
|
"grad_norm": 6.175102233886719, |
|
"learning_rate": 5.647071096969381e-06, |
|
"loss": 7.7955, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 1.5022063655994742, |
|
"grad_norm": 4.672608375549316, |
|
"learning_rate": 5.542746857232279e-06, |
|
"loss": 7.8056, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.5115951553844709, |
|
"grad_norm": 7.012312412261963, |
|
"learning_rate": 5.438422617495176e-06, |
|
"loss": 7.774, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 1.5209839451694678, |
|
"grad_norm": 5.2042131423950195, |
|
"learning_rate": 5.334098377758072e-06, |
|
"loss": 7.7874, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 1.5303727349544642, |
|
"grad_norm": 3.745805501937866, |
|
"learning_rate": 5.22977413802097e-06, |
|
"loss": 7.7918, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 1.5397615247394612, |
|
"grad_norm": 4.060446262359619, |
|
"learning_rate": 5.125449898283867e-06, |
|
"loss": 7.7787, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 1.5491503145244578, |
|
"grad_norm": 21.851919174194336, |
|
"learning_rate": 5.021125658546764e-06, |
|
"loss": 7.7881, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.5585391043094545, |
|
"grad_norm": 4.261013507843018, |
|
"learning_rate": 4.916801418809661e-06, |
|
"loss": 7.7723, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 1.5679278940944512, |
|
"grad_norm": 3.9473931789398193, |
|
"learning_rate": 4.812477179072558e-06, |
|
"loss": 7.7809, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 1.577316683879448, |
|
"grad_norm": 6.088964939117432, |
|
"learning_rate": 4.709196181732826e-06, |
|
"loss": 7.8096, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 1.5867054736644448, |
|
"grad_norm": 7.912614822387695, |
|
"learning_rate": 4.604871941995723e-06, |
|
"loss": 7.7559, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 1.5960942634494413, |
|
"grad_norm": 7.268245697021484, |
|
"learning_rate": 4.50054770225862e-06, |
|
"loss": 7.8063, |
|
"step": 17000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 21302, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|