|
{ |
|
"best_metric": 0.5025785565376282, |
|
"best_model_checkpoint": "models/E-Coli-FFT/KCYHSM/checkpoint-21500", |
|
"epoch": 8.96, |
|
"eval_steps": 500, |
|
"global_step": 28000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.560947835445404, |
|
"learning_rate": 4.996016e-05, |
|
"loss": 2.0872, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy_per_token": 0.30261626839637756, |
|
"eval_loss": 2.0467002391815186, |
|
"eval_runtime": 226.6763, |
|
"eval_samples_per_second": 110.289, |
|
"eval_steps_per_second": 6.895, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5276467204093933, |
|
"learning_rate": 4.992016000000001e-05, |
|
"loss": 2.033, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_accuracy_per_token": 0.31899595260620117, |
|
"eval_loss": 2.0086700916290283, |
|
"eval_runtime": 215.1833, |
|
"eval_samples_per_second": 116.18, |
|
"eval_steps_per_second": 7.264, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.3343302011489868, |
|
"learning_rate": 4.988016e-05, |
|
"loss": 1.9439, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_accuracy_per_token": 0.3831162750720978, |
|
"eval_loss": 1.8512518405914307, |
|
"eval_runtime": 215.5299, |
|
"eval_samples_per_second": 115.993, |
|
"eval_steps_per_second": 7.252, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.4562740325927734, |
|
"learning_rate": 4.984016e-05, |
|
"loss": 1.7198, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_accuracy_per_token": 0.47227564454078674, |
|
"eval_loss": 1.6024138927459717, |
|
"eval_runtime": 215.7037, |
|
"eval_samples_per_second": 115.9, |
|
"eval_steps_per_second": 7.246, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.136140823364258, |
|
"learning_rate": 4.980016e-05, |
|
"loss": 1.5124, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_accuracy_per_token": 0.5323951840400696, |
|
"eval_loss": 1.4281255006790161, |
|
"eval_runtime": 215.8041, |
|
"eval_samples_per_second": 115.846, |
|
"eval_steps_per_second": 7.243, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.6090725660324097, |
|
"learning_rate": 4.976016e-05, |
|
"loss": 1.3463, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_accuracy_per_token": 0.5815901160240173, |
|
"eval_loss": 1.2842084169387817, |
|
"eval_runtime": 215.7888, |
|
"eval_samples_per_second": 115.854, |
|
"eval_steps_per_second": 7.243, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.4967901706695557, |
|
"learning_rate": 4.972016e-05, |
|
"loss": 1.1657, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_accuracy_per_token": 0.6198335886001587, |
|
"eval_loss": 1.1727232933044434, |
|
"eval_runtime": 215.8392, |
|
"eval_samples_per_second": 115.827, |
|
"eval_steps_per_second": 7.242, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.750229597091675, |
|
"learning_rate": 4.968016e-05, |
|
"loss": 1.0624, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_accuracy_per_token": 0.6496000289916992, |
|
"eval_loss": 1.0832889080047607, |
|
"eval_runtime": 216.0152, |
|
"eval_samples_per_second": 115.733, |
|
"eval_steps_per_second": 7.236, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.4956400394439697, |
|
"learning_rate": 4.9640160000000003e-05, |
|
"loss": 0.9957, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_accuracy_per_token": 0.6765108108520508, |
|
"eval_loss": 1.0036195516586304, |
|
"eval_runtime": 215.7199, |
|
"eval_samples_per_second": 115.891, |
|
"eval_steps_per_second": 7.246, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.816002368927002, |
|
"learning_rate": 4.9600160000000004e-05, |
|
"loss": 0.913, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_accuracy_per_token": 0.6984472274780273, |
|
"eval_loss": 0.9418182969093323, |
|
"eval_runtime": 215.8333, |
|
"eval_samples_per_second": 115.83, |
|
"eval_steps_per_second": 7.242, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.293124198913574, |
|
"learning_rate": 4.956016e-05, |
|
"loss": 0.8747, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_accuracy_per_token": 0.7186679244041443, |
|
"eval_loss": 0.8788002133369446, |
|
"eval_runtime": 215.9244, |
|
"eval_samples_per_second": 115.781, |
|
"eval_steps_per_second": 7.239, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.6259233951568604, |
|
"learning_rate": 4.9520160000000005e-05, |
|
"loss": 0.8064, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"eval_accuracy_per_token": 0.7395341992378235, |
|
"eval_loss": 0.8194286823272705, |
|
"eval_runtime": 215.8802, |
|
"eval_samples_per_second": 115.805, |
|
"eval_steps_per_second": 7.24, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.7774460315704346, |
|
"learning_rate": 4.9480160000000005e-05, |
|
"loss": 0.7329, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_accuracy_per_token": 0.7511833310127258, |
|
"eval_loss": 0.7874646782875061, |
|
"eval_runtime": 215.9936, |
|
"eval_samples_per_second": 115.744, |
|
"eval_steps_per_second": 7.236, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.8111910820007324, |
|
"learning_rate": 4.9440160000000005e-05, |
|
"loss": 0.6541, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy_per_token": 0.7654264569282532, |
|
"eval_loss": 0.7462261915206909, |
|
"eval_runtime": 215.7774, |
|
"eval_samples_per_second": 115.86, |
|
"eval_steps_per_second": 7.244, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.734023332595825, |
|
"learning_rate": 4.940016e-05, |
|
"loss": 0.6368, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_accuracy_per_token": 0.7762272953987122, |
|
"eval_loss": 0.7165001630783081, |
|
"eval_runtime": 215.5714, |
|
"eval_samples_per_second": 115.971, |
|
"eval_steps_per_second": 7.25, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 3.085864782333374, |
|
"learning_rate": 4.936016e-05, |
|
"loss": 0.6019, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_accuracy_per_token": 0.7840808033943176, |
|
"eval_loss": 0.6941312551498413, |
|
"eval_runtime": 215.5763, |
|
"eval_samples_per_second": 115.968, |
|
"eval_steps_per_second": 7.25, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 2.5468504428863525, |
|
"learning_rate": 4.9320320000000004e-05, |
|
"loss": 0.5855, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"eval_accuracy_per_token": 0.7905924916267395, |
|
"eval_loss": 0.6726610064506531, |
|
"eval_runtime": 215.5941, |
|
"eval_samples_per_second": 115.959, |
|
"eval_steps_per_second": 7.25, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.7991392612457275, |
|
"learning_rate": 4.9280320000000005e-05, |
|
"loss": 0.565, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"eval_accuracy_per_token": 0.7970213890075684, |
|
"eval_loss": 0.6574403047561646, |
|
"eval_runtime": 215.6086, |
|
"eval_samples_per_second": 115.951, |
|
"eval_steps_per_second": 7.249, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 2.0875983238220215, |
|
"learning_rate": 4.924032e-05, |
|
"loss": 0.5437, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"eval_accuracy_per_token": 0.8036227226257324, |
|
"eval_loss": 0.6450381278991699, |
|
"eval_runtime": 215.7698, |
|
"eval_samples_per_second": 115.864, |
|
"eval_steps_per_second": 7.244, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.820004940032959, |
|
"learning_rate": 4.920032e-05, |
|
"loss": 0.4618, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_accuracy_per_token": 0.8065482378005981, |
|
"eval_loss": 0.6290712952613831, |
|
"eval_runtime": 215.5858, |
|
"eval_samples_per_second": 115.963, |
|
"eval_steps_per_second": 7.25, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 2.534975051879883, |
|
"learning_rate": 4.916048e-05, |
|
"loss": 0.4836, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_accuracy_per_token": 0.8109478950500488, |
|
"eval_loss": 0.621654748916626, |
|
"eval_runtime": 215.6832, |
|
"eval_samples_per_second": 115.911, |
|
"eval_steps_per_second": 7.247, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.2555694580078125, |
|
"learning_rate": 4.9120480000000004e-05, |
|
"loss": 0.4793, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"eval_accuracy_per_token": 0.8153020739555359, |
|
"eval_loss": 0.6032074093818665, |
|
"eval_runtime": 215.7864, |
|
"eval_samples_per_second": 115.855, |
|
"eval_steps_per_second": 7.243, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 2.6011011600494385, |
|
"learning_rate": 4.9080480000000004e-05, |
|
"loss": 0.4639, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"eval_accuracy_per_token": 0.8189014792442322, |
|
"eval_loss": 0.5884661078453064, |
|
"eval_runtime": 215.9181, |
|
"eval_samples_per_second": 115.785, |
|
"eval_steps_per_second": 7.239, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 2.201638698577881, |
|
"learning_rate": 4.904048e-05, |
|
"loss": 0.4734, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"eval_accuracy_per_token": 0.8225154280662537, |
|
"eval_loss": 0.5901506543159485, |
|
"eval_runtime": 215.7481, |
|
"eval_samples_per_second": 115.876, |
|
"eval_steps_per_second": 7.245, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.115771770477295, |
|
"learning_rate": 4.9000480000000005e-05, |
|
"loss": 0.4589, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy_per_token": 0.824912428855896, |
|
"eval_loss": 0.57036292552948, |
|
"eval_runtime": 215.9409, |
|
"eval_samples_per_second": 115.772, |
|
"eval_steps_per_second": 7.238, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 2.13425350189209, |
|
"learning_rate": 4.8960480000000005e-05, |
|
"loss": 0.3846, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"eval_accuracy_per_token": 0.8280689716339111, |
|
"eval_loss": 0.5732296705245972, |
|
"eval_runtime": 215.8629, |
|
"eval_samples_per_second": 115.814, |
|
"eval_steps_per_second": 7.241, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 2.227823257446289, |
|
"learning_rate": 4.892048e-05, |
|
"loss": 0.3907, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"eval_accuracy_per_token": 0.8309552073478699, |
|
"eval_loss": 0.5606500506401062, |
|
"eval_runtime": 215.8455, |
|
"eval_samples_per_second": 115.824, |
|
"eval_steps_per_second": 7.241, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 1.9877572059631348, |
|
"learning_rate": 4.888048e-05, |
|
"loss": 0.3781, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_accuracy_per_token": 0.833513617515564, |
|
"eval_loss": 0.5528165102005005, |
|
"eval_runtime": 216.0002, |
|
"eval_samples_per_second": 115.741, |
|
"eval_steps_per_second": 7.236, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 2.0553159713745117, |
|
"learning_rate": 4.884048e-05, |
|
"loss": 0.3914, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"eval_accuracy_per_token": 0.8335487246513367, |
|
"eval_loss": 0.549156904220581, |
|
"eval_runtime": 215.7201, |
|
"eval_samples_per_second": 115.891, |
|
"eval_steps_per_second": 7.246, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 2.2741925716400146, |
|
"learning_rate": 4.880048000000001e-05, |
|
"loss": 0.3894, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_accuracy_per_token": 0.8362560272216797, |
|
"eval_loss": 0.5504783391952515, |
|
"eval_runtime": 215.7979, |
|
"eval_samples_per_second": 115.849, |
|
"eval_steps_per_second": 7.243, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 2.08858585357666, |
|
"learning_rate": 4.876048e-05, |
|
"loss": 0.3944, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"eval_accuracy_per_token": 0.8395103812217712, |
|
"eval_loss": 0.5324631929397583, |
|
"eval_runtime": 215.9188, |
|
"eval_samples_per_second": 115.784, |
|
"eval_steps_per_second": 7.239, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 2.1368110179901123, |
|
"learning_rate": 4.872048e-05, |
|
"loss": 0.3307, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"eval_accuracy_per_token": 0.839469850063324, |
|
"eval_loss": 0.5465147495269775, |
|
"eval_runtime": 216.142, |
|
"eval_samples_per_second": 115.665, |
|
"eval_steps_per_second": 7.231, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 2.2641913890838623, |
|
"learning_rate": 4.868048e-05, |
|
"loss": 0.3125, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"eval_accuracy_per_token": 0.8418903946876526, |
|
"eval_loss": 0.5389049053192139, |
|
"eval_runtime": 216.0271, |
|
"eval_samples_per_second": 115.726, |
|
"eval_steps_per_second": 7.235, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 1.6883608102798462, |
|
"learning_rate": 4.864048e-05, |
|
"loss": 0.3213, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"eval_accuracy_per_token": 0.8448163270950317, |
|
"eval_loss": 0.5376200675964355, |
|
"eval_runtime": 216.0239, |
|
"eval_samples_per_second": 115.728, |
|
"eval_steps_per_second": 7.235, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 2.2113685607910156, |
|
"learning_rate": 4.860048e-05, |
|
"loss": 0.3269, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_accuracy_per_token": 0.8447655439376831, |
|
"eval_loss": 0.5422470569610596, |
|
"eval_runtime": 215.062, |
|
"eval_samples_per_second": 116.246, |
|
"eval_steps_per_second": 7.268, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 1.9853556156158447, |
|
"learning_rate": 4.856048e-05, |
|
"loss": 0.3374, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"eval_accuracy_per_token": 0.847917914390564, |
|
"eval_loss": 0.5255292057991028, |
|
"eval_runtime": 213.7043, |
|
"eval_samples_per_second": 116.984, |
|
"eval_steps_per_second": 7.314, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 2.584245204925537, |
|
"learning_rate": 4.852048e-05, |
|
"loss": 0.3229, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"eval_accuracy_per_token": 0.8491668701171875, |
|
"eval_loss": 0.5113908052444458, |
|
"eval_runtime": 214.1955, |
|
"eval_samples_per_second": 116.716, |
|
"eval_steps_per_second": 7.297, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 2.0866963863372803, |
|
"learning_rate": 4.848056e-05, |
|
"loss": 0.3061, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"eval_accuracy_per_token": 0.8502768874168396, |
|
"eval_loss": 0.5418649911880493, |
|
"eval_runtime": 214.0933, |
|
"eval_samples_per_second": 116.771, |
|
"eval_steps_per_second": 7.301, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 2.0000994205474854, |
|
"learning_rate": 4.844056e-05, |
|
"loss": 0.2556, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"eval_accuracy_per_token": 0.85043865442276, |
|
"eval_loss": 0.542536199092865, |
|
"eval_runtime": 213.1241, |
|
"eval_samples_per_second": 117.303, |
|
"eval_steps_per_second": 7.334, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 2.849112033843994, |
|
"learning_rate": 4.840064e-05, |
|
"loss": 0.2672, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"eval_accuracy_per_token": 0.8512768149375916, |
|
"eval_loss": 0.5348747372627258, |
|
"eval_runtime": 215.5865, |
|
"eval_samples_per_second": 115.963, |
|
"eval_steps_per_second": 7.25, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"grad_norm": 2.216937303543091, |
|
"learning_rate": 4.836064e-05, |
|
"loss": 0.2791, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.5600000000000005, |
|
"eval_accuracy_per_token": 0.8536346554756165, |
|
"eval_loss": 0.5185777544975281, |
|
"eval_runtime": 215.5365, |
|
"eval_samples_per_second": 115.99, |
|
"eval_steps_per_second": 7.252, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 2.2237415313720703, |
|
"learning_rate": 4.832064e-05, |
|
"loss": 0.2792, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"eval_accuracy_per_token": 0.8542323112487793, |
|
"eval_loss": 0.5137789249420166, |
|
"eval_runtime": 215.6897, |
|
"eval_samples_per_second": 115.907, |
|
"eval_steps_per_second": 7.247, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 2.6220571994781494, |
|
"learning_rate": 4.828064e-05, |
|
"loss": 0.2867, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"eval_accuracy_per_token": 0.8552057147026062, |
|
"eval_loss": 0.5025785565376282, |
|
"eval_runtime": 215.6135, |
|
"eval_samples_per_second": 115.948, |
|
"eval_steps_per_second": 7.249, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 2.4813005924224854, |
|
"learning_rate": 4.82408e-05, |
|
"loss": 0.2628, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"eval_accuracy_per_token": 0.8553439974784851, |
|
"eval_loss": 0.5604137778282166, |
|
"eval_runtime": 215.6781, |
|
"eval_samples_per_second": 115.913, |
|
"eval_steps_per_second": 7.247, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 2.316831588745117, |
|
"learning_rate": 4.82008e-05, |
|
"loss": 0.2124, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"eval_accuracy_per_token": 0.8566614389419556, |
|
"eval_loss": 0.5436919927597046, |
|
"eval_runtime": 215.5335, |
|
"eval_samples_per_second": 115.991, |
|
"eval_steps_per_second": 7.252, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"grad_norm": 2.0445761680603027, |
|
"learning_rate": 4.81608e-05, |
|
"loss": 0.2246, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"eval_accuracy_per_token": 0.8579392433166504, |
|
"eval_loss": 0.5327755212783813, |
|
"eval_runtime": 215.6442, |
|
"eval_samples_per_second": 115.932, |
|
"eval_steps_per_second": 7.248, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"grad_norm": 1.8908203840255737, |
|
"learning_rate": 4.81208e-05, |
|
"loss": 0.2266, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.52, |
|
"eval_accuracy_per_token": 0.8593510389328003, |
|
"eval_loss": 0.5242588520050049, |
|
"eval_runtime": 215.5467, |
|
"eval_samples_per_second": 115.984, |
|
"eval_steps_per_second": 7.251, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"grad_norm": 1.698264241218567, |
|
"learning_rate": 4.80808e-05, |
|
"loss": 0.2335, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.68, |
|
"eval_accuracy_per_token": 0.8600181341171265, |
|
"eval_loss": 0.5188168883323669, |
|
"eval_runtime": 215.5491, |
|
"eval_samples_per_second": 115.983, |
|
"eval_steps_per_second": 7.251, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"grad_norm": 2.196866035461426, |
|
"learning_rate": 4.80408e-05, |
|
"loss": 0.2375, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 7.84, |
|
"eval_accuracy_per_token": 0.8607679605484009, |
|
"eval_loss": 0.5174950957298279, |
|
"eval_runtime": 215.872, |
|
"eval_samples_per_second": 115.809, |
|
"eval_steps_per_second": 7.24, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.599553346633911, |
|
"learning_rate": 4.800088e-05, |
|
"loss": 0.2426, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy_per_token": 0.8615555763244629, |
|
"eval_loss": 0.5121429562568665, |
|
"eval_runtime": 215.6323, |
|
"eval_samples_per_second": 115.938, |
|
"eval_steps_per_second": 7.248, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"grad_norm": 2.2151308059692383, |
|
"learning_rate": 4.796088e-05, |
|
"loss": 0.1708, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.16, |
|
"eval_accuracy_per_token": 0.8606916666030884, |
|
"eval_loss": 0.5612675547599792, |
|
"eval_runtime": 215.6202, |
|
"eval_samples_per_second": 115.945, |
|
"eval_steps_per_second": 7.249, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"grad_norm": 2.0478286743164062, |
|
"learning_rate": 4.792088e-05, |
|
"loss": 0.1739, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 8.32, |
|
"eval_accuracy_per_token": 0.8610015511512756, |
|
"eval_loss": 0.5620591640472412, |
|
"eval_runtime": 215.5503, |
|
"eval_samples_per_second": 115.982, |
|
"eval_steps_per_second": 7.251, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"grad_norm": 2.2109131813049316, |
|
"learning_rate": 4.788088e-05, |
|
"loss": 0.1893, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.48, |
|
"eval_accuracy_per_token": 0.861874520778656, |
|
"eval_loss": 0.5581731200218201, |
|
"eval_runtime": 215.6555, |
|
"eval_samples_per_second": 115.926, |
|
"eval_steps_per_second": 7.248, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 2.688985586166382, |
|
"learning_rate": 4.784096e-05, |
|
"loss": 0.1948, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"eval_accuracy_per_token": 0.864142119884491, |
|
"eval_loss": 0.5396531820297241, |
|
"eval_runtime": 215.5944, |
|
"eval_samples_per_second": 115.959, |
|
"eval_steps_per_second": 7.25, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 2.5816333293914795, |
|
"learning_rate": 4.780096e-05, |
|
"loss": 0.2022, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"eval_accuracy_per_token": 0.8647276759147644, |
|
"eval_loss": 0.537101149559021, |
|
"eval_runtime": 215.522, |
|
"eval_samples_per_second": 115.997, |
|
"eval_steps_per_second": 7.252, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 2.262164354324341, |
|
"learning_rate": 4.776096e-05, |
|
"loss": 0.2027, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"eval_accuracy_per_token": 0.8652188777923584, |
|
"eval_loss": 0.5241075158119202, |
|
"eval_runtime": 215.8588, |
|
"eval_samples_per_second": 115.816, |
|
"eval_steps_per_second": 7.241, |
|
"step": 28000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 625000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 200, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 19, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.043038124900352e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|