|
{ |
|
"best_metric": 0.9794921875, |
|
"best_model_checkpoint": "resnet-Alzheimer/checkpoint-3600", |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 4000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.1754038333892822, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3812, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.91097092628479, |
|
"learning_rate": 0.0001, |
|
"loss": 1.34, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.134509563446045, |
|
"learning_rate": 0.00015, |
|
"loss": 1.2635, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.377187252044678, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1824, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 7.713193416595459, |
|
"learning_rate": 0.00025, |
|
"loss": 1.1226, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.569382429122925, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0308, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 5.650737285614014, |
|
"learning_rate": 0.00035, |
|
"loss": 1.0115, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.970870494842529, |
|
"learning_rate": 0.0004, |
|
"loss": 1.0127, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.5087890625, |
|
"eval_loss": 0.9888483285903931, |
|
"eval_runtime": 5.4153, |
|
"eval_samples_per_second": 189.093, |
|
"eval_steps_per_second": 11.818, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.92154860496521, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 1.0175, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.131512641906738, |
|
"learning_rate": 0.0005, |
|
"loss": 0.918, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.9838943481445312, |
|
"learning_rate": 0.00055, |
|
"loss": 0.9613, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.4230854511260986, |
|
"learning_rate": 0.0006, |
|
"loss": 0.9188, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 3.774385690689087, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 0.9245, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 3.175440549850464, |
|
"learning_rate": 0.0007, |
|
"loss": 0.8713, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.7985305786132812, |
|
"learning_rate": 0.00075, |
|
"loss": 0.9264, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.5870747566223145, |
|
"learning_rate": 0.0008, |
|
"loss": 0.9345, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5302734375, |
|
"eval_loss": 0.942151665687561, |
|
"eval_runtime": 5.1954, |
|
"eval_samples_per_second": 197.099, |
|
"eval_steps_per_second": 12.319, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.3990691900253296, |
|
"learning_rate": 0.00085, |
|
"loss": 0.8851, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.4066977500915527, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 0.9081, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 1.5531185865402222, |
|
"learning_rate": 0.00095, |
|
"loss": 0.8888, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.960204839706421, |
|
"learning_rate": 0.001, |
|
"loss": 0.9206, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.0218795537948608, |
|
"learning_rate": 0.0010500000000000002, |
|
"loss": 0.8735, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.9216176271438599, |
|
"learning_rate": 0.0011, |
|
"loss": 0.924, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.9017541408538818, |
|
"learning_rate": 0.00115, |
|
"loss": 0.9327, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.556686282157898, |
|
"learning_rate": 0.0012, |
|
"loss": 0.8889, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.578125, |
|
"eval_loss": 0.8723889589309692, |
|
"eval_runtime": 4.9225, |
|
"eval_samples_per_second": 208.026, |
|
"eval_steps_per_second": 13.002, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.9459726810455322, |
|
"learning_rate": 0.00125, |
|
"loss": 0.8643, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.749912679195404, |
|
"learning_rate": 0.0013000000000000002, |
|
"loss": 0.8978, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.9228126406669617, |
|
"learning_rate": 0.00135, |
|
"loss": 0.8838, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 1.0743939876556396, |
|
"learning_rate": 0.0014, |
|
"loss": 0.8868, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.997053325176239, |
|
"learning_rate": 0.00145, |
|
"loss": 0.8632, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.5891302227973938, |
|
"learning_rate": 0.0015, |
|
"loss": 0.8501, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.0819345712661743, |
|
"learning_rate": 0.0015500000000000002, |
|
"loss": 0.884, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.5622245073318481, |
|
"learning_rate": 0.0016, |
|
"loss": 0.8843, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5888671875, |
|
"eval_loss": 0.8535706996917725, |
|
"eval_runtime": 6.5338, |
|
"eval_samples_per_second": 156.725, |
|
"eval_steps_per_second": 9.795, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.7301604747772217, |
|
"learning_rate": 0.00165, |
|
"loss": 0.8241, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 1.222732424736023, |
|
"learning_rate": 0.0017, |
|
"loss": 0.8929, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.8520879745483398, |
|
"learning_rate": 0.00175, |
|
"loss": 0.8726, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.6151734590530396, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 0.8851, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.6786526441574097, |
|
"learning_rate": 0.00185, |
|
"loss": 0.8544, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.8025469779968262, |
|
"learning_rate": 0.0019, |
|
"loss": 0.8432, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.0158729553222656, |
|
"learning_rate": 0.00195, |
|
"loss": 0.8261, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.7815405130386353, |
|
"learning_rate": 0.002, |
|
"loss": 0.8397, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.615234375, |
|
"eval_loss": 0.8353910446166992, |
|
"eval_runtime": 5.3022, |
|
"eval_samples_per_second": 193.129, |
|
"eval_steps_per_second": 12.071, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 0.9535025954246521, |
|
"learning_rate": 0.0019944444444444445, |
|
"loss": 0.8636, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.43990448117256165, |
|
"learning_rate": 0.001988888888888889, |
|
"loss": 0.7931, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 0.6062633395195007, |
|
"learning_rate": 0.0019833333333333335, |
|
"loss": 0.8345, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.6349042057991028, |
|
"learning_rate": 0.001977777777777778, |
|
"loss": 0.8593, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.6786915063858032, |
|
"learning_rate": 0.0019722222222222224, |
|
"loss": 0.8435, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 0.7090786695480347, |
|
"learning_rate": 0.0019666666666666665, |
|
"loss": 0.8008, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 0.6538481712341309, |
|
"learning_rate": 0.001961111111111111, |
|
"loss": 0.763, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.4316461682319641, |
|
"learning_rate": 0.0019555555555555554, |
|
"loss": 0.8624, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5380859375, |
|
"eval_loss": 0.9221189022064209, |
|
"eval_runtime": 4.9173, |
|
"eval_samples_per_second": 208.245, |
|
"eval_steps_per_second": 13.015, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 0.5696819424629211, |
|
"learning_rate": 0.00195, |
|
"loss": 0.8065, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.6260728240013123, |
|
"learning_rate": 0.0019444444444444444, |
|
"loss": 0.7873, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 0.6295855045318604, |
|
"learning_rate": 0.0019388888888888889, |
|
"loss": 0.7802, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.6074417233467102, |
|
"learning_rate": 0.0019333333333333333, |
|
"loss": 0.7907, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.6099679470062256, |
|
"learning_rate": 0.0019277777777777778, |
|
"loss": 0.7391, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.9349565505981445, |
|
"learning_rate": 0.0019222222222222223, |
|
"loss": 0.7749, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 0.6923946142196655, |
|
"learning_rate": 0.0019166666666666668, |
|
"loss": 0.8232, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.5967056751251221, |
|
"learning_rate": 0.0019111111111111113, |
|
"loss": 0.7543, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.6474609375, |
|
"eval_loss": 0.7568103671073914, |
|
"eval_runtime": 5.3014, |
|
"eval_samples_per_second": 193.156, |
|
"eval_steps_per_second": 12.072, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 0.7925052642822266, |
|
"learning_rate": 0.0019055555555555555, |
|
"loss": 0.7086, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.715761125087738, |
|
"learning_rate": 0.0019, |
|
"loss": 0.7901, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 0.6602711081504822, |
|
"learning_rate": 0.0018944444444444445, |
|
"loss": 0.7375, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.5104066729545593, |
|
"learning_rate": 0.001888888888888889, |
|
"loss": 0.7805, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 0.6333702802658081, |
|
"learning_rate": 0.0018833333333333334, |
|
"loss": 0.7017, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.5703239440917969, |
|
"learning_rate": 0.001877777777777778, |
|
"loss": 0.7086, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 0.8939486742019653, |
|
"learning_rate": 0.0018722222222222222, |
|
"loss": 0.7399, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.6808524131774902, |
|
"learning_rate": 0.0018666666666666666, |
|
"loss": 0.6993, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.61328125, |
|
"eval_loss": 0.8830391764640808, |
|
"eval_runtime": 4.9073, |
|
"eval_samples_per_second": 208.667, |
|
"eval_steps_per_second": 13.042, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.7670312523841858, |
|
"learning_rate": 0.0018611111111111111, |
|
"loss": 0.7304, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.518883466720581, |
|
"learning_rate": 0.0018555555555555556, |
|
"loss": 0.6759, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.6331384778022766, |
|
"learning_rate": 0.00185, |
|
"loss": 0.7323, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.5934571027755737, |
|
"learning_rate": 0.0018444444444444446, |
|
"loss": 0.7109, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.5555841326713562, |
|
"learning_rate": 0.0018388888888888888, |
|
"loss": 0.7361, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.45028582215309143, |
|
"learning_rate": 0.0018333333333333333, |
|
"loss": 0.7209, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.4313984811306, |
|
"learning_rate": 0.0018277777777777778, |
|
"loss": 0.692, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.6221916675567627, |
|
"learning_rate": 0.0018222222222222223, |
|
"loss": 0.7045, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.658203125, |
|
"eval_loss": 0.7372878789901733, |
|
"eval_runtime": 4.9218, |
|
"eval_samples_per_second": 208.053, |
|
"eval_steps_per_second": 13.003, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 0.9794626832008362, |
|
"learning_rate": 0.0018166666666666667, |
|
"loss": 0.641, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.8530990481376648, |
|
"learning_rate": 0.0018111111111111112, |
|
"loss": 0.6616, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"grad_norm": 0.5696712136268616, |
|
"learning_rate": 0.0018055555555555557, |
|
"loss": 0.6685, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.6695945858955383, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 0.6079, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.9470874667167664, |
|
"learning_rate": 0.0017944444444444446, |
|
"loss": 0.6362, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 1.0435755252838135, |
|
"learning_rate": 0.001788888888888889, |
|
"loss": 0.7036, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 0.4934737980365753, |
|
"learning_rate": 0.0017833333333333334, |
|
"loss": 0.6955, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.49625110626220703, |
|
"learning_rate": 0.0017777777777777776, |
|
"loss": 0.6557, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7451171875, |
|
"eval_loss": 0.6075544357299805, |
|
"eval_runtime": 5.2806, |
|
"eval_samples_per_second": 193.918, |
|
"eval_steps_per_second": 12.12, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 10.12, |
|
"grad_norm": 0.49739229679107666, |
|
"learning_rate": 0.0017722222222222221, |
|
"loss": 0.605, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 10.25, |
|
"grad_norm": 0.6317277550697327, |
|
"learning_rate": 0.0017666666666666666, |
|
"loss": 0.5332, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 10.38, |
|
"grad_norm": 0.756879985332489, |
|
"learning_rate": 0.001761111111111111, |
|
"loss": 0.5619, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 10.5, |
|
"grad_norm": 0.6143298149108887, |
|
"learning_rate": 0.0017555555555555556, |
|
"loss": 0.601, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 10.62, |
|
"grad_norm": 0.7249147891998291, |
|
"learning_rate": 0.00175, |
|
"loss": 0.5935, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.75, |
|
"grad_norm": 0.4532654285430908, |
|
"learning_rate": 0.0017444444444444445, |
|
"loss": 0.5988, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.88, |
|
"grad_norm": 0.5738415718078613, |
|
"learning_rate": 0.001738888888888889, |
|
"loss": 0.6634, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.5514868497848511, |
|
"learning_rate": 0.0017333333333333335, |
|
"loss": 0.5876, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.69921875, |
|
"eval_loss": 0.7281272411346436, |
|
"eval_runtime": 4.8994, |
|
"eval_samples_per_second": 209.004, |
|
"eval_steps_per_second": 13.063, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 11.12, |
|
"grad_norm": 0.7158863544464111, |
|
"learning_rate": 0.001727777777777778, |
|
"loss": 0.606, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 11.25, |
|
"grad_norm": 0.7355363368988037, |
|
"learning_rate": 0.0017222222222222224, |
|
"loss": 0.5923, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 11.38, |
|
"grad_norm": 0.7794367671012878, |
|
"learning_rate": 0.0017166666666666667, |
|
"loss": 0.5935, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 11.5, |
|
"grad_norm": 0.9755826592445374, |
|
"learning_rate": 0.0017111111111111112, |
|
"loss": 0.644, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 11.62, |
|
"grad_norm": 0.6257722973823547, |
|
"learning_rate": 0.0017055555555555554, |
|
"loss": 0.617, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 11.75, |
|
"grad_norm": 0.8550503253936768, |
|
"learning_rate": 0.0017, |
|
"loss": 0.5854, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.88, |
|
"grad_norm": 0.7347137928009033, |
|
"learning_rate": 0.0016944444444444444, |
|
"loss": 0.6358, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.7867416739463806, |
|
"learning_rate": 0.0016888888888888889, |
|
"loss": 0.5732, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7509765625, |
|
"eval_loss": 0.5769097208976746, |
|
"eval_runtime": 5.275, |
|
"eval_samples_per_second": 194.122, |
|
"eval_steps_per_second": 12.133, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 12.12, |
|
"grad_norm": 0.6022630333900452, |
|
"learning_rate": 0.0016833333333333333, |
|
"loss": 0.5643, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 12.25, |
|
"grad_norm": 0.599958062171936, |
|
"learning_rate": 0.0016777777777777778, |
|
"loss": 0.5438, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 12.38, |
|
"grad_norm": 0.6484814286231995, |
|
"learning_rate": 0.0016722222222222223, |
|
"loss": 0.5208, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 0.8167735934257507, |
|
"learning_rate": 0.0016666666666666668, |
|
"loss": 0.5369, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 12.62, |
|
"grad_norm": 0.49088793992996216, |
|
"learning_rate": 0.0016611111111111113, |
|
"loss": 0.4803, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 12.75, |
|
"grad_norm": 0.6817615628242493, |
|
"learning_rate": 0.0016555555555555555, |
|
"loss": 0.5102, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 12.88, |
|
"grad_norm": 0.8656439781188965, |
|
"learning_rate": 0.00165, |
|
"loss": 0.5287, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.5195401310920715, |
|
"learning_rate": 0.0016444444444444445, |
|
"loss": 0.4864, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.8310546875, |
|
"eval_loss": 0.445728600025177, |
|
"eval_runtime": 4.9054, |
|
"eval_samples_per_second": 208.751, |
|
"eval_steps_per_second": 13.047, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 13.12, |
|
"grad_norm": 0.7564366459846497, |
|
"learning_rate": 0.001638888888888889, |
|
"loss": 0.4715, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 13.25, |
|
"grad_norm": 0.6976212859153748, |
|
"learning_rate": 0.0016333333333333334, |
|
"loss": 0.471, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 13.38, |
|
"grad_norm": 0.7652568817138672, |
|
"learning_rate": 0.001627777777777778, |
|
"loss": 0.4821, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.7834269404411316, |
|
"learning_rate": 0.0016222222222222222, |
|
"loss": 0.5091, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 13.62, |
|
"grad_norm": 0.8186032176017761, |
|
"learning_rate": 0.0016166666666666666, |
|
"loss": 0.4611, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 13.75, |
|
"grad_norm": 0.7720199227333069, |
|
"learning_rate": 0.0016111111111111111, |
|
"loss": 0.5397, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 13.88, |
|
"grad_norm": 0.6797453165054321, |
|
"learning_rate": 0.0016055555555555556, |
|
"loss": 0.5135, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.726184606552124, |
|
"learning_rate": 0.0016, |
|
"loss": 0.5175, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7841796875, |
|
"eval_loss": 0.5278125405311584, |
|
"eval_runtime": 4.8906, |
|
"eval_samples_per_second": 209.383, |
|
"eval_steps_per_second": 13.086, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 14.12, |
|
"grad_norm": 0.6777172088623047, |
|
"learning_rate": 0.0015944444444444446, |
|
"loss": 0.4831, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.6228752732276917, |
|
"learning_rate": 0.0015888888888888888, |
|
"loss": 0.4657, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 14.38, |
|
"grad_norm": 0.7296370267868042, |
|
"learning_rate": 0.0015833333333333333, |
|
"loss": 0.5084, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 14.5, |
|
"grad_norm": 0.7809439897537231, |
|
"learning_rate": 0.0015777777777777778, |
|
"loss": 0.4749, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 14.62, |
|
"grad_norm": 0.4627506136894226, |
|
"learning_rate": 0.0015722222222222223, |
|
"loss": 0.4157, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 14.75, |
|
"grad_norm": 0.465811163187027, |
|
"learning_rate": 0.0015666666666666667, |
|
"loss": 0.4192, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 14.88, |
|
"grad_norm": 0.636384129524231, |
|
"learning_rate": 0.0015611111111111112, |
|
"loss": 0.4627, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.8339561223983765, |
|
"learning_rate": 0.0015555555555555557, |
|
"loss": 0.4865, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.837890625, |
|
"eval_loss": 0.4163576364517212, |
|
"eval_runtime": 5.3394, |
|
"eval_samples_per_second": 191.782, |
|
"eval_steps_per_second": 11.986, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 15.12, |
|
"grad_norm": 0.5218497514724731, |
|
"learning_rate": 0.0015500000000000002, |
|
"loss": 0.4253, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 15.25, |
|
"grad_norm": 0.6273193359375, |
|
"learning_rate": 0.0015444444444444446, |
|
"loss": 0.4474, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 15.38, |
|
"grad_norm": 0.6019622087478638, |
|
"learning_rate": 0.001538888888888889, |
|
"loss": 0.4008, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 15.5, |
|
"grad_norm": 0.7020573616027832, |
|
"learning_rate": 0.0015333333333333334, |
|
"loss": 0.3768, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 15.62, |
|
"grad_norm": 0.577691376209259, |
|
"learning_rate": 0.0015277777777777776, |
|
"loss": 0.4108, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 15.75, |
|
"grad_norm": 0.8489026427268982, |
|
"learning_rate": 0.0015222222222222221, |
|
"loss": 0.3994, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 15.88, |
|
"grad_norm": 0.42233309149742126, |
|
"learning_rate": 0.0015166666666666666, |
|
"loss": 0.4292, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.48867735266685486, |
|
"learning_rate": 0.001511111111111111, |
|
"loss": 0.4049, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.830078125, |
|
"eval_loss": 0.4204105734825134, |
|
"eval_runtime": 4.8855, |
|
"eval_samples_per_second": 209.602, |
|
"eval_steps_per_second": 13.1, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 16.12, |
|
"grad_norm": 0.6492818593978882, |
|
"learning_rate": 0.0015055555555555556, |
|
"loss": 0.3885, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 16.25, |
|
"grad_norm": 0.4546281397342682, |
|
"learning_rate": 0.0015, |
|
"loss": 0.4096, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 16.38, |
|
"grad_norm": 0.6827344298362732, |
|
"learning_rate": 0.0014944444444444445, |
|
"loss": 0.3618, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 16.5, |
|
"grad_norm": 0.454326868057251, |
|
"learning_rate": 0.001488888888888889, |
|
"loss": 0.3863, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 16.62, |
|
"grad_norm": 0.6911420226097107, |
|
"learning_rate": 0.0014833333333333335, |
|
"loss": 0.4264, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 16.75, |
|
"grad_norm": 0.6122339367866516, |
|
"learning_rate": 0.001477777777777778, |
|
"loss": 0.4205, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 16.88, |
|
"grad_norm": 0.5123728513717651, |
|
"learning_rate": 0.0014722222222222224, |
|
"loss": 0.4419, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 1.0908498764038086, |
|
"learning_rate": 0.0014666666666666667, |
|
"loss": 0.4167, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.828125, |
|
"eval_loss": 0.47203314304351807, |
|
"eval_runtime": 5.3495, |
|
"eval_samples_per_second": 191.421, |
|
"eval_steps_per_second": 11.964, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 17.12, |
|
"grad_norm": 0.42975571751594543, |
|
"learning_rate": 0.0014611111111111112, |
|
"loss": 0.4006, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 17.25, |
|
"grad_norm": 0.6392154693603516, |
|
"learning_rate": 0.0014555555555555554, |
|
"loss": 0.3581, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 17.38, |
|
"grad_norm": 0.6548070907592773, |
|
"learning_rate": 0.00145, |
|
"loss": 0.3672, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 17.5, |
|
"grad_norm": 0.6939528584480286, |
|
"learning_rate": 0.0014444444444444444, |
|
"loss": 0.3514, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 17.62, |
|
"grad_norm": 0.6098494529724121, |
|
"learning_rate": 0.0014388888888888889, |
|
"loss": 0.3835, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 17.75, |
|
"grad_norm": 0.5356572866439819, |
|
"learning_rate": 0.0014333333333333333, |
|
"loss": 0.3326, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 17.88, |
|
"grad_norm": 0.6472760438919067, |
|
"learning_rate": 0.0014277777777777778, |
|
"loss": 0.3829, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.67198646068573, |
|
"learning_rate": 0.0014222222222222223, |
|
"loss": 0.36, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.81640625, |
|
"eval_loss": 0.4660454988479614, |
|
"eval_runtime": 4.9124, |
|
"eval_samples_per_second": 208.451, |
|
"eval_steps_per_second": 13.028, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 18.12, |
|
"grad_norm": 0.4594449996948242, |
|
"learning_rate": 0.0014166666666666668, |
|
"loss": 0.3549, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 18.25, |
|
"grad_norm": 0.4456086754798889, |
|
"learning_rate": 0.0014111111111111112, |
|
"loss": 0.2899, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 18.38, |
|
"grad_norm": 0.724087119102478, |
|
"learning_rate": 0.0014055555555555555, |
|
"loss": 0.2976, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.8099024891853333, |
|
"learning_rate": 0.0014, |
|
"loss": 0.3706, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 18.62, |
|
"grad_norm": 0.6271733641624451, |
|
"learning_rate": 0.0013944444444444445, |
|
"loss": 0.3591, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 18.75, |
|
"grad_norm": 0.5864254236221313, |
|
"learning_rate": 0.001388888888888889, |
|
"loss": 0.3184, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 18.88, |
|
"grad_norm": 0.4915286898612976, |
|
"learning_rate": 0.0013833333333333334, |
|
"loss": 0.301, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.6932692527770996, |
|
"learning_rate": 0.001377777777777778, |
|
"loss": 0.3195, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.876953125, |
|
"eval_loss": 0.306354820728302, |
|
"eval_runtime": 5.2563, |
|
"eval_samples_per_second": 194.815, |
|
"eval_steps_per_second": 12.176, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 19.12, |
|
"grad_norm": 0.5778792500495911, |
|
"learning_rate": 0.0013722222222222222, |
|
"loss": 0.3493, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 19.25, |
|
"grad_norm": 0.951936662197113, |
|
"learning_rate": 0.0013666666666666666, |
|
"loss": 0.3305, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 19.38, |
|
"grad_norm": 0.6778426170349121, |
|
"learning_rate": 0.0013611111111111111, |
|
"loss": 0.32, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 19.5, |
|
"grad_norm": 0.6356533765792847, |
|
"learning_rate": 0.0013555555555555556, |
|
"loss": 0.2889, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 19.62, |
|
"grad_norm": 0.6476128697395325, |
|
"learning_rate": 0.00135, |
|
"loss": 0.2907, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 19.75, |
|
"grad_norm": 0.4664938151836395, |
|
"learning_rate": 0.0013444444444444445, |
|
"loss": 0.3261, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 19.88, |
|
"grad_norm": 1.06290602684021, |
|
"learning_rate": 0.0013388888888888888, |
|
"loss": 0.3365, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.5365467667579651, |
|
"learning_rate": 0.0013333333333333333, |
|
"loss": 0.3652, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.912109375, |
|
"eval_loss": 0.25709766149520874, |
|
"eval_runtime": 4.9952, |
|
"eval_samples_per_second": 204.995, |
|
"eval_steps_per_second": 12.812, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 20.12, |
|
"grad_norm": 0.5051919221878052, |
|
"learning_rate": 0.0013277777777777778, |
|
"loss": 0.3147, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 20.25, |
|
"grad_norm": 0.5098996162414551, |
|
"learning_rate": 0.0013222222222222222, |
|
"loss": 0.3085, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 20.38, |
|
"grad_norm": 0.5585361123085022, |
|
"learning_rate": 0.0013166666666666667, |
|
"loss": 0.3679, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 20.5, |
|
"grad_norm": 0.38560378551483154, |
|
"learning_rate": 0.0013111111111111112, |
|
"loss": 0.2987, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 20.62, |
|
"grad_norm": 0.3209057152271271, |
|
"learning_rate": 0.0013055555555555557, |
|
"loss": 0.2792, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 20.75, |
|
"grad_norm": 0.6471489667892456, |
|
"learning_rate": 0.0013000000000000002, |
|
"loss": 0.2755, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 20.88, |
|
"grad_norm": 0.8814804553985596, |
|
"learning_rate": 0.0012944444444444446, |
|
"loss": 0.2993, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.5392754673957825, |
|
"learning_rate": 0.001288888888888889, |
|
"loss": 0.2794, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.9150390625, |
|
"eval_loss": 0.24504294991493225, |
|
"eval_runtime": 4.8909, |
|
"eval_samples_per_second": 209.37, |
|
"eval_steps_per_second": 13.086, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 21.12, |
|
"grad_norm": 0.6234158873558044, |
|
"learning_rate": 0.0012833333333333334, |
|
"loss": 0.2926, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 21.25, |
|
"grad_norm": 0.4284802973270416, |
|
"learning_rate": 0.0012777777777777776, |
|
"loss": 0.2803, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 21.38, |
|
"grad_norm": 0.688140869140625, |
|
"learning_rate": 0.0012722222222222221, |
|
"loss": 0.2799, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 21.5, |
|
"grad_norm": 0.8576880097389221, |
|
"learning_rate": 0.0012666666666666666, |
|
"loss": 0.2868, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 21.62, |
|
"grad_norm": 0.6299762725830078, |
|
"learning_rate": 0.001261111111111111, |
|
"loss": 0.2971, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 21.75, |
|
"grad_norm": 0.7093678116798401, |
|
"learning_rate": 0.0012555555555555555, |
|
"loss": 0.2905, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 21.88, |
|
"grad_norm": 0.4271737039089203, |
|
"learning_rate": 0.00125, |
|
"loss": 0.3336, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.6771571040153503, |
|
"learning_rate": 0.0012444444444444445, |
|
"loss": 0.2704, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.9033203125, |
|
"eval_loss": 0.23907524347305298, |
|
"eval_runtime": 5.3054, |
|
"eval_samples_per_second": 193.012, |
|
"eval_steps_per_second": 12.063, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 22.12, |
|
"grad_norm": 0.44859397411346436, |
|
"learning_rate": 0.001238888888888889, |
|
"loss": 0.28, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 22.25, |
|
"grad_norm": 0.5617765784263611, |
|
"learning_rate": 0.0012333333333333335, |
|
"loss": 0.3093, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 22.38, |
|
"grad_norm": 0.6634913682937622, |
|
"learning_rate": 0.001227777777777778, |
|
"loss": 0.2417, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 22.5, |
|
"grad_norm": 0.670782744884491, |
|
"learning_rate": 0.0012222222222222224, |
|
"loss": 0.2932, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 22.62, |
|
"grad_norm": 0.6564796566963196, |
|
"learning_rate": 0.0012166666666666667, |
|
"loss": 0.3042, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 22.75, |
|
"grad_norm": 0.34089842438697815, |
|
"learning_rate": 0.0012111111111111112, |
|
"loss": 0.2925, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 22.88, |
|
"grad_norm": 0.5612368583679199, |
|
"learning_rate": 0.0012055555555555554, |
|
"loss": 0.2559, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"grad_norm": 0.624458909034729, |
|
"learning_rate": 0.0012, |
|
"loss": 0.2612, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.927734375, |
|
"eval_loss": 0.23524078726768494, |
|
"eval_runtime": 4.902, |
|
"eval_samples_per_second": 208.893, |
|
"eval_steps_per_second": 13.056, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 23.12, |
|
"grad_norm": 0.6820557117462158, |
|
"learning_rate": 0.0011944444444444444, |
|
"loss": 0.2282, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 23.25, |
|
"grad_norm": 0.5979276895523071, |
|
"learning_rate": 0.0011888888888888889, |
|
"loss": 0.2569, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 23.38, |
|
"grad_norm": 0.5427021384239197, |
|
"learning_rate": 0.0011833333333333333, |
|
"loss": 0.2724, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 23.5, |
|
"grad_norm": 0.4382477104663849, |
|
"learning_rate": 0.0011777777777777778, |
|
"loss": 0.2616, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 23.62, |
|
"grad_norm": 0.6240445375442505, |
|
"learning_rate": 0.0011722222222222223, |
|
"loss": 0.2636, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 23.75, |
|
"grad_norm": 0.7440346479415894, |
|
"learning_rate": 0.0011666666666666668, |
|
"loss": 0.2913, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 23.88, |
|
"grad_norm": 0.4682701826095581, |
|
"learning_rate": 0.0011611111111111112, |
|
"loss": 0.2499, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 0.5112751722335815, |
|
"learning_rate": 0.0011555555555555555, |
|
"loss": 0.2425, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.828125, |
|
"eval_loss": 0.4720377027988434, |
|
"eval_runtime": 5.3156, |
|
"eval_samples_per_second": 192.639, |
|
"eval_steps_per_second": 12.04, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 24.12, |
|
"grad_norm": 0.765444278717041, |
|
"learning_rate": 0.00115, |
|
"loss": 0.2736, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 24.25, |
|
"grad_norm": 0.380066841840744, |
|
"learning_rate": 0.0011444444444444445, |
|
"loss": 0.2357, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 24.38, |
|
"grad_norm": 0.43320003151893616, |
|
"learning_rate": 0.001138888888888889, |
|
"loss": 0.2518, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 24.5, |
|
"grad_norm": 0.5003307461738586, |
|
"learning_rate": 0.0011333333333333334, |
|
"loss": 0.2898, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 24.62, |
|
"grad_norm": 0.41153478622436523, |
|
"learning_rate": 0.001127777777777778, |
|
"loss": 0.2209, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 24.75, |
|
"grad_norm": 0.41805940866470337, |
|
"learning_rate": 0.0011222222222222222, |
|
"loss": 0.235, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 24.88, |
|
"grad_norm": 0.5226410627365112, |
|
"learning_rate": 0.0011166666666666666, |
|
"loss": 0.2349, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 0.3767559826374054, |
|
"learning_rate": 0.0011111111111111111, |
|
"loss": 0.2567, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.9130859375, |
|
"eval_loss": 0.22960150241851807, |
|
"eval_runtime": 4.887, |
|
"eval_samples_per_second": 209.535, |
|
"eval_steps_per_second": 13.096, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 25.12, |
|
"grad_norm": 0.6860052943229675, |
|
"learning_rate": 0.0011055555555555556, |
|
"loss": 0.2426, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 25.25, |
|
"grad_norm": 0.3876688778400421, |
|
"learning_rate": 0.0011, |
|
"loss": 0.2243, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 25.38, |
|
"grad_norm": 0.3251183032989502, |
|
"learning_rate": 0.0010944444444444445, |
|
"loss": 0.234, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 25.5, |
|
"grad_norm": 0.5538493990898132, |
|
"learning_rate": 0.0010888888888888888, |
|
"loss": 0.2547, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 25.62, |
|
"grad_norm": 0.6539644598960876, |
|
"learning_rate": 0.0010833333333333333, |
|
"loss": 0.2382, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 25.75, |
|
"grad_norm": 0.6687932014465332, |
|
"learning_rate": 0.0010777777777777778, |
|
"loss": 0.2254, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 25.88, |
|
"grad_norm": 0.6210919618606567, |
|
"learning_rate": 0.0010722222222222222, |
|
"loss": 0.2356, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"grad_norm": 0.5525135397911072, |
|
"learning_rate": 0.0010666666666666667, |
|
"loss": 0.2302, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.89453125, |
|
"eval_loss": 0.30673664808273315, |
|
"eval_runtime": 4.9576, |
|
"eval_samples_per_second": 206.552, |
|
"eval_steps_per_second": 12.909, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 26.12, |
|
"grad_norm": 0.5014208555221558, |
|
"learning_rate": 0.0010611111111111112, |
|
"loss": 0.2403, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 26.25, |
|
"grad_norm": 0.6093131303787231, |
|
"learning_rate": 0.0010555555555555557, |
|
"loss": 0.2356, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 26.38, |
|
"grad_norm": 0.3627248704433441, |
|
"learning_rate": 0.0010500000000000002, |
|
"loss": 0.2509, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 26.5, |
|
"grad_norm": 0.4119124114513397, |
|
"learning_rate": 0.0010444444444444446, |
|
"loss": 0.1915, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 26.62, |
|
"grad_norm": 0.5565811395645142, |
|
"learning_rate": 0.0010388888888888889, |
|
"loss": 0.191, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 26.75, |
|
"grad_norm": 0.44097578525543213, |
|
"learning_rate": 0.0010333333333333334, |
|
"loss": 0.2353, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 26.88, |
|
"grad_norm": 0.4542636275291443, |
|
"learning_rate": 0.0010277777777777776, |
|
"loss": 0.2144, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"grad_norm": 0.4763772785663605, |
|
"learning_rate": 0.0010222222222222221, |
|
"loss": 0.2358, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.9375, |
|
"eval_loss": 0.17758239805698395, |
|
"eval_runtime": 5.3185, |
|
"eval_samples_per_second": 192.534, |
|
"eval_steps_per_second": 12.033, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 27.12, |
|
"grad_norm": 0.7219308614730835, |
|
"learning_rate": 0.0010166666666666666, |
|
"loss": 0.238, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 27.25, |
|
"grad_norm": 0.7707520127296448, |
|
"learning_rate": 0.001011111111111111, |
|
"loss": 0.1863, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 27.38, |
|
"grad_norm": 0.6878935098648071, |
|
"learning_rate": 0.0010055555555555555, |
|
"loss": 0.2493, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 27.5, |
|
"grad_norm": 0.5451861619949341, |
|
"learning_rate": 0.001, |
|
"loss": 0.2374, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 27.62, |
|
"grad_norm": 0.39642319083213806, |
|
"learning_rate": 0.0009944444444444445, |
|
"loss": 0.2382, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 27.75, |
|
"grad_norm": 0.4122956097126007, |
|
"learning_rate": 0.000988888888888889, |
|
"loss": 0.2176, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 27.88, |
|
"grad_norm": 0.6155421733856201, |
|
"learning_rate": 0.0009833333333333332, |
|
"loss": 0.2128, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 0.7283052206039429, |
|
"learning_rate": 0.0009777777777777777, |
|
"loss": 0.2173, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.94921875, |
|
"eval_loss": 0.15962785482406616, |
|
"eval_runtime": 4.9115, |
|
"eval_samples_per_second": 208.489, |
|
"eval_steps_per_second": 13.031, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 28.12, |
|
"grad_norm": 0.39027243852615356, |
|
"learning_rate": 0.0009722222222222222, |
|
"loss": 0.1979, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 28.25, |
|
"grad_norm": 0.5258718729019165, |
|
"learning_rate": 0.0009666666666666667, |
|
"loss": 0.1447, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 28.38, |
|
"grad_norm": 0.6615960001945496, |
|
"learning_rate": 0.0009611111111111112, |
|
"loss": 0.2403, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 28.5, |
|
"grad_norm": 0.4044310748577118, |
|
"learning_rate": 0.0009555555555555556, |
|
"loss": 0.1981, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 28.62, |
|
"grad_norm": 0.2666930556297302, |
|
"learning_rate": 0.00095, |
|
"loss": 0.2108, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 28.75, |
|
"grad_norm": 0.5612334609031677, |
|
"learning_rate": 0.0009444444444444445, |
|
"loss": 0.1783, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 28.88, |
|
"grad_norm": 0.48420026898384094, |
|
"learning_rate": 0.000938888888888889, |
|
"loss": 0.1848, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"grad_norm": 0.5850337743759155, |
|
"learning_rate": 0.0009333333333333333, |
|
"loss": 0.1798, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.94140625, |
|
"eval_loss": 0.1548241674900055, |
|
"eval_runtime": 5.32, |
|
"eval_samples_per_second": 192.483, |
|
"eval_steps_per_second": 12.03, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 29.12, |
|
"grad_norm": 0.5059901475906372, |
|
"learning_rate": 0.0009277777777777778, |
|
"loss": 0.1954, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 29.25, |
|
"grad_norm": 0.22623513638973236, |
|
"learning_rate": 0.0009222222222222223, |
|
"loss": 0.1604, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 29.38, |
|
"grad_norm": 0.2330830693244934, |
|
"learning_rate": 0.0009166666666666666, |
|
"loss": 0.2125, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 29.5, |
|
"grad_norm": 0.4784901440143585, |
|
"learning_rate": 0.0009111111111111111, |
|
"loss": 0.1823, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 29.62, |
|
"grad_norm": 0.6156973242759705, |
|
"learning_rate": 0.0009055555555555556, |
|
"loss": 0.2289, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 29.75, |
|
"grad_norm": 0.4373360872268677, |
|
"learning_rate": 0.0009000000000000001, |
|
"loss": 0.2127, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 29.88, |
|
"grad_norm": 0.501115083694458, |
|
"learning_rate": 0.0008944444444444445, |
|
"loss": 0.2359, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.411662757396698, |
|
"learning_rate": 0.0008888888888888888, |
|
"loss": 0.197, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.95703125, |
|
"eval_loss": 0.17402663826942444, |
|
"eval_runtime": 4.919, |
|
"eval_samples_per_second": 208.172, |
|
"eval_steps_per_second": 13.011, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 30.12, |
|
"grad_norm": 0.45976510643959045, |
|
"learning_rate": 0.0008833333333333333, |
|
"loss": 0.1608, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 30.25, |
|
"grad_norm": 0.3243074417114258, |
|
"learning_rate": 0.0008777777777777778, |
|
"loss": 0.1742, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 30.38, |
|
"grad_norm": 0.5205725431442261, |
|
"learning_rate": 0.0008722222222222223, |
|
"loss": 0.1718, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 30.5, |
|
"grad_norm": 0.3976719081401825, |
|
"learning_rate": 0.0008666666666666667, |
|
"loss": 0.2247, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 30.62, |
|
"grad_norm": 0.2859196662902832, |
|
"learning_rate": 0.0008611111111111112, |
|
"loss": 0.1884, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 30.75, |
|
"grad_norm": 0.5310297012329102, |
|
"learning_rate": 0.0008555555555555556, |
|
"loss": 0.1672, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 30.88, |
|
"grad_norm": 0.5172590613365173, |
|
"learning_rate": 0.00085, |
|
"loss": 0.1828, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"grad_norm": 0.6098745465278625, |
|
"learning_rate": 0.0008444444444444444, |
|
"loss": 0.1654, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.966796875, |
|
"eval_loss": 0.12167137861251831, |
|
"eval_runtime": 4.9956, |
|
"eval_samples_per_second": 204.98, |
|
"eval_steps_per_second": 12.811, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 31.12, |
|
"grad_norm": 0.3343498706817627, |
|
"learning_rate": 0.0008388888888888889, |
|
"loss": 0.1784, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 31.25, |
|
"grad_norm": 0.3938640058040619, |
|
"learning_rate": 0.0008333333333333334, |
|
"loss": 0.1697, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 31.38, |
|
"grad_norm": 0.41868484020233154, |
|
"learning_rate": 0.0008277777777777778, |
|
"loss": 0.2263, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 31.5, |
|
"grad_norm": 0.4363801181316376, |
|
"learning_rate": 0.0008222222222222222, |
|
"loss": 0.1762, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 31.62, |
|
"grad_norm": 0.5088948607444763, |
|
"learning_rate": 0.0008166666666666667, |
|
"loss": 0.1711, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 31.75, |
|
"grad_norm": 0.5423977375030518, |
|
"learning_rate": 0.0008111111111111111, |
|
"loss": 0.1675, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 31.88, |
|
"grad_norm": 0.431382954120636, |
|
"learning_rate": 0.0008055555555555556, |
|
"loss": 0.2216, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 0.4037337303161621, |
|
"learning_rate": 0.0008, |
|
"loss": 0.1896, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.92578125, |
|
"eval_loss": 0.2552070617675781, |
|
"eval_runtime": 5.2019, |
|
"eval_samples_per_second": 196.853, |
|
"eval_steps_per_second": 12.303, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 32.12, |
|
"grad_norm": 0.6025939583778381, |
|
"learning_rate": 0.0007944444444444444, |
|
"loss": 0.1926, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 32.25, |
|
"grad_norm": 0.7205588221549988, |
|
"learning_rate": 0.0007888888888888889, |
|
"loss": 0.1755, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 32.38, |
|
"grad_norm": 0.3841509222984314, |
|
"learning_rate": 0.0007833333333333334, |
|
"loss": 0.1696, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 32.5, |
|
"grad_norm": 0.5659075975418091, |
|
"learning_rate": 0.0007777777777777778, |
|
"loss": 0.133, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 32.62, |
|
"grad_norm": 0.7011501789093018, |
|
"learning_rate": 0.0007722222222222223, |
|
"loss": 0.2069, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 32.75, |
|
"grad_norm": 0.5933576822280884, |
|
"learning_rate": 0.0007666666666666667, |
|
"loss": 0.1799, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 32.88, |
|
"grad_norm": 0.636463463306427, |
|
"learning_rate": 0.0007611111111111111, |
|
"loss": 0.1884, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"grad_norm": 0.36000609397888184, |
|
"learning_rate": 0.0007555555555555555, |
|
"loss": 0.1705, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.97265625, |
|
"eval_loss": 0.10305143892765045, |
|
"eval_runtime": 4.8746, |
|
"eval_samples_per_second": 210.07, |
|
"eval_steps_per_second": 13.129, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 33.12, |
|
"grad_norm": 0.25941601395606995, |
|
"learning_rate": 0.00075, |
|
"loss": 0.143, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 33.25, |
|
"grad_norm": 0.6486319899559021, |
|
"learning_rate": 0.0007444444444444445, |
|
"loss": 0.1819, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 33.38, |
|
"grad_norm": 0.34492290019989014, |
|
"learning_rate": 0.000738888888888889, |
|
"loss": 0.1877, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 33.5, |
|
"grad_norm": 0.5475990176200867, |
|
"learning_rate": 0.0007333333333333333, |
|
"loss": 0.1586, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 33.62, |
|
"grad_norm": 0.231631800532341, |
|
"learning_rate": 0.0007277777777777777, |
|
"loss": 0.145, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 33.75, |
|
"grad_norm": 0.6208530068397522, |
|
"learning_rate": 0.0007222222222222222, |
|
"loss": 0.2015, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 33.88, |
|
"grad_norm": 0.7229673862457275, |
|
"learning_rate": 0.0007166666666666667, |
|
"loss": 0.1814, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"grad_norm": 0.38056522607803345, |
|
"learning_rate": 0.0007111111111111111, |
|
"loss": 0.1689, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.96875, |
|
"eval_loss": 0.10111749172210693, |
|
"eval_runtime": 5.2922, |
|
"eval_samples_per_second": 193.491, |
|
"eval_steps_per_second": 12.093, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 34.12, |
|
"grad_norm": 0.5405479669570923, |
|
"learning_rate": 0.0007055555555555556, |
|
"loss": 0.16, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 34.25, |
|
"grad_norm": 0.5781314373016357, |
|
"learning_rate": 0.0007, |
|
"loss": 0.1598, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 34.38, |
|
"grad_norm": 0.33385559916496277, |
|
"learning_rate": 0.0006944444444444445, |
|
"loss": 0.1747, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 34.5, |
|
"grad_norm": 0.36587977409362793, |
|
"learning_rate": 0.000688888888888889, |
|
"loss": 0.1376, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 34.62, |
|
"grad_norm": 0.3459375202655792, |
|
"learning_rate": 0.0006833333333333333, |
|
"loss": 0.1297, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 34.75, |
|
"grad_norm": 0.5182803273200989, |
|
"learning_rate": 0.0006777777777777778, |
|
"loss": 0.1747, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"grad_norm": 0.39014366269111633, |
|
"learning_rate": 0.0006722222222222223, |
|
"loss": 0.169, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"grad_norm": 0.4516375660896301, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 0.1439, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.96484375, |
|
"eval_loss": 0.11748197674751282, |
|
"eval_runtime": 4.909, |
|
"eval_samples_per_second": 208.595, |
|
"eval_steps_per_second": 13.037, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 35.12, |
|
"grad_norm": 0.47782474756240845, |
|
"learning_rate": 0.0006611111111111111, |
|
"loss": 0.1417, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 35.25, |
|
"grad_norm": 0.11640643328428268, |
|
"learning_rate": 0.0006555555555555556, |
|
"loss": 0.1226, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 35.38, |
|
"grad_norm": 0.4363173544406891, |
|
"learning_rate": 0.0006500000000000001, |
|
"loss": 0.14, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 35.5, |
|
"grad_norm": 0.6676026582717896, |
|
"learning_rate": 0.0006444444444444444, |
|
"loss": 0.1548, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 35.62, |
|
"grad_norm": 0.4940982162952423, |
|
"learning_rate": 0.0006388888888888888, |
|
"loss": 0.1554, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 35.75, |
|
"grad_norm": 0.6478282809257507, |
|
"learning_rate": 0.0006333333333333333, |
|
"loss": 0.1641, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 35.88, |
|
"grad_norm": 0.6007707715034485, |
|
"learning_rate": 0.0006277777777777778, |
|
"loss": 0.1484, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 0.4945576786994934, |
|
"learning_rate": 0.0006222222222222223, |
|
"loss": 0.1606, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.9443359375, |
|
"eval_loss": 0.18046385049819946, |
|
"eval_runtime": 5.0626, |
|
"eval_samples_per_second": 202.266, |
|
"eval_steps_per_second": 12.642, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 36.12, |
|
"grad_norm": 0.4033058285713196, |
|
"learning_rate": 0.0006166666666666667, |
|
"loss": 0.1372, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 36.25, |
|
"grad_norm": 0.30507412552833557, |
|
"learning_rate": 0.0006111111111111112, |
|
"loss": 0.1549, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 36.38, |
|
"grad_norm": 0.3899296820163727, |
|
"learning_rate": 0.0006055555555555556, |
|
"loss": 0.1667, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 36.5, |
|
"grad_norm": 0.44058963656425476, |
|
"learning_rate": 0.0006, |
|
"loss": 0.1712, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 36.62, |
|
"grad_norm": 0.4805178642272949, |
|
"learning_rate": 0.0005944444444444444, |
|
"loss": 0.1805, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 36.75, |
|
"grad_norm": 0.37880581617355347, |
|
"learning_rate": 0.0005888888888888889, |
|
"loss": 0.1411, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 36.88, |
|
"grad_norm": 0.4263412654399872, |
|
"learning_rate": 0.0005833333333333334, |
|
"loss": 0.1714, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"grad_norm": 0.2723836898803711, |
|
"learning_rate": 0.0005777777777777778, |
|
"loss": 0.1281, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.9677734375, |
|
"eval_loss": 0.1253870278596878, |
|
"eval_runtime": 5.1275, |
|
"eval_samples_per_second": 199.709, |
|
"eval_steps_per_second": 12.482, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 37.12, |
|
"grad_norm": 0.4946765601634979, |
|
"learning_rate": 0.0005722222222222222, |
|
"loss": 0.1059, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 37.25, |
|
"grad_norm": 0.4709372818470001, |
|
"learning_rate": 0.0005666666666666667, |
|
"loss": 0.1321, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 37.38, |
|
"grad_norm": 0.36459285020828247, |
|
"learning_rate": 0.0005611111111111111, |
|
"loss": 0.1351, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 37.5, |
|
"grad_norm": 0.4145031273365021, |
|
"learning_rate": 0.0005555555555555556, |
|
"loss": 0.1545, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 37.62, |
|
"grad_norm": 0.5457221865653992, |
|
"learning_rate": 0.00055, |
|
"loss": 0.1424, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 37.75, |
|
"grad_norm": 0.5123695731163025, |
|
"learning_rate": 0.0005444444444444444, |
|
"loss": 0.1508, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 37.88, |
|
"grad_norm": 0.29368171095848083, |
|
"learning_rate": 0.0005388888888888889, |
|
"loss": 0.1438, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"grad_norm": 0.6859858632087708, |
|
"learning_rate": 0.0005333333333333334, |
|
"loss": 0.1518, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.96484375, |
|
"eval_loss": 0.11837992072105408, |
|
"eval_runtime": 4.9042, |
|
"eval_samples_per_second": 208.8, |
|
"eval_steps_per_second": 13.05, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 38.12, |
|
"grad_norm": 0.3859548270702362, |
|
"learning_rate": 0.0005277777777777778, |
|
"loss": 0.1455, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 38.25, |
|
"grad_norm": 0.21001270413398743, |
|
"learning_rate": 0.0005222222222222223, |
|
"loss": 0.13, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 38.38, |
|
"grad_norm": 0.4814240038394928, |
|
"learning_rate": 0.0005166666666666667, |
|
"loss": 0.1291, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 38.5, |
|
"grad_norm": 0.4478558301925659, |
|
"learning_rate": 0.0005111111111111111, |
|
"loss": 0.1293, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 38.62, |
|
"grad_norm": 0.4811321496963501, |
|
"learning_rate": 0.0005055555555555555, |
|
"loss": 0.1231, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 38.75, |
|
"grad_norm": 0.2841961085796356, |
|
"learning_rate": 0.0005, |
|
"loss": 0.166, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 38.88, |
|
"grad_norm": 0.5479158759117126, |
|
"learning_rate": 0.0004944444444444445, |
|
"loss": 0.1044, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"grad_norm": 0.37449321150779724, |
|
"learning_rate": 0.0004888888888888889, |
|
"loss": 0.1531, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.9736328125, |
|
"eval_loss": 0.09921471774578094, |
|
"eval_runtime": 5.3451, |
|
"eval_samples_per_second": 191.577, |
|
"eval_steps_per_second": 11.974, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 39.12, |
|
"grad_norm": 0.5961503386497498, |
|
"learning_rate": 0.00048333333333333334, |
|
"loss": 0.1321, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 39.25, |
|
"grad_norm": 0.3140615224838257, |
|
"learning_rate": 0.0004777777777777778, |
|
"loss": 0.1192, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 39.38, |
|
"grad_norm": 0.8949409127235413, |
|
"learning_rate": 0.00047222222222222224, |
|
"loss": 0.122, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 39.5, |
|
"grad_norm": 0.21187840402126312, |
|
"learning_rate": 0.00046666666666666666, |
|
"loss": 0.1341, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 39.62, |
|
"grad_norm": 0.6364386081695557, |
|
"learning_rate": 0.00046111111111111114, |
|
"loss": 0.1327, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 39.75, |
|
"grad_norm": 0.2257820963859558, |
|
"learning_rate": 0.00045555555555555556, |
|
"loss": 0.1101, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 39.88, |
|
"grad_norm": 0.373692125082016, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.1293, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.48990318179130554, |
|
"learning_rate": 0.0004444444444444444, |
|
"loss": 0.132, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.9775390625, |
|
"eval_loss": 0.09202806651592255, |
|
"eval_runtime": 4.9155, |
|
"eval_samples_per_second": 208.319, |
|
"eval_steps_per_second": 13.02, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 40.12, |
|
"grad_norm": 0.571524441242218, |
|
"learning_rate": 0.0004388888888888889, |
|
"loss": 0.1167, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 40.25, |
|
"grad_norm": 0.5896998643875122, |
|
"learning_rate": 0.00043333333333333337, |
|
"loss": 0.151, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 40.38, |
|
"grad_norm": 0.44366732239723206, |
|
"learning_rate": 0.0004277777777777778, |
|
"loss": 0.1422, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 40.5, |
|
"grad_norm": 0.314609169960022, |
|
"learning_rate": 0.0004222222222222222, |
|
"loss": 0.1253, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 40.62, |
|
"grad_norm": 0.3513747453689575, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 0.119, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 40.75, |
|
"grad_norm": 0.3717803359031677, |
|
"learning_rate": 0.0004111111111111111, |
|
"loss": 0.1156, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 40.88, |
|
"grad_norm": 0.22342754900455475, |
|
"learning_rate": 0.00040555555555555554, |
|
"loss": 0.1372, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"grad_norm": 0.41738444566726685, |
|
"learning_rate": 0.0004, |
|
"loss": 0.134, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.9638671875, |
|
"eval_loss": 0.13908132910728455, |
|
"eval_runtime": 5.2165, |
|
"eval_samples_per_second": 196.299, |
|
"eval_steps_per_second": 12.269, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 41.12, |
|
"grad_norm": 0.41814348101615906, |
|
"learning_rate": 0.00039444444444444444, |
|
"loss": 0.1279, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 41.25, |
|
"grad_norm": 0.9678131937980652, |
|
"learning_rate": 0.0003888888888888889, |
|
"loss": 0.1398, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 41.38, |
|
"grad_norm": 0.6725767850875854, |
|
"learning_rate": 0.00038333333333333334, |
|
"loss": 0.1492, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 41.5, |
|
"grad_norm": 0.31534790992736816, |
|
"learning_rate": 0.00037777777777777777, |
|
"loss": 0.1119, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 41.62, |
|
"grad_norm": 0.632583737373352, |
|
"learning_rate": 0.00037222222222222225, |
|
"loss": 0.1131, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 41.75, |
|
"grad_norm": 0.6746741533279419, |
|
"learning_rate": 0.00036666666666666667, |
|
"loss": 0.1351, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 41.88, |
|
"grad_norm": 0.3400849997997284, |
|
"learning_rate": 0.0003611111111111111, |
|
"loss": 0.0815, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"grad_norm": 0.5605281591415405, |
|
"learning_rate": 0.00035555555555555557, |
|
"loss": 0.1413, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.9716796875, |
|
"eval_loss": 0.11220287531614304, |
|
"eval_runtime": 5.0927, |
|
"eval_samples_per_second": 201.072, |
|
"eval_steps_per_second": 12.567, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 42.12, |
|
"grad_norm": 0.5148097276687622, |
|
"learning_rate": 0.00035, |
|
"loss": 0.125, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 42.25, |
|
"grad_norm": 0.38650012016296387, |
|
"learning_rate": 0.0003444444444444445, |
|
"loss": 0.1209, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 42.38, |
|
"grad_norm": 0.3292187750339508, |
|
"learning_rate": 0.0003388888888888889, |
|
"loss": 0.1236, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 42.5, |
|
"grad_norm": 0.20681746304035187, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 0.0973, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 42.62, |
|
"grad_norm": 0.33743348717689514, |
|
"learning_rate": 0.0003277777777777778, |
|
"loss": 0.1208, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 42.75, |
|
"grad_norm": 0.34158453345298767, |
|
"learning_rate": 0.0003222222222222222, |
|
"loss": 0.11, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 42.88, |
|
"grad_norm": 0.5730062127113342, |
|
"learning_rate": 0.00031666666666666665, |
|
"loss": 0.1292, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"grad_norm": 0.44954267144203186, |
|
"learning_rate": 0.0003111111111111111, |
|
"loss": 0.1097, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.9677734375, |
|
"eval_loss": 0.11706902086734772, |
|
"eval_runtime": 4.9202, |
|
"eval_samples_per_second": 208.12, |
|
"eval_steps_per_second": 13.007, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 43.12, |
|
"grad_norm": 0.25731635093688965, |
|
"learning_rate": 0.0003055555555555556, |
|
"loss": 0.1161, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 43.25, |
|
"grad_norm": 0.5329569578170776, |
|
"learning_rate": 0.0003, |
|
"loss": 0.1507, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 43.38, |
|
"grad_norm": 0.3034692704677582, |
|
"learning_rate": 0.00029444444444444445, |
|
"loss": 0.1447, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 43.5, |
|
"grad_norm": 0.5483482480049133, |
|
"learning_rate": 0.0002888888888888889, |
|
"loss": 0.1323, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 43.62, |
|
"grad_norm": 0.279697984457016, |
|
"learning_rate": 0.00028333333333333335, |
|
"loss": 0.1, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 43.75, |
|
"grad_norm": 0.5593113303184509, |
|
"learning_rate": 0.0002777777777777778, |
|
"loss": 0.1169, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 43.88, |
|
"grad_norm": 0.621919572353363, |
|
"learning_rate": 0.0002722222222222222, |
|
"loss": 0.1119, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 0.37898024916648865, |
|
"learning_rate": 0.0002666666666666667, |
|
"loss": 0.1167, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.10542036592960358, |
|
"eval_runtime": 5.3473, |
|
"eval_samples_per_second": 191.5, |
|
"eval_steps_per_second": 11.969, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 44.12, |
|
"grad_norm": 0.40025296807289124, |
|
"learning_rate": 0.00026111111111111116, |
|
"loss": 0.1107, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 44.25, |
|
"grad_norm": 0.19010861217975616, |
|
"learning_rate": 0.00025555555555555553, |
|
"loss": 0.1008, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 44.38, |
|
"grad_norm": 0.33224934339523315, |
|
"learning_rate": 0.00025, |
|
"loss": 0.1355, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 44.5, |
|
"grad_norm": 0.4298325181007385, |
|
"learning_rate": 0.00024444444444444443, |
|
"loss": 0.106, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 44.62, |
|
"grad_norm": 0.4320330023765564, |
|
"learning_rate": 0.0002388888888888889, |
|
"loss": 0.1053, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 44.75, |
|
"grad_norm": 0.1121302917599678, |
|
"learning_rate": 0.00023333333333333333, |
|
"loss": 0.0845, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 44.88, |
|
"grad_norm": 0.3021819293498993, |
|
"learning_rate": 0.00022777777777777778, |
|
"loss": 0.1222, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"grad_norm": 0.7353653311729431, |
|
"learning_rate": 0.0002222222222222222, |
|
"loss": 0.1388, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.9794921875, |
|
"eval_loss": 0.09323666244745255, |
|
"eval_runtime": 4.9368, |
|
"eval_samples_per_second": 207.422, |
|
"eval_steps_per_second": 12.964, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 45.12, |
|
"grad_norm": 0.5964930057525635, |
|
"learning_rate": 0.00021666666666666668, |
|
"loss": 0.1201, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 45.25, |
|
"grad_norm": 0.17329342663288116, |
|
"learning_rate": 0.0002111111111111111, |
|
"loss": 0.0905, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 45.38, |
|
"grad_norm": 0.5378609299659729, |
|
"learning_rate": 0.00020555555555555556, |
|
"loss": 0.0981, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 45.5, |
|
"grad_norm": 0.3457593619823456, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1116, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 45.62, |
|
"grad_norm": 0.5954685211181641, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 0.1037, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 45.75, |
|
"grad_norm": 0.1786712259054184, |
|
"learning_rate": 0.00018888888888888888, |
|
"loss": 0.0978, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 45.88, |
|
"grad_norm": 0.25224894285202026, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.1089, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"grad_norm": 0.33607247471809387, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.1221, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.9765625, |
|
"eval_loss": 0.09462323784828186, |
|
"eval_runtime": 5.2287, |
|
"eval_samples_per_second": 195.844, |
|
"eval_steps_per_second": 12.24, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 46.12, |
|
"grad_norm": 0.34634700417518616, |
|
"learning_rate": 0.00017222222222222224, |
|
"loss": 0.1243, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 46.25, |
|
"grad_norm": 0.5061681866645813, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 0.1115, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 46.38, |
|
"grad_norm": 0.2837713658809662, |
|
"learning_rate": 0.0001611111111111111, |
|
"loss": 0.1008, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 46.5, |
|
"grad_norm": 0.2688066363334656, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.1058, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 46.62, |
|
"grad_norm": 0.32675421237945557, |
|
"learning_rate": 0.00015, |
|
"loss": 0.0897, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 46.75, |
|
"grad_norm": 0.6959260702133179, |
|
"learning_rate": 0.00014444444444444444, |
|
"loss": 0.1182, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 46.88, |
|
"grad_norm": 0.3018099069595337, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 0.1013, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"grad_norm": 0.6018778085708618, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.1099, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.9755859375, |
|
"eval_loss": 0.1115545928478241, |
|
"eval_runtime": 5.0799, |
|
"eval_samples_per_second": 201.581, |
|
"eval_steps_per_second": 12.599, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 47.12, |
|
"grad_norm": 0.42199546098709106, |
|
"learning_rate": 0.00012777777777777776, |
|
"loss": 0.1073, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 47.25, |
|
"grad_norm": 0.6451756358146667, |
|
"learning_rate": 0.00012222222222222221, |
|
"loss": 0.099, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 47.38, |
|
"grad_norm": 0.4935210943222046, |
|
"learning_rate": 0.00011666666666666667, |
|
"loss": 0.1077, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 47.5, |
|
"grad_norm": 0.2563684582710266, |
|
"learning_rate": 0.0001111111111111111, |
|
"loss": 0.0907, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 47.62, |
|
"grad_norm": 0.3351310193538666, |
|
"learning_rate": 0.00010555555555555555, |
|
"loss": 0.1059, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 47.75, |
|
"grad_norm": 0.39526107907295227, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0868, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 47.88, |
|
"grad_norm": 0.4634101390838623, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 0.1098, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 0.5983624458312988, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.1041, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.974609375, |
|
"eval_loss": 0.11264081299304962, |
|
"eval_runtime": 4.9279, |
|
"eval_samples_per_second": 207.795, |
|
"eval_steps_per_second": 12.987, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 48.12, |
|
"grad_norm": 0.4093017578125, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 0.1134, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 48.25, |
|
"grad_norm": 0.6668171286582947, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 0.0948, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 48.38, |
|
"grad_norm": 0.24066688120365143, |
|
"learning_rate": 7.222222222222222e-05, |
|
"loss": 0.0958, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 48.5, |
|
"grad_norm": 0.2770562469959259, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.1021, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 48.62, |
|
"grad_norm": 0.45978790521621704, |
|
"learning_rate": 6.111111111111111e-05, |
|
"loss": 0.1084, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 48.75, |
|
"grad_norm": 0.594672441482544, |
|
"learning_rate": 5.555555555555555e-05, |
|
"loss": 0.1373, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 48.88, |
|
"grad_norm": 0.8167428374290466, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1038, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"grad_norm": 0.2987329661846161, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.1025, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.9755859375, |
|
"eval_loss": 0.11138872057199478, |
|
"eval_runtime": 5.3184, |
|
"eval_samples_per_second": 192.54, |
|
"eval_steps_per_second": 12.034, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 49.12, |
|
"grad_norm": 0.3884102404117584, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 0.1018, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 49.25, |
|
"grad_norm": 0.2661769688129425, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.1011, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 49.38, |
|
"grad_norm": 0.40820014476776123, |
|
"learning_rate": 2.7777777777777776e-05, |
|
"loss": 0.1488, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 49.5, |
|
"grad_norm": 0.46163231134414673, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.1258, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 49.62, |
|
"grad_norm": 0.4315054416656494, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.1018, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 49.75, |
|
"grad_norm": 0.2365369200706482, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 0.0977, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 49.88, |
|
"grad_norm": 0.4910149574279785, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 0.1122, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.2623092234134674, |
|
"learning_rate": 0.0, |
|
"loss": 0.0887, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.9755859375, |
|
"eval_loss": 0.10555899888277054, |
|
"eval_runtime": 4.9229, |
|
"eval_samples_per_second": 208.005, |
|
"eval_steps_per_second": 13.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 4000, |
|
"total_flos": 5.437210780237824e+18, |
|
"train_loss": 0.3629864407479763, |
|
"train_runtime": 3465.6999, |
|
"train_samples_per_second": 73.867, |
|
"train_steps_per_second": 1.154 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 5.437210780237824e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|