|
{ |
|
"best_metric": 0.6386696730552424, |
|
"best_model_checkpoint": "dinov2-small-imagenet1k-1-layer-finetuned-galaxy10-decals/checkpoint-2480", |
|
"epoch": 19.879759519038075, |
|
"eval_steps": 500, |
|
"global_step": 2480, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08016032064128256, |
|
"grad_norm": 145.4962615966797, |
|
"learning_rate": 4.0322580645161286e-09, |
|
"loss": 3.7794, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16032064128256512, |
|
"grad_norm": 174.03173828125, |
|
"learning_rate": 8.064516129032257e-09, |
|
"loss": 3.8424, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24048096192384769, |
|
"grad_norm": 157.9890899658203, |
|
"learning_rate": 1.2096774193548386e-08, |
|
"loss": 3.8211, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32064128256513025, |
|
"grad_norm": 154.46774291992188, |
|
"learning_rate": 1.6129032258064514e-08, |
|
"loss": 3.7635, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40080160320641284, |
|
"grad_norm": 165.22471618652344, |
|
"learning_rate": 2.0161290322580644e-08, |
|
"loss": 3.8423, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.48096192384769537, |
|
"grad_norm": 149.4167022705078, |
|
"learning_rate": 2.4193548387096773e-08, |
|
"loss": 3.722, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.561122244488978, |
|
"grad_norm": 163.74615478515625, |
|
"learning_rate": 2.8225806451612906e-08, |
|
"loss": 3.7051, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6412825651302605, |
|
"grad_norm": 158.1961669921875, |
|
"learning_rate": 3.225806451612903e-08, |
|
"loss": 3.6243, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7214428857715431, |
|
"grad_norm": 158.70668029785156, |
|
"learning_rate": 3.629032258064516e-08, |
|
"loss": 3.605, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8016032064128257, |
|
"grad_norm": 135.90687561035156, |
|
"learning_rate": 4.032258064516129e-08, |
|
"loss": 3.4392, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8817635270541082, |
|
"grad_norm": 154.86976623535156, |
|
"learning_rate": 4.435483870967742e-08, |
|
"loss": 3.3917, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9619238476953907, |
|
"grad_norm": 131.125, |
|
"learning_rate": 4.8387096774193546e-08, |
|
"loss": 3.2794, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9939879759519038, |
|
"eval_accuracy": 0.09244644870349493, |
|
"eval_loss": 3.2712783813476562, |
|
"eval_runtime": 27.1498, |
|
"eval_samples_per_second": 65.341, |
|
"eval_steps_per_second": 2.063, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.0420841683366733, |
|
"grad_norm": 149.99026489257812, |
|
"learning_rate": 5.241935483870967e-08, |
|
"loss": 3.2834, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.122244488977956, |
|
"grad_norm": 125.77014923095703, |
|
"learning_rate": 5.645161290322581e-08, |
|
"loss": 3.1664, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2024048096192386, |
|
"grad_norm": 139.7942657470703, |
|
"learning_rate": 6.048387096774194e-08, |
|
"loss": 3.0851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.282565130260521, |
|
"grad_norm": 141.2034149169922, |
|
"learning_rate": 6.451612903225806e-08, |
|
"loss": 2.8981, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3627254509018036, |
|
"grad_norm": 79.29173278808594, |
|
"learning_rate": 6.854838709677419e-08, |
|
"loss": 2.8603, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4428857715430863, |
|
"grad_norm": 135.16925048828125, |
|
"learning_rate": 7.258064516129032e-08, |
|
"loss": 2.7191, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5230460921843687, |
|
"grad_norm": 105.61141204833984, |
|
"learning_rate": 7.661290322580644e-08, |
|
"loss": 2.6305, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6032064128256514, |
|
"grad_norm": 82.95243072509766, |
|
"learning_rate": 8.064516129032257e-08, |
|
"loss": 2.5291, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6833667334669338, |
|
"grad_norm": 87.03684997558594, |
|
"learning_rate": 8.467741935483871e-08, |
|
"loss": 2.4414, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7635270541082164, |
|
"grad_norm": 95.90193176269531, |
|
"learning_rate": 8.870967741935484e-08, |
|
"loss": 2.3726, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.843687374749499, |
|
"grad_norm": 84.5301513671875, |
|
"learning_rate": 9.274193548387096e-08, |
|
"loss": 2.2699, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9238476953907817, |
|
"grad_norm": 71.2406997680664, |
|
"learning_rate": 9.677419354838709e-08, |
|
"loss": 2.2032, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.9959919839679359, |
|
"eval_accuracy": 0.24013528748590757, |
|
"eval_loss": 2.1442790031433105, |
|
"eval_runtime": 24.1714, |
|
"eval_samples_per_second": 73.392, |
|
"eval_steps_per_second": 2.317, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.004008016032064, |
|
"grad_norm": 78.2083969116211, |
|
"learning_rate": 9.991039426523296e-08, |
|
"loss": 2.1881, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.0841683366733466, |
|
"grad_norm": 63.044246673583984, |
|
"learning_rate": 9.946236559139784e-08, |
|
"loss": 2.1004, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.164328657314629, |
|
"grad_norm": 88.07442474365234, |
|
"learning_rate": 9.901433691756272e-08, |
|
"loss": 2.0794, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.244488977955912, |
|
"grad_norm": 59.17829895019531, |
|
"learning_rate": 9.856630824372759e-08, |
|
"loss": 2.068, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.3246492985971945, |
|
"grad_norm": 61.12708282470703, |
|
"learning_rate": 9.811827956989247e-08, |
|
"loss": 2.0244, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.404809619238477, |
|
"grad_norm": 73.2655029296875, |
|
"learning_rate": 9.767025089605734e-08, |
|
"loss": 2.0167, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.4849699398797593, |
|
"grad_norm": 70.88330841064453, |
|
"learning_rate": 9.722222222222221e-08, |
|
"loss": 1.9997, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.565130260521042, |
|
"grad_norm": 68.97590637207031, |
|
"learning_rate": 9.677419354838709e-08, |
|
"loss": 2.0024, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6452905811623246, |
|
"grad_norm": 61.53851318359375, |
|
"learning_rate": 9.632616487455196e-08, |
|
"loss": 1.9394, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7254509018036073, |
|
"grad_norm": 70.83650970458984, |
|
"learning_rate": 9.587813620071684e-08, |
|
"loss": 1.941, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.80561122244489, |
|
"grad_norm": 67.32137298583984, |
|
"learning_rate": 9.543010752688172e-08, |
|
"loss": 1.9437, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.8857715430861726, |
|
"grad_norm": 61.75737762451172, |
|
"learning_rate": 9.498207885304659e-08, |
|
"loss": 1.8847, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.9659318637274548, |
|
"grad_norm": 77.73907470703125, |
|
"learning_rate": 9.453405017921147e-08, |
|
"loss": 1.9114, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.997995991983968, |
|
"eval_accuracy": 0.37767756482525366, |
|
"eval_loss": 1.8624014854431152, |
|
"eval_runtime": 21.9272, |
|
"eval_samples_per_second": 80.904, |
|
"eval_steps_per_second": 2.554, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 3.0460921843687374, |
|
"grad_norm": 70.89105224609375, |
|
"learning_rate": 9.408602150537634e-08, |
|
"loss": 1.8743, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.12625250501002, |
|
"grad_norm": 62.805423736572266, |
|
"learning_rate": 9.363799283154121e-08, |
|
"loss": 1.8232, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.2064128256513027, |
|
"grad_norm": 65.72661590576172, |
|
"learning_rate": 9.318996415770609e-08, |
|
"loss": 1.7887, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.2865731462925853, |
|
"grad_norm": 64.57553100585938, |
|
"learning_rate": 9.274193548387096e-08, |
|
"loss": 1.8042, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.3667334669338675, |
|
"grad_norm": 68.34881591796875, |
|
"learning_rate": 9.229390681003584e-08, |
|
"loss": 1.8049, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.44689378757515, |
|
"grad_norm": 97.984130859375, |
|
"learning_rate": 9.184587813620072e-08, |
|
"loss": 1.7861, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.527054108216433, |
|
"grad_norm": 76.27339935302734, |
|
"learning_rate": 9.139784946236558e-08, |
|
"loss": 1.7652, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.6072144288577155, |
|
"grad_norm": 58.08464050292969, |
|
"learning_rate": 9.094982078853046e-08, |
|
"loss": 1.7745, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.687374749498998, |
|
"grad_norm": 82.49138641357422, |
|
"learning_rate": 9.050179211469534e-08, |
|
"loss": 1.7691, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.7675350701402808, |
|
"grad_norm": 65.0194091796875, |
|
"learning_rate": 9.005376344086021e-08, |
|
"loss": 1.734, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.847695390781563, |
|
"grad_norm": 68.1599349975586, |
|
"learning_rate": 8.960573476702509e-08, |
|
"loss": 1.7531, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.9278557114228456, |
|
"grad_norm": 62.770931243896484, |
|
"learning_rate": 8.915770609318996e-08, |
|
"loss": 1.6905, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.44532130777903045, |
|
"eval_loss": 1.686484932899475, |
|
"eval_runtime": 13.8629, |
|
"eval_samples_per_second": 127.967, |
|
"eval_steps_per_second": 4.04, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 4.008016032064128, |
|
"grad_norm": 71.53292083740234, |
|
"learning_rate": 8.870967741935484e-08, |
|
"loss": 1.7048, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.0881763527054105, |
|
"grad_norm": 94.61470031738281, |
|
"learning_rate": 8.826164874551971e-08, |
|
"loss": 1.6891, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.168336673346693, |
|
"grad_norm": 73.69153594970703, |
|
"learning_rate": 8.781362007168458e-08, |
|
"loss": 1.6418, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.248496993987976, |
|
"grad_norm": 62.38913345336914, |
|
"learning_rate": 8.736559139784946e-08, |
|
"loss": 1.6844, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.328657314629258, |
|
"grad_norm": 66.4658203125, |
|
"learning_rate": 8.691756272401434e-08, |
|
"loss": 1.6274, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.408817635270541, |
|
"grad_norm": 62.56648635864258, |
|
"learning_rate": 8.646953405017921e-08, |
|
"loss": 1.6291, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.488977955911824, |
|
"grad_norm": 64.94890594482422, |
|
"learning_rate": 8.602150537634409e-08, |
|
"loss": 1.6456, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.569138276553106, |
|
"grad_norm": 70.55397033691406, |
|
"learning_rate": 8.557347670250896e-08, |
|
"loss": 1.5912, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.649298597194389, |
|
"grad_norm": 77.71963500976562, |
|
"learning_rate": 8.512544802867383e-08, |
|
"loss": 1.5993, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.729458917835672, |
|
"grad_norm": 80.71322631835938, |
|
"learning_rate": 8.467741935483871e-08, |
|
"loss": 1.5824, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.809619238476954, |
|
"grad_norm": 67.88044738769531, |
|
"learning_rate": 8.422939068100358e-08, |
|
"loss": 1.6323, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.889779559118237, |
|
"grad_norm": 67.66226196289062, |
|
"learning_rate": 8.378136200716846e-08, |
|
"loss": 1.5812, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.969939879759519, |
|
"grad_norm": 78.28225708007812, |
|
"learning_rate": 8.333333333333334e-08, |
|
"loss": 1.5548, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.993987975951904, |
|
"eval_accuracy": 0.48534385569334837, |
|
"eval_loss": 1.5514700412750244, |
|
"eval_runtime": 15.6981, |
|
"eval_samples_per_second": 113.007, |
|
"eval_steps_per_second": 3.567, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 5.050100200400801, |
|
"grad_norm": 61.500125885009766, |
|
"learning_rate": 8.288530465949821e-08, |
|
"loss": 1.5763, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.130260521042084, |
|
"grad_norm": 53.66603469848633, |
|
"learning_rate": 8.243727598566307e-08, |
|
"loss": 1.5392, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.210420841683367, |
|
"grad_norm": 65.59963989257812, |
|
"learning_rate": 8.198924731182796e-08, |
|
"loss": 1.5559, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.290581162324649, |
|
"grad_norm": 70.21326446533203, |
|
"learning_rate": 8.154121863799282e-08, |
|
"loss": 1.5247, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.370741482965932, |
|
"grad_norm": 95.18133544921875, |
|
"learning_rate": 8.10931899641577e-08, |
|
"loss": 1.5343, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.4509018036072145, |
|
"grad_norm": 63.49616241455078, |
|
"learning_rate": 8.064516129032257e-08, |
|
"loss": 1.5013, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.531062124248497, |
|
"grad_norm": 87.86900329589844, |
|
"learning_rate": 8.019713261648746e-08, |
|
"loss": 1.5003, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.61122244488978, |
|
"grad_norm": 97.59938049316406, |
|
"learning_rate": 7.974910394265232e-08, |
|
"loss": 1.5498, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.6913827655310625, |
|
"grad_norm": 80.1442642211914, |
|
"learning_rate": 7.930107526881719e-08, |
|
"loss": 1.5255, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.771543086172345, |
|
"grad_norm": 79.7429428100586, |
|
"learning_rate": 7.885304659498207e-08, |
|
"loss": 1.4666, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.851703406813627, |
|
"grad_norm": 70.18585968017578, |
|
"learning_rate": 7.840501792114696e-08, |
|
"loss": 1.4929, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.9318637274549095, |
|
"grad_norm": 95.345703125, |
|
"learning_rate": 7.795698924731182e-08, |
|
"loss": 1.4678, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.995991983967936, |
|
"eval_accuracy": 0.5067643742953777, |
|
"eval_loss": 1.4546138048171997, |
|
"eval_runtime": 12.1383, |
|
"eval_samples_per_second": 146.149, |
|
"eval_steps_per_second": 4.613, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 6.012024048096192, |
|
"grad_norm": 71.77326202392578, |
|
"learning_rate": 7.75089605734767e-08, |
|
"loss": 1.4735, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.092184368737475, |
|
"grad_norm": 73.5064468383789, |
|
"learning_rate": 7.706093189964157e-08, |
|
"loss": 1.4785, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.1723446893787575, |
|
"grad_norm": 74.84465789794922, |
|
"learning_rate": 7.661290322580644e-08, |
|
"loss": 1.4793, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.25250501002004, |
|
"grad_norm": 116.29476165771484, |
|
"learning_rate": 7.616487455197132e-08, |
|
"loss": 1.4126, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.332665330661323, |
|
"grad_norm": 73.14019775390625, |
|
"learning_rate": 7.571684587813619e-08, |
|
"loss": 1.4344, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.412825651302605, |
|
"grad_norm": 83.63815307617188, |
|
"learning_rate": 7.526881720430107e-08, |
|
"loss": 1.4544, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.492985971943888, |
|
"grad_norm": 84.06439208984375, |
|
"learning_rate": 7.482078853046595e-08, |
|
"loss": 1.4593, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.573146292585171, |
|
"grad_norm": 67.01233673095703, |
|
"learning_rate": 7.437275985663082e-08, |
|
"loss": 1.4563, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.653306613226453, |
|
"grad_norm": 74.03414916992188, |
|
"learning_rate": 7.392473118279569e-08, |
|
"loss": 1.452, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.733466933867735, |
|
"grad_norm": 70.25257110595703, |
|
"learning_rate": 7.347670250896057e-08, |
|
"loss": 1.3841, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.813627254509018, |
|
"grad_norm": 80.05170440673828, |
|
"learning_rate": 7.302867383512544e-08, |
|
"loss": 1.4093, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.8937875751503, |
|
"grad_norm": 94.38542938232422, |
|
"learning_rate": 7.258064516129032e-08, |
|
"loss": 1.4171, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.973947895791583, |
|
"grad_norm": 94.03691101074219, |
|
"learning_rate": 7.213261648745519e-08, |
|
"loss": 1.3977, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.997995991983968, |
|
"eval_accuracy": 0.5372040586245772, |
|
"eval_loss": 1.3747113943099976, |
|
"eval_runtime": 11.127, |
|
"eval_samples_per_second": 159.432, |
|
"eval_steps_per_second": 5.033, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 7.054108216432866, |
|
"grad_norm": 71.07766723632812, |
|
"learning_rate": 7.168458781362007e-08, |
|
"loss": 1.3713, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.134268537074148, |
|
"grad_norm": 81.19793701171875, |
|
"learning_rate": 7.123655913978494e-08, |
|
"loss": 1.3874, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.214428857715431, |
|
"grad_norm": 79.04607391357422, |
|
"learning_rate": 7.078853046594981e-08, |
|
"loss": 1.3978, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.294589178356714, |
|
"grad_norm": 97.111328125, |
|
"learning_rate": 7.034050179211469e-08, |
|
"loss": 1.3499, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.374749498997996, |
|
"grad_norm": 71.67705535888672, |
|
"learning_rate": 6.989247311827957e-08, |
|
"loss": 1.3573, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.454909819639279, |
|
"grad_norm": 84.07296752929688, |
|
"learning_rate": 6.944444444444444e-08, |
|
"loss": 1.3843, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.5350701402805615, |
|
"grad_norm": 72.90270233154297, |
|
"learning_rate": 6.899641577060932e-08, |
|
"loss": 1.3852, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.615230460921843, |
|
"grad_norm": 66.78356170654297, |
|
"learning_rate": 6.854838709677419e-08, |
|
"loss": 1.3628, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.695390781563126, |
|
"grad_norm": 80.06150817871094, |
|
"learning_rate": 6.810035842293906e-08, |
|
"loss": 1.3676, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.775551102204409, |
|
"grad_norm": 87.77715301513672, |
|
"learning_rate": 6.765232974910394e-08, |
|
"loss": 1.3413, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.855711422845691, |
|
"grad_norm": 71.23929595947266, |
|
"learning_rate": 6.720430107526881e-08, |
|
"loss": 1.3765, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.935871743486974, |
|
"grad_norm": 88.73052978515625, |
|
"learning_rate": 6.675627240143369e-08, |
|
"loss": 1.3531, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5535512965050733, |
|
"eval_loss": 1.3144019842147827, |
|
"eval_runtime": 16.7857, |
|
"eval_samples_per_second": 105.685, |
|
"eval_steps_per_second": 3.336, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 8.016032064128256, |
|
"grad_norm": 115.76951599121094, |
|
"learning_rate": 6.630824372759857e-08, |
|
"loss": 1.3257, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.09619238476954, |
|
"grad_norm": 75.48204040527344, |
|
"learning_rate": 6.586021505376344e-08, |
|
"loss": 1.3404, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 8.176352705410821, |
|
"grad_norm": 105.67681121826172, |
|
"learning_rate": 6.541218637992831e-08, |
|
"loss": 1.3141, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 8.256513026052104, |
|
"grad_norm": 68.87728881835938, |
|
"learning_rate": 6.496415770609319e-08, |
|
"loss": 1.3586, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.336673346693386, |
|
"grad_norm": 89.05242156982422, |
|
"learning_rate": 6.451612903225806e-08, |
|
"loss": 1.3465, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.41683366733467, |
|
"grad_norm": 63.90314483642578, |
|
"learning_rate": 6.406810035842294e-08, |
|
"loss": 1.3117, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.496993987975952, |
|
"grad_norm": 64.17387390136719, |
|
"learning_rate": 6.362007168458781e-08, |
|
"loss": 1.3509, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.577154308617235, |
|
"grad_norm": 66.45846557617188, |
|
"learning_rate": 6.317204301075269e-08, |
|
"loss": 1.2805, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.657314629258517, |
|
"grad_norm": 69.26084899902344, |
|
"learning_rate": 6.272401433691757e-08, |
|
"loss": 1.3548, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.7374749498998, |
|
"grad_norm": 85.13467407226562, |
|
"learning_rate": 6.227598566308242e-08, |
|
"loss": 1.2757, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.817635270541082, |
|
"grad_norm": 69.39098358154297, |
|
"learning_rate": 6.18279569892473e-08, |
|
"loss": 1.2923, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.897795591182366, |
|
"grad_norm": 74.49005889892578, |
|
"learning_rate": 6.137992831541219e-08, |
|
"loss": 1.2824, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.977955911823647, |
|
"grad_norm": 73.94770050048828, |
|
"learning_rate": 6.093189964157706e-08, |
|
"loss": 1.3098, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.993987975951903, |
|
"eval_accuracy": 0.5659526493799324, |
|
"eval_loss": 1.265461802482605, |
|
"eval_runtime": 13.7383, |
|
"eval_samples_per_second": 129.128, |
|
"eval_steps_per_second": 4.076, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 9.05811623246493, |
|
"grad_norm": 64.57673645019531, |
|
"learning_rate": 6.048387096774194e-08, |
|
"loss": 1.2946, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 9.138276553106213, |
|
"grad_norm": 67.67877197265625, |
|
"learning_rate": 6.00358422939068e-08, |
|
"loss": 1.3228, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 9.218436873747494, |
|
"grad_norm": 72.48619842529297, |
|
"learning_rate": 5.958781362007168e-08, |
|
"loss": 1.2929, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 9.298597194388778, |
|
"grad_norm": 64.42074584960938, |
|
"learning_rate": 5.913978494623656e-08, |
|
"loss": 1.2571, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 9.37875751503006, |
|
"grad_norm": 70.90869903564453, |
|
"learning_rate": 5.8691756272401424e-08, |
|
"loss": 1.2976, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 9.458917835671343, |
|
"grad_norm": 120.41864776611328, |
|
"learning_rate": 5.8243727598566305e-08, |
|
"loss": 1.2296, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 9.539078156312625, |
|
"grad_norm": 68.10767364501953, |
|
"learning_rate": 5.779569892473119e-08, |
|
"loss": 1.304, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 9.619238476953909, |
|
"grad_norm": 98.28290557861328, |
|
"learning_rate": 5.7347670250896055e-08, |
|
"loss": 1.2379, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.69939879759519, |
|
"grad_norm": 62.52736282348633, |
|
"learning_rate": 5.689964157706093e-08, |
|
"loss": 1.2613, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 9.779559118236474, |
|
"grad_norm": 74.3100357055664, |
|
"learning_rate": 5.645161290322581e-08, |
|
"loss": 1.2616, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.859719438877756, |
|
"grad_norm": 86.70094299316406, |
|
"learning_rate": 5.600358422939068e-08, |
|
"loss": 1.2098, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.939879759519037, |
|
"grad_norm": 87.94029998779297, |
|
"learning_rate": 5.5555555555555555e-08, |
|
"loss": 1.2152, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.995991983967937, |
|
"eval_accuracy": 0.5817361894024803, |
|
"eval_loss": 1.2197498083114624, |
|
"eval_runtime": 13.6983, |
|
"eval_samples_per_second": 129.505, |
|
"eval_steps_per_second": 4.088, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 10.02004008016032, |
|
"grad_norm": 81.81733703613281, |
|
"learning_rate": 5.510752688172042e-08, |
|
"loss": 1.2643, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 10.100200400801603, |
|
"grad_norm": 79.10731506347656, |
|
"learning_rate": 5.4659498207885304e-08, |
|
"loss": 1.2565, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 10.180360721442886, |
|
"grad_norm": 78.17452239990234, |
|
"learning_rate": 5.421146953405018e-08, |
|
"loss": 1.2499, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 10.260521042084168, |
|
"grad_norm": 71.21734619140625, |
|
"learning_rate": 5.376344086021505e-08, |
|
"loss": 1.2377, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 10.340681362725451, |
|
"grad_norm": 96.33527374267578, |
|
"learning_rate": 5.331541218637993e-08, |
|
"loss": 1.2936, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 10.420841683366733, |
|
"grad_norm": 81.22813415527344, |
|
"learning_rate": 5.2867383512544804e-08, |
|
"loss": 1.2413, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 10.501002004008017, |
|
"grad_norm": 73.4112777709961, |
|
"learning_rate": 5.241935483870967e-08, |
|
"loss": 1.2184, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 10.581162324649299, |
|
"grad_norm": 103.389404296875, |
|
"learning_rate": 5.1971326164874554e-08, |
|
"loss": 1.2365, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 10.661322645290582, |
|
"grad_norm": 71.31685638427734, |
|
"learning_rate": 5.152329749103942e-08, |
|
"loss": 1.2513, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 10.741482965931864, |
|
"grad_norm": 72.35453796386719, |
|
"learning_rate": 5.10752688172043e-08, |
|
"loss": 1.2371, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 10.821643286573146, |
|
"grad_norm": 82.98307800292969, |
|
"learning_rate": 5.062724014336918e-08, |
|
"loss": 1.1896, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.901803607214429, |
|
"grad_norm": 96.587646484375, |
|
"learning_rate": 5.0179211469534046e-08, |
|
"loss": 1.2517, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 10.98196392785571, |
|
"grad_norm": 90.14690399169922, |
|
"learning_rate": 4.973118279569892e-08, |
|
"loss": 1.2257, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 10.997995991983968, |
|
"eval_accuracy": 0.5924464487034949, |
|
"eval_loss": 1.185572624206543, |
|
"eval_runtime": 21.3384, |
|
"eval_samples_per_second": 83.137, |
|
"eval_steps_per_second": 2.624, |
|
"step": 1372 |
|
}, |
|
{ |
|
"epoch": 11.062124248496994, |
|
"grad_norm": 93.5479965209961, |
|
"learning_rate": 4.9283154121863796e-08, |
|
"loss": 1.1936, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 11.142284569138276, |
|
"grad_norm": 90.29972839355469, |
|
"learning_rate": 4.883512544802867e-08, |
|
"loss": 1.2202, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 11.22244488977956, |
|
"grad_norm": 88.16073608398438, |
|
"learning_rate": 4.8387096774193546e-08, |
|
"loss": 1.2298, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 11.302605210420841, |
|
"grad_norm": 63.090301513671875, |
|
"learning_rate": 4.793906810035842e-08, |
|
"loss": 1.1935, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 11.382765531062125, |
|
"grad_norm": 91.5514907836914, |
|
"learning_rate": 4.7491039426523296e-08, |
|
"loss": 1.2192, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 11.462925851703407, |
|
"grad_norm": 82.7196273803711, |
|
"learning_rate": 4.704301075268817e-08, |
|
"loss": 1.2072, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 11.54308617234469, |
|
"grad_norm": 70.3786392211914, |
|
"learning_rate": 4.6594982078853046e-08, |
|
"loss": 1.2248, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 11.623246492985972, |
|
"grad_norm": 87.86737060546875, |
|
"learning_rate": 4.614695340501792e-08, |
|
"loss": 1.2449, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 11.703406813627254, |
|
"grad_norm": 63.58481979370117, |
|
"learning_rate": 4.569892473118279e-08, |
|
"loss": 1.1992, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 11.783567134268537, |
|
"grad_norm": 67.95172119140625, |
|
"learning_rate": 4.525089605734767e-08, |
|
"loss": 1.2231, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 11.863727454909819, |
|
"grad_norm": 88.7829818725586, |
|
"learning_rate": 4.4802867383512545e-08, |
|
"loss": 1.2268, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 11.943887775551103, |
|
"grad_norm": 60.62504577636719, |
|
"learning_rate": 4.435483870967742e-08, |
|
"loss": 1.1706, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.6082299887260428, |
|
"eval_loss": 1.1539757251739502, |
|
"eval_runtime": 27.2781, |
|
"eval_samples_per_second": 65.034, |
|
"eval_steps_per_second": 2.053, |
|
"step": 1497 |
|
}, |
|
{ |
|
"epoch": 12.024048096192384, |
|
"grad_norm": 68.47216033935547, |
|
"learning_rate": 4.390681003584229e-08, |
|
"loss": 1.2074, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 12.104208416833668, |
|
"grad_norm": 75.7269515991211, |
|
"learning_rate": 4.345878136200717e-08, |
|
"loss": 1.188, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 12.18436873747495, |
|
"grad_norm": 77.85631561279297, |
|
"learning_rate": 4.3010752688172045e-08, |
|
"loss": 1.1652, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 12.264529058116233, |
|
"grad_norm": 73.39097595214844, |
|
"learning_rate": 4.256272401433691e-08, |
|
"loss": 1.1572, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 12.344689378757515, |
|
"grad_norm": 81.20556640625, |
|
"learning_rate": 4.211469534050179e-08, |
|
"loss": 1.1941, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 12.424849699398798, |
|
"grad_norm": 56.853816986083984, |
|
"learning_rate": 4.166666666666667e-08, |
|
"loss": 1.1808, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 12.50501002004008, |
|
"grad_norm": 99.04067993164062, |
|
"learning_rate": 4.121863799283154e-08, |
|
"loss": 1.2169, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 12.585170340681362, |
|
"grad_norm": 123.83197021484375, |
|
"learning_rate": 4.077060931899641e-08, |
|
"loss": 1.2092, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 12.665330661322646, |
|
"grad_norm": 74.84356689453125, |
|
"learning_rate": 4.032258064516129e-08, |
|
"loss": 1.1641, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 12.745490981963927, |
|
"grad_norm": 75.44207000732422, |
|
"learning_rate": 3.987455197132616e-08, |
|
"loss": 1.1961, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 12.82565130260521, |
|
"grad_norm": 79.65312957763672, |
|
"learning_rate": 3.942652329749104e-08, |
|
"loss": 1.1956, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 12.905811623246493, |
|
"grad_norm": 72.1136245727539, |
|
"learning_rate": 3.897849462365591e-08, |
|
"loss": 1.1539, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 12.985971943887776, |
|
"grad_norm": 75.26019287109375, |
|
"learning_rate": 3.853046594982079e-08, |
|
"loss": 1.1836, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 12.993987975951903, |
|
"eval_accuracy": 0.6121758737316798, |
|
"eval_loss": 1.1286228895187378, |
|
"eval_runtime": 14.7437, |
|
"eval_samples_per_second": 120.322, |
|
"eval_steps_per_second": 3.798, |
|
"step": 1621 |
|
}, |
|
{ |
|
"epoch": 13.066132264529058, |
|
"grad_norm": 90.93775939941406, |
|
"learning_rate": 3.808243727598566e-08, |
|
"loss": 1.1852, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 13.146292585170341, |
|
"grad_norm": 60.07513427734375, |
|
"learning_rate": 3.7634408602150537e-08, |
|
"loss": 1.1543, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 13.226452905811623, |
|
"grad_norm": 68.70402526855469, |
|
"learning_rate": 3.718637992831541e-08, |
|
"loss": 1.1385, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 13.306613226452907, |
|
"grad_norm": 87.76813507080078, |
|
"learning_rate": 3.6738351254480286e-08, |
|
"loss": 1.1811, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 13.386773547094188, |
|
"grad_norm": 80.2512435913086, |
|
"learning_rate": 3.629032258064516e-08, |
|
"loss": 1.209, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 13.46693386773547, |
|
"grad_norm": 82.47491455078125, |
|
"learning_rate": 3.5842293906810036e-08, |
|
"loss": 1.1742, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 13.547094188376754, |
|
"grad_norm": 79.13627624511719, |
|
"learning_rate": 3.5394265232974904e-08, |
|
"loss": 1.17, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 13.627254509018035, |
|
"grad_norm": 89.55215454101562, |
|
"learning_rate": 3.4946236559139786e-08, |
|
"loss": 1.1587, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 13.707414829659319, |
|
"grad_norm": 103.78477478027344, |
|
"learning_rate": 3.449820788530466e-08, |
|
"loss": 1.2009, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 13.7875751503006, |
|
"grad_norm": 85.93024444580078, |
|
"learning_rate": 3.405017921146953e-08, |
|
"loss": 1.1594, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 13.867735470941884, |
|
"grad_norm": 79.2539291381836, |
|
"learning_rate": 3.3602150537634404e-08, |
|
"loss": 1.1262, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 13.947895791583166, |
|
"grad_norm": 81.66879272460938, |
|
"learning_rate": 3.3154121863799285e-08, |
|
"loss": 1.1769, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 13.995991983967937, |
|
"eval_accuracy": 0.6161217587373168, |
|
"eval_loss": 1.1120373010635376, |
|
"eval_runtime": 10.9033, |
|
"eval_samples_per_second": 162.703, |
|
"eval_steps_per_second": 5.136, |
|
"step": 1746 |
|
}, |
|
{ |
|
"epoch": 14.02805611222445, |
|
"grad_norm": 64.20880126953125, |
|
"learning_rate": 3.2706093189964154e-08, |
|
"loss": 1.1211, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 14.108216432865731, |
|
"grad_norm": 85.90133666992188, |
|
"learning_rate": 3.225806451612903e-08, |
|
"loss": 1.1692, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 14.188376753507015, |
|
"grad_norm": 73.4815902709961, |
|
"learning_rate": 3.1810035842293903e-08, |
|
"loss": 1.1464, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 14.268537074148297, |
|
"grad_norm": 79.17596435546875, |
|
"learning_rate": 3.1362007168458785e-08, |
|
"loss": 1.163, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 14.348697394789578, |
|
"grad_norm": 69.56460571289062, |
|
"learning_rate": 3.091397849462365e-08, |
|
"loss": 1.166, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 14.428857715430862, |
|
"grad_norm": 80.14591217041016, |
|
"learning_rate": 3.046594982078853e-08, |
|
"loss": 1.1634, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 14.509018036072144, |
|
"grad_norm": 82.79823303222656, |
|
"learning_rate": 3.00179211469534e-08, |
|
"loss": 1.1717, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 14.589178356713427, |
|
"grad_norm": 65.2205581665039, |
|
"learning_rate": 2.956989247311828e-08, |
|
"loss": 1.1617, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 14.669338677354709, |
|
"grad_norm": 72.65853118896484, |
|
"learning_rate": 2.9121863799283153e-08, |
|
"loss": 1.1598, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 14.749498997995993, |
|
"grad_norm": 69.55330657958984, |
|
"learning_rate": 2.8673835125448027e-08, |
|
"loss": 1.1549, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 14.829659318637274, |
|
"grad_norm": 76.02484893798828, |
|
"learning_rate": 2.8225806451612906e-08, |
|
"loss": 1.1652, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 14.909819639278558, |
|
"grad_norm": 74.75779724121094, |
|
"learning_rate": 2.7777777777777777e-08, |
|
"loss": 1.1282, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 14.98997995991984, |
|
"grad_norm": 83.63590240478516, |
|
"learning_rate": 2.7329749103942652e-08, |
|
"loss": 1.1197, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 14.997995991983968, |
|
"eval_accuracy": 0.6234498308906427, |
|
"eval_loss": 1.0942639112472534, |
|
"eval_runtime": 23.7449, |
|
"eval_samples_per_second": 74.711, |
|
"eval_steps_per_second": 2.358, |
|
"step": 1871 |
|
}, |
|
{ |
|
"epoch": 15.070140280561123, |
|
"grad_norm": 91.89788055419922, |
|
"learning_rate": 2.6881720430107524e-08, |
|
"loss": 1.1295, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 15.150300601202405, |
|
"grad_norm": 73.01239776611328, |
|
"learning_rate": 2.6433691756272402e-08, |
|
"loss": 1.1051, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 15.230460921843687, |
|
"grad_norm": 80.28163146972656, |
|
"learning_rate": 2.5985663082437277e-08, |
|
"loss": 1.1283, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 15.31062124248497, |
|
"grad_norm": 65.41940307617188, |
|
"learning_rate": 2.553763440860215e-08, |
|
"loss": 1.1271, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 15.390781563126252, |
|
"grad_norm": 76.08214569091797, |
|
"learning_rate": 2.5089605734767023e-08, |
|
"loss": 1.1447, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 15.470941883767535, |
|
"grad_norm": 112.58313751220703, |
|
"learning_rate": 2.4641577060931898e-08, |
|
"loss": 1.1009, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 15.551102204408817, |
|
"grad_norm": 92.31614685058594, |
|
"learning_rate": 2.4193548387096773e-08, |
|
"loss": 1.1769, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 15.6312625250501, |
|
"grad_norm": 72.65853118896484, |
|
"learning_rate": 2.3745519713261648e-08, |
|
"loss": 1.1652, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 15.711422845691382, |
|
"grad_norm": 103.74449157714844, |
|
"learning_rate": 2.3297491039426523e-08, |
|
"loss": 1.1576, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 15.791583166332666, |
|
"grad_norm": 85.3733139038086, |
|
"learning_rate": 2.2849462365591394e-08, |
|
"loss": 1.1294, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 15.871743486973948, |
|
"grad_norm": 84.7496109008789, |
|
"learning_rate": 2.2401433691756273e-08, |
|
"loss": 1.1817, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 15.951903807615231, |
|
"grad_norm": 75.84141540527344, |
|
"learning_rate": 2.1953405017921144e-08, |
|
"loss": 1.1373, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.6307779030439684, |
|
"eval_loss": 1.077236533164978, |
|
"eval_runtime": 16.348, |
|
"eval_samples_per_second": 108.515, |
|
"eval_steps_per_second": 3.425, |
|
"step": 1996 |
|
}, |
|
{ |
|
"epoch": 16.03206412825651, |
|
"grad_norm": 77.71000671386719, |
|
"learning_rate": 2.1505376344086022e-08, |
|
"loss": 1.058, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 16.112224448897795, |
|
"grad_norm": 77.07865905761719, |
|
"learning_rate": 2.1057347670250894e-08, |
|
"loss": 1.1101, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 16.19238476953908, |
|
"grad_norm": 91.62117004394531, |
|
"learning_rate": 2.060931899641577e-08, |
|
"loss": 1.0807, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 16.272545090180362, |
|
"grad_norm": 111.3026351928711, |
|
"learning_rate": 2.0161290322580644e-08, |
|
"loss": 1.0986, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 16.352705410821642, |
|
"grad_norm": 88.4916763305664, |
|
"learning_rate": 1.971326164874552e-08, |
|
"loss": 1.1188, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 16.432865731462925, |
|
"grad_norm": 77.49140167236328, |
|
"learning_rate": 1.9265232974910393e-08, |
|
"loss": 1.1645, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 16.51302605210421, |
|
"grad_norm": 77.70806884765625, |
|
"learning_rate": 1.8817204301075268e-08, |
|
"loss": 1.1432, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 16.593186372745492, |
|
"grad_norm": 77.1611099243164, |
|
"learning_rate": 1.8369175627240143e-08, |
|
"loss": 1.156, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 16.673346693386772, |
|
"grad_norm": 73.50543975830078, |
|
"learning_rate": 1.7921146953405018e-08, |
|
"loss": 1.1156, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 16.753507014028056, |
|
"grad_norm": 89.7205810546875, |
|
"learning_rate": 1.7473118279569893e-08, |
|
"loss": 1.1455, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 16.83366733466934, |
|
"grad_norm": 105.73194885253906, |
|
"learning_rate": 1.7025089605734764e-08, |
|
"loss": 1.1198, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 16.91382765531062, |
|
"grad_norm": 71.158447265625, |
|
"learning_rate": 1.6577060931899643e-08, |
|
"loss": 1.1243, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 16.993987975951903, |
|
"grad_norm": 65.14007568359375, |
|
"learning_rate": 1.6129032258064514e-08, |
|
"loss": 1.1111, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 16.993987975951903, |
|
"eval_accuracy": 0.6330326944757609, |
|
"eval_loss": 1.0714222192764282, |
|
"eval_runtime": 13.3934, |
|
"eval_samples_per_second": 132.454, |
|
"eval_steps_per_second": 4.181, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 17.074148296593187, |
|
"grad_norm": 77.27207946777344, |
|
"learning_rate": 1.5681003584229392e-08, |
|
"loss": 1.1447, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 17.15430861723447, |
|
"grad_norm": 95.06674194335938, |
|
"learning_rate": 1.5232974910394264e-08, |
|
"loss": 1.116, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 17.23446893787575, |
|
"grad_norm": 66.83224487304688, |
|
"learning_rate": 1.478494623655914e-08, |
|
"loss": 1.1596, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 17.314629258517034, |
|
"grad_norm": 70.9565200805664, |
|
"learning_rate": 1.4336917562724014e-08, |
|
"loss": 1.1261, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 17.394789579158317, |
|
"grad_norm": 82.84857177734375, |
|
"learning_rate": 1.3888888888888889e-08, |
|
"loss": 1.1545, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 17.4749498997996, |
|
"grad_norm": 97.7608413696289, |
|
"learning_rate": 1.3440860215053762e-08, |
|
"loss": 1.1173, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 17.55511022044088, |
|
"grad_norm": 75.67000579833984, |
|
"learning_rate": 1.2992831541218638e-08, |
|
"loss": 1.1108, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 17.635270541082164, |
|
"grad_norm": 67.27717590332031, |
|
"learning_rate": 1.2544802867383512e-08, |
|
"loss": 1.1232, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 17.715430861723448, |
|
"grad_norm": 73.40684509277344, |
|
"learning_rate": 1.2096774193548386e-08, |
|
"loss": 1.0621, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 17.79559118236473, |
|
"grad_norm": 85.15264892578125, |
|
"learning_rate": 1.1648745519713261e-08, |
|
"loss": 1.1232, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 17.87575150300601, |
|
"grad_norm": 103.73624420166016, |
|
"learning_rate": 1.1200716845878136e-08, |
|
"loss": 1.1078, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 17.955911823647295, |
|
"grad_norm": 94.97132873535156, |
|
"learning_rate": 1.0752688172043011e-08, |
|
"loss": 1.1274, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 17.995991983967937, |
|
"eval_accuracy": 0.6335963923337091, |
|
"eval_loss": 1.0624490976333618, |
|
"eval_runtime": 17.5649, |
|
"eval_samples_per_second": 100.997, |
|
"eval_steps_per_second": 3.188, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 18.03607214428858, |
|
"grad_norm": 71.5256118774414, |
|
"learning_rate": 1.0304659498207884e-08, |
|
"loss": 1.1403, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 18.11623246492986, |
|
"grad_norm": 76.1630630493164, |
|
"learning_rate": 9.85663082437276e-09, |
|
"loss": 1.0924, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 18.196392785571142, |
|
"grad_norm": 92.13204193115234, |
|
"learning_rate": 9.408602150537634e-09, |
|
"loss": 1.1017, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 18.276553106212425, |
|
"grad_norm": 104.67939758300781, |
|
"learning_rate": 8.960573476702509e-09, |
|
"loss": 1.1629, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 18.35671342685371, |
|
"grad_norm": 82.44912719726562, |
|
"learning_rate": 8.512544802867382e-09, |
|
"loss": 1.1621, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 18.43687374749499, |
|
"grad_norm": 85.671630859375, |
|
"learning_rate": 8.064516129032257e-09, |
|
"loss": 1.1094, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 18.517034068136272, |
|
"grad_norm": 66.93050384521484, |
|
"learning_rate": 7.616487455197132e-09, |
|
"loss": 1.1134, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 18.597194388777556, |
|
"grad_norm": 74.51307678222656, |
|
"learning_rate": 7.168458781362007e-09, |
|
"loss": 1.0945, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 18.677354709418836, |
|
"grad_norm": 96.68108367919922, |
|
"learning_rate": 6.720430107526881e-09, |
|
"loss": 1.0838, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 18.75751503006012, |
|
"grad_norm": 82.29679870605469, |
|
"learning_rate": 6.272401433691756e-09, |
|
"loss": 1.0986, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 18.837675350701403, |
|
"grad_norm": 90.82588958740234, |
|
"learning_rate": 5.824372759856631e-09, |
|
"loss": 1.0962, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 18.917835671342687, |
|
"grad_norm": 83.15840911865234, |
|
"learning_rate": 5.3763440860215056e-09, |
|
"loss": 1.1204, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 18.997995991983966, |
|
"grad_norm": 74.08020782470703, |
|
"learning_rate": 4.92831541218638e-09, |
|
"loss": 1.0801, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 18.997995991983966, |
|
"eval_accuracy": 0.6381059751972943, |
|
"eval_loss": 1.0584417581558228, |
|
"eval_runtime": 13.2957, |
|
"eval_samples_per_second": 133.427, |
|
"eval_steps_per_second": 4.212, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 19.07815631262525, |
|
"grad_norm": 83.4019546508789, |
|
"learning_rate": 4.4802867383512545e-09, |
|
"loss": 1.1158, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 19.158316633266534, |
|
"grad_norm": 80.1566390991211, |
|
"learning_rate": 4.0322580645161286e-09, |
|
"loss": 1.1164, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 19.238476953907817, |
|
"grad_norm": 87.40319061279297, |
|
"learning_rate": 3.5842293906810034e-09, |
|
"loss": 1.0695, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 19.318637274549097, |
|
"grad_norm": 75.24124908447266, |
|
"learning_rate": 3.136200716845878e-09, |
|
"loss": 1.0736, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 19.39879759519038, |
|
"grad_norm": 93.60723114013672, |
|
"learning_rate": 2.6881720430107528e-09, |
|
"loss": 1.141, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 19.478957915831664, |
|
"grad_norm": 81.5455551147461, |
|
"learning_rate": 2.2401433691756273e-09, |
|
"loss": 1.1103, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 19.559118236472948, |
|
"grad_norm": 96.52420806884766, |
|
"learning_rate": 1.7921146953405017e-09, |
|
"loss": 1.1554, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 19.639278557114228, |
|
"grad_norm": 73.265380859375, |
|
"learning_rate": 1.3440860215053764e-09, |
|
"loss": 1.1075, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 19.71943887775551, |
|
"grad_norm": 89.17982482910156, |
|
"learning_rate": 8.960573476702509e-10, |
|
"loss": 1.0946, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 19.799599198396795, |
|
"grad_norm": 84.99248504638672, |
|
"learning_rate": 4.4802867383512543e-10, |
|
"loss": 1.1167, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"grad_norm": 78.83936309814453, |
|
"learning_rate": 0.0, |
|
"loss": 1.0979, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"eval_accuracy": 0.6386696730552424, |
|
"eval_loss": 1.0578892230987549, |
|
"eval_runtime": 14.5957, |
|
"eval_samples_per_second": 121.543, |
|
"eval_steps_per_second": 3.837, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 19.879759519038075, |
|
"step": 2480, |
|
"total_flos": 8.259382470828884e+18, |
|
"train_loss": 1.5085111414232562, |
|
"train_runtime": 5954.6222, |
|
"train_samples_per_second": 53.612, |
|
"train_steps_per_second": 0.416 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2480, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 8.259382470828884e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|