|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997999866657777, |
|
"eval_steps": 500, |
|
"global_step": 3749, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 2.017, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 2.3713, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 2.1764, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 2.1757, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 2.1736, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.125, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 2.1865, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 2.0263, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 1.9312, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.8754, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.7123, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 5.8666666666666675e-06, |
|
"loss": 1.8266, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.6079, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 6.9333333333333344e-06, |
|
"loss": 1.5006, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 7.4666666666666675e-06, |
|
"loss": 1.4158, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.068359375, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.3677, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 8.533333333333335e-06, |
|
"loss": 1.374, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 9.066666666666667e-06, |
|
"loss": 1.3052, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.275, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.10498046875, |
|
"learning_rate": 1.0133333333333335e-05, |
|
"loss": 1.2574, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 1.2239, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 1.1565, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.1733333333333335e-05, |
|
"loss": 1.1758, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 1.2266666666666667e-05, |
|
"loss": 1.1843, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 1.1936, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.1426, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 1.3866666666666669e-05, |
|
"loss": 1.114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 1.1195, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.4933333333333335e-05, |
|
"loss": 1.1256, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 1.546666666666667e-05, |
|
"loss": 1.102, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1103515625, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.0762, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 1.0761, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.09765625, |
|
"learning_rate": 1.706666666666667e-05, |
|
"loss": 1.0501, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 1.76e-05, |
|
"loss": 1.0105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 1.8133333333333335e-05, |
|
"loss": 1.0232, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.989, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.9894, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 1.9733333333333336e-05, |
|
"loss": 0.9824, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.0439453125, |
|
"learning_rate": 1.999989162756852e-05, |
|
"loss": 0.9569, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 1.999902466221011e-05, |
|
"loss": 0.928, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.061767578125, |
|
"learning_rate": 1.9997290806656996e-05, |
|
"loss": 0.9349, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 1.9994690211230084e-05, |
|
"loss": 0.9489, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 1.999122310139442e-05, |
|
"loss": 0.916, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 1.9986889777739686e-05, |
|
"loss": 0.9541, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.02978515625, |
|
"learning_rate": 1.9981690615954097e-05, |
|
"loss": 0.9155, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 1.9975626066791855e-05, |
|
"loss": 0.9015, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 1.996869665603406e-05, |
|
"loss": 0.9115, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.996090298444313e-05, |
|
"loss": 0.896, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 1.9952245727710723e-05, |
|
"loss": 0.8855, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.193359375, |
|
"learning_rate": 1.9942725636399136e-05, |
|
"loss": 0.9084, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 1.9932343535876255e-05, |
|
"loss": 0.8899, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 1.9921100326243977e-05, |
|
"loss": 0.8952, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.038330078125, |
|
"learning_rate": 1.9908996982260196e-05, |
|
"loss": 0.8857, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 1.9896034553254284e-05, |
|
"loss": 0.8677, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.03076171875, |
|
"learning_rate": 1.988221416303611e-05, |
|
"loss": 0.8637, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 1.986753700979861e-05, |
|
"loss": 0.8384, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 1.985200436601392e-05, |
|
"loss": 0.9015, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 1.9835617578323038e-05, |
|
"loss": 0.8564, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 1.9818378067419092e-05, |
|
"loss": 0.8183, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 1.9800287327924152e-05, |
|
"loss": 0.8358, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.059814453125, |
|
"learning_rate": 1.9781346928259662e-05, |
|
"loss": 0.8564, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 1.9761558510510453e-05, |
|
"loss": 0.8295, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 1.974092379028239e-05, |
|
"loss": 0.8403, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 1.9719444556553616e-05, |
|
"loss": 0.8582, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.0267333984375, |
|
"learning_rate": 1.969712267151948e-05, |
|
"loss": 0.8328, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 1.9673960070431043e-05, |
|
"loss": 0.8571, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 1.9649958761427364e-05, |
|
"loss": 0.824, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 1.9625120825361326e-05, |
|
"loss": 0.8418, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 1.9599448415619283e-05, |
|
"loss": 0.8267, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 1.957294375793435e-05, |
|
"loss": 0.8677, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 1.954560915019343e-05, |
|
"loss": 0.827, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 1.951744696223801e-05, |
|
"loss": 0.8303, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.024169921875, |
|
"learning_rate": 1.9488459635658687e-05, |
|
"loss": 0.8455, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.945864968358349e-05, |
|
"loss": 0.8297, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 1.9428019690460008e-05, |
|
"loss": 0.8516, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 1.939657231183132e-05, |
|
"loss": 0.8538, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 1.9364310274105758e-05, |
|
"loss": 0.8045, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 1.933123637432054e-05, |
|
"loss": 0.8436, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 1.929735347989929e-05, |
|
"loss": 0.8123, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.92626645284034e-05, |
|
"loss": 0.8184, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.047119140625, |
|
"learning_rate": 1.92271725272774e-05, |
|
"loss": 0.8222, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 1.919088055358818e-05, |
|
"loss": 0.8434, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.9153791753758236e-05, |
|
"loss": 0.8413, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.05517578125, |
|
"learning_rate": 1.911590934329288e-05, |
|
"loss": 0.8323, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 1.9077236606501465e-05, |
|
"loss": 0.7935, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.022705078125, |
|
"learning_rate": 1.903777689621263e-05, |
|
"loss": 0.8186, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 1.899753363348364e-05, |
|
"loss": 0.8094, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.0267333984375, |
|
"learning_rate": 1.895651030730378e-05, |
|
"loss": 0.8265, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 1.891471047429186e-05, |
|
"loss": 0.9105, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 1.8872137758387873e-05, |
|
"loss": 0.8028, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 1.8828795850538804e-05, |
|
"loss": 0.8322, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.8784688508378655e-05, |
|
"loss": 0.798, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 1.8739819555902626e-05, |
|
"loss": 0.8202, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 1.8694192883135632e-05, |
|
"loss": 0.8819, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.027587890625, |
|
"learning_rate": 1.8647812445795003e-05, |
|
"loss": 0.8273, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 1.8600682264947566e-05, |
|
"loss": 0.8294, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 1.8552806426661022e-05, |
|
"loss": 0.8222, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 1.8504189081649678e-05, |
|
"loss": 0.8109, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 1.8454834444914607e-05, |
|
"loss": 0.7972, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 1.8404746795378218e-05, |
|
"loss": 0.8107, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 1.8353930475513268e-05, |
|
"loss": 0.8208, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.0272216796875, |
|
"learning_rate": 1.8302389890966404e-05, |
|
"loss": 0.7933, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 1.8250129510176183e-05, |
|
"loss": 0.814, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 1.8197153863985686e-05, |
|
"loss": 0.8251, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.8143467545249694e-05, |
|
"loss": 0.7962, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 1.8089075208436507e-05, |
|
"loss": 0.8164, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.0223388671875, |
|
"learning_rate": 1.8033981569224404e-05, |
|
"loss": 0.8081, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 1.797819140409282e-05, |
|
"loss": 0.8211, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 1.7921709549908222e-05, |
|
"loss": 0.8526, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 1.7864540903504777e-05, |
|
"loss": 0.8195, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 1.7806690421259794e-05, |
|
"loss": 0.7951, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.04443359375, |
|
"learning_rate": 1.774816311866404e-05, |
|
"loss": 0.8389, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 1.768896406988689e-05, |
|
"loss": 0.8031, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.0281982421875, |
|
"learning_rate": 1.7629098407336415e-05, |
|
"loss": 0.8188, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 1.756857132121443e-05, |
|
"loss": 0.8232, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 1.7507388059066492e-05, |
|
"loss": 0.8126, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.06982421875, |
|
"learning_rate": 1.7445553925326963e-05, |
|
"loss": 0.7775, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.7383074280859132e-05, |
|
"loss": 0.8205, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.7319954542490448e-05, |
|
"loss": 0.8158, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.041259765625, |
|
"learning_rate": 1.725620018254286e-05, |
|
"loss": 0.8146, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 1.7191816728358435e-05, |
|
"loss": 0.8259, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.021240234375, |
|
"learning_rate": 1.71268097618201e-05, |
|
"loss": 0.8213, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 1.706118491886774e-05, |
|
"loss": 0.7947, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.0296630859375, |
|
"learning_rate": 1.6994947889009563e-05, |
|
"loss": 0.8364, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 1.692810441482884e-05, |
|
"loss": 0.8268, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.6860660291486023e-05, |
|
"loss": 0.7868, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 1.6792621366216338e-05, |
|
"loss": 0.8402, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 1.6723993537822837e-05, |
|
"loss": 0.81, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 1.6654782756164983e-05, |
|
"loss": 0.8151, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 1.6584995021642814e-05, |
|
"loss": 0.7716, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 1.651463638467673e-05, |
|
"loss": 0.8236, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.031005859375, |
|
"learning_rate": 1.6443712945182933e-05, |
|
"loss": 0.7794, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 1.637223085204457e-05, |
|
"loss": 0.8222, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.630019630257865e-05, |
|
"loss": 0.8134, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 1.6227615541998756e-05, |
|
"loss": 0.8209, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0301513671875, |
|
"learning_rate": 1.6154494862873588e-05, |
|
"loss": 0.79, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.6080840604581435e-05, |
|
"loss": 0.812, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 1.600665915276054e-05, |
|
"loss": 0.8119, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 1.5931956938755494e-05, |
|
"loss": 0.8073, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0220947265625, |
|
"learning_rate": 1.585674043905966e-05, |
|
"loss": 0.8205, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 1.5781016174753675e-05, |
|
"loss": 0.8222, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0302734375, |
|
"learning_rate": 1.5704790710940074e-05, |
|
"loss": 0.8307, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.028076171875, |
|
"learning_rate": 1.5628070656174135e-05, |
|
"loss": 0.8038, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 1.5550862661890918e-05, |
|
"loss": 0.7824, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 1.547317342182861e-05, |
|
"loss": 0.7891, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 1.5395009671448186e-05, |
|
"loss": 0.8051, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 1.5316378187349476e-05, |
|
"loss": 0.7899, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.02734375, |
|
"learning_rate": 1.5237285786683638e-05, |
|
"loss": 0.827, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 1.515773932656213e-05, |
|
"loss": 0.8107, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 1.5077745703462228e-05, |
|
"loss": 0.7785, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 1.4997311852629097e-05, |
|
"loss": 0.8053, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.029296875, |
|
"learning_rate": 1.4916444747474542e-05, |
|
"loss": 0.8034, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 1.4835151398972424e-05, |
|
"loss": 0.8117, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.0303955078125, |
|
"learning_rate": 1.475343885505083e-05, |
|
"loss": 0.7811, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 1.4671314199981019e-05, |
|
"loss": 0.8069, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.4588784553763262e-05, |
|
"loss": 0.8312, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 1.4505857071509523e-05, |
|
"loss": 0.8132, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.4422538942823158e-05, |
|
"loss": 0.812, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 0.822, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 1.425475967328001e-05, |
|
"loss": 0.8269, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.4170313078462318e-05, |
|
"loss": 0.8027, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.115234375, |
|
"learning_rate": 1.4085504928029086e-05, |
|
"loss": 0.8123, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 1.4000342574632846e-05, |
|
"loss": 0.8028, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 1.3914833401634642e-05, |
|
"loss": 0.7812, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 1.3828984822463895e-05, |
|
"loss": 0.8294, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.03369140625, |
|
"learning_rate": 1.3742804279975686e-05, |
|
"loss": 0.8118, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 1.3656299245805476e-05, |
|
"loss": 0.8086, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.03955078125, |
|
"learning_rate": 1.3569477219721336e-05, |
|
"loss": 0.8155, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 1.3482345728973742e-05, |
|
"loss": 0.8394, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.0252685546875, |
|
"learning_rate": 1.3394912327642966e-05, |
|
"loss": 0.8172, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 1.330718459598417e-05, |
|
"loss": 0.7993, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.0311279296875, |
|
"learning_rate": 1.3219170139770213e-05, |
|
"loss": 0.7824, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 1.3130876589632243e-05, |
|
"loss": 0.7982, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 1.3042311600398157e-05, |
|
"loss": 0.8093, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.03271484375, |
|
"learning_rate": 1.2953482850428927e-05, |
|
"loss": 0.7864, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 1.2864398040952921e-05, |
|
"loss": 0.778, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 1.2775064895398217e-05, |
|
"loss": 0.7869, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 1.2685491158723003e-05, |
|
"loss": 0.8012, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 1.2595684596744112e-05, |
|
"loss": 0.8385, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 1.250565299546374e-05, |
|
"loss": 0.8062, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.025146484375, |
|
"learning_rate": 1.2415404160394429e-05, |
|
"loss": 0.8076, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.2324945915882334e-05, |
|
"loss": 0.7864, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 1.2234286104428884e-05, |
|
"loss": 0.833, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 1.2143432586010851e-05, |
|
"loss": 0.7848, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.037109375, |
|
"learning_rate": 1.2052393237398916e-05, |
|
"loss": 0.7838, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 1.1961175951474766e-05, |
|
"loss": 0.808, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.1869788636546801e-05, |
|
"loss": 0.8076, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.028076171875, |
|
"learning_rate": 1.1778239215664512e-05, |
|
"loss": 0.8196, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 1.1686535625931566e-05, |
|
"loss": 0.7873, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 1.1594685817817673e-05, |
|
"loss": 0.8126, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 1.1502697754469315e-05, |
|
"loss": 0.8462, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.029296875, |
|
"learning_rate": 1.141057941101935e-05, |
|
"loss": 0.8273, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.02294921875, |
|
"learning_rate": 1.1318338773895596e-05, |
|
"loss": 0.8222, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 1.1225983840128418e-05, |
|
"loss": 0.7724, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 1.1133522616657417e-05, |
|
"loss": 0.8072, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.0286865234375, |
|
"learning_rate": 1.104096311963724e-05, |
|
"loss": 0.8306, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.026611328125, |
|
"learning_rate": 1.0948313373742606e-05, |
|
"loss": 0.7973, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.037841796875, |
|
"learning_rate": 1.0855581411472576e-05, |
|
"loss": 0.8385, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 1.076277527245417e-05, |
|
"loss": 0.8238, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 1.0669903002745343e-05, |
|
"loss": 0.8102, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 1.0576972654137411e-05, |
|
"loss": 0.8189, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 1.0483992283456992e-05, |
|
"loss": 0.7938, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 1.0390969951867482e-05, |
|
"loss": 0.8171, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.0311279296875, |
|
"learning_rate": 1.0297913724170187e-05, |
|
"loss": 0.8082, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.0204831668105117e-05, |
|
"loss": 0.7972, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.0218505859375, |
|
"learning_rate": 1.011173185365154e-05, |
|
"loss": 0.7743, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.03515625, |
|
"learning_rate": 1.0018622352328331e-05, |
|
"loss": 0.8095, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.03564453125, |
|
"learning_rate": 9.92551123649419e-06, |
|
"loss": 0.796, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.02197265625, |
|
"learning_rate": 9.832406578647789e-06, |
|
"loss": 0.7923, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 9.739316450727914e-06, |
|
"loss": 0.8106, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 9.646248923413639e-06, |
|
"loss": 0.7871, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.027099609375, |
|
"learning_rate": 9.553212065424625e-06, |
|
"loss": 0.798, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 9.460213942821578e-06, |
|
"loss": 0.795, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 9.367262618306947e-06, |
|
"loss": 0.8001, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.02294921875, |
|
"learning_rate": 9.274366150525902e-06, |
|
"loss": 0.766, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.02587890625, |
|
"learning_rate": 9.181532593367675e-06, |
|
"loss": 0.7999, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.0224609375, |
|
"learning_rate": 9.0887699952673e-06, |
|
"loss": 0.813, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.035400390625, |
|
"learning_rate": 8.996086398507848e-06, |
|
"loss": 0.8108, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.0267333984375, |
|
"learning_rate": 8.903489838523167e-06, |
|
"loss": 0.841, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 8.81098834320124e-06, |
|
"loss": 0.8204, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 8.71858993218818e-06, |
|
"loss": 0.7937, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 8.626302616192955e-06, |
|
"loss": 0.799, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 8.534134396292875e-06, |
|
"loss": 0.8033, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 8.442093263239913e-06, |
|
"loss": 0.8089, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 8.350187196767942e-06, |
|
"loss": 0.792, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 8.258424164900899e-06, |
|
"loss": 0.8242, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 8.166812123261982e-06, |
|
"loss": 0.8058, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0283203125, |
|
"learning_rate": 8.075359014383914e-06, |
|
"loss": 0.8084, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 7.984072767020359e-06, |
|
"loss": 0.7894, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0419921875, |
|
"learning_rate": 7.892961295458496e-06, |
|
"loss": 0.7993, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 7.802032498832895e-06, |
|
"loss": 0.8036, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 7.71129426044066e-06, |
|
"loss": 0.7973, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 7.620754447057985e-06, |
|
"loss": 0.7964, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.036376953125, |
|
"learning_rate": 7.530420908258111e-06, |
|
"loss": 0.777, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.029541015625, |
|
"learning_rate": 7.4403014757308e-06, |
|
"loss": 0.8116, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.0263671875, |
|
"learning_rate": 7.350403962603335e-06, |
|
"loss": 0.8267, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.031982421875, |
|
"learning_rate": 7.260736162763149e-06, |
|
"loss": 0.8153, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 7.171305850182113e-06, |
|
"loss": 0.8026, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.03466796875, |
|
"learning_rate": 7.082120778242554e-06, |
|
"loss": 0.8198, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 6.993188679065048e-06, |
|
"loss": 0.7804, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 6.904517262838082e-06, |
|
"loss": 0.8251, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 6.8161142171495785e-06, |
|
"loss": 0.849, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.02880859375, |
|
"learning_rate": 6.72798720632042e-06, |
|
"loss": 0.7912, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.0245361328125, |
|
"learning_rate": 6.640143870739956e-06, |
|
"loss": 0.7864, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 6.552591826203616e-06, |
|
"loss": 0.7713, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 6.4653386632526275e-06, |
|
"loss": 0.7925, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 6.378391946515937e-06, |
|
"loss": 0.8402, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.023681640625, |
|
"learning_rate": 6.291759214054383e-06, |
|
"loss": 0.8032, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 6.205447976707154e-06, |
|
"loss": 0.7891, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.08935546875, |
|
"learning_rate": 6.119465717440629e-06, |
|
"loss": 0.819, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.0283203125, |
|
"learning_rate": 6.033819890699616e-06, |
|
"loss": 0.8318, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 5.94851792176107e-06, |
|
"loss": 0.8241, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.0311279296875, |
|
"learning_rate": 5.863567206090348e-06, |
|
"loss": 0.7844, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 5.778975108700031e-06, |
|
"loss": 0.8273, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.03759765625, |
|
"learning_rate": 5.694748963511396e-06, |
|
"loss": 0.8117, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 5.610896072718603e-06, |
|
"loss": 0.8067, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.0213623046875, |
|
"learning_rate": 5.527423706155586e-06, |
|
"loss": 0.7988, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 5.4443391006657896e-06, |
|
"loss": 0.8286, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 5.361649459474756e-06, |
|
"loss": 0.8054, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 5.279361951565618e-06, |
|
"loss": 0.7801, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.027587890625, |
|
"learning_rate": 5.197483711057569e-06, |
|
"loss": 0.8107, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 5.116021836587353e-06, |
|
"loss": 0.8273, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 5.0349833906938235e-06, |
|
"loss": 0.7919, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 4.954375399205655e-06, |
|
"loss": 0.8371, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.0257568359375, |
|
"learning_rate": 4.8742048506322045e-06, |
|
"loss": 0.7882, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.027099609375, |
|
"learning_rate": 4.794478695557631e-06, |
|
"loss": 0.8149, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.02685546875, |
|
"learning_rate": 4.715203846038312e-06, |
|
"loss": 0.8254, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 4.636387175003558e-06, |
|
"loss": 0.8249, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 4.558035515659768e-06, |
|
"loss": 0.8004, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.0230712890625, |
|
"learning_rate": 4.480155660898001e-06, |
|
"loss": 0.7959, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 4.402754362705051e-06, |
|
"loss": 0.786, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 4.325838331578061e-06, |
|
"loss": 0.7948, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 4.249414235942755e-06, |
|
"loss": 0.7959, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.045654296875, |
|
"learning_rate": 4.173488701575274e-06, |
|
"loss": 0.775, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 4.098068311027772e-06, |
|
"loss": 0.7992, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 4.023159603057698e-06, |
|
"loss": 0.8045, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 3.948769072060927e-06, |
|
"loss": 0.7903, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0458984375, |
|
"learning_rate": 3.874903167508688e-06, |
|
"loss": 0.7974, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.026123046875, |
|
"learning_rate": 3.801568293388421e-06, |
|
"loss": 0.7766, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 3.728770807648574e-06, |
|
"loss": 0.8202, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 3.6565170216473744e-06, |
|
"loss": 0.8047, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 3.584813199605658e-06, |
|
"loss": 0.8324, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.0255126953125, |
|
"learning_rate": 3.513665558063771e-06, |
|
"loss": 0.8203, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.034912109375, |
|
"learning_rate": 3.4430802653426176e-06, |
|
"loss": 0.7997, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 3.373063441008877e-06, |
|
"loss": 0.7783, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.03857421875, |
|
"learning_rate": 3.303621155344453e-06, |
|
"loss": 0.8088, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.0250244140625, |
|
"learning_rate": 3.234759428820198e-06, |
|
"loss": 0.8231, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.0235595703125, |
|
"learning_rate": 3.1664842315739586e-06, |
|
"loss": 0.8005, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.03076171875, |
|
"learning_rate": 3.098801482892966e-06, |
|
"loss": 0.7926, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.11962890625, |
|
"learning_rate": 3.031717050700659e-06, |
|
"loss": 0.7926, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.045166015625, |
|
"learning_rate": 2.9652367510479476e-06, |
|
"loss": 0.8127, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.021728515625, |
|
"learning_rate": 2.899366347608974e-06, |
|
"loss": 0.8046, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 2.834111551181423e-06, |
|
"loss": 0.7647, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 2.7694780191914005e-06, |
|
"loss": 0.7985, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.04052734375, |
|
"learning_rate": 2.7054713552029577e-06, |
|
"loss": 0.841, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.041748046875, |
|
"learning_rate": 2.6420971084322745e-06, |
|
"loss": 0.7775, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.0244140625, |
|
"learning_rate": 2.5793607732665402e-06, |
|
"loss": 0.7954, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 2.5172677887876416e-06, |
|
"loss": 0.7875, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.02490234375, |
|
"learning_rate": 2.455823538300569e-06, |
|
"loss": 0.8065, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.0224609375, |
|
"learning_rate": 2.3950333488667178e-06, |
|
"loss": 0.8201, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 2.3349024908420403e-06, |
|
"loss": 0.813, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.03076171875, |
|
"learning_rate": 2.2754361774201217e-06, |
|
"loss": 0.8076, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.2166395641802076e-06, |
|
"loss": 0.8135, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 2.1585177486402275e-06, |
|
"loss": 0.8095, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.0341796875, |
|
"learning_rate": 2.101075769814855e-06, |
|
"loss": 0.7985, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.04150390625, |
|
"learning_rate": 2.0443186077786358e-06, |
|
"loss": 0.7976, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.04736328125, |
|
"learning_rate": 1.9882511832342297e-06, |
|
"loss": 0.7868, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0262451171875, |
|
"learning_rate": 1.9328783570857954e-06, |
|
"loss": 0.8072, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 1.8782049300175698e-06, |
|
"loss": 0.7931, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 1.8242356420776485e-06, |
|
"loss": 0.8185, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 1.7709751722670466e-06, |
|
"loss": 0.8391, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0274658203125, |
|
"learning_rate": 1.718428138134034e-06, |
|
"loss": 0.8502, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.02783203125, |
|
"learning_rate": 1.666599095373811e-06, |
|
"loss": 0.8185, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 1.6154925374335362e-06, |
|
"loss": 0.806, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 1.5651128951227613e-06, |
|
"loss": 0.783, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 1.5154645362292853e-06, |
|
"loss": 0.8159, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.025390625, |
|
"learning_rate": 1.4665517651404814e-06, |
|
"loss": 0.7793, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.0220947265625, |
|
"learning_rate": 1.4183788224701201e-06, |
|
"loss": 0.7754, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 1.370949884690711e-06, |
|
"loss": 0.847, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.035888671875, |
|
"learning_rate": 1.3242690637714228e-06, |
|
"loss": 0.8014, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.0238037109375, |
|
"learning_rate": 1.2783404068215776e-06, |
|
"loss": 0.8235, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.033447265625, |
|
"learning_rate": 1.2331678957397819e-06, |
|
"loss": 0.7997, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.033203125, |
|
"learning_rate": 1.1887554468687046e-06, |
|
"loss": 0.8159, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.033935546875, |
|
"learning_rate": 1.145106910655538e-06, |
|
"loss": 0.8392, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 1.1022260713181786e-06, |
|
"loss": 0.8266, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.03662109375, |
|
"learning_rate": 1.0601166465171387e-06, |
|
"loss": 0.8209, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.0294189453125, |
|
"learning_rate": 1.0187822870332398e-06, |
|
"loss": 0.8021, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 9.782265764510968e-07, |
|
"loss": 0.7938, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.0291748046875, |
|
"learning_rate": 9.384530308484275e-07, |
|
"loss": 0.8007, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.0279541015625, |
|
"learning_rate": 8.99465098491229e-07, |
|
"loss": 0.8097, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.032470703125, |
|
"learning_rate": 8.612661595348038e-07, |
|
"loss": 0.8029, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 8.238595257307225e-07, |
|
"loss": 0.8079, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 7.872484401397018e-07, |
|
"loss": 0.8074, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 7.514360768504314e-07, |
|
"loss": 0.7961, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.03125, |
|
"learning_rate": 7.164255407043986e-07, |
|
"loss": 0.7701, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.0269775390625, |
|
"learning_rate": 6.822198670266989e-07, |
|
"loss": 0.8165, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 6.488220213628837e-07, |
|
"loss": 0.8097, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 6.16234899221858e-07, |
|
"loss": 0.8073, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.0234375, |
|
"learning_rate": 5.844613258248411e-07, |
|
"loss": 0.7865, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.130859375, |
|
"learning_rate": 5.535040558604299e-07, |
|
"loss": 0.8182, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.0233154296875, |
|
"learning_rate": 5.233657732457775e-07, |
|
"loss": 0.78, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.0284423828125, |
|
"learning_rate": 4.940490908938977e-07, |
|
"loss": 0.7961, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.0478515625, |
|
"learning_rate": 4.6555655048713953e-07, |
|
"loss": 0.7946, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 4.3789062225682356e-07, |
|
"loss": 0.8022, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.060302734375, |
|
"learning_rate": 4.1105370476908104e-07, |
|
"loss": 0.8201, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 3.8504812471690687e-07, |
|
"loss": 0.8014, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.02392578125, |
|
"learning_rate": 3.598761367184367e-07, |
|
"loss": 0.7995, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.02880859375, |
|
"learning_rate": 3.3553992312148177e-07, |
|
"loss": 0.8139, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.029052734375, |
|
"learning_rate": 3.1204159381432174e-07, |
|
"loss": 0.8127, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.028564453125, |
|
"learning_rate": 2.8938318604278314e-07, |
|
"loss": 0.8009, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 2.675666642336172e-07, |
|
"loss": 0.8319, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 2.4659391982418626e-07, |
|
"loss": 0.8413, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 2.264667710984836e-07, |
|
"loss": 0.8194, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.0289306640625, |
|
"learning_rate": 2.0718696302949092e-07, |
|
"loss": 0.8128, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 1.8875616712789257e-07, |
|
"loss": 0.8008, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 1.7117598129716362e-07, |
|
"loss": 0.7753, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 1.544479296950341e-07, |
|
"loss": 0.802, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.025634765625, |
|
"learning_rate": 1.385734626013435e-07, |
|
"loss": 0.7936, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.0240478515625, |
|
"learning_rate": 1.2355395629231493e-07, |
|
"loss": 0.7956, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 1.0939071292122572e-07, |
|
"loss": 0.7769, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.0279541015625, |
|
"learning_rate": 9.608496040551918e-08, |
|
"loss": 0.7957, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.0277099609375, |
|
"learning_rate": 8.363785232034849e-08, |
|
"loss": 0.814, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 7.205046779856007e-08, |
|
"loss": 0.8155, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.03173828125, |
|
"learning_rate": 6.132381143713728e-08, |
|
"loss": 0.8364, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.0264892578125, |
|
"learning_rate": 5.1458813210106815e-08, |
|
"loss": 0.8173, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.038818359375, |
|
"learning_rate": 4.245632838791092e-08, |
|
"loss": 0.8085, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0225830078125, |
|
"learning_rate": 3.431713746325449e-08, |
|
"loss": 0.8195, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 2.7041946083442573e-08, |
|
"loss": 0.8029, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 2.0631384989202585e-08, |
|
"loss": 0.8359, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.030517578125, |
|
"learning_rate": 1.5086009959995875e-08, |
|
"loss": 0.8233, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.0228271484375, |
|
"learning_rate": 1.0406301765837346e-08, |
|
"loss": 0.7743, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.024658203125, |
|
"learning_rate": 6.592666125614377e-09, |
|
"loss": 0.7764, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.023193359375, |
|
"learning_rate": 3.645433671908283e-09, |
|
"loss": 0.798, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.0260009765625, |
|
"learning_rate": 1.5648599223316852e-09, |
|
"loss": 0.8119, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.0247802734375, |
|
"learning_rate": 3.5112525737734935e-10, |
|
"loss": 0.8295, |
|
"step": 3740 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3749, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.2690756062895145e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|