AZIIIIIIIIZ's picture
End of training
2dd0c66 verified
{
"best_metric": 0.9764913889070788,
"best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat/checkpoint-5370",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 5370,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"grad_norm": 3.459811210632324,
"learning_rate": 9.31098696461825e-07,
"loss": 0.7199,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 3.0020904541015625,
"learning_rate": 1.86219739292365e-06,
"loss": 0.7055,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 2.790972948074341,
"learning_rate": 2.7932960893854746e-06,
"loss": 0.6772,
"step": 30
},
{
"epoch": 0.07,
"grad_norm": 3.0193727016448975,
"learning_rate": 3.7243947858473e-06,
"loss": 0.6557,
"step": 40
},
{
"epoch": 0.09,
"grad_norm": 2.7702300548553467,
"learning_rate": 4.655493482309125e-06,
"loss": 0.6343,
"step": 50
},
{
"epoch": 0.11,
"grad_norm": 2.6783347129821777,
"learning_rate": 5.586592178770949e-06,
"loss": 0.5984,
"step": 60
},
{
"epoch": 0.13,
"grad_norm": 2.6775383949279785,
"learning_rate": 6.517690875232775e-06,
"loss": 0.5643,
"step": 70
},
{
"epoch": 0.15,
"grad_norm": 4.056990623474121,
"learning_rate": 7.4487895716946e-06,
"loss": 0.534,
"step": 80
},
{
"epoch": 0.17,
"grad_norm": 3.630444288253784,
"learning_rate": 8.379888268156424e-06,
"loss": 0.4963,
"step": 90
},
{
"epoch": 0.19,
"grad_norm": 5.517983913421631,
"learning_rate": 9.31098696461825e-06,
"loss": 0.4724,
"step": 100
},
{
"epoch": 0.2,
"grad_norm": 6.528829097747803,
"learning_rate": 1.0242085661080076e-05,
"loss": 0.4412,
"step": 110
},
{
"epoch": 0.22,
"grad_norm": 4.494168758392334,
"learning_rate": 1.1173184357541899e-05,
"loss": 0.426,
"step": 120
},
{
"epoch": 0.24,
"grad_norm": 6.211711883544922,
"learning_rate": 1.2104283054003724e-05,
"loss": 0.4181,
"step": 130
},
{
"epoch": 0.26,
"grad_norm": 8.021025657653809,
"learning_rate": 1.303538175046555e-05,
"loss": 0.4104,
"step": 140
},
{
"epoch": 0.28,
"grad_norm": 5.851734161376953,
"learning_rate": 1.3966480446927374e-05,
"loss": 0.396,
"step": 150
},
{
"epoch": 0.3,
"grad_norm": 7.224986553192139,
"learning_rate": 1.48975791433892e-05,
"loss": 0.3909,
"step": 160
},
{
"epoch": 0.32,
"grad_norm": 8.457038879394531,
"learning_rate": 1.5828677839851026e-05,
"loss": 0.3542,
"step": 170
},
{
"epoch": 0.34,
"grad_norm": 6.494715690612793,
"learning_rate": 1.675977653631285e-05,
"loss": 0.386,
"step": 180
},
{
"epoch": 0.35,
"grad_norm": 6.016439914703369,
"learning_rate": 1.7690875232774675e-05,
"loss": 0.3678,
"step": 190
},
{
"epoch": 0.37,
"grad_norm": 7.461198329925537,
"learning_rate": 1.86219739292365e-05,
"loss": 0.3559,
"step": 200
},
{
"epoch": 0.39,
"grad_norm": 9.391804695129395,
"learning_rate": 1.9553072625698323e-05,
"loss": 0.3505,
"step": 210
},
{
"epoch": 0.41,
"grad_norm": 7.332585334777832,
"learning_rate": 2.0484171322160152e-05,
"loss": 0.3562,
"step": 220
},
{
"epoch": 0.43,
"grad_norm": 6.92149019241333,
"learning_rate": 2.1415270018621975e-05,
"loss": 0.3258,
"step": 230
},
{
"epoch": 0.45,
"grad_norm": 7.061047554016113,
"learning_rate": 2.2346368715083797e-05,
"loss": 0.3243,
"step": 240
},
{
"epoch": 0.47,
"grad_norm": 8.112573623657227,
"learning_rate": 2.3277467411545626e-05,
"loss": 0.3315,
"step": 250
},
{
"epoch": 0.48,
"grad_norm": 5.698765277862549,
"learning_rate": 2.420856610800745e-05,
"loss": 0.3337,
"step": 260
},
{
"epoch": 0.5,
"grad_norm": 11.131732940673828,
"learning_rate": 2.5139664804469275e-05,
"loss": 0.2954,
"step": 270
},
{
"epoch": 0.52,
"grad_norm": 5.557519912719727,
"learning_rate": 2.60707635009311e-05,
"loss": 0.2806,
"step": 280
},
{
"epoch": 0.54,
"grad_norm": 7.42105770111084,
"learning_rate": 2.7001862197392923e-05,
"loss": 0.3018,
"step": 290
},
{
"epoch": 0.56,
"grad_norm": 5.84682035446167,
"learning_rate": 2.793296089385475e-05,
"loss": 0.2983,
"step": 300
},
{
"epoch": 0.58,
"grad_norm": 12.523080825805664,
"learning_rate": 2.886405959031657e-05,
"loss": 0.2942,
"step": 310
},
{
"epoch": 0.6,
"grad_norm": 10.046136856079102,
"learning_rate": 2.97951582867784e-05,
"loss": 0.3163,
"step": 320
},
{
"epoch": 0.61,
"grad_norm": 8.785089492797852,
"learning_rate": 3.0726256983240227e-05,
"loss": 0.2803,
"step": 330
},
{
"epoch": 0.63,
"grad_norm": 6.233649253845215,
"learning_rate": 3.165735567970205e-05,
"loss": 0.2968,
"step": 340
},
{
"epoch": 0.65,
"grad_norm": 6.553578853607178,
"learning_rate": 3.258845437616387e-05,
"loss": 0.2786,
"step": 350
},
{
"epoch": 0.67,
"grad_norm": 15.390639305114746,
"learning_rate": 3.35195530726257e-05,
"loss": 0.2665,
"step": 360
},
{
"epoch": 0.69,
"grad_norm": 7.992424964904785,
"learning_rate": 3.445065176908753e-05,
"loss": 0.2693,
"step": 370
},
{
"epoch": 0.71,
"grad_norm": 9.531819343566895,
"learning_rate": 3.538175046554935e-05,
"loss": 0.2914,
"step": 380
},
{
"epoch": 0.73,
"grad_norm": 8.762212753295898,
"learning_rate": 3.6312849162011175e-05,
"loss": 0.2673,
"step": 390
},
{
"epoch": 0.74,
"grad_norm": 11.937532424926758,
"learning_rate": 3.7243947858473e-05,
"loss": 0.2846,
"step": 400
},
{
"epoch": 0.76,
"grad_norm": 7.173421382904053,
"learning_rate": 3.817504655493483e-05,
"loss": 0.3207,
"step": 410
},
{
"epoch": 0.78,
"grad_norm": 10.6701021194458,
"learning_rate": 3.9106145251396646e-05,
"loss": 0.285,
"step": 420
},
{
"epoch": 0.8,
"grad_norm": 9.541986465454102,
"learning_rate": 4.003724394785848e-05,
"loss": 0.2684,
"step": 430
},
{
"epoch": 0.82,
"grad_norm": 7.715641975402832,
"learning_rate": 4.0968342644320304e-05,
"loss": 0.258,
"step": 440
},
{
"epoch": 0.84,
"grad_norm": 5.0808491706848145,
"learning_rate": 4.1899441340782123e-05,
"loss": 0.2539,
"step": 450
},
{
"epoch": 0.86,
"grad_norm": 7.925539016723633,
"learning_rate": 4.283054003724395e-05,
"loss": 0.2571,
"step": 460
},
{
"epoch": 0.88,
"grad_norm": 5.518635272979736,
"learning_rate": 4.3761638733705775e-05,
"loss": 0.2775,
"step": 470
},
{
"epoch": 0.89,
"grad_norm": 11.271161079406738,
"learning_rate": 4.4692737430167594e-05,
"loss": 0.2619,
"step": 480
},
{
"epoch": 0.91,
"grad_norm": 9.803050994873047,
"learning_rate": 4.562383612662943e-05,
"loss": 0.2623,
"step": 490
},
{
"epoch": 0.93,
"grad_norm": 6.506972312927246,
"learning_rate": 4.655493482309125e-05,
"loss": 0.278,
"step": 500
},
{
"epoch": 0.95,
"grad_norm": 9.093548774719238,
"learning_rate": 4.748603351955307e-05,
"loss": 0.2694,
"step": 510
},
{
"epoch": 0.97,
"grad_norm": 4.879490852355957,
"learning_rate": 4.84171322160149e-05,
"loss": 0.2487,
"step": 520
},
{
"epoch": 0.99,
"grad_norm": 12.4769926071167,
"learning_rate": 4.9348230912476724e-05,
"loss": 0.2428,
"step": 530
},
{
"epoch": 1.0,
"eval_accuracy": 0.9301944862811866,
"eval_loss": 0.17676062881946564,
"eval_runtime": 76.2927,
"eval_samples_per_second": 200.163,
"eval_steps_per_second": 3.133,
"step": 537
},
{
"epoch": 1.01,
"grad_norm": 8.015608787536621,
"learning_rate": 4.99689633767846e-05,
"loss": 0.2571,
"step": 540
},
{
"epoch": 1.02,
"grad_norm": 4.525390625,
"learning_rate": 4.986550796606663e-05,
"loss": 0.2506,
"step": 550
},
{
"epoch": 1.04,
"grad_norm": 5.759488105773926,
"learning_rate": 4.9762052555348645e-05,
"loss": 0.2415,
"step": 560
},
{
"epoch": 1.06,
"grad_norm": 7.032402992248535,
"learning_rate": 4.965859714463066e-05,
"loss": 0.2397,
"step": 570
},
{
"epoch": 1.08,
"grad_norm": 7.004304885864258,
"learning_rate": 4.955514173391269e-05,
"loss": 0.237,
"step": 580
},
{
"epoch": 1.1,
"grad_norm": 5.160691261291504,
"learning_rate": 4.9451686323194706e-05,
"loss": 0.2404,
"step": 590
},
{
"epoch": 1.12,
"grad_norm": 3.625582218170166,
"learning_rate": 4.9348230912476724e-05,
"loss": 0.2526,
"step": 600
},
{
"epoch": 1.14,
"grad_norm": 6.990879058837891,
"learning_rate": 4.924477550175874e-05,
"loss": 0.2412,
"step": 610
},
{
"epoch": 1.15,
"grad_norm": 5.855559825897217,
"learning_rate": 4.9141320091040766e-05,
"loss": 0.2289,
"step": 620
},
{
"epoch": 1.17,
"grad_norm": 3.9337453842163086,
"learning_rate": 4.9037864680322784e-05,
"loss": 0.2367,
"step": 630
},
{
"epoch": 1.19,
"grad_norm": 5.882688999176025,
"learning_rate": 4.89344092696048e-05,
"loss": 0.2166,
"step": 640
},
{
"epoch": 1.21,
"grad_norm": 5.7442779541015625,
"learning_rate": 4.8830953858886826e-05,
"loss": 0.2439,
"step": 650
},
{
"epoch": 1.23,
"grad_norm": 6.9317755699157715,
"learning_rate": 4.872749844816884e-05,
"loss": 0.246,
"step": 660
},
{
"epoch": 1.25,
"grad_norm": 5.778949737548828,
"learning_rate": 4.862404303745086e-05,
"loss": 0.2377,
"step": 670
},
{
"epoch": 1.27,
"grad_norm": 3.596724271774292,
"learning_rate": 4.852058762673288e-05,
"loss": 0.2367,
"step": 680
},
{
"epoch": 1.28,
"grad_norm": 5.4918670654296875,
"learning_rate": 4.84171322160149e-05,
"loss": 0.2199,
"step": 690
},
{
"epoch": 1.3,
"grad_norm": 5.873249530792236,
"learning_rate": 4.831367680529692e-05,
"loss": 0.2051,
"step": 700
},
{
"epoch": 1.32,
"grad_norm": 5.162601947784424,
"learning_rate": 4.8210221394578933e-05,
"loss": 0.2048,
"step": 710
},
{
"epoch": 1.34,
"grad_norm": 7.136524200439453,
"learning_rate": 4.810676598386096e-05,
"loss": 0.2388,
"step": 720
},
{
"epoch": 1.36,
"grad_norm": 5.694529056549072,
"learning_rate": 4.8003310573142976e-05,
"loss": 0.2468,
"step": 730
},
{
"epoch": 1.38,
"grad_norm": 8.343286514282227,
"learning_rate": 4.7899855162424994e-05,
"loss": 0.2166,
"step": 740
},
{
"epoch": 1.4,
"grad_norm": 5.000110626220703,
"learning_rate": 4.779639975170702e-05,
"loss": 0.1935,
"step": 750
},
{
"epoch": 1.42,
"grad_norm": 4.729160785675049,
"learning_rate": 4.7692944340989036e-05,
"loss": 0.2,
"step": 760
},
{
"epoch": 1.43,
"grad_norm": 3.7154366970062256,
"learning_rate": 4.7589488930271054e-05,
"loss": 0.1998,
"step": 770
},
{
"epoch": 1.45,
"grad_norm": 7.739311695098877,
"learning_rate": 4.748603351955307e-05,
"loss": 0.218,
"step": 780
},
{
"epoch": 1.47,
"grad_norm": 9.706314086914062,
"learning_rate": 4.7382578108835096e-05,
"loss": 0.1978,
"step": 790
},
{
"epoch": 1.49,
"grad_norm": 4.060732841491699,
"learning_rate": 4.7279122698117114e-05,
"loss": 0.2043,
"step": 800
},
{
"epoch": 1.51,
"grad_norm": 6.549215793609619,
"learning_rate": 4.717566728739913e-05,
"loss": 0.2051,
"step": 810
},
{
"epoch": 1.53,
"grad_norm": 4.586370468139648,
"learning_rate": 4.707221187668116e-05,
"loss": 0.2025,
"step": 820
},
{
"epoch": 1.55,
"grad_norm": 7.645646572113037,
"learning_rate": 4.696875646596317e-05,
"loss": 0.2121,
"step": 830
},
{
"epoch": 1.56,
"grad_norm": 6.611913681030273,
"learning_rate": 4.686530105524519e-05,
"loss": 0.2095,
"step": 840
},
{
"epoch": 1.58,
"grad_norm": 5.388148307800293,
"learning_rate": 4.676184564452721e-05,
"loss": 0.227,
"step": 850
},
{
"epoch": 1.6,
"grad_norm": 6.008878231048584,
"learning_rate": 4.665839023380923e-05,
"loss": 0.1992,
"step": 860
},
{
"epoch": 1.62,
"grad_norm": 4.902502536773682,
"learning_rate": 4.655493482309125e-05,
"loss": 0.2126,
"step": 870
},
{
"epoch": 1.64,
"grad_norm": 4.861505508422852,
"learning_rate": 4.6451479412373264e-05,
"loss": 0.1905,
"step": 880
},
{
"epoch": 1.66,
"grad_norm": 3.8657066822052,
"learning_rate": 4.634802400165529e-05,
"loss": 0.2017,
"step": 890
},
{
"epoch": 1.68,
"grad_norm": 3.623135805130005,
"learning_rate": 4.6244568590937306e-05,
"loss": 0.184,
"step": 900
},
{
"epoch": 1.69,
"grad_norm": 5.032660961151123,
"learning_rate": 4.6141113180219324e-05,
"loss": 0.2036,
"step": 910
},
{
"epoch": 1.71,
"grad_norm": 5.572585105895996,
"learning_rate": 4.603765776950135e-05,
"loss": 0.2066,
"step": 920
},
{
"epoch": 1.73,
"grad_norm": 5.036092281341553,
"learning_rate": 4.5934202358783367e-05,
"loss": 0.2003,
"step": 930
},
{
"epoch": 1.75,
"grad_norm": 3.459541082382202,
"learning_rate": 4.5830746948065384e-05,
"loss": 0.2143,
"step": 940
},
{
"epoch": 1.77,
"grad_norm": 3.976844072341919,
"learning_rate": 4.57272915373474e-05,
"loss": 0.18,
"step": 950
},
{
"epoch": 1.79,
"grad_norm": 4.34367036819458,
"learning_rate": 4.562383612662943e-05,
"loss": 0.2025,
"step": 960
},
{
"epoch": 1.81,
"grad_norm": 4.163506031036377,
"learning_rate": 4.5520380715911445e-05,
"loss": 0.1976,
"step": 970
},
{
"epoch": 1.82,
"grad_norm": 5.492095947265625,
"learning_rate": 4.541692530519346e-05,
"loss": 0.2001,
"step": 980
},
{
"epoch": 1.84,
"grad_norm": 5.207737445831299,
"learning_rate": 4.531346989447549e-05,
"loss": 0.2054,
"step": 990
},
{
"epoch": 1.86,
"grad_norm": 6.426153659820557,
"learning_rate": 4.52100144837575e-05,
"loss": 0.1785,
"step": 1000
},
{
"epoch": 1.88,
"grad_norm": 4.854528903961182,
"learning_rate": 4.510655907303952e-05,
"loss": 0.1873,
"step": 1010
},
{
"epoch": 1.9,
"grad_norm": 5.651273727416992,
"learning_rate": 4.500310366232154e-05,
"loss": 0.2056,
"step": 1020
},
{
"epoch": 1.92,
"grad_norm": 4.955221652984619,
"learning_rate": 4.489964825160356e-05,
"loss": 0.1934,
"step": 1030
},
{
"epoch": 1.94,
"grad_norm": 4.257241725921631,
"learning_rate": 4.479619284088558e-05,
"loss": 0.1972,
"step": 1040
},
{
"epoch": 1.96,
"grad_norm": 3.3766889572143555,
"learning_rate": 4.4692737430167594e-05,
"loss": 0.1789,
"step": 1050
},
{
"epoch": 1.97,
"grad_norm": 4.304858684539795,
"learning_rate": 4.458928201944962e-05,
"loss": 0.1899,
"step": 1060
},
{
"epoch": 1.99,
"grad_norm": 3.6677396297454834,
"learning_rate": 4.448582660873164e-05,
"loss": 0.1877,
"step": 1070
},
{
"epoch": 2.0,
"eval_accuracy": 0.9550782528976491,
"eval_loss": 0.11414149403572083,
"eval_runtime": 76.7154,
"eval_samples_per_second": 199.061,
"eval_steps_per_second": 3.115,
"step": 1074
},
{
"epoch": 2.01,
"grad_norm": 2.929933786392212,
"learning_rate": 4.4382371198013655e-05,
"loss": 0.1798,
"step": 1080
},
{
"epoch": 2.03,
"grad_norm": 4.865507125854492,
"learning_rate": 4.427891578729568e-05,
"loss": 0.1895,
"step": 1090
},
{
"epoch": 2.05,
"grad_norm": 5.74074125289917,
"learning_rate": 4.41754603765777e-05,
"loss": 0.1879,
"step": 1100
},
{
"epoch": 2.07,
"grad_norm": 3.846959114074707,
"learning_rate": 4.4072004965859715e-05,
"loss": 0.1843,
"step": 1110
},
{
"epoch": 2.09,
"grad_norm": 3.7884573936462402,
"learning_rate": 4.396854955514173e-05,
"loss": 0.1654,
"step": 1120
},
{
"epoch": 2.1,
"grad_norm": 4.030633449554443,
"learning_rate": 4.386509414442376e-05,
"loss": 0.1754,
"step": 1130
},
{
"epoch": 2.12,
"grad_norm": 4.759024620056152,
"learning_rate": 4.3761638733705775e-05,
"loss": 0.1804,
"step": 1140
},
{
"epoch": 2.14,
"grad_norm": 5.641456604003906,
"learning_rate": 4.365818332298779e-05,
"loss": 0.1822,
"step": 1150
},
{
"epoch": 2.16,
"grad_norm": 6.424627780914307,
"learning_rate": 4.355472791226982e-05,
"loss": 0.1904,
"step": 1160
},
{
"epoch": 2.18,
"grad_norm": 3.8068325519561768,
"learning_rate": 4.345127250155183e-05,
"loss": 0.1623,
"step": 1170
},
{
"epoch": 2.2,
"grad_norm": 8.320751190185547,
"learning_rate": 4.334781709083385e-05,
"loss": 0.171,
"step": 1180
},
{
"epoch": 2.22,
"grad_norm": 3.5664634704589844,
"learning_rate": 4.324436168011588e-05,
"loss": 0.1674,
"step": 1190
},
{
"epoch": 2.23,
"grad_norm": 5.247286319732666,
"learning_rate": 4.314090626939789e-05,
"loss": 0.1841,
"step": 1200
},
{
"epoch": 2.25,
"grad_norm": 4.270170211791992,
"learning_rate": 4.3037450858679914e-05,
"loss": 0.1767,
"step": 1210
},
{
"epoch": 2.27,
"grad_norm": 3.8757407665252686,
"learning_rate": 4.2933995447961925e-05,
"loss": 0.1763,
"step": 1220
},
{
"epoch": 2.29,
"grad_norm": 3.8913145065307617,
"learning_rate": 4.283054003724395e-05,
"loss": 0.1938,
"step": 1230
},
{
"epoch": 2.31,
"grad_norm": 5.493150234222412,
"learning_rate": 4.2727084626525974e-05,
"loss": 0.1815,
"step": 1240
},
{
"epoch": 2.33,
"grad_norm": 3.1902143955230713,
"learning_rate": 4.2623629215807985e-05,
"loss": 0.1666,
"step": 1250
},
{
"epoch": 2.35,
"grad_norm": 4.181407928466797,
"learning_rate": 4.252017380509001e-05,
"loss": 0.1856,
"step": 1260
},
{
"epoch": 2.36,
"grad_norm": 3.092036724090576,
"learning_rate": 4.241671839437203e-05,
"loss": 0.1582,
"step": 1270
},
{
"epoch": 2.38,
"grad_norm": 4.973979949951172,
"learning_rate": 4.2313262983654045e-05,
"loss": 0.2007,
"step": 1280
},
{
"epoch": 2.4,
"grad_norm": 3.259059429168701,
"learning_rate": 4.220980757293606e-05,
"loss": 0.1729,
"step": 1290
},
{
"epoch": 2.42,
"grad_norm": 3.7358877658843994,
"learning_rate": 4.210635216221809e-05,
"loss": 0.1996,
"step": 1300
},
{
"epoch": 2.44,
"grad_norm": 4.496426582336426,
"learning_rate": 4.2002896751500106e-05,
"loss": 0.1711,
"step": 1310
},
{
"epoch": 2.46,
"grad_norm": 4.290408611297607,
"learning_rate": 4.1899441340782123e-05,
"loss": 0.1773,
"step": 1320
},
{
"epoch": 2.48,
"grad_norm": 5.771693229675293,
"learning_rate": 4.179598593006415e-05,
"loss": 0.1777,
"step": 1330
},
{
"epoch": 2.5,
"grad_norm": 4.477756500244141,
"learning_rate": 4.169253051934616e-05,
"loss": 0.1731,
"step": 1340
},
{
"epoch": 2.51,
"grad_norm": 4.2773895263671875,
"learning_rate": 4.1589075108628184e-05,
"loss": 0.185,
"step": 1350
},
{
"epoch": 2.53,
"grad_norm": 5.028202533721924,
"learning_rate": 4.148561969791021e-05,
"loss": 0.1729,
"step": 1360
},
{
"epoch": 2.55,
"grad_norm": 5.293006896972656,
"learning_rate": 4.138216428719222e-05,
"loss": 0.1714,
"step": 1370
},
{
"epoch": 2.57,
"grad_norm": 5.110383987426758,
"learning_rate": 4.1278708876474244e-05,
"loss": 0.1697,
"step": 1380
},
{
"epoch": 2.59,
"grad_norm": 4.646900653839111,
"learning_rate": 4.1175253465756255e-05,
"loss": 0.189,
"step": 1390
},
{
"epoch": 2.61,
"grad_norm": 3.653062343597412,
"learning_rate": 4.107179805503828e-05,
"loss": 0.1619,
"step": 1400
},
{
"epoch": 2.63,
"grad_norm": 4.054281711578369,
"learning_rate": 4.0968342644320304e-05,
"loss": 0.1767,
"step": 1410
},
{
"epoch": 2.64,
"grad_norm": 2.8915677070617676,
"learning_rate": 4.0864887233602315e-05,
"loss": 0.1565,
"step": 1420
},
{
"epoch": 2.66,
"grad_norm": 6.253181457519531,
"learning_rate": 4.076143182288434e-05,
"loss": 0.1844,
"step": 1430
},
{
"epoch": 2.68,
"grad_norm": 4.847060680389404,
"learning_rate": 4.065797641216636e-05,
"loss": 0.1653,
"step": 1440
},
{
"epoch": 2.7,
"grad_norm": 6.430269718170166,
"learning_rate": 4.0554521001448376e-05,
"loss": 0.1701,
"step": 1450
},
{
"epoch": 2.72,
"grad_norm": 8.237833023071289,
"learning_rate": 4.04510655907304e-05,
"loss": 0.1612,
"step": 1460
},
{
"epoch": 2.74,
"grad_norm": 4.337214946746826,
"learning_rate": 4.034761018001242e-05,
"loss": 0.1699,
"step": 1470
},
{
"epoch": 2.76,
"grad_norm": 2.5487866401672363,
"learning_rate": 4.0244154769294436e-05,
"loss": 0.1658,
"step": 1480
},
{
"epoch": 2.77,
"grad_norm": 5.610079765319824,
"learning_rate": 4.0140699358576454e-05,
"loss": 0.1591,
"step": 1490
},
{
"epoch": 2.79,
"grad_norm": 3.776702642440796,
"learning_rate": 4.003724394785848e-05,
"loss": 0.1542,
"step": 1500
},
{
"epoch": 2.81,
"grad_norm": 7.976995944976807,
"learning_rate": 3.9933788537140496e-05,
"loss": 0.1641,
"step": 1510
},
{
"epoch": 2.83,
"grad_norm": 3.6274945735931396,
"learning_rate": 3.9830333126422514e-05,
"loss": 0.1654,
"step": 1520
},
{
"epoch": 2.85,
"grad_norm": 4.607425212860107,
"learning_rate": 3.972687771570454e-05,
"loss": 0.1819,
"step": 1530
},
{
"epoch": 2.87,
"grad_norm": 3.462461471557617,
"learning_rate": 3.962342230498655e-05,
"loss": 0.1622,
"step": 1540
},
{
"epoch": 2.89,
"grad_norm": 5.016781330108643,
"learning_rate": 3.9519966894268574e-05,
"loss": 0.1422,
"step": 1550
},
{
"epoch": 2.91,
"grad_norm": 5.226962566375732,
"learning_rate": 3.9416511483550586e-05,
"loss": 0.1747,
"step": 1560
},
{
"epoch": 2.92,
"grad_norm": 4.826117038726807,
"learning_rate": 3.931305607283261e-05,
"loss": 0.1582,
"step": 1570
},
{
"epoch": 2.94,
"grad_norm": 5.275283336639404,
"learning_rate": 3.9209600662114635e-05,
"loss": 0.1597,
"step": 1580
},
{
"epoch": 2.96,
"grad_norm": 4.76600980758667,
"learning_rate": 3.9106145251396646e-05,
"loss": 0.1685,
"step": 1590
},
{
"epoch": 2.98,
"grad_norm": 4.5946946144104,
"learning_rate": 3.900268984067867e-05,
"loss": 0.1431,
"step": 1600
},
{
"epoch": 3.0,
"grad_norm": 3.286769151687622,
"learning_rate": 3.889923442996069e-05,
"loss": 0.1574,
"step": 1610
},
{
"epoch": 3.0,
"eval_accuracy": 0.9461070001964508,
"eval_loss": 0.13586224615573883,
"eval_runtime": 75.9342,
"eval_samples_per_second": 201.108,
"eval_steps_per_second": 3.147,
"step": 1611
},
{
"epoch": 3.02,
"grad_norm": 4.025867938995361,
"learning_rate": 3.8795779019242706e-05,
"loss": 0.1492,
"step": 1620
},
{
"epoch": 3.04,
"grad_norm": 5.146902084350586,
"learning_rate": 3.869232360852473e-05,
"loss": 0.1639,
"step": 1630
},
{
"epoch": 3.05,
"grad_norm": 4.2595906257629395,
"learning_rate": 3.858886819780675e-05,
"loss": 0.1614,
"step": 1640
},
{
"epoch": 3.07,
"grad_norm": 3.0910167694091797,
"learning_rate": 3.8485412787088766e-05,
"loss": 0.1676,
"step": 1650
},
{
"epoch": 3.09,
"grad_norm": 2.9316298961639404,
"learning_rate": 3.8381957376370784e-05,
"loss": 0.1585,
"step": 1660
},
{
"epoch": 3.11,
"grad_norm": 6.207183361053467,
"learning_rate": 3.827850196565281e-05,
"loss": 0.1454,
"step": 1670
},
{
"epoch": 3.13,
"grad_norm": 6.1695170402526855,
"learning_rate": 3.817504655493483e-05,
"loss": 0.156,
"step": 1680
},
{
"epoch": 3.15,
"grad_norm": 4.578737258911133,
"learning_rate": 3.8071591144216845e-05,
"loss": 0.1499,
"step": 1690
},
{
"epoch": 3.17,
"grad_norm": 3.303900957107544,
"learning_rate": 3.796813573349887e-05,
"loss": 0.163,
"step": 1700
},
{
"epoch": 3.18,
"grad_norm": 4.704057216644287,
"learning_rate": 3.786468032278088e-05,
"loss": 0.1595,
"step": 1710
},
{
"epoch": 3.2,
"grad_norm": 6.2445068359375,
"learning_rate": 3.7761224912062905e-05,
"loss": 0.1547,
"step": 1720
},
{
"epoch": 3.22,
"grad_norm": 4.816792964935303,
"learning_rate": 3.765776950134492e-05,
"loss": 0.1509,
"step": 1730
},
{
"epoch": 3.24,
"grad_norm": 3.5534775257110596,
"learning_rate": 3.755431409062694e-05,
"loss": 0.1504,
"step": 1740
},
{
"epoch": 3.26,
"grad_norm": 3.065899133682251,
"learning_rate": 3.7450858679908965e-05,
"loss": 0.1491,
"step": 1750
},
{
"epoch": 3.28,
"grad_norm": 5.379620552062988,
"learning_rate": 3.7347403269190976e-05,
"loss": 0.1605,
"step": 1760
},
{
"epoch": 3.3,
"grad_norm": 3.993555784225464,
"learning_rate": 3.7243947858473e-05,
"loss": 0.1516,
"step": 1770
},
{
"epoch": 3.31,
"grad_norm": 3.0113396644592285,
"learning_rate": 3.714049244775502e-05,
"loss": 0.1487,
"step": 1780
},
{
"epoch": 3.33,
"grad_norm": 8.80622673034668,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.1587,
"step": 1790
},
{
"epoch": 3.35,
"grad_norm": 3.0601558685302734,
"learning_rate": 3.693358162631906e-05,
"loss": 0.1523,
"step": 1800
},
{
"epoch": 3.37,
"grad_norm": 3.8811893463134766,
"learning_rate": 3.683012621560108e-05,
"loss": 0.1665,
"step": 1810
},
{
"epoch": 3.39,
"grad_norm": 5.455690860748291,
"learning_rate": 3.67266708048831e-05,
"loss": 0.1458,
"step": 1820
},
{
"epoch": 3.41,
"grad_norm": 2.9093761444091797,
"learning_rate": 3.6623215394165115e-05,
"loss": 0.1524,
"step": 1830
},
{
"epoch": 3.43,
"grad_norm": 5.275539398193359,
"learning_rate": 3.651975998344714e-05,
"loss": 0.1551,
"step": 1840
},
{
"epoch": 3.45,
"grad_norm": 3.3126418590545654,
"learning_rate": 3.641630457272916e-05,
"loss": 0.1532,
"step": 1850
},
{
"epoch": 3.46,
"grad_norm": 4.406727313995361,
"learning_rate": 3.6312849162011175e-05,
"loss": 0.1535,
"step": 1860
},
{
"epoch": 3.48,
"grad_norm": 2.409686803817749,
"learning_rate": 3.62093937512932e-05,
"loss": 0.1617,
"step": 1870
},
{
"epoch": 3.5,
"grad_norm": 3.572361946105957,
"learning_rate": 3.610593834057521e-05,
"loss": 0.1466,
"step": 1880
},
{
"epoch": 3.52,
"grad_norm": 4.148622035980225,
"learning_rate": 3.6002482929857235e-05,
"loss": 0.1473,
"step": 1890
},
{
"epoch": 3.54,
"grad_norm": 2.6833081245422363,
"learning_rate": 3.589902751913925e-05,
"loss": 0.1533,
"step": 1900
},
{
"epoch": 3.56,
"grad_norm": 4.358566761016846,
"learning_rate": 3.579557210842127e-05,
"loss": 0.159,
"step": 1910
},
{
"epoch": 3.58,
"grad_norm": 2.758660078048706,
"learning_rate": 3.5692116697703296e-05,
"loss": 0.152,
"step": 1920
},
{
"epoch": 3.59,
"grad_norm": 6.929190158843994,
"learning_rate": 3.558866128698531e-05,
"loss": 0.1512,
"step": 1930
},
{
"epoch": 3.61,
"grad_norm": 3.7686049938201904,
"learning_rate": 3.548520587626733e-05,
"loss": 0.1507,
"step": 1940
},
{
"epoch": 3.63,
"grad_norm": 3.968449592590332,
"learning_rate": 3.538175046554935e-05,
"loss": 0.1463,
"step": 1950
},
{
"epoch": 3.65,
"grad_norm": 3.7159385681152344,
"learning_rate": 3.527829505483137e-05,
"loss": 0.1545,
"step": 1960
},
{
"epoch": 3.67,
"grad_norm": 7.977210521697998,
"learning_rate": 3.517483964411339e-05,
"loss": 0.1478,
"step": 1970
},
{
"epoch": 3.69,
"grad_norm": 4.873678207397461,
"learning_rate": 3.507138423339541e-05,
"loss": 0.145,
"step": 1980
},
{
"epoch": 3.71,
"grad_norm": 5.451579570770264,
"learning_rate": 3.496792882267743e-05,
"loss": 0.1484,
"step": 1990
},
{
"epoch": 3.72,
"grad_norm": 3.9063429832458496,
"learning_rate": 3.4864473411959445e-05,
"loss": 0.1453,
"step": 2000
},
{
"epoch": 3.74,
"grad_norm": 3.2299928665161133,
"learning_rate": 3.476101800124147e-05,
"loss": 0.1546,
"step": 2010
},
{
"epoch": 3.76,
"grad_norm": 4.653662204742432,
"learning_rate": 3.465756259052349e-05,
"loss": 0.1419,
"step": 2020
},
{
"epoch": 3.78,
"grad_norm": 3.974182367324829,
"learning_rate": 3.4554107179805505e-05,
"loss": 0.1388,
"step": 2030
},
{
"epoch": 3.8,
"grad_norm": 5.028197765350342,
"learning_rate": 3.445065176908753e-05,
"loss": 0.1664,
"step": 2040
},
{
"epoch": 3.82,
"grad_norm": 4.503687381744385,
"learning_rate": 3.434719635836954e-05,
"loss": 0.1506,
"step": 2050
},
{
"epoch": 3.84,
"grad_norm": 3.6866469383239746,
"learning_rate": 3.4243740947651566e-05,
"loss": 0.1351,
"step": 2060
},
{
"epoch": 3.85,
"grad_norm": 2.9063117504119873,
"learning_rate": 3.4140285536933584e-05,
"loss": 0.1427,
"step": 2070
},
{
"epoch": 3.87,
"grad_norm": 4.720154762268066,
"learning_rate": 3.40368301262156e-05,
"loss": 0.1422,
"step": 2080
},
{
"epoch": 3.89,
"grad_norm": 4.307085990905762,
"learning_rate": 3.3933374715497626e-05,
"loss": 0.1442,
"step": 2090
},
{
"epoch": 3.91,
"grad_norm": 3.0267748832702637,
"learning_rate": 3.382991930477964e-05,
"loss": 0.1479,
"step": 2100
},
{
"epoch": 3.93,
"grad_norm": 4.804783344268799,
"learning_rate": 3.372646389406166e-05,
"loss": 0.1515,
"step": 2110
},
{
"epoch": 3.95,
"grad_norm": 6.450136184692383,
"learning_rate": 3.362300848334368e-05,
"loss": 0.1523,
"step": 2120
},
{
"epoch": 3.97,
"grad_norm": 4.83671236038208,
"learning_rate": 3.35195530726257e-05,
"loss": 0.1428,
"step": 2130
},
{
"epoch": 3.99,
"grad_norm": 3.104628324508667,
"learning_rate": 3.341609766190772e-05,
"loss": 0.1412,
"step": 2140
},
{
"epoch": 4.0,
"eval_accuracy": 0.9521969746578482,
"eval_loss": 0.12448973953723907,
"eval_runtime": 75.9779,
"eval_samples_per_second": 200.993,
"eval_steps_per_second": 3.146,
"step": 2148
},
{
"epoch": 4.0,
"grad_norm": 3.8630151748657227,
"learning_rate": 3.331264225118974e-05,
"loss": 0.1589,
"step": 2150
},
{
"epoch": 4.02,
"grad_norm": 4.394106388092041,
"learning_rate": 3.320918684047176e-05,
"loss": 0.1574,
"step": 2160
},
{
"epoch": 4.04,
"grad_norm": 3.3176400661468506,
"learning_rate": 3.3105731429753776e-05,
"loss": 0.1453,
"step": 2170
},
{
"epoch": 4.06,
"grad_norm": 3.7267205715179443,
"learning_rate": 3.30022760190358e-05,
"loss": 0.1386,
"step": 2180
},
{
"epoch": 4.08,
"grad_norm": 4.1653313636779785,
"learning_rate": 3.289882060831782e-05,
"loss": 0.1438,
"step": 2190
},
{
"epoch": 4.1,
"grad_norm": 3.0098297595977783,
"learning_rate": 3.2795365197599836e-05,
"loss": 0.1407,
"step": 2200
},
{
"epoch": 4.12,
"grad_norm": 4.614931106567383,
"learning_rate": 3.269190978688186e-05,
"loss": 0.1359,
"step": 2210
},
{
"epoch": 4.13,
"grad_norm": 2.9281229972839355,
"learning_rate": 3.258845437616387e-05,
"loss": 0.1457,
"step": 2220
},
{
"epoch": 4.15,
"grad_norm": 3.3955490589141846,
"learning_rate": 3.2484998965445896e-05,
"loss": 0.1272,
"step": 2230
},
{
"epoch": 4.17,
"grad_norm": 3.2423532009124756,
"learning_rate": 3.2381543554727914e-05,
"loss": 0.1287,
"step": 2240
},
{
"epoch": 4.19,
"grad_norm": 3.0419697761535645,
"learning_rate": 3.227808814400993e-05,
"loss": 0.1415,
"step": 2250
},
{
"epoch": 4.21,
"grad_norm": 3.408339738845825,
"learning_rate": 3.2174632733291956e-05,
"loss": 0.1391,
"step": 2260
},
{
"epoch": 4.23,
"grad_norm": 6.2242350578308105,
"learning_rate": 3.207117732257397e-05,
"loss": 0.1465,
"step": 2270
},
{
"epoch": 4.25,
"grad_norm": 2.413308620452881,
"learning_rate": 3.196772191185599e-05,
"loss": 0.1311,
"step": 2280
},
{
"epoch": 4.26,
"grad_norm": 4.887659549713135,
"learning_rate": 3.186426650113801e-05,
"loss": 0.1389,
"step": 2290
},
{
"epoch": 4.28,
"grad_norm": 3.2473926544189453,
"learning_rate": 3.176081109042003e-05,
"loss": 0.1502,
"step": 2300
},
{
"epoch": 4.3,
"grad_norm": 4.212912559509277,
"learning_rate": 3.165735567970205e-05,
"loss": 0.1313,
"step": 2310
},
{
"epoch": 4.32,
"grad_norm": 4.970630645751953,
"learning_rate": 3.155390026898407e-05,
"loss": 0.137,
"step": 2320
},
{
"epoch": 4.34,
"grad_norm": 2.669292688369751,
"learning_rate": 3.145044485826609e-05,
"loss": 0.1447,
"step": 2330
},
{
"epoch": 4.36,
"grad_norm": 4.9535980224609375,
"learning_rate": 3.1346989447548106e-05,
"loss": 0.1385,
"step": 2340
},
{
"epoch": 4.38,
"grad_norm": 4.139229774475098,
"learning_rate": 3.124353403683013e-05,
"loss": 0.1275,
"step": 2350
},
{
"epoch": 4.39,
"grad_norm": 5.44993782043457,
"learning_rate": 3.114007862611215e-05,
"loss": 0.1364,
"step": 2360
},
{
"epoch": 4.41,
"grad_norm": 2.844508171081543,
"learning_rate": 3.1036623215394166e-05,
"loss": 0.1359,
"step": 2370
},
{
"epoch": 4.43,
"grad_norm": 3.190946578979492,
"learning_rate": 3.093316780467619e-05,
"loss": 0.1318,
"step": 2380
},
{
"epoch": 4.45,
"grad_norm": 6.107606887817383,
"learning_rate": 3.08297123939582e-05,
"loss": 0.1169,
"step": 2390
},
{
"epoch": 4.47,
"grad_norm": 3.5856211185455322,
"learning_rate": 3.0726256983240227e-05,
"loss": 0.1481,
"step": 2400
},
{
"epoch": 4.49,
"grad_norm": 3.9483227729797363,
"learning_rate": 3.0622801572522244e-05,
"loss": 0.1397,
"step": 2410
},
{
"epoch": 4.51,
"grad_norm": 4.955249786376953,
"learning_rate": 3.051934616180426e-05,
"loss": 0.1427,
"step": 2420
},
{
"epoch": 4.53,
"grad_norm": 4.119729995727539,
"learning_rate": 3.0415890751086283e-05,
"loss": 0.1265,
"step": 2430
},
{
"epoch": 4.54,
"grad_norm": 3.9766499996185303,
"learning_rate": 3.03124353403683e-05,
"loss": 0.1184,
"step": 2440
},
{
"epoch": 4.56,
"grad_norm": 3.8164212703704834,
"learning_rate": 3.0208979929650323e-05,
"loss": 0.1247,
"step": 2450
},
{
"epoch": 4.58,
"grad_norm": 3.525179862976074,
"learning_rate": 3.0105524518932344e-05,
"loss": 0.1415,
"step": 2460
},
{
"epoch": 4.6,
"grad_norm": 3.0888924598693848,
"learning_rate": 3.0002069108214358e-05,
"loss": 0.1319,
"step": 2470
},
{
"epoch": 4.62,
"grad_norm": 4.304419040679932,
"learning_rate": 2.989861369749638e-05,
"loss": 0.1439,
"step": 2480
},
{
"epoch": 4.64,
"grad_norm": 8.166552543640137,
"learning_rate": 2.97951582867784e-05,
"loss": 0.1331,
"step": 2490
},
{
"epoch": 4.66,
"grad_norm": 2.1165900230407715,
"learning_rate": 2.969170287606042e-05,
"loss": 0.1406,
"step": 2500
},
{
"epoch": 4.67,
"grad_norm": 5.2040557861328125,
"learning_rate": 2.958824746534244e-05,
"loss": 0.1234,
"step": 2510
},
{
"epoch": 4.69,
"grad_norm": 3.8155689239501953,
"learning_rate": 2.948479205462446e-05,
"loss": 0.1159,
"step": 2520
},
{
"epoch": 4.71,
"grad_norm": 7.363270282745361,
"learning_rate": 2.9381336643906475e-05,
"loss": 0.1306,
"step": 2530
},
{
"epoch": 4.73,
"grad_norm": 4.376212120056152,
"learning_rate": 2.9277881233188497e-05,
"loss": 0.1352,
"step": 2540
},
{
"epoch": 4.75,
"grad_norm": 4.796603679656982,
"learning_rate": 2.9174425822470518e-05,
"loss": 0.1363,
"step": 2550
},
{
"epoch": 4.77,
"grad_norm": 3.4734084606170654,
"learning_rate": 2.9070970411752536e-05,
"loss": 0.127,
"step": 2560
},
{
"epoch": 4.79,
"grad_norm": 4.342591285705566,
"learning_rate": 2.8967515001034557e-05,
"loss": 0.1264,
"step": 2570
},
{
"epoch": 4.8,
"grad_norm": 4.9679856300354,
"learning_rate": 2.886405959031657e-05,
"loss": 0.1397,
"step": 2580
},
{
"epoch": 4.82,
"grad_norm": 3.1558454036712646,
"learning_rate": 2.8760604179598593e-05,
"loss": 0.1304,
"step": 2590
},
{
"epoch": 4.84,
"grad_norm": 4.197761535644531,
"learning_rate": 2.8657148768880614e-05,
"loss": 0.1192,
"step": 2600
},
{
"epoch": 4.86,
"grad_norm": 3.9379701614379883,
"learning_rate": 2.8553693358162632e-05,
"loss": 0.1355,
"step": 2610
},
{
"epoch": 4.88,
"grad_norm": 4.20279598236084,
"learning_rate": 2.8450237947444653e-05,
"loss": 0.1333,
"step": 2620
},
{
"epoch": 4.9,
"grad_norm": 5.210755348205566,
"learning_rate": 2.8346782536726674e-05,
"loss": 0.1239,
"step": 2630
},
{
"epoch": 4.92,
"grad_norm": 5.406430244445801,
"learning_rate": 2.824332712600869e-05,
"loss": 0.1469,
"step": 2640
},
{
"epoch": 4.93,
"grad_norm": 5.022087097167969,
"learning_rate": 2.813987171529071e-05,
"loss": 0.1207,
"step": 2650
},
{
"epoch": 4.95,
"grad_norm": 3.3133649826049805,
"learning_rate": 2.803641630457273e-05,
"loss": 0.1271,
"step": 2660
},
{
"epoch": 4.97,
"grad_norm": 3.654719591140747,
"learning_rate": 2.793296089385475e-05,
"loss": 0.1287,
"step": 2670
},
{
"epoch": 4.99,
"grad_norm": 4.803737640380859,
"learning_rate": 2.782950548313677e-05,
"loss": 0.1289,
"step": 2680
},
{
"epoch": 5.0,
"eval_accuracy": 0.9704668980420404,
"eval_loss": 0.07738856226205826,
"eval_runtime": 76.5353,
"eval_samples_per_second": 199.529,
"eval_steps_per_second": 3.123,
"step": 2685
},
{
"epoch": 5.01,
"grad_norm": 4.131795406341553,
"learning_rate": 2.772605007241879e-05,
"loss": 0.1206,
"step": 2690
},
{
"epoch": 5.03,
"grad_norm": 3.679658889770508,
"learning_rate": 2.7622594661700806e-05,
"loss": 0.1291,
"step": 2700
},
{
"epoch": 5.05,
"grad_norm": 3.8007965087890625,
"learning_rate": 2.7519139250982827e-05,
"loss": 0.1119,
"step": 2710
},
{
"epoch": 5.07,
"grad_norm": 4.17035436630249,
"learning_rate": 2.741568384026485e-05,
"loss": 0.1317,
"step": 2720
},
{
"epoch": 5.08,
"grad_norm": 3.355526924133301,
"learning_rate": 2.7312228429546866e-05,
"loss": 0.1381,
"step": 2730
},
{
"epoch": 5.1,
"grad_norm": 5.8981547355651855,
"learning_rate": 2.7208773018828887e-05,
"loss": 0.1296,
"step": 2740
},
{
"epoch": 5.12,
"grad_norm": 2.47714900970459,
"learning_rate": 2.7105317608110902e-05,
"loss": 0.131,
"step": 2750
},
{
"epoch": 5.14,
"grad_norm": 4.262291431427002,
"learning_rate": 2.7001862197392923e-05,
"loss": 0.1357,
"step": 2760
},
{
"epoch": 5.16,
"grad_norm": 4.63747501373291,
"learning_rate": 2.6898406786674944e-05,
"loss": 0.1171,
"step": 2770
},
{
"epoch": 5.18,
"grad_norm": 3.2632124423980713,
"learning_rate": 2.6794951375956962e-05,
"loss": 0.1307,
"step": 2780
},
{
"epoch": 5.2,
"grad_norm": 4.751256942749023,
"learning_rate": 2.6691495965238983e-05,
"loss": 0.112,
"step": 2790
},
{
"epoch": 5.21,
"grad_norm": 7.088289737701416,
"learning_rate": 2.6588040554521005e-05,
"loss": 0.1389,
"step": 2800
},
{
"epoch": 5.23,
"grad_norm": 2.923245906829834,
"learning_rate": 2.648458514380302e-05,
"loss": 0.133,
"step": 2810
},
{
"epoch": 5.25,
"grad_norm": 3.6290907859802246,
"learning_rate": 2.638112973308504e-05,
"loss": 0.1215,
"step": 2820
},
{
"epoch": 5.27,
"grad_norm": 4.726309299468994,
"learning_rate": 2.627767432236706e-05,
"loss": 0.124,
"step": 2830
},
{
"epoch": 5.29,
"grad_norm": 3.569528818130493,
"learning_rate": 2.617421891164908e-05,
"loss": 0.131,
"step": 2840
},
{
"epoch": 5.31,
"grad_norm": 2.8678665161132812,
"learning_rate": 2.60707635009311e-05,
"loss": 0.1346,
"step": 2850
},
{
"epoch": 5.33,
"grad_norm": 6.845192909240723,
"learning_rate": 2.5967308090213122e-05,
"loss": 0.142,
"step": 2860
},
{
"epoch": 5.34,
"grad_norm": 3.2927472591400146,
"learning_rate": 2.5863852679495136e-05,
"loss": 0.1229,
"step": 2870
},
{
"epoch": 5.36,
"grad_norm": 3.8850090503692627,
"learning_rate": 2.5760397268777158e-05,
"loss": 0.1057,
"step": 2880
},
{
"epoch": 5.38,
"grad_norm": 4.47546911239624,
"learning_rate": 2.565694185805918e-05,
"loss": 0.1292,
"step": 2890
},
{
"epoch": 5.4,
"grad_norm": 2.9944636821746826,
"learning_rate": 2.5553486447341197e-05,
"loss": 0.1298,
"step": 2900
},
{
"epoch": 5.42,
"grad_norm": 3.4300310611724854,
"learning_rate": 2.5450031036623218e-05,
"loss": 0.1285,
"step": 2910
},
{
"epoch": 5.44,
"grad_norm": 3.3256707191467285,
"learning_rate": 2.5346575625905232e-05,
"loss": 0.1253,
"step": 2920
},
{
"epoch": 5.46,
"grad_norm": 4.314760684967041,
"learning_rate": 2.5243120215187254e-05,
"loss": 0.1353,
"step": 2930
},
{
"epoch": 5.47,
"grad_norm": 5.66748571395874,
"learning_rate": 2.5139664804469275e-05,
"loss": 0.1284,
"step": 2940
},
{
"epoch": 5.49,
"grad_norm": 4.710278511047363,
"learning_rate": 2.5036209393751293e-05,
"loss": 0.1339,
"step": 2950
},
{
"epoch": 5.51,
"grad_norm": 2.889969825744629,
"learning_rate": 2.4932753983033314e-05,
"loss": 0.1213,
"step": 2960
},
{
"epoch": 5.53,
"grad_norm": 5.408463001251221,
"learning_rate": 2.482929857231533e-05,
"loss": 0.1343,
"step": 2970
},
{
"epoch": 5.55,
"grad_norm": 2.5208628177642822,
"learning_rate": 2.4725843161597353e-05,
"loss": 0.1121,
"step": 2980
},
{
"epoch": 5.57,
"grad_norm": 3.910186290740967,
"learning_rate": 2.462238775087937e-05,
"loss": 0.1201,
"step": 2990
},
{
"epoch": 5.59,
"grad_norm": 2.9305076599121094,
"learning_rate": 2.4518932340161392e-05,
"loss": 0.1188,
"step": 3000
},
{
"epoch": 5.61,
"grad_norm": 3.034980297088623,
"learning_rate": 2.4415476929443413e-05,
"loss": 0.121,
"step": 3010
},
{
"epoch": 5.62,
"grad_norm": 4.653752326965332,
"learning_rate": 2.431202151872543e-05,
"loss": 0.1283,
"step": 3020
},
{
"epoch": 5.64,
"grad_norm": 3.6336913108825684,
"learning_rate": 2.420856610800745e-05,
"loss": 0.111,
"step": 3030
},
{
"epoch": 5.66,
"grad_norm": 3.5738136768341064,
"learning_rate": 2.4105110697289467e-05,
"loss": 0.1179,
"step": 3040
},
{
"epoch": 5.68,
"grad_norm": 2.7753243446350098,
"learning_rate": 2.4001655286571488e-05,
"loss": 0.1122,
"step": 3050
},
{
"epoch": 5.7,
"grad_norm": 3.7840399742126465,
"learning_rate": 2.389819987585351e-05,
"loss": 0.117,
"step": 3060
},
{
"epoch": 5.72,
"grad_norm": 4.982550144195557,
"learning_rate": 2.3794744465135527e-05,
"loss": 0.1179,
"step": 3070
},
{
"epoch": 5.74,
"grad_norm": 2.420515775680542,
"learning_rate": 2.3691289054417548e-05,
"loss": 0.1368,
"step": 3080
},
{
"epoch": 5.75,
"grad_norm": 3.5275652408599854,
"learning_rate": 2.3587833643699566e-05,
"loss": 0.1249,
"step": 3090
},
{
"epoch": 5.77,
"grad_norm": 4.064232349395752,
"learning_rate": 2.3484378232981584e-05,
"loss": 0.1314,
"step": 3100
},
{
"epoch": 5.79,
"grad_norm": 5.377870082855225,
"learning_rate": 2.3380922822263605e-05,
"loss": 0.1215,
"step": 3110
},
{
"epoch": 5.81,
"grad_norm": 3.4903948307037354,
"learning_rate": 2.3277467411545626e-05,
"loss": 0.1182,
"step": 3120
},
{
"epoch": 5.83,
"grad_norm": 6.624187469482422,
"learning_rate": 2.3174012000827644e-05,
"loss": 0.1233,
"step": 3130
},
{
"epoch": 5.85,
"grad_norm": 4.476204872131348,
"learning_rate": 2.3070556590109662e-05,
"loss": 0.1284,
"step": 3140
},
{
"epoch": 5.87,
"grad_norm": 2.996946096420288,
"learning_rate": 2.2967101179391683e-05,
"loss": 0.135,
"step": 3150
},
{
"epoch": 5.88,
"grad_norm": 4.674262046813965,
"learning_rate": 2.28636457686737e-05,
"loss": 0.1167,
"step": 3160
},
{
"epoch": 5.9,
"grad_norm": 3.4972784519195557,
"learning_rate": 2.2760190357955722e-05,
"loss": 0.1251,
"step": 3170
},
{
"epoch": 5.92,
"grad_norm": 3.1503241062164307,
"learning_rate": 2.2656734947237744e-05,
"loss": 0.1169,
"step": 3180
},
{
"epoch": 5.94,
"grad_norm": 3.190443277359009,
"learning_rate": 2.255327953651976e-05,
"loss": 0.1355,
"step": 3190
},
{
"epoch": 5.96,
"grad_norm": 4.845892429351807,
"learning_rate": 2.244982412580178e-05,
"loss": 0.1268,
"step": 3200
},
{
"epoch": 5.98,
"grad_norm": 3.408785343170166,
"learning_rate": 2.2346368715083797e-05,
"loss": 0.1153,
"step": 3210
},
{
"epoch": 6.0,
"grad_norm": 3.0129294395446777,
"learning_rate": 2.224291330436582e-05,
"loss": 0.1116,
"step": 3220
},
{
"epoch": 6.0,
"eval_accuracy": 0.9663414314714164,
"eval_loss": 0.08886239677667618,
"eval_runtime": 76.0016,
"eval_samples_per_second": 200.93,
"eval_steps_per_second": 3.145,
"step": 3222
},
{
"epoch": 6.01,
"grad_norm": 4.935642242431641,
"learning_rate": 2.213945789364784e-05,
"loss": 0.1182,
"step": 3230
},
{
"epoch": 6.03,
"grad_norm": 4.62550163269043,
"learning_rate": 2.2036002482929857e-05,
"loss": 0.105,
"step": 3240
},
{
"epoch": 6.05,
"grad_norm": 5.272533416748047,
"learning_rate": 2.193254707221188e-05,
"loss": 0.1216,
"step": 3250
},
{
"epoch": 6.07,
"grad_norm": 3.5938615798950195,
"learning_rate": 2.1829091661493897e-05,
"loss": 0.1134,
"step": 3260
},
{
"epoch": 6.09,
"grad_norm": 3.716996431350708,
"learning_rate": 2.1725636250775914e-05,
"loss": 0.117,
"step": 3270
},
{
"epoch": 6.11,
"grad_norm": 2.794499158859253,
"learning_rate": 2.162218084005794e-05,
"loss": 0.1197,
"step": 3280
},
{
"epoch": 6.13,
"grad_norm": 3.517066717147827,
"learning_rate": 2.1518725429339957e-05,
"loss": 0.1158,
"step": 3290
},
{
"epoch": 6.15,
"grad_norm": 3.3488523960113525,
"learning_rate": 2.1415270018621975e-05,
"loss": 0.1083,
"step": 3300
},
{
"epoch": 6.16,
"grad_norm": 4.872901916503906,
"learning_rate": 2.1311814607903992e-05,
"loss": 0.1234,
"step": 3310
},
{
"epoch": 6.18,
"grad_norm": 5.1622633934021,
"learning_rate": 2.1208359197186014e-05,
"loss": 0.1155,
"step": 3320
},
{
"epoch": 6.2,
"grad_norm": 3.0708415508270264,
"learning_rate": 2.110490378646803e-05,
"loss": 0.1293,
"step": 3330
},
{
"epoch": 6.22,
"grad_norm": 5.712008953094482,
"learning_rate": 2.1001448375750053e-05,
"loss": 0.1295,
"step": 3340
},
{
"epoch": 6.24,
"grad_norm": 2.7026169300079346,
"learning_rate": 2.0897992965032074e-05,
"loss": 0.1242,
"step": 3350
},
{
"epoch": 6.26,
"grad_norm": 2.5105152130126953,
"learning_rate": 2.0794537554314092e-05,
"loss": 0.1239,
"step": 3360
},
{
"epoch": 6.28,
"grad_norm": 3.213020086288452,
"learning_rate": 2.069108214359611e-05,
"loss": 0.1108,
"step": 3370
},
{
"epoch": 6.29,
"grad_norm": 4.593565940856934,
"learning_rate": 2.0587626732878128e-05,
"loss": 0.1085,
"step": 3380
},
{
"epoch": 6.31,
"grad_norm": 4.210085868835449,
"learning_rate": 2.0484171322160152e-05,
"loss": 0.1153,
"step": 3390
},
{
"epoch": 6.33,
"grad_norm": 3.647468328475952,
"learning_rate": 2.038071591144217e-05,
"loss": 0.1139,
"step": 3400
},
{
"epoch": 6.35,
"grad_norm": 3.584791898727417,
"learning_rate": 2.0277260500724188e-05,
"loss": 0.1083,
"step": 3410
},
{
"epoch": 6.37,
"grad_norm": 3.0671119689941406,
"learning_rate": 2.017380509000621e-05,
"loss": 0.1039,
"step": 3420
},
{
"epoch": 6.39,
"grad_norm": 4.143247127532959,
"learning_rate": 2.0070349679288227e-05,
"loss": 0.1217,
"step": 3430
},
{
"epoch": 6.41,
"grad_norm": 3.869572401046753,
"learning_rate": 1.9966894268570248e-05,
"loss": 0.1136,
"step": 3440
},
{
"epoch": 6.42,
"grad_norm": 3.644425630569458,
"learning_rate": 1.986343885785227e-05,
"loss": 0.1107,
"step": 3450
},
{
"epoch": 6.44,
"grad_norm": 4.0842814445495605,
"learning_rate": 1.9759983447134287e-05,
"loss": 0.113,
"step": 3460
},
{
"epoch": 6.46,
"grad_norm": 4.737167835235596,
"learning_rate": 1.9656528036416305e-05,
"loss": 0.118,
"step": 3470
},
{
"epoch": 6.48,
"grad_norm": 4.954039573669434,
"learning_rate": 1.9553072625698323e-05,
"loss": 0.1103,
"step": 3480
},
{
"epoch": 6.5,
"grad_norm": 3.720627784729004,
"learning_rate": 1.9449617214980344e-05,
"loss": 0.1165,
"step": 3490
},
{
"epoch": 6.52,
"grad_norm": 4.383377552032471,
"learning_rate": 1.9346161804262365e-05,
"loss": 0.1317,
"step": 3500
},
{
"epoch": 6.54,
"grad_norm": 2.1662657260894775,
"learning_rate": 1.9242706393544383e-05,
"loss": 0.1094,
"step": 3510
},
{
"epoch": 6.55,
"grad_norm": 3.569554328918457,
"learning_rate": 1.9139250982826404e-05,
"loss": 0.1179,
"step": 3520
},
{
"epoch": 6.57,
"grad_norm": 3.2241714000701904,
"learning_rate": 1.9035795572108422e-05,
"loss": 0.1194,
"step": 3530
},
{
"epoch": 6.59,
"grad_norm": 3.6238088607788086,
"learning_rate": 1.893234016139044e-05,
"loss": 0.1125,
"step": 3540
},
{
"epoch": 6.61,
"grad_norm": 4.729239463806152,
"learning_rate": 1.882888475067246e-05,
"loss": 0.1197,
"step": 3550
},
{
"epoch": 6.63,
"grad_norm": 3.336503744125366,
"learning_rate": 1.8725429339954483e-05,
"loss": 0.1158,
"step": 3560
},
{
"epoch": 6.65,
"grad_norm": 2.9191136360168457,
"learning_rate": 1.86219739292365e-05,
"loss": 0.0991,
"step": 3570
},
{
"epoch": 6.67,
"grad_norm": 4.706370830535889,
"learning_rate": 1.8518518518518518e-05,
"loss": 0.123,
"step": 3580
},
{
"epoch": 6.69,
"grad_norm": 3.0669870376586914,
"learning_rate": 1.841506310780054e-05,
"loss": 0.1188,
"step": 3590
},
{
"epoch": 6.7,
"grad_norm": 3.902052402496338,
"learning_rate": 1.8311607697082557e-05,
"loss": 0.1104,
"step": 3600
},
{
"epoch": 6.72,
"grad_norm": 5.678684711456299,
"learning_rate": 1.820815228636458e-05,
"loss": 0.1058,
"step": 3610
},
{
"epoch": 6.74,
"grad_norm": 4.781716823577881,
"learning_rate": 1.81046968756466e-05,
"loss": 0.1265,
"step": 3620
},
{
"epoch": 6.76,
"grad_norm": 4.41150426864624,
"learning_rate": 1.8001241464928618e-05,
"loss": 0.1209,
"step": 3630
},
{
"epoch": 6.78,
"grad_norm": 3.2814714908599854,
"learning_rate": 1.7897786054210635e-05,
"loss": 0.1039,
"step": 3640
},
{
"epoch": 6.8,
"grad_norm": 3.8997206687927246,
"learning_rate": 1.7794330643492653e-05,
"loss": 0.1263,
"step": 3650
},
{
"epoch": 6.82,
"grad_norm": 3.4347612857818604,
"learning_rate": 1.7690875232774675e-05,
"loss": 0.1119,
"step": 3660
},
{
"epoch": 6.83,
"grad_norm": 3.8548810482025146,
"learning_rate": 1.7587419822056696e-05,
"loss": 0.1194,
"step": 3670
},
{
"epoch": 6.85,
"grad_norm": 1.9823788404464722,
"learning_rate": 1.7483964411338714e-05,
"loss": 0.118,
"step": 3680
},
{
"epoch": 6.87,
"grad_norm": 3.7170395851135254,
"learning_rate": 1.7380509000620735e-05,
"loss": 0.1097,
"step": 3690
},
{
"epoch": 6.89,
"grad_norm": 2.754812717437744,
"learning_rate": 1.7277053589902753e-05,
"loss": 0.1059,
"step": 3700
},
{
"epoch": 6.91,
"grad_norm": 5.865429401397705,
"learning_rate": 1.717359817918477e-05,
"loss": 0.1184,
"step": 3710
},
{
"epoch": 6.93,
"grad_norm": 2.526935577392578,
"learning_rate": 1.7070142768466792e-05,
"loss": 0.1109,
"step": 3720
},
{
"epoch": 6.95,
"grad_norm": 4.929475784301758,
"learning_rate": 1.6966687357748813e-05,
"loss": 0.115,
"step": 3730
},
{
"epoch": 6.96,
"grad_norm": 3.5055129528045654,
"learning_rate": 1.686323194703083e-05,
"loss": 0.1042,
"step": 3740
},
{
"epoch": 6.98,
"grad_norm": 5.299228191375732,
"learning_rate": 1.675977653631285e-05,
"loss": 0.1091,
"step": 3750
},
{
"epoch": 7.0,
"eval_accuracy": 0.9717110863728636,
"eval_loss": 0.07997328042984009,
"eval_runtime": 77.3565,
"eval_samples_per_second": 197.411,
"eval_steps_per_second": 3.09,
"step": 3759
},
{
"epoch": 7.0,
"grad_norm": 2.833702325820923,
"learning_rate": 1.665632112559487e-05,
"loss": 0.1151,
"step": 3760
},
{
"epoch": 7.02,
"grad_norm": 3.2511253356933594,
"learning_rate": 1.6552865714876888e-05,
"loss": 0.1081,
"step": 3770
},
{
"epoch": 7.04,
"grad_norm": 4.139963150024414,
"learning_rate": 1.644941030415891e-05,
"loss": 0.104,
"step": 3780
},
{
"epoch": 7.06,
"grad_norm": 3.9693522453308105,
"learning_rate": 1.634595489344093e-05,
"loss": 0.1116,
"step": 3790
},
{
"epoch": 7.08,
"grad_norm": 3.844640016555786,
"learning_rate": 1.6242499482722948e-05,
"loss": 0.114,
"step": 3800
},
{
"epoch": 7.09,
"grad_norm": 3.2410988807678223,
"learning_rate": 1.6139044072004966e-05,
"loss": 0.1002,
"step": 3810
},
{
"epoch": 7.11,
"grad_norm": 3.647073268890381,
"learning_rate": 1.6035588661286984e-05,
"loss": 0.1086,
"step": 3820
},
{
"epoch": 7.13,
"grad_norm": 3.2348549365997314,
"learning_rate": 1.5932133250569005e-05,
"loss": 0.089,
"step": 3830
},
{
"epoch": 7.15,
"grad_norm": 4.308054447174072,
"learning_rate": 1.5828677839851026e-05,
"loss": 0.1098,
"step": 3840
},
{
"epoch": 7.17,
"grad_norm": 2.811333417892456,
"learning_rate": 1.5725222429133044e-05,
"loss": 0.1187,
"step": 3850
},
{
"epoch": 7.19,
"grad_norm": 4.352880001068115,
"learning_rate": 1.5621767018415065e-05,
"loss": 0.1115,
"step": 3860
},
{
"epoch": 7.21,
"grad_norm": 4.083710193634033,
"learning_rate": 1.5518311607697083e-05,
"loss": 0.1064,
"step": 3870
},
{
"epoch": 7.23,
"grad_norm": 3.2910239696502686,
"learning_rate": 1.54148561969791e-05,
"loss": 0.1019,
"step": 3880
},
{
"epoch": 7.24,
"grad_norm": 4.050919532775879,
"learning_rate": 1.5311400786261122e-05,
"loss": 0.1023,
"step": 3890
},
{
"epoch": 7.26,
"grad_norm": 3.7136409282684326,
"learning_rate": 1.5207945375543142e-05,
"loss": 0.1072,
"step": 3900
},
{
"epoch": 7.28,
"grad_norm": 2.733660936355591,
"learning_rate": 1.5104489964825161e-05,
"loss": 0.0913,
"step": 3910
},
{
"epoch": 7.3,
"grad_norm": 5.974127292633057,
"learning_rate": 1.5001034554107179e-05,
"loss": 0.1121,
"step": 3920
},
{
"epoch": 7.32,
"grad_norm": 3.1126036643981934,
"learning_rate": 1.48975791433892e-05,
"loss": 0.11,
"step": 3930
},
{
"epoch": 7.34,
"grad_norm": 3.3671085834503174,
"learning_rate": 1.479412373267122e-05,
"loss": 0.1213,
"step": 3940
},
{
"epoch": 7.36,
"grad_norm": 3.5733137130737305,
"learning_rate": 1.4690668321953238e-05,
"loss": 0.1109,
"step": 3950
},
{
"epoch": 7.37,
"grad_norm": 3.739729642868042,
"learning_rate": 1.4587212911235259e-05,
"loss": 0.1012,
"step": 3960
},
{
"epoch": 7.39,
"grad_norm": 3.9161570072174072,
"learning_rate": 1.4483757500517278e-05,
"loss": 0.1062,
"step": 3970
},
{
"epoch": 7.41,
"grad_norm": 3.0634665489196777,
"learning_rate": 1.4380302089799296e-05,
"loss": 0.1158,
"step": 3980
},
{
"epoch": 7.43,
"grad_norm": 6.085744380950928,
"learning_rate": 1.4276846679081316e-05,
"loss": 0.1133,
"step": 3990
},
{
"epoch": 7.45,
"grad_norm": 4.016533851623535,
"learning_rate": 1.4173391268363337e-05,
"loss": 0.1201,
"step": 4000
},
{
"epoch": 7.47,
"grad_norm": 4.826283931732178,
"learning_rate": 1.4069935857645355e-05,
"loss": 0.0988,
"step": 4010
},
{
"epoch": 7.49,
"grad_norm": 4.33713436126709,
"learning_rate": 1.3966480446927374e-05,
"loss": 0.1128,
"step": 4020
},
{
"epoch": 7.5,
"grad_norm": 3.433681011199951,
"learning_rate": 1.3863025036209396e-05,
"loss": 0.1273,
"step": 4030
},
{
"epoch": 7.52,
"grad_norm": 2.7771129608154297,
"learning_rate": 1.3759569625491414e-05,
"loss": 0.1077,
"step": 4040
},
{
"epoch": 7.54,
"grad_norm": 3.7656004428863525,
"learning_rate": 1.3656114214773433e-05,
"loss": 0.1012,
"step": 4050
},
{
"epoch": 7.56,
"grad_norm": 3.985187530517578,
"learning_rate": 1.3552658804055451e-05,
"loss": 0.114,
"step": 4060
},
{
"epoch": 7.58,
"grad_norm": 3.333801031112671,
"learning_rate": 1.3449203393337472e-05,
"loss": 0.1071,
"step": 4070
},
{
"epoch": 7.6,
"grad_norm": 3.4079647064208984,
"learning_rate": 1.3345747982619492e-05,
"loss": 0.1104,
"step": 4080
},
{
"epoch": 7.62,
"grad_norm": 3.1041195392608643,
"learning_rate": 1.324229257190151e-05,
"loss": 0.1129,
"step": 4090
},
{
"epoch": 7.64,
"grad_norm": 3.2613961696624756,
"learning_rate": 1.313883716118353e-05,
"loss": 0.1109,
"step": 4100
},
{
"epoch": 7.65,
"grad_norm": 3.5139191150665283,
"learning_rate": 1.303538175046555e-05,
"loss": 0.1173,
"step": 4110
},
{
"epoch": 7.67,
"grad_norm": 3.3949713706970215,
"learning_rate": 1.2931926339747568e-05,
"loss": 0.1133,
"step": 4120
},
{
"epoch": 7.69,
"grad_norm": 4.6892900466918945,
"learning_rate": 1.282847092902959e-05,
"loss": 0.1159,
"step": 4130
},
{
"epoch": 7.71,
"grad_norm": 2.7756004333496094,
"learning_rate": 1.2725015518311609e-05,
"loss": 0.1074,
"step": 4140
},
{
"epoch": 7.73,
"grad_norm": 2.3531765937805176,
"learning_rate": 1.2621560107593627e-05,
"loss": 0.1026,
"step": 4150
},
{
"epoch": 7.75,
"grad_norm": 3.776615858078003,
"learning_rate": 1.2518104696875646e-05,
"loss": 0.1035,
"step": 4160
},
{
"epoch": 7.77,
"grad_norm": 3.5298571586608887,
"learning_rate": 1.2414649286157666e-05,
"loss": 0.1033,
"step": 4170
},
{
"epoch": 7.78,
"grad_norm": 2.347933769226074,
"learning_rate": 1.2311193875439685e-05,
"loss": 0.0973,
"step": 4180
},
{
"epoch": 7.8,
"grad_norm": 2.9776358604431152,
"learning_rate": 1.2207738464721707e-05,
"loss": 0.1207,
"step": 4190
},
{
"epoch": 7.82,
"grad_norm": 2.641087055206299,
"learning_rate": 1.2104283054003724e-05,
"loss": 0.1083,
"step": 4200
},
{
"epoch": 7.84,
"grad_norm": 2.9380156993865967,
"learning_rate": 1.2000827643285744e-05,
"loss": 0.0985,
"step": 4210
},
{
"epoch": 7.86,
"grad_norm": 2.4157328605651855,
"learning_rate": 1.1897372232567764e-05,
"loss": 0.0976,
"step": 4220
},
{
"epoch": 7.88,
"grad_norm": 3.6187868118286133,
"learning_rate": 1.1793916821849783e-05,
"loss": 0.1199,
"step": 4230
},
{
"epoch": 7.9,
"grad_norm": 2.72450852394104,
"learning_rate": 1.1690461411131803e-05,
"loss": 0.1057,
"step": 4240
},
{
"epoch": 7.91,
"grad_norm": 3.189300298690796,
"learning_rate": 1.1587006000413822e-05,
"loss": 0.0851,
"step": 4250
},
{
"epoch": 7.93,
"grad_norm": 2.51131272315979,
"learning_rate": 1.1483550589695842e-05,
"loss": 0.1037,
"step": 4260
},
{
"epoch": 7.95,
"grad_norm": 2.9266738891601562,
"learning_rate": 1.1380095178977861e-05,
"loss": 0.1008,
"step": 4270
},
{
"epoch": 7.97,
"grad_norm": 3.532125473022461,
"learning_rate": 1.127663976825988e-05,
"loss": 0.1028,
"step": 4280
},
{
"epoch": 7.99,
"grad_norm": 3.3334052562713623,
"learning_rate": 1.1173184357541899e-05,
"loss": 0.1096,
"step": 4290
},
{
"epoch": 8.0,
"eval_accuracy": 0.9757055857507694,
"eval_loss": 0.06653288006782532,
"eval_runtime": 77.4069,
"eval_samples_per_second": 197.282,
"eval_steps_per_second": 3.088,
"step": 4296
},
{
"epoch": 8.01,
"grad_norm": 3.6473946571350098,
"learning_rate": 1.106972894682392e-05,
"loss": 0.1067,
"step": 4300
},
{
"epoch": 8.03,
"grad_norm": 3.4407718181610107,
"learning_rate": 1.096627353610594e-05,
"loss": 0.0985,
"step": 4310
},
{
"epoch": 8.04,
"grad_norm": 4.716196060180664,
"learning_rate": 1.0862818125387957e-05,
"loss": 0.1021,
"step": 4320
},
{
"epoch": 8.06,
"grad_norm": 6.6525702476501465,
"learning_rate": 1.0759362714669978e-05,
"loss": 0.1124,
"step": 4330
},
{
"epoch": 8.08,
"grad_norm": 3.4421017169952393,
"learning_rate": 1.0655907303951996e-05,
"loss": 0.1059,
"step": 4340
},
{
"epoch": 8.1,
"grad_norm": 3.29632830619812,
"learning_rate": 1.0552451893234016e-05,
"loss": 0.1049,
"step": 4350
},
{
"epoch": 8.12,
"grad_norm": 5.985255241394043,
"learning_rate": 1.0448996482516037e-05,
"loss": 0.0996,
"step": 4360
},
{
"epoch": 8.14,
"grad_norm": 4.031270503997803,
"learning_rate": 1.0345541071798055e-05,
"loss": 0.1034,
"step": 4370
},
{
"epoch": 8.16,
"grad_norm": 3.9531686305999756,
"learning_rate": 1.0242085661080076e-05,
"loss": 0.0969,
"step": 4380
},
{
"epoch": 8.18,
"grad_norm": 4.336350440979004,
"learning_rate": 1.0138630250362094e-05,
"loss": 0.0993,
"step": 4390
},
{
"epoch": 8.19,
"grad_norm": 3.5339159965515137,
"learning_rate": 1.0035174839644113e-05,
"loss": 0.1108,
"step": 4400
},
{
"epoch": 8.21,
"grad_norm": 4.038322925567627,
"learning_rate": 9.931719428926135e-06,
"loss": 0.1026,
"step": 4410
},
{
"epoch": 8.23,
"grad_norm": 5.179644584655762,
"learning_rate": 9.828264018208153e-06,
"loss": 0.1047,
"step": 4420
},
{
"epoch": 8.25,
"grad_norm": 3.643061876296997,
"learning_rate": 9.724808607490172e-06,
"loss": 0.1037,
"step": 4430
},
{
"epoch": 8.27,
"grad_norm": 2.6012673377990723,
"learning_rate": 9.621353196772192e-06,
"loss": 0.1082,
"step": 4440
},
{
"epoch": 8.29,
"grad_norm": 6.382651329040527,
"learning_rate": 9.517897786054211e-06,
"loss": 0.1017,
"step": 4450
},
{
"epoch": 8.31,
"grad_norm": 3.192500352859497,
"learning_rate": 9.41444237533623e-06,
"loss": 0.1021,
"step": 4460
},
{
"epoch": 8.32,
"grad_norm": 2.353194236755371,
"learning_rate": 9.31098696461825e-06,
"loss": 0.0942,
"step": 4470
},
{
"epoch": 8.34,
"grad_norm": 2.7383475303649902,
"learning_rate": 9.20753155390027e-06,
"loss": 0.0887,
"step": 4480
},
{
"epoch": 8.36,
"grad_norm": 3.0728166103363037,
"learning_rate": 9.10407614318229e-06,
"loss": 0.1038,
"step": 4490
},
{
"epoch": 8.38,
"grad_norm": 2.619554042816162,
"learning_rate": 9.000620732464309e-06,
"loss": 0.1013,
"step": 4500
},
{
"epoch": 8.4,
"grad_norm": 5.080254554748535,
"learning_rate": 8.897165321746327e-06,
"loss": 0.1143,
"step": 4510
},
{
"epoch": 8.42,
"grad_norm": 4.772169589996338,
"learning_rate": 8.793709911028348e-06,
"loss": 0.1027,
"step": 4520
},
{
"epoch": 8.44,
"grad_norm": 2.42454195022583,
"learning_rate": 8.690254500310367e-06,
"loss": 0.0992,
"step": 4530
},
{
"epoch": 8.45,
"grad_norm": 2.924750328063965,
"learning_rate": 8.586799089592385e-06,
"loss": 0.0977,
"step": 4540
},
{
"epoch": 8.47,
"grad_norm": 3.605734348297119,
"learning_rate": 8.483343678874407e-06,
"loss": 0.0964,
"step": 4550
},
{
"epoch": 8.49,
"grad_norm": 5.610400199890137,
"learning_rate": 8.379888268156424e-06,
"loss": 0.0883,
"step": 4560
},
{
"epoch": 8.51,
"grad_norm": 2.485067367553711,
"learning_rate": 8.276432857438444e-06,
"loss": 0.1016,
"step": 4570
},
{
"epoch": 8.53,
"grad_norm": 4.045931816101074,
"learning_rate": 8.172977446720465e-06,
"loss": 0.1047,
"step": 4580
},
{
"epoch": 8.55,
"grad_norm": 3.8962624073028564,
"learning_rate": 8.069522036002483e-06,
"loss": 0.1028,
"step": 4590
},
{
"epoch": 8.57,
"grad_norm": 2.916381359100342,
"learning_rate": 7.966066625284502e-06,
"loss": 0.1092,
"step": 4600
},
{
"epoch": 8.58,
"grad_norm": 2.839132308959961,
"learning_rate": 7.862611214566522e-06,
"loss": 0.0993,
"step": 4610
},
{
"epoch": 8.6,
"grad_norm": 3.5891973972320557,
"learning_rate": 7.759155803848542e-06,
"loss": 0.0932,
"step": 4620
},
{
"epoch": 8.62,
"grad_norm": 3.9104928970336914,
"learning_rate": 7.655700393130561e-06,
"loss": 0.0892,
"step": 4630
},
{
"epoch": 8.64,
"grad_norm": 4.489515781402588,
"learning_rate": 7.552244982412581e-06,
"loss": 0.0909,
"step": 4640
},
{
"epoch": 8.66,
"grad_norm": 3.1181390285491943,
"learning_rate": 7.4487895716946e-06,
"loss": 0.0865,
"step": 4650
},
{
"epoch": 8.68,
"grad_norm": 3.370128870010376,
"learning_rate": 7.345334160976619e-06,
"loss": 0.0902,
"step": 4660
},
{
"epoch": 8.7,
"grad_norm": 3.6510777473449707,
"learning_rate": 7.241878750258639e-06,
"loss": 0.1041,
"step": 4670
},
{
"epoch": 8.72,
"grad_norm": 4.543170928955078,
"learning_rate": 7.138423339540658e-06,
"loss": 0.1106,
"step": 4680
},
{
"epoch": 8.73,
"grad_norm": 3.1991612911224365,
"learning_rate": 7.0349679288226775e-06,
"loss": 0.0993,
"step": 4690
},
{
"epoch": 8.75,
"grad_norm": 2.5615463256835938,
"learning_rate": 6.931512518104698e-06,
"loss": 0.0927,
"step": 4700
},
{
"epoch": 8.77,
"grad_norm": 5.079352855682373,
"learning_rate": 6.8280571073867165e-06,
"loss": 0.1004,
"step": 4710
},
{
"epoch": 8.79,
"grad_norm": 2.056499481201172,
"learning_rate": 6.724601696668736e-06,
"loss": 0.0884,
"step": 4720
},
{
"epoch": 8.81,
"grad_norm": 2.651646614074707,
"learning_rate": 6.621146285950755e-06,
"loss": 0.095,
"step": 4730
},
{
"epoch": 8.83,
"grad_norm": 2.911651849746704,
"learning_rate": 6.517690875232775e-06,
"loss": 0.0971,
"step": 4740
},
{
"epoch": 8.85,
"grad_norm": 2.585360527038574,
"learning_rate": 6.414235464514795e-06,
"loss": 0.0942,
"step": 4750
},
{
"epoch": 8.86,
"grad_norm": 4.262210369110107,
"learning_rate": 6.310780053796813e-06,
"loss": 0.099,
"step": 4760
},
{
"epoch": 8.88,
"grad_norm": 3.066347599029541,
"learning_rate": 6.207324643078833e-06,
"loss": 0.0997,
"step": 4770
},
{
"epoch": 8.9,
"grad_norm": 4.1641740798950195,
"learning_rate": 6.103869232360853e-06,
"loss": 0.1004,
"step": 4780
},
{
"epoch": 8.92,
"grad_norm": 4.297872066497803,
"learning_rate": 6.000413821642872e-06,
"loss": 0.0975,
"step": 4790
},
{
"epoch": 8.94,
"grad_norm": 2.9514224529266357,
"learning_rate": 5.8969584109248915e-06,
"loss": 0.09,
"step": 4800
},
{
"epoch": 8.96,
"grad_norm": 3.211758852005005,
"learning_rate": 5.793503000206911e-06,
"loss": 0.0915,
"step": 4810
},
{
"epoch": 8.98,
"grad_norm": 3.523693084716797,
"learning_rate": 5.690047589488931e-06,
"loss": 0.0843,
"step": 4820
},
{
"epoch": 8.99,
"grad_norm": 4.310064315795898,
"learning_rate": 5.586592178770949e-06,
"loss": 0.0996,
"step": 4830
},
{
"epoch": 9.0,
"eval_accuracy": 0.9746578482090237,
"eval_loss": 0.07076110690832138,
"eval_runtime": 77.2249,
"eval_samples_per_second": 197.747,
"eval_steps_per_second": 3.095,
"step": 4833
},
{
"epoch": 9.01,
"grad_norm": 4.004106044769287,
"learning_rate": 5.48313676805297e-06,
"loss": 0.0958,
"step": 4840
},
{
"epoch": 9.03,
"grad_norm": 3.368622064590454,
"learning_rate": 5.379681357334989e-06,
"loss": 0.1035,
"step": 4850
},
{
"epoch": 9.05,
"grad_norm": 2.3737103939056396,
"learning_rate": 5.276225946617008e-06,
"loss": 0.0846,
"step": 4860
},
{
"epoch": 9.07,
"grad_norm": 3.6056108474731445,
"learning_rate": 5.1727705358990274e-06,
"loss": 0.0988,
"step": 4870
},
{
"epoch": 9.09,
"grad_norm": 6.646406173706055,
"learning_rate": 5.069315125181047e-06,
"loss": 0.077,
"step": 4880
},
{
"epoch": 9.11,
"grad_norm": 3.300297737121582,
"learning_rate": 4.965859714463067e-06,
"loss": 0.0959,
"step": 4890
},
{
"epoch": 9.12,
"grad_norm": 3.297924518585205,
"learning_rate": 4.862404303745086e-06,
"loss": 0.0904,
"step": 4900
},
{
"epoch": 9.14,
"grad_norm": 2.438100576400757,
"learning_rate": 4.7589488930271056e-06,
"loss": 0.0787,
"step": 4910
},
{
"epoch": 9.16,
"grad_norm": 6.617523670196533,
"learning_rate": 4.655493482309125e-06,
"loss": 0.1144,
"step": 4920
},
{
"epoch": 9.18,
"grad_norm": 4.281922817230225,
"learning_rate": 4.552038071591145e-06,
"loss": 0.1008,
"step": 4930
},
{
"epoch": 9.2,
"grad_norm": 2.712520122528076,
"learning_rate": 4.448582660873163e-06,
"loss": 0.0886,
"step": 4940
},
{
"epoch": 9.22,
"grad_norm": 4.191254615783691,
"learning_rate": 4.345127250155184e-06,
"loss": 0.0986,
"step": 4950
},
{
"epoch": 9.24,
"grad_norm": 3.2903385162353516,
"learning_rate": 4.241671839437203e-06,
"loss": 0.0886,
"step": 4960
},
{
"epoch": 9.26,
"grad_norm": 4.816535472869873,
"learning_rate": 4.138216428719222e-06,
"loss": 0.1063,
"step": 4970
},
{
"epoch": 9.27,
"grad_norm": 3.22310209274292,
"learning_rate": 4.0347610180012415e-06,
"loss": 0.0978,
"step": 4980
},
{
"epoch": 9.29,
"grad_norm": 3.7314705848693848,
"learning_rate": 3.931305607283261e-06,
"loss": 0.0842,
"step": 4990
},
{
"epoch": 9.31,
"grad_norm": 3.6335864067077637,
"learning_rate": 3.8278501965652806e-06,
"loss": 0.1074,
"step": 5000
},
{
"epoch": 9.33,
"grad_norm": 2.8816540241241455,
"learning_rate": 3.7243947858473e-06,
"loss": 0.093,
"step": 5010
},
{
"epoch": 9.35,
"grad_norm": 4.274160385131836,
"learning_rate": 3.6209393751293196e-06,
"loss": 0.1024,
"step": 5020
},
{
"epoch": 9.37,
"grad_norm": 2.640784502029419,
"learning_rate": 3.5174839644113387e-06,
"loss": 0.0984,
"step": 5030
},
{
"epoch": 9.39,
"grad_norm": 4.0636396408081055,
"learning_rate": 3.4140285536933583e-06,
"loss": 0.0998,
"step": 5040
},
{
"epoch": 9.4,
"grad_norm": 3.3350281715393066,
"learning_rate": 3.3105731429753774e-06,
"loss": 0.0825,
"step": 5050
},
{
"epoch": 9.42,
"grad_norm": 3.7046918869018555,
"learning_rate": 3.2071177322573973e-06,
"loss": 0.0747,
"step": 5060
},
{
"epoch": 9.44,
"grad_norm": 3.884317636489868,
"learning_rate": 3.1036623215394165e-06,
"loss": 0.0923,
"step": 5070
},
{
"epoch": 9.46,
"grad_norm": 4.088473320007324,
"learning_rate": 3.000206910821436e-06,
"loss": 0.0885,
"step": 5080
},
{
"epoch": 9.48,
"grad_norm": 2.4199376106262207,
"learning_rate": 2.8967515001034555e-06,
"loss": 0.1044,
"step": 5090
},
{
"epoch": 9.5,
"grad_norm": 4.261946678161621,
"learning_rate": 2.7932960893854746e-06,
"loss": 0.0856,
"step": 5100
},
{
"epoch": 9.52,
"grad_norm": 4.894256114959717,
"learning_rate": 2.6898406786674946e-06,
"loss": 0.0979,
"step": 5110
},
{
"epoch": 9.53,
"grad_norm": 3.232664108276367,
"learning_rate": 2.5863852679495137e-06,
"loss": 0.0912,
"step": 5120
},
{
"epoch": 9.55,
"grad_norm": 3.6954145431518555,
"learning_rate": 2.4829298572315337e-06,
"loss": 0.0837,
"step": 5130
},
{
"epoch": 9.57,
"grad_norm": 3.2980313301086426,
"learning_rate": 2.3794744465135528e-06,
"loss": 0.0866,
"step": 5140
},
{
"epoch": 9.59,
"grad_norm": 5.655994415283203,
"learning_rate": 2.2760190357955723e-06,
"loss": 0.0881,
"step": 5150
},
{
"epoch": 9.61,
"grad_norm": 4.117016792297363,
"learning_rate": 2.172563625077592e-06,
"loss": 0.0938,
"step": 5160
},
{
"epoch": 9.63,
"grad_norm": 4.604465007781982,
"learning_rate": 2.069108214359611e-06,
"loss": 0.0869,
"step": 5170
},
{
"epoch": 9.65,
"grad_norm": 2.572514057159424,
"learning_rate": 1.9656528036416305e-06,
"loss": 0.0939,
"step": 5180
},
{
"epoch": 9.66,
"grad_norm": 3.884051561355591,
"learning_rate": 1.86219739292365e-06,
"loss": 0.11,
"step": 5190
},
{
"epoch": 9.68,
"grad_norm": 3.295647621154785,
"learning_rate": 1.7587419822056694e-06,
"loss": 0.0796,
"step": 5200
},
{
"epoch": 9.7,
"grad_norm": 3.270512819290161,
"learning_rate": 1.6552865714876887e-06,
"loss": 0.1012,
"step": 5210
},
{
"epoch": 9.72,
"grad_norm": 3.492386817932129,
"learning_rate": 1.5518311607697082e-06,
"loss": 0.1028,
"step": 5220
},
{
"epoch": 9.74,
"grad_norm": 2.9747917652130127,
"learning_rate": 1.4483757500517278e-06,
"loss": 0.1021,
"step": 5230
},
{
"epoch": 9.76,
"grad_norm": 3.2330212593078613,
"learning_rate": 1.3449203393337473e-06,
"loss": 0.0865,
"step": 5240
},
{
"epoch": 9.78,
"grad_norm": 3.7194619178771973,
"learning_rate": 1.2414649286157668e-06,
"loss": 0.0992,
"step": 5250
},
{
"epoch": 9.8,
"grad_norm": 5.062513828277588,
"learning_rate": 1.1380095178977862e-06,
"loss": 0.1016,
"step": 5260
},
{
"epoch": 9.81,
"grad_norm": 3.4997618198394775,
"learning_rate": 1.0345541071798055e-06,
"loss": 0.077,
"step": 5270
},
{
"epoch": 9.83,
"grad_norm": 3.2800211906433105,
"learning_rate": 9.31098696461825e-07,
"loss": 0.0966,
"step": 5280
},
{
"epoch": 9.85,
"grad_norm": 5.51563835144043,
"learning_rate": 8.276432857438443e-07,
"loss": 0.0967,
"step": 5290
},
{
"epoch": 9.87,
"grad_norm": 5.4373698234558105,
"learning_rate": 7.241878750258639e-07,
"loss": 0.1014,
"step": 5300
},
{
"epoch": 9.89,
"grad_norm": 4.9278154373168945,
"learning_rate": 6.207324643078834e-07,
"loss": 0.1023,
"step": 5310
},
{
"epoch": 9.91,
"grad_norm": 3.8460750579833984,
"learning_rate": 5.172770535899027e-07,
"loss": 0.0992,
"step": 5320
},
{
"epoch": 9.93,
"grad_norm": 2.2577359676361084,
"learning_rate": 4.1382164287192217e-07,
"loss": 0.0822,
"step": 5330
},
{
"epoch": 9.94,
"grad_norm": 3.778047561645508,
"learning_rate": 3.103662321539417e-07,
"loss": 0.0894,
"step": 5340
},
{
"epoch": 9.96,
"grad_norm": 4.457272052764893,
"learning_rate": 2.0691082143596109e-07,
"loss": 0.0854,
"step": 5350
},
{
"epoch": 9.98,
"grad_norm": 3.2312333583831787,
"learning_rate": 1.0345541071798054e-07,
"loss": 0.0858,
"step": 5360
},
{
"epoch": 10.0,
"grad_norm": 4.007193088531494,
"learning_rate": 0.0,
"loss": 0.0992,
"step": 5370
},
{
"epoch": 10.0,
"eval_accuracy": 0.9764913889070788,
"eval_loss": 0.06747107207775116,
"eval_runtime": 75.4344,
"eval_samples_per_second": 202.441,
"eval_steps_per_second": 3.168,
"step": 5370
},
{
"epoch": 10.0,
"step": 5370,
"total_flos": 3.4161822702270628e+19,
"train_loss": 0.15793793185907148,
"train_runtime": 13574.3496,
"train_samples_per_second": 101.249,
"train_steps_per_second": 0.396
}
],
"logging_steps": 10,
"max_steps": 5370,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 3.4161822702270628e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}