{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.802654408616688, "eval_steps": 500, "global_step": 80000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011266590053854301, "grad_norm": 3268.47607421875, "learning_rate": 3.125e-06, "loss": 80243.872, "step": 500 }, { "epoch": 0.022533180107708602, "grad_norm": 1101.09423828125, "learning_rate": 6.25e-06, "loss": 455.9426, "step": 1000 }, { "epoch": 0.0337997701615629, "grad_norm": 1345.1741943359375, "learning_rate": 9.375000000000001e-06, "loss": 398.2565, "step": 1500 }, { "epoch": 0.045066360215417205, "grad_norm": 957.8522338867188, "learning_rate": 1.25e-05, "loss": 361.662, "step": 2000 }, { "epoch": 0.0563329502692715, "grad_norm": 1153.3565673828125, "learning_rate": 1.5625e-05, "loss": 351.6057, "step": 2500 }, { "epoch": 0.0675995403231258, "grad_norm": 3220.947021484375, "learning_rate": 1.8750000000000002e-05, "loss": 324.4043, "step": 3000 }, { "epoch": 0.0788661303769801, "grad_norm": 1318.7249755859375, "learning_rate": 2.1875e-05, "loss": 303.6339, "step": 3500 }, { "epoch": 0.09013272043083441, "grad_norm": 1085.81982421875, "learning_rate": 2.5e-05, "loss": 290.8328, "step": 4000 }, { "epoch": 0.1013993104846887, "grad_norm": 1732.97412109375, "learning_rate": 2.8125000000000003e-05, "loss": 286.8244, "step": 4500 }, { "epoch": 0.112665900538543, "grad_norm": 1018.1327514648438, "learning_rate": 3.125e-05, "loss": 274.4126, "step": 5000 }, { "epoch": 0.1239324905923973, "grad_norm": 1115.255859375, "learning_rate": 3.4375e-05, "loss": 258.1825, "step": 5500 }, { "epoch": 0.1351990806462516, "grad_norm": 2264.675537109375, "learning_rate": 3.7500000000000003e-05, "loss": 249.4162, "step": 6000 }, { "epoch": 0.1464656707001059, "grad_norm": 1202.668212890625, "learning_rate": 4.0625000000000005e-05, "loss": 238.756, "step": 6500 }, { "epoch": 0.1577322607539602, "grad_norm": 1271.8997802734375, "learning_rate": 4.375e-05, "loss": 234.6415, "step": 7000 }, { "epoch": 0.16899885080781452, "grad_norm": 880.48291015625, "learning_rate": 4.6875e-05, "loss": 220.2277, "step": 7500 }, { "epoch": 0.18026544086166882, "grad_norm": 1199.511962890625, "learning_rate": 5e-05, "loss": 212.284, "step": 8000 }, { "epoch": 0.19153203091552312, "grad_norm": 885.7015991210938, "learning_rate": 4.999405067699773e-05, "loss": 197.9255, "step": 8500 }, { "epoch": 0.2027986209693774, "grad_norm": 1130.3941650390625, "learning_rate": 4.997620553954645e-05, "loss": 201.8366, "step": 9000 }, { "epoch": 0.2140652110232317, "grad_norm": 1032.0145263671875, "learning_rate": 4.994647308096509e-05, "loss": 195.4686, "step": 9500 }, { "epoch": 0.225331801077086, "grad_norm": 1835.482666015625, "learning_rate": 4.990486745229364e-05, "loss": 186.7062, "step": 10000 }, { "epoch": 0.2365983911309403, "grad_norm": 1477.0096435546875, "learning_rate": 4.985140845555799e-05, "loss": 188.2111, "step": 10500 }, { "epoch": 0.2478649811847946, "grad_norm": 688.68310546875, "learning_rate": 4.9786121534345265e-05, "loss": 180.3121, "step": 11000 }, { "epoch": 0.2591315712386489, "grad_norm": 1981.5882568359375, "learning_rate": 4.970903776169402e-05, "loss": 171.624, "step": 11500 }, { "epoch": 0.2703981612925032, "grad_norm": 1986.2034912109375, "learning_rate": 4.962019382530521e-05, "loss": 178.433, "step": 12000 }, { "epoch": 0.2816647513463575, "grad_norm": 1576.7353515625, "learning_rate": 4.951963201008076e-05, "loss": 164.5357, "step": 12500 }, { "epoch": 0.2929313414002118, "grad_norm": 1520.091796875, "learning_rate": 4.940740017799833e-05, "loss": 162.9244, "step": 13000 }, { "epoch": 0.3041979314540661, "grad_norm": 1361.819091796875, "learning_rate": 4.9283551745331534e-05, "loss": 164.2202, "step": 13500 }, { "epoch": 0.3154645215079204, "grad_norm": 1085.12548828125, "learning_rate": 4.914814565722671e-05, "loss": 164.809, "step": 14000 }, { "epoch": 0.3267311115617747, "grad_norm": 1452.0218505859375, "learning_rate": 4.9001246359648224e-05, "loss": 162.6041, "step": 14500 }, { "epoch": 0.33799770161562903, "grad_norm": 1728.5352783203125, "learning_rate": 4.884292376870567e-05, "loss": 156.4712, "step": 15000 }, { "epoch": 0.3492642916694833, "grad_norm": 1888.999267578125, "learning_rate": 4.867325323737765e-05, "loss": 151.6581, "step": 15500 }, { "epoch": 0.36053088172333764, "grad_norm": 1127.0830078125, "learning_rate": 4.849231551964771e-05, "loss": 154.7625, "step": 16000 }, { "epoch": 0.3717974717771919, "grad_norm": 1579.301513671875, "learning_rate": 4.830019673206997e-05, "loss": 158.3963, "step": 16500 }, { "epoch": 0.38306406183104624, "grad_norm": 1466.5936279296875, "learning_rate": 4.8096988312782174e-05, "loss": 151.9632, "step": 17000 }, { "epoch": 0.3943306518849005, "grad_norm": 1501.413330078125, "learning_rate": 4.788278697798618e-05, "loss": 152.0479, "step": 17500 }, { "epoch": 0.4055972419387548, "grad_norm": 938.9967651367188, "learning_rate": 4.765769467591625e-05, "loss": 147.7795, "step": 18000 }, { "epoch": 0.4168638319926091, "grad_norm": 1407.6708984375, "learning_rate": 4.742181853831721e-05, "loss": 145.714, "step": 18500 }, { "epoch": 0.4281304220464634, "grad_norm": 1046.4781494140625, "learning_rate": 4.717527082945554e-05, "loss": 147.87, "step": 19000 }, { "epoch": 0.43939701210031773, "grad_norm": 1437.9764404296875, "learning_rate": 4.69181688926877e-05, "loss": 139.7023, "step": 19500 }, { "epoch": 0.450663602154172, "grad_norm": 1155.10595703125, "learning_rate": 4.665063509461097e-05, "loss": 146.1232, "step": 20000 }, { "epoch": 0.46193019220802634, "grad_norm": 1317.321044921875, "learning_rate": 4.637279676682367e-05, "loss": 139.9448, "step": 20500 }, { "epoch": 0.4731967822618806, "grad_norm": 1005.6251831054688, "learning_rate": 4.608478614532215e-05, "loss": 142.7613, "step": 21000 }, { "epoch": 0.48446337231573494, "grad_norm": 2557.992919921875, "learning_rate": 4.5786740307563636e-05, "loss": 139.5429, "step": 21500 }, { "epoch": 0.4957299623695892, "grad_norm": 1912.8707275390625, "learning_rate": 4.54788011072248e-05, "loss": 138.4188, "step": 22000 }, { "epoch": 0.5069965524234435, "grad_norm": 1349.5655517578125, "learning_rate": 4.516111510668707e-05, "loss": 135.984, "step": 22500 }, { "epoch": 0.5182631424772978, "grad_norm": 2255.250732421875, "learning_rate": 4.4833833507280884e-05, "loss": 136.2522, "step": 23000 }, { "epoch": 0.5295297325311521, "grad_norm": 947.2132568359375, "learning_rate": 4.4497112077322044e-05, "loss": 135.873, "step": 23500 }, { "epoch": 0.5407963225850064, "grad_norm": 1092.2021484375, "learning_rate": 4.415111107797445e-05, "loss": 136.0968, "step": 24000 }, { "epoch": 0.5520629126388608, "grad_norm": 2551.856201171875, "learning_rate": 4.379599518697444e-05, "loss": 133.4127, "step": 24500 }, { "epoch": 0.563329502692715, "grad_norm": 1087.750732421875, "learning_rate": 4.34319334202531e-05, "loss": 137.0111, "step": 25000 }, { "epoch": 0.5745960927465693, "grad_norm": 710.9840698242188, "learning_rate": 4.305909905149389e-05, "loss": 128.7961, "step": 25500 }, { "epoch": 0.5858626828004236, "grad_norm": 1028.0732421875, "learning_rate": 4.267766952966369e-05, "loss": 131.5538, "step": 26000 }, { "epoch": 0.597129272854278, "grad_norm": 1650.9874267578125, "learning_rate": 4.228782639455674e-05, "loss": 131.9968, "step": 26500 }, { "epoch": 0.6083958629081322, "grad_norm": 3326.53564453125, "learning_rate": 4.188975519039151e-05, "loss": 129.6315, "step": 27000 }, { "epoch": 0.6196624529619865, "grad_norm": 2218.9794921875, "learning_rate": 4.148364537750172e-05, "loss": 126.7289, "step": 27500 }, { "epoch": 0.6309290430158409, "grad_norm": 1092.4063720703125, "learning_rate": 4.1069690242163484e-05, "loss": 131.7434, "step": 28000 }, { "epoch": 0.6421956330696951, "grad_norm": 1642.626220703125, "learning_rate": 4.064808680460148e-05, "loss": 131.2289, "step": 28500 }, { "epoch": 0.6534622231235494, "grad_norm": 2752.57470703125, "learning_rate": 4.021903572521802e-05, "loss": 125.4669, "step": 29000 }, { "epoch": 0.6647288131774037, "grad_norm": 2134.545654296875, "learning_rate": 3.978274120908956e-05, "loss": 128.8268, "step": 29500 }, { "epoch": 0.6759954032312581, "grad_norm": 1844.7005615234375, "learning_rate": 3.933941090877615e-05, "loss": 126.0543, "step": 30000 }, { "epoch": 0.6872619932851123, "grad_norm": 911.3765869140625, "learning_rate": 3.888925582549006e-05, "loss": 124.5692, "step": 30500 }, { "epoch": 0.6985285833389666, "grad_norm": 1240.784423828125, "learning_rate": 3.84324902086706e-05, "loss": 129.9127, "step": 31000 }, { "epoch": 0.7097951733928209, "grad_norm": 1387.1654052734375, "learning_rate": 3.796933145401304e-05, "loss": 128.229, "step": 31500 }, { "epoch": 0.7210617634466753, "grad_norm": 5207.958984375, "learning_rate": 3.7500000000000003e-05, "loss": 123.6102, "step": 32000 }, { "epoch": 0.7323283535005295, "grad_norm": 1478.238525390625, "learning_rate": 3.702471922298469e-05, "loss": 122.5027, "step": 32500 }, { "epoch": 0.7435949435543838, "grad_norm": 1219.868408203125, "learning_rate": 3.654371533087586e-05, "loss": 121.6549, "step": 33000 }, { "epoch": 0.7548615336082382, "grad_norm": 1724.080078125, "learning_rate": 3.6057217255475034e-05, "loss": 122.2066, "step": 33500 }, { "epoch": 0.7661281236620925, "grad_norm": 2315.779052734375, "learning_rate": 3.556545654351749e-05, "loss": 124.5871, "step": 34000 }, { "epoch": 0.7773947137159467, "grad_norm": 1415.927978515625, "learning_rate": 3.5068667246468436e-05, "loss": 119.864, "step": 34500 }, { "epoch": 0.788661303769801, "grad_norm": 2292.79736328125, "learning_rate": 3.456708580912725e-05, "loss": 124.4397, "step": 35000 }, { "epoch": 0.7999278938236554, "grad_norm": 2198.923583984375, "learning_rate": 3.406095095709254e-05, "loss": 119.8706, "step": 35500 }, { "epoch": 0.8111944838775096, "grad_norm": 774.8341064453125, "learning_rate": 3.355050358314172e-05, "loss": 122.3335, "step": 36000 }, { "epoch": 0.8224610739313639, "grad_norm": 1356.7291259765625, "learning_rate": 3.303598663257904e-05, "loss": 117.6119, "step": 36500 }, { "epoch": 0.8337276639852182, "grad_norm": 0.0, "learning_rate": 3.251764498760683e-05, "loss": 122.5679, "step": 37000 }, { "epoch": 0.8449942540390726, "grad_norm": 0.0, "learning_rate": 3.1995725350774806e-05, "loss": 116.4365, "step": 37500 }, { "epoch": 0.8562608440929268, "grad_norm": 1365.280517578125, "learning_rate": 3.147047612756302e-05, "loss": 119.871, "step": 38000 }, { "epoch": 0.8675274341467811, "grad_norm": 1635.288330078125, "learning_rate": 3.094214730815433e-05, "loss": 116.4286, "step": 38500 }, { "epoch": 0.8787940242006355, "grad_norm": 1430.511962890625, "learning_rate": 3.0410990348452573e-05, "loss": 113.9206, "step": 39000 }, { "epoch": 0.8900606142544898, "grad_norm": 1990.450927734375, "learning_rate": 2.9877258050403212e-05, "loss": 118.977, "step": 39500 }, { "epoch": 0.901327204308344, "grad_norm": 875.0726928710938, "learning_rate": 2.9341204441673266e-05, "loss": 114.4106, "step": 40000 }, { "epoch": 0.9125937943621983, "grad_norm": 1655.7935791015625, "learning_rate": 2.8803084654747918e-05, "loss": 115.3111, "step": 40500 }, { "epoch": 0.9238603844160527, "grad_norm": 1956.72216796875, "learning_rate": 2.8263154805501297e-05, "loss": 117.3135, "step": 41000 }, { "epoch": 0.9351269744699069, "grad_norm": 937.9488525390625, "learning_rate": 2.7721671871299116e-05, "loss": 116.4852, "step": 41500 }, { "epoch": 0.9463935645237612, "grad_norm": 1579.9736328125, "learning_rate": 2.717889356869146e-05, "loss": 118.1533, "step": 42000 }, { "epoch": 0.9576601545776156, "grad_norm": 1284.749755859375, "learning_rate": 2.663507823075358e-05, "loss": 113.1541, "step": 42500 }, { "epoch": 0.9689267446314699, "grad_norm": 1050.632080078125, "learning_rate": 2.6090484684133404e-05, "loss": 115.3209, "step": 43000 }, { "epoch": 0.9801933346853241, "grad_norm": 1058.0616455078125, "learning_rate": 2.5545372125864032e-05, "loss": 119.323, "step": 43500 }, { "epoch": 0.9914599247391784, "grad_norm": 1310.6236572265625, "learning_rate": 2.5e-05, "loss": 111.4601, "step": 44000 }, { "epoch": 1.0, "eval_loss": 114.70393371582031, "eval_runtime": 1385.5714, "eval_samples_per_second": 14.238, "eval_steps_per_second": 3.56, "step": 44379 }, { "epoch": 1.0027265147930327, "grad_norm": 1699.930419921875, "learning_rate": 2.4454627874135974e-05, "loss": 111.6484, "step": 44500 }, { "epoch": 1.013993104846887, "grad_norm": 0.0, "learning_rate": 2.3909515315866605e-05, "loss": 111.7967, "step": 45000 }, { "epoch": 1.0252596949007413, "grad_norm": 1440.404052734375, "learning_rate": 2.3364921769246423e-05, "loss": 107.7449, "step": 45500 }, { "epoch": 1.0365262849545958, "grad_norm": 1180.1439208984375, "learning_rate": 2.2821106431308544e-05, "loss": 109.0394, "step": 46000 }, { "epoch": 1.04779287500845, "grad_norm": 1600.07568359375, "learning_rate": 2.2278328128700893e-05, "loss": 104.1056, "step": 46500 }, { "epoch": 1.0590594650623042, "grad_norm": 1731.480224609375, "learning_rate": 2.173684519449872e-05, "loss": 106.6714, "step": 47000 }, { "epoch": 1.0703260551161586, "grad_norm": 955.4271240234375, "learning_rate": 2.1196915345252084e-05, "loss": 105.6627, "step": 47500 }, { "epoch": 1.0815926451700129, "grad_norm": 2818.5283203125, "learning_rate": 2.0658795558326743e-05, "loss": 106.0866, "step": 48000 }, { "epoch": 1.092859235223867, "grad_norm": 806.58447265625, "learning_rate": 2.0122741949596797e-05, "loss": 111.0945, "step": 48500 }, { "epoch": 1.1041258252777215, "grad_norm": 1397.9573974609375, "learning_rate": 1.958900965154743e-05, "loss": 104.8343, "step": 49000 }, { "epoch": 1.1153924153315757, "grad_norm": 827.4343872070312, "learning_rate": 1.9057852691845677e-05, "loss": 103.2378, "step": 49500 }, { "epoch": 1.12665900538543, "grad_norm": 2430.602294921875, "learning_rate": 1.852952387243698e-05, "loss": 107.4349, "step": 50000 }, { "epoch": 1.1379255954392844, "grad_norm": 1653.0751953125, "learning_rate": 1.80042746492252e-05, "loss": 111.0688, "step": 50500 }, { "epoch": 1.1491921854931386, "grad_norm": 1318.4361572265625, "learning_rate": 1.7482355012393177e-05, "loss": 107.7423, "step": 51000 }, { "epoch": 1.160458775546993, "grad_norm": 1956.568603515625, "learning_rate": 1.6964013367420966e-05, "loss": 103.8008, "step": 51500 }, { "epoch": 1.1717253656008473, "grad_norm": 764.591552734375, "learning_rate": 1.6449496416858284e-05, "loss": 104.8202, "step": 52000 }, { "epoch": 1.1829919556547015, "grad_norm": 1741.2701416015625, "learning_rate": 1.5939049042907462e-05, "loss": 107.5796, "step": 52500 }, { "epoch": 1.194258545708556, "grad_norm": 2550.640625, "learning_rate": 1.5432914190872757e-05, "loss": 101.9078, "step": 53000 }, { "epoch": 1.2055251357624102, "grad_norm": 698.9768676757812, "learning_rate": 1.4931332753531574e-05, "loss": 108.1872, "step": 53500 }, { "epoch": 1.2167917258162644, "grad_norm": 3405.810546875, "learning_rate": 1.443454345648252e-05, "loss": 99.6448, "step": 54000 }, { "epoch": 1.2280583158701188, "grad_norm": 3267.17236328125, "learning_rate": 1.3942782744524973e-05, "loss": 108.0154, "step": 54500 }, { "epoch": 1.239324905923973, "grad_norm": 1127.849853515625, "learning_rate": 1.3456284669124158e-05, "loss": 100.0244, "step": 55000 }, { "epoch": 1.2505914959778273, "grad_norm": 1082.1241455078125, "learning_rate": 1.2975280777015314e-05, "loss": 104.4581, "step": 55500 }, { "epoch": 1.2618580860316817, "grad_norm": 1860.104248046875, "learning_rate": 1.2500000000000006e-05, "loss": 103.1496, "step": 56000 }, { "epoch": 1.273124676085536, "grad_norm": 1682.26708984375, "learning_rate": 1.2030668545986959e-05, "loss": 107.3074, "step": 56500 }, { "epoch": 1.2843912661393904, "grad_norm": 616.0757446289062, "learning_rate": 1.1567509791329401e-05, "loss": 102.4632, "step": 57000 }, { "epoch": 1.2956578561932446, "grad_norm": 624.0526733398438, "learning_rate": 1.1110744174509952e-05, "loss": 100.0244, "step": 57500 }, { "epoch": 1.3069244462470988, "grad_norm": 1387.1529541015625, "learning_rate": 1.0660589091223855e-05, "loss": 105.9446, "step": 58000 }, { "epoch": 1.318191036300953, "grad_norm": 1828.0877685546875, "learning_rate": 1.0217258790910448e-05, "loss": 104.6986, "step": 58500 }, { "epoch": 1.3294576263548075, "grad_norm": 650.5953369140625, "learning_rate": 9.780964274781984e-06, "loss": 105.4167, "step": 59000 }, { "epoch": 1.3407242164086617, "grad_norm": 996.328125, "learning_rate": 9.351913195398524e-06, "loss": 103.2462, "step": 59500 }, { "epoch": 1.3519908064625161, "grad_norm": 1035.3780517578125, "learning_rate": 8.930309757836517e-06, "loss": 103.7156, "step": 60000 }, { "epoch": 1.3632573965163703, "grad_norm": 1865.9571533203125, "learning_rate": 8.51635462249828e-06, "loss": 105.3619, "step": 60500 }, { "epoch": 1.3745239865702246, "grad_norm": 2139.88671875, "learning_rate": 8.110244809608495e-06, "loss": 103.2506, "step": 61000 }, { "epoch": 1.385790576624079, "grad_norm": 2942.61083984375, "learning_rate": 7.712173605443269e-06, "loss": 102.9886, "step": 61500 }, { "epoch": 1.3970571666779332, "grad_norm": 415.38153076171875, "learning_rate": 7.3223304703363135e-06, "loss": 102.6045, "step": 62000 }, { "epoch": 1.4083237567317877, "grad_norm": 1247.396728515625, "learning_rate": 6.940900948506113e-06, "loss": 102.5344, "step": 62500 }, { "epoch": 1.4195903467856419, "grad_norm": 740.3623046875, "learning_rate": 6.568066579746901e-06, "loss": 104.4504, "step": 63000 }, { "epoch": 1.430856936839496, "grad_norm": 1221.581298828125, "learning_rate": 6.204004813025568e-06, "loss": 101.7631, "step": 63500 }, { "epoch": 1.4421235268933503, "grad_norm": 1514.0115966796875, "learning_rate": 5.848888922025553e-06, "loss": 99.2285, "step": 64000 }, { "epoch": 1.4533901169472048, "grad_norm": 821.0801391601562, "learning_rate": 5.50288792267796e-06, "loss": 102.3846, "step": 64500 }, { "epoch": 1.464656707001059, "grad_norm": 1313.3104248046875, "learning_rate": 5.166166492719124e-06, "loss": 103.6035, "step": 65000 }, { "epoch": 1.4759232970549134, "grad_norm": 1329.42919921875, "learning_rate": 4.8388848933129335e-06, "loss": 104.2133, "step": 65500 }, { "epoch": 1.4871898871087676, "grad_norm": 3106.43505859375, "learning_rate": 4.521198892775203e-06, "loss": 100.1638, "step": 66000 }, { "epoch": 1.4984564771626219, "grad_norm": 996.626220703125, "learning_rate": 4.213259692436367e-06, "loss": 97.5407, "step": 66500 }, { "epoch": 1.5097230672164763, "grad_norm": 1821.3323974609375, "learning_rate": 3.9152138546778625e-06, "loss": 102.4095, "step": 67000 }, { "epoch": 1.5209896572703305, "grad_norm": 1764.0323486328125, "learning_rate": 3.6272032331763408e-06, "loss": 96.4079, "step": 67500 }, { "epoch": 1.532256247324185, "grad_norm": 1377.978271484375, "learning_rate": 3.3493649053890326e-06, "loss": 104.5846, "step": 68000 }, { "epoch": 1.5435228373780392, "grad_norm": 1291.5908203125, "learning_rate": 3.081831107312308e-06, "loss": 100.9904, "step": 68500 }, { "epoch": 1.5547894274318934, "grad_norm": 2216.6796875, "learning_rate": 2.8247291705444575e-06, "loss": 99.7059, "step": 69000 }, { "epoch": 1.5660560174857476, "grad_norm": 1271.5609130859375, "learning_rate": 2.578181461682794e-06, "loss": 99.0976, "step": 69500 }, { "epoch": 1.577322607539602, "grad_norm": 2712.908203125, "learning_rate": 2.3423053240837515e-06, "loss": 99.1392, "step": 70000 }, { "epoch": 1.5885891975934565, "grad_norm": 1653.12890625, "learning_rate": 2.1172130220138226e-06, "loss": 101.0942, "step": 70500 }, { "epoch": 1.5998557876473107, "grad_norm": 1816.012451171875, "learning_rate": 1.9030116872178316e-06, "loss": 99.6448, "step": 71000 }, { "epoch": 1.611122377701165, "grad_norm": 1233.8719482421875, "learning_rate": 1.6998032679300391e-06, "loss": 101.785, "step": 71500 }, { "epoch": 1.6223889677550192, "grad_norm": 988.7132568359375, "learning_rate": 1.5076844803522922e-06, "loss": 95.7829, "step": 72000 }, { "epoch": 1.6336555578088736, "grad_norm": 1261.22021484375, "learning_rate": 1.3267467626223606e-06, "loss": 105.201, "step": 72500 }, { "epoch": 1.6449221478627278, "grad_norm": 934.3240966796875, "learning_rate": 1.1570762312943295e-06, "loss": 98.9473, "step": 73000 }, { "epoch": 1.6561887379165823, "grad_norm": 664.5089111328125, "learning_rate": 9.98753640351785e-07, "loss": 95.5199, "step": 73500 }, { "epoch": 1.6674553279704365, "grad_norm": 1963.7371826171875, "learning_rate": 8.51854342773295e-07, "loss": 101.1854, "step": 74000 }, { "epoch": 1.6787219180242907, "grad_norm": 2443.130126953125, "learning_rate": 7.164482546684642e-07, "loss": 97.2825, "step": 74500 }, { "epoch": 1.689988508078145, "grad_norm": 1137.326416015625, "learning_rate": 5.925998220016659e-07, "loss": 100.3116, "step": 75000 }, { "epoch": 1.7012550981319994, "grad_norm": 1520.0399169921875, "learning_rate": 4.803679899192392e-07, "loss": 100.4226, "step": 75500 }, { "epoch": 1.7125216881858538, "grad_norm": 1065.4630126953125, "learning_rate": 3.7980617469479953e-07, "loss": 101.6046, "step": 76000 }, { "epoch": 1.723788278239708, "grad_norm": 1042.942138671875, "learning_rate": 2.909622383059835e-07, "loss": 99.1135, "step": 76500 }, { "epoch": 1.7350548682935623, "grad_norm": 841.5830078125, "learning_rate": 2.1387846565474045e-07, "loss": 101.7611, "step": 77000 }, { "epoch": 1.7463214583474165, "grad_norm": 3369.8330078125, "learning_rate": 1.4859154444200884e-07, "loss": 99.4417, "step": 77500 }, { "epoch": 1.757588048401271, "grad_norm": 2432.385498046875, "learning_rate": 9.513254770636137e-08, "loss": 100.2592, "step": 78000 }, { "epoch": 1.7688546384551251, "grad_norm": 1680.1993408203125, "learning_rate": 5.352691903491303e-08, "loss": 98.8878, "step": 78500 }, { "epoch": 1.7801212285089796, "grad_norm": 682.8377075195312, "learning_rate": 2.3794460453555047e-08, "loss": 97.455, "step": 79000 }, { "epoch": 1.7913878185628338, "grad_norm": 0.0, "learning_rate": 5.94932300227169e-09, "loss": 102.4157, "step": 79500 }, { "epoch": 1.802654408616688, "grad_norm": 3429.1298828125, "learning_rate": 0.0, "loss": 97.4711, "step": 80000 } ], "logging_steps": 500, "max_steps": 80000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }