diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,71113 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.999802839116719, + "eval_steps": 1268, + "global_step": 10144, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001971608832807571, + "grad_norm": 3.145049268342521, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.6851, + "step": 1 + }, + { + "epoch": 0.0001971608832807571, + "eval_loss": 0.6705650091171265, + "eval_runtime": 342.719, + "eval_samples_per_second": 23.722, + "eval_steps_per_second": 1.485, + "step": 1 + }, + { + "epoch": 0.0003943217665615142, + "grad_norm": 2.928706161869534, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6183, + "step": 2 + }, + { + "epoch": 0.0005914826498422713, + "grad_norm": 2.8063413659680485, + "learning_rate": 3e-06, + "loss": 0.6447, + "step": 3 + }, + { + "epoch": 0.0007886435331230284, + "grad_norm": 4.474770355859649, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6615, + "step": 4 + }, + { + "epoch": 0.0009858044164037854, + "grad_norm": 2.83892152250444, + "learning_rate": 5e-06, + "loss": 0.5571, + "step": 5 + }, + { + "epoch": 0.0011829652996845426, + "grad_norm": 3.174661998811544, + "learning_rate": 6e-06, + "loss": 0.6092, + "step": 6 + }, + { + "epoch": 0.0013801261829652998, + "grad_norm": 2.086667997610238, + "learning_rate": 7e-06, + "loss": 0.5469, + "step": 7 + }, + { + "epoch": 0.0015772870662460567, + "grad_norm": 2.4093348319308454, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6155, + "step": 8 + }, + { + "epoch": 0.001774447949526814, + "grad_norm": 1.753355191522501, + "learning_rate": 9e-06, + "loss": 0.5135, + "step": 9 + }, + { + "epoch": 0.001971608832807571, + "grad_norm": 1.813162138301549, + "learning_rate": 1e-05, + "loss": 0.5444, + "step": 10 + }, + { + "epoch": 0.002168769716088328, + "grad_norm": 1.5430930747728506, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.5032, + "step": 11 + }, + { + "epoch": 0.002365930599369085, + "grad_norm": 1.4956193189962286, + "learning_rate": 1.2e-05, + "loss": 0.5294, + "step": 12 + }, + { + "epoch": 0.0025630914826498424, + "grad_norm": 1.7344594420796293, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.5647, + "step": 13 + }, + { + "epoch": 0.0027602523659305996, + "grad_norm": 1.1966218841914706, + "learning_rate": 1.4e-05, + "loss": 0.5103, + "step": 14 + }, + { + "epoch": 0.0029574132492113563, + "grad_norm": 1.432228734486573, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.5, + "step": 15 + }, + { + "epoch": 0.0031545741324921135, + "grad_norm": 1.4645984670917185, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.4978, + "step": 16 + }, + { + "epoch": 0.0033517350157728706, + "grad_norm": 1.3563922951463974, + "learning_rate": 1.7e-05, + "loss": 0.506, + "step": 17 + }, + { + "epoch": 0.003548895899053628, + "grad_norm": 1.3871081225739526, + "learning_rate": 1.8e-05, + "loss": 0.5311, + "step": 18 + }, + { + "epoch": 0.003746056782334385, + "grad_norm": 1.280010346938333, + "learning_rate": 1.9e-05, + "loss": 0.5433, + "step": 19 + }, + { + "epoch": 0.003943217665615142, + "grad_norm": 2.284117117524146, + "learning_rate": 2e-05, + "loss": 0.5448, + "step": 20 + }, + { + "epoch": 0.004140378548895899, + "grad_norm": 1.6225424151126402, + "learning_rate": 1.9999999879870974e-05, + "loss": 0.5044, + "step": 21 + }, + { + "epoch": 0.004337539432176656, + "grad_norm": 1.6087069713987625, + "learning_rate": 1.99999995194839e-05, + "loss": 0.4712, + "step": 22 + }, + { + "epoch": 0.004534700315457414, + "grad_norm": 1.4815279243249726, + "learning_rate": 1.9999998918838782e-05, + "loss": 0.5043, + "step": 23 + }, + { + "epoch": 0.00473186119873817, + "grad_norm": 1.2421866810695292, + "learning_rate": 1.9999998077935636e-05, + "loss": 0.4933, + "step": 24 + }, + { + "epoch": 0.004929022082018927, + "grad_norm": 1.392319279479948, + "learning_rate": 1.9999996996774485e-05, + "loss": 0.4657, + "step": 25 + }, + { + "epoch": 0.005126182965299685, + "grad_norm": 1.613538190971824, + "learning_rate": 1.9999995675355352e-05, + "loss": 0.5023, + "step": 26 + }, + { + "epoch": 0.0053233438485804415, + "grad_norm": 1.3274753395366978, + "learning_rate": 1.999999411367827e-05, + "loss": 0.486, + "step": 27 + }, + { + "epoch": 0.005520504731861199, + "grad_norm": 1.4785198394383092, + "learning_rate": 1.9999992311743276e-05, + "loss": 0.4713, + "step": 28 + }, + { + "epoch": 0.005717665615141956, + "grad_norm": 1.4857486979381507, + "learning_rate": 1.9999990269550415e-05, + "loss": 0.4881, + "step": 29 + }, + { + "epoch": 0.005914826498422713, + "grad_norm": 1.36552629938149, + "learning_rate": 1.9999987987099734e-05, + "loss": 0.5463, + "step": 30 + }, + { + "epoch": 0.00611198738170347, + "grad_norm": 1.2130083488975558, + "learning_rate": 1.999998546439129e-05, + "loss": 0.5119, + "step": 31 + }, + { + "epoch": 0.006309148264984227, + "grad_norm": 1.4571092650286936, + "learning_rate": 1.999998270142514e-05, + "loss": 0.5107, + "step": 32 + }, + { + "epoch": 0.0065063091482649845, + "grad_norm": 1.3046426394488964, + "learning_rate": 1.9999979698201355e-05, + "loss": 0.5163, + "step": 33 + }, + { + "epoch": 0.006703470031545741, + "grad_norm": 1.3429233230107818, + "learning_rate": 1.999997645472e-05, + "loss": 0.4949, + "step": 34 + }, + { + "epoch": 0.006900630914826498, + "grad_norm": 1.1865584873872308, + "learning_rate": 1.9999972970981164e-05, + "loss": 0.5042, + "step": 35 + }, + { + "epoch": 0.007097791798107256, + "grad_norm": 1.2089421648447298, + "learning_rate": 1.999996924698492e-05, + "loss": 0.4996, + "step": 36 + }, + { + "epoch": 0.007294952681388012, + "grad_norm": 1.2171992786206751, + "learning_rate": 1.9999965282731364e-05, + "loss": 0.5232, + "step": 37 + }, + { + "epoch": 0.00749211356466877, + "grad_norm": 1.5903110532514266, + "learning_rate": 1.9999961078220587e-05, + "loss": 0.5206, + "step": 38 + }, + { + "epoch": 0.007689274447949527, + "grad_norm": 1.2539942673764422, + "learning_rate": 1.9999956633452696e-05, + "loss": 0.4656, + "step": 39 + }, + { + "epoch": 0.007886435331230283, + "grad_norm": 1.1722354147042853, + "learning_rate": 1.9999951948427793e-05, + "loss": 0.4811, + "step": 40 + }, + { + "epoch": 0.008083596214511041, + "grad_norm": 1.520380985973137, + "learning_rate": 1.9999947023145992e-05, + "loss": 0.4745, + "step": 41 + }, + { + "epoch": 0.008280757097791799, + "grad_norm": 1.181201745034852, + "learning_rate": 1.999994185760741e-05, + "loss": 0.5026, + "step": 42 + }, + { + "epoch": 0.008477917981072555, + "grad_norm": 1.2315559894723096, + "learning_rate": 1.9999936451812168e-05, + "loss": 0.5071, + "step": 43 + }, + { + "epoch": 0.008675078864353312, + "grad_norm": 1.0027266084772204, + "learning_rate": 1.9999930805760403e-05, + "loss": 0.4293, + "step": 44 + }, + { + "epoch": 0.00887223974763407, + "grad_norm": 1.5355646992423921, + "learning_rate": 1.999992491945225e-05, + "loss": 0.5075, + "step": 45 + }, + { + "epoch": 0.009069400630914827, + "grad_norm": 1.2141404668583387, + "learning_rate": 1.9999918792887844e-05, + "loss": 0.5002, + "step": 46 + }, + { + "epoch": 0.009266561514195583, + "grad_norm": 1.1874086565333581, + "learning_rate": 1.9999912426067335e-05, + "loss": 0.5214, + "step": 47 + }, + { + "epoch": 0.00946372239747634, + "grad_norm": 1.310468730323023, + "learning_rate": 1.999990581899088e-05, + "loss": 0.4954, + "step": 48 + }, + { + "epoch": 0.009660883280757098, + "grad_norm": 1.1678068139516498, + "learning_rate": 1.9999898971658632e-05, + "loss": 0.4874, + "step": 49 + }, + { + "epoch": 0.009858044164037854, + "grad_norm": 1.2436720803185994, + "learning_rate": 1.9999891884070764e-05, + "loss": 0.5053, + "step": 50 + }, + { + "epoch": 0.010055205047318612, + "grad_norm": 1.191814384498826, + "learning_rate": 1.999988455622744e-05, + "loss": 0.4437, + "step": 51 + }, + { + "epoch": 0.01025236593059937, + "grad_norm": 1.0535554735149129, + "learning_rate": 1.9999876988128832e-05, + "loss": 0.4572, + "step": 52 + }, + { + "epoch": 0.010449526813880125, + "grad_norm": 1.5861895667849393, + "learning_rate": 1.9999869179775126e-05, + "loss": 0.5274, + "step": 53 + }, + { + "epoch": 0.010646687697160883, + "grad_norm": 1.3410047214951284, + "learning_rate": 1.9999861131166513e-05, + "loss": 0.5134, + "step": 54 + }, + { + "epoch": 0.01084384858044164, + "grad_norm": 1.5169630762672912, + "learning_rate": 1.9999852842303183e-05, + "loss": 0.498, + "step": 55 + }, + { + "epoch": 0.011041009463722398, + "grad_norm": 1.221925524795734, + "learning_rate": 1.9999844313185335e-05, + "loss": 0.5275, + "step": 56 + }, + { + "epoch": 0.011238170347003154, + "grad_norm": 5.091704408593217, + "learning_rate": 1.9999835543813174e-05, + "loss": 0.4799, + "step": 57 + }, + { + "epoch": 0.011435331230283912, + "grad_norm": 2.4153909798169337, + "learning_rate": 1.9999826534186914e-05, + "loss": 0.4921, + "step": 58 + }, + { + "epoch": 0.01163249211356467, + "grad_norm": 1.547979859587359, + "learning_rate": 1.9999817284306766e-05, + "loss": 0.5229, + "step": 59 + }, + { + "epoch": 0.011829652996845425, + "grad_norm": 1.5690337913664598, + "learning_rate": 1.9999807794172955e-05, + "loss": 0.4735, + "step": 60 + }, + { + "epoch": 0.012026813880126183, + "grad_norm": 1.6033315849616043, + "learning_rate": 1.999979806378571e-05, + "loss": 0.5152, + "step": 61 + }, + { + "epoch": 0.01222397476340694, + "grad_norm": 1.2835879687259206, + "learning_rate": 1.9999788093145264e-05, + "loss": 0.5271, + "step": 62 + }, + { + "epoch": 0.012421135646687698, + "grad_norm": 1.2507778090991872, + "learning_rate": 1.9999777882251857e-05, + "loss": 0.4735, + "step": 63 + }, + { + "epoch": 0.012618296529968454, + "grad_norm": 1.0918550876107425, + "learning_rate": 1.999976743110573e-05, + "loss": 0.489, + "step": 64 + }, + { + "epoch": 0.012815457413249211, + "grad_norm": 1.3178360947947898, + "learning_rate": 1.999975673970714e-05, + "loss": 0.5017, + "step": 65 + }, + { + "epoch": 0.013012618296529969, + "grad_norm": 1.226739326525634, + "learning_rate": 1.9999745808056344e-05, + "loss": 0.4785, + "step": 66 + }, + { + "epoch": 0.013209779179810725, + "grad_norm": 1.1354158482899859, + "learning_rate": 1.99997346361536e-05, + "loss": 0.4727, + "step": 67 + }, + { + "epoch": 0.013406940063091483, + "grad_norm": 1.069961387810125, + "learning_rate": 1.9999723223999178e-05, + "loss": 0.4692, + "step": 68 + }, + { + "epoch": 0.01360410094637224, + "grad_norm": 0.9762520384137559, + "learning_rate": 1.999971157159335e-05, + "loss": 0.4922, + "step": 69 + }, + { + "epoch": 0.013801261829652996, + "grad_norm": 1.1048692949441026, + "learning_rate": 1.99996996789364e-05, + "loss": 0.4762, + "step": 70 + }, + { + "epoch": 0.013998422712933754, + "grad_norm": 1.4422307357408406, + "learning_rate": 1.9999687546028617e-05, + "loss": 0.517, + "step": 71 + }, + { + "epoch": 0.014195583596214511, + "grad_norm": 1.2130637644825035, + "learning_rate": 1.9999675172870286e-05, + "loss": 0.4549, + "step": 72 + }, + { + "epoch": 0.014392744479495269, + "grad_norm": 1.3283546807946351, + "learning_rate": 1.9999662559461704e-05, + "loss": 0.4912, + "step": 73 + }, + { + "epoch": 0.014589905362776025, + "grad_norm": 1.2835417017942559, + "learning_rate": 1.9999649705803178e-05, + "loss": 0.5079, + "step": 74 + }, + { + "epoch": 0.014787066246056782, + "grad_norm": 1.0779965377716199, + "learning_rate": 1.9999636611895018e-05, + "loss": 0.4633, + "step": 75 + }, + { + "epoch": 0.01498422712933754, + "grad_norm": 1.0321678770877212, + "learning_rate": 1.999962327773753e-05, + "loss": 0.4854, + "step": 76 + }, + { + "epoch": 0.015181388012618296, + "grad_norm": 1.2808800261718734, + "learning_rate": 1.9999609703331045e-05, + "loss": 0.4931, + "step": 77 + }, + { + "epoch": 0.015378548895899053, + "grad_norm": 1.3610125610716681, + "learning_rate": 1.999959588867588e-05, + "loss": 0.5009, + "step": 78 + }, + { + "epoch": 0.015575709779179811, + "grad_norm": 1.0655940435477698, + "learning_rate": 1.999958183377237e-05, + "loss": 0.4816, + "step": 79 + }, + { + "epoch": 0.015772870662460567, + "grad_norm": 1.2930806912423924, + "learning_rate": 1.999956753862086e-05, + "loss": 0.4807, + "step": 80 + }, + { + "epoch": 0.015970031545741326, + "grad_norm": 1.287162958203915, + "learning_rate": 1.9999553003221682e-05, + "loss": 0.4588, + "step": 81 + }, + { + "epoch": 0.016167192429022082, + "grad_norm": 1.1339051396074737, + "learning_rate": 1.9999538227575196e-05, + "loss": 0.5081, + "step": 82 + }, + { + "epoch": 0.016364353312302838, + "grad_norm": 1.1265294750507961, + "learning_rate": 1.9999523211681746e-05, + "loss": 0.4669, + "step": 83 + }, + { + "epoch": 0.016561514195583597, + "grad_norm": 1.1931313211356958, + "learning_rate": 1.99995079555417e-05, + "loss": 0.48, + "step": 84 + }, + { + "epoch": 0.016758675078864353, + "grad_norm": 0.999555485327999, + "learning_rate": 1.9999492459155424e-05, + "loss": 0.4786, + "step": 85 + }, + { + "epoch": 0.01695583596214511, + "grad_norm": 1.1669083683078765, + "learning_rate": 1.9999476722523287e-05, + "loss": 0.4581, + "step": 86 + }, + { + "epoch": 0.01715299684542587, + "grad_norm": 1.0996260118567778, + "learning_rate": 1.9999460745645673e-05, + "loss": 0.4871, + "step": 87 + }, + { + "epoch": 0.017350157728706624, + "grad_norm": 1.1438449589174153, + "learning_rate": 1.999944452852296e-05, + "loss": 0.4852, + "step": 88 + }, + { + "epoch": 0.01754731861198738, + "grad_norm": 1.0031606047710635, + "learning_rate": 1.9999428071155535e-05, + "loss": 0.5007, + "step": 89 + }, + { + "epoch": 0.01774447949526814, + "grad_norm": 1.3469859967905786, + "learning_rate": 1.9999411373543804e-05, + "loss": 0.5102, + "step": 90 + }, + { + "epoch": 0.017941640378548895, + "grad_norm": 1.1936672885104658, + "learning_rate": 1.9999394435688158e-05, + "loss": 0.5077, + "step": 91 + }, + { + "epoch": 0.018138801261829655, + "grad_norm": 1.3174882849640317, + "learning_rate": 1.9999377257589012e-05, + "loss": 0.4961, + "step": 92 + }, + { + "epoch": 0.01833596214511041, + "grad_norm": 1.2572224290689644, + "learning_rate": 1.9999359839246775e-05, + "loss": 0.498, + "step": 93 + }, + { + "epoch": 0.018533123028391166, + "grad_norm": 1.1310411895258263, + "learning_rate": 1.9999342180661863e-05, + "loss": 0.5175, + "step": 94 + }, + { + "epoch": 0.018730283911671926, + "grad_norm": 1.2365159429858565, + "learning_rate": 1.9999324281834705e-05, + "loss": 0.4588, + "step": 95 + }, + { + "epoch": 0.01892744479495268, + "grad_norm": 1.3269862941988586, + "learning_rate": 1.9999306142765726e-05, + "loss": 0.485, + "step": 96 + }, + { + "epoch": 0.019124605678233438, + "grad_norm": 1.1672780497391835, + "learning_rate": 1.9999287763455367e-05, + "loss": 0.4964, + "step": 97 + }, + { + "epoch": 0.019321766561514197, + "grad_norm": 1.2280978200722572, + "learning_rate": 1.9999269143904066e-05, + "loss": 0.5031, + "step": 98 + }, + { + "epoch": 0.019518927444794953, + "grad_norm": 1.1756264159886103, + "learning_rate": 1.999925028411227e-05, + "loss": 0.4558, + "step": 99 + }, + { + "epoch": 0.01971608832807571, + "grad_norm": 1.0797299658749118, + "learning_rate": 1.9999231184080434e-05, + "loss": 0.509, + "step": 100 + }, + { + "epoch": 0.019913249211356468, + "grad_norm": 1.5073253405939093, + "learning_rate": 1.9999211843809018e-05, + "loss": 0.4829, + "step": 101 + }, + { + "epoch": 0.020110410094637224, + "grad_norm": 1.22115492589763, + "learning_rate": 1.9999192263298485e-05, + "loss": 0.4954, + "step": 102 + }, + { + "epoch": 0.02030757097791798, + "grad_norm": 1.215526599323687, + "learning_rate": 1.9999172442549307e-05, + "loss": 0.4973, + "step": 103 + }, + { + "epoch": 0.02050473186119874, + "grad_norm": 1.0092097441473407, + "learning_rate": 1.9999152381561955e-05, + "loss": 0.461, + "step": 104 + }, + { + "epoch": 0.020701892744479495, + "grad_norm": 1.1251707920134784, + "learning_rate": 1.9999132080336915e-05, + "loss": 0.503, + "step": 105 + }, + { + "epoch": 0.02089905362776025, + "grad_norm": 1.0138460258654038, + "learning_rate": 1.9999111538874677e-05, + "loss": 0.4765, + "step": 106 + }, + { + "epoch": 0.02109621451104101, + "grad_norm": 1.2072707360767827, + "learning_rate": 1.999909075717573e-05, + "loss": 0.4894, + "step": 107 + }, + { + "epoch": 0.021293375394321766, + "grad_norm": 1.0013269764600548, + "learning_rate": 1.9999069735240578e-05, + "loss": 0.4957, + "step": 108 + }, + { + "epoch": 0.021490536277602525, + "grad_norm": 1.1353117571038165, + "learning_rate": 1.999904847306972e-05, + "loss": 0.5045, + "step": 109 + }, + { + "epoch": 0.02168769716088328, + "grad_norm": 1.057179146627742, + "learning_rate": 1.999902697066367e-05, + "loss": 0.4928, + "step": 110 + }, + { + "epoch": 0.021884858044164037, + "grad_norm": 0.9423530793571464, + "learning_rate": 1.999900522802295e-05, + "loss": 0.5128, + "step": 111 + }, + { + "epoch": 0.022082018927444796, + "grad_norm": 0.968781602059733, + "learning_rate": 1.9998983245148072e-05, + "loss": 0.4547, + "step": 112 + }, + { + "epoch": 0.022279179810725552, + "grad_norm": 0.9113728069244939, + "learning_rate": 1.999896102203957e-05, + "loss": 0.4814, + "step": 113 + }, + { + "epoch": 0.022476340694006308, + "grad_norm": 1.1042295670503781, + "learning_rate": 1.999893855869798e-05, + "loss": 0.4613, + "step": 114 + }, + { + "epoch": 0.022673501577287068, + "grad_norm": 1.034777939325516, + "learning_rate": 1.999891585512384e-05, + "loss": 0.5244, + "step": 115 + }, + { + "epoch": 0.022870662460567823, + "grad_norm": 0.9820709661002166, + "learning_rate": 1.999889291131769e-05, + "loss": 0.5079, + "step": 116 + }, + { + "epoch": 0.02306782334384858, + "grad_norm": 2.571070138444174, + "learning_rate": 1.9998869727280088e-05, + "loss": 0.4896, + "step": 117 + }, + { + "epoch": 0.02326498422712934, + "grad_norm": 1.5658204423287487, + "learning_rate": 1.9998846303011588e-05, + "loss": 0.4715, + "step": 118 + }, + { + "epoch": 0.023462145110410094, + "grad_norm": 0.9704506073467836, + "learning_rate": 1.9998822638512757e-05, + "loss": 0.4948, + "step": 119 + }, + { + "epoch": 0.02365930599369085, + "grad_norm": 1.8219918715734673, + "learning_rate": 1.9998798733784155e-05, + "loss": 0.4449, + "step": 120 + }, + { + "epoch": 0.02385646687697161, + "grad_norm": 1.3674996874754528, + "learning_rate": 1.9998774588826362e-05, + "loss": 0.4736, + "step": 121 + }, + { + "epoch": 0.024053627760252366, + "grad_norm": 1.0825809239978976, + "learning_rate": 1.999875020363996e-05, + "loss": 0.4462, + "step": 122 + }, + { + "epoch": 0.02425078864353312, + "grad_norm": 1.2402946359904268, + "learning_rate": 1.999872557822553e-05, + "loss": 0.464, + "step": 123 + }, + { + "epoch": 0.02444794952681388, + "grad_norm": 0.9734568557016353, + "learning_rate": 1.999870071258367e-05, + "loss": 0.5062, + "step": 124 + }, + { + "epoch": 0.024645110410094637, + "grad_norm": 1.32265424416242, + "learning_rate": 1.999867560671497e-05, + "loss": 0.458, + "step": 125 + }, + { + "epoch": 0.024842271293375396, + "grad_norm": 1.0916211714042101, + "learning_rate": 1.999865026062004e-05, + "loss": 0.4639, + "step": 126 + }, + { + "epoch": 0.025039432176656152, + "grad_norm": 1.0101120690939442, + "learning_rate": 1.999862467429948e-05, + "loss": 0.5054, + "step": 127 + }, + { + "epoch": 0.025236593059936908, + "grad_norm": 1.07583395846967, + "learning_rate": 1.9998598847753918e-05, + "loss": 0.4147, + "step": 128 + }, + { + "epoch": 0.025433753943217667, + "grad_norm": 1.0153450766886587, + "learning_rate": 1.999857278098396e-05, + "loss": 0.4608, + "step": 129 + }, + { + "epoch": 0.025630914826498423, + "grad_norm": 0.9636929458831386, + "learning_rate": 1.999854647399024e-05, + "loss": 0.4435, + "step": 130 + }, + { + "epoch": 0.02582807570977918, + "grad_norm": 0.976995842942242, + "learning_rate": 1.999851992677339e-05, + "loss": 0.468, + "step": 131 + }, + { + "epoch": 0.026025236593059938, + "grad_norm": 1.2900795634559068, + "learning_rate": 1.999849313933405e-05, + "loss": 0.4996, + "step": 132 + }, + { + "epoch": 0.026222397476340694, + "grad_norm": 1.053599579751552, + "learning_rate": 1.9998466111672856e-05, + "loss": 0.4973, + "step": 133 + }, + { + "epoch": 0.02641955835962145, + "grad_norm": 1.018304999333016, + "learning_rate": 1.9998438843790463e-05, + "loss": 0.4754, + "step": 134 + }, + { + "epoch": 0.02661671924290221, + "grad_norm": 0.9962199755386415, + "learning_rate": 1.9998411335687527e-05, + "loss": 0.5074, + "step": 135 + }, + { + "epoch": 0.026813880126182965, + "grad_norm": 1.0381641170143454, + "learning_rate": 1.9998383587364706e-05, + "loss": 0.5516, + "step": 136 + }, + { + "epoch": 0.02701104100946372, + "grad_norm": 1.0875760040456148, + "learning_rate": 1.999835559882267e-05, + "loss": 0.4803, + "step": 137 + }, + { + "epoch": 0.02720820189274448, + "grad_norm": 1.5281532146270271, + "learning_rate": 1.9998327370062086e-05, + "loss": 0.4919, + "step": 138 + }, + { + "epoch": 0.027405362776025236, + "grad_norm": 0.9062501284108182, + "learning_rate": 1.9998298901083637e-05, + "loss": 0.4639, + "step": 139 + }, + { + "epoch": 0.027602523659305992, + "grad_norm": 0.9073298345394162, + "learning_rate": 1.9998270191888002e-05, + "loss": 0.476, + "step": 140 + }, + { + "epoch": 0.02779968454258675, + "grad_norm": 0.8859849922750372, + "learning_rate": 1.9998241242475876e-05, + "loss": 0.4808, + "step": 141 + }, + { + "epoch": 0.027996845425867507, + "grad_norm": 9.232525438376628, + "learning_rate": 1.9998212052847955e-05, + "loss": 0.5524, + "step": 142 + }, + { + "epoch": 0.028194006309148267, + "grad_norm": 1.1971641286106405, + "learning_rate": 1.9998182623004935e-05, + "loss": 0.4861, + "step": 143 + }, + { + "epoch": 0.028391167192429023, + "grad_norm": 1.1014546150503517, + "learning_rate": 1.9998152952947526e-05, + "loss": 0.4994, + "step": 144 + }, + { + "epoch": 0.02858832807570978, + "grad_norm": 1.1251735110198886, + "learning_rate": 1.9998123042676444e-05, + "loss": 0.5235, + "step": 145 + }, + { + "epoch": 0.028785488958990538, + "grad_norm": 1.0757931397923415, + "learning_rate": 1.9998092892192403e-05, + "loss": 0.4805, + "step": 146 + }, + { + "epoch": 0.028982649842271294, + "grad_norm": 1.0601740904924608, + "learning_rate": 1.9998062501496126e-05, + "loss": 0.4652, + "step": 147 + }, + { + "epoch": 0.02917981072555205, + "grad_norm": 1.0857317459697904, + "learning_rate": 1.999803187058835e-05, + "loss": 0.4949, + "step": 148 + }, + { + "epoch": 0.02937697160883281, + "grad_norm": 1.2154921400920446, + "learning_rate": 1.99980009994698e-05, + "loss": 0.4988, + "step": 149 + }, + { + "epoch": 0.029574132492113565, + "grad_norm": 1.0127571620302098, + "learning_rate": 1.999796988814123e-05, + "loss": 0.4898, + "step": 150 + }, + { + "epoch": 0.02977129337539432, + "grad_norm": 0.9897255202956721, + "learning_rate": 1.9997938536603386e-05, + "loss": 0.5307, + "step": 151 + }, + { + "epoch": 0.02996845425867508, + "grad_norm": 20.686417789329063, + "learning_rate": 1.999790694485701e-05, + "loss": 0.551, + "step": 152 + }, + { + "epoch": 0.030165615141955836, + "grad_norm": 1.7162332300485896, + "learning_rate": 1.999787511290287e-05, + "loss": 0.5087, + "step": 153 + }, + { + "epoch": 0.03036277602523659, + "grad_norm": 1.3514560362581123, + "learning_rate": 1.999784304074173e-05, + "loss": 0.5216, + "step": 154 + }, + { + "epoch": 0.03055993690851735, + "grad_norm": 1.5606869153465206, + "learning_rate": 1.9997810728374362e-05, + "loss": 0.4768, + "step": 155 + }, + { + "epoch": 0.030757097791798107, + "grad_norm": 1.3648443694718702, + "learning_rate": 1.999777817580154e-05, + "loss": 0.494, + "step": 156 + }, + { + "epoch": 0.030954258675078863, + "grad_norm": 1.132691943500696, + "learning_rate": 1.9997745383024043e-05, + "loss": 0.4662, + "step": 157 + }, + { + "epoch": 0.031151419558359622, + "grad_norm": 1.2793599540460188, + "learning_rate": 1.9997712350042663e-05, + "loss": 0.4834, + "step": 158 + }, + { + "epoch": 0.03134858044164038, + "grad_norm": 1.1425965940998726, + "learning_rate": 1.9997679076858193e-05, + "loss": 0.4975, + "step": 159 + }, + { + "epoch": 0.031545741324921134, + "grad_norm": 1.1861645099171103, + "learning_rate": 1.9997645563471432e-05, + "loss": 0.4793, + "step": 160 + }, + { + "epoch": 0.03174290220820189, + "grad_norm": 1.0972295116283801, + "learning_rate": 1.9997611809883187e-05, + "loss": 0.4746, + "step": 161 + }, + { + "epoch": 0.03194006309148265, + "grad_norm": 1.066451222541386, + "learning_rate": 1.9997577816094266e-05, + "loss": 0.4907, + "step": 162 + }, + { + "epoch": 0.03213722397476341, + "grad_norm": 1.0553209971214357, + "learning_rate": 1.9997543582105484e-05, + "loss": 0.4623, + "step": 163 + }, + { + "epoch": 0.032334384858044164, + "grad_norm": 0.978588088915193, + "learning_rate": 1.999750910791767e-05, + "loss": 0.4758, + "step": 164 + }, + { + "epoch": 0.03253154574132492, + "grad_norm": 1.1746969879352085, + "learning_rate": 1.9997474393531648e-05, + "loss": 0.5184, + "step": 165 + }, + { + "epoch": 0.032728706624605676, + "grad_norm": 1.1684805326278327, + "learning_rate": 1.999743943894825e-05, + "loss": 0.5299, + "step": 166 + }, + { + "epoch": 0.03292586750788644, + "grad_norm": 1.0632876477663906, + "learning_rate": 1.999740424416832e-05, + "loss": 0.4858, + "step": 167 + }, + { + "epoch": 0.033123028391167195, + "grad_norm": 1.0392183455349258, + "learning_rate": 1.9997368809192704e-05, + "loss": 0.4637, + "step": 168 + }, + { + "epoch": 0.03332018927444795, + "grad_norm": 1.0283561237168466, + "learning_rate": 1.999733313402225e-05, + "loss": 0.5204, + "step": 169 + }, + { + "epoch": 0.033517350157728706, + "grad_norm": 1.1695473033049606, + "learning_rate": 1.999729721865782e-05, + "loss": 0.5182, + "step": 170 + }, + { + "epoch": 0.03371451104100946, + "grad_norm": 0.9872390177704469, + "learning_rate": 1.999726106310027e-05, + "loss": 0.4961, + "step": 171 + }, + { + "epoch": 0.03391167192429022, + "grad_norm": 2.3231197797384, + "learning_rate": 1.9997224667350474e-05, + "loss": 0.4806, + "step": 172 + }, + { + "epoch": 0.03410883280757098, + "grad_norm": 1.1146751680169542, + "learning_rate": 1.9997188031409302e-05, + "loss": 0.4741, + "step": 173 + }, + { + "epoch": 0.03430599369085174, + "grad_norm": 0.896181610550275, + "learning_rate": 1.9997151155277638e-05, + "loss": 0.4462, + "step": 174 + }, + { + "epoch": 0.03450315457413249, + "grad_norm": 0.9768565430868928, + "learning_rate": 1.9997114038956367e-05, + "loss": 0.4981, + "step": 175 + }, + { + "epoch": 0.03470031545741325, + "grad_norm": 0.9551378209936853, + "learning_rate": 1.999707668244638e-05, + "loss": 0.4674, + "step": 176 + }, + { + "epoch": 0.034897476340694004, + "grad_norm": 0.9851958907905455, + "learning_rate": 1.9997039085748576e-05, + "loss": 0.4604, + "step": 177 + }, + { + "epoch": 0.03509463722397476, + "grad_norm": 1.0161464160279716, + "learning_rate": 1.9997001248863858e-05, + "loss": 0.4541, + "step": 178 + }, + { + "epoch": 0.03529179810725552, + "grad_norm": 1.050476492771695, + "learning_rate": 1.9996963171793132e-05, + "loss": 0.4949, + "step": 179 + }, + { + "epoch": 0.03548895899053628, + "grad_norm": 1.1900294403214005, + "learning_rate": 1.999692485453732e-05, + "loss": 0.4815, + "step": 180 + }, + { + "epoch": 0.035686119873817035, + "grad_norm": 1.1031147141223552, + "learning_rate": 1.9996886297097335e-05, + "loss": 0.4862, + "step": 181 + }, + { + "epoch": 0.03588328075709779, + "grad_norm": 1.083256023575967, + "learning_rate": 1.9996847499474102e-05, + "loss": 0.4587, + "step": 182 + }, + { + "epoch": 0.03608044164037855, + "grad_norm": 0.9670193439952784, + "learning_rate": 1.9996808461668565e-05, + "loss": 0.4513, + "step": 183 + }, + { + "epoch": 0.03627760252365931, + "grad_norm": 0.8979692827898681, + "learning_rate": 1.999676918368165e-05, + "loss": 0.4517, + "step": 184 + }, + { + "epoch": 0.036474763406940065, + "grad_norm": 0.9969877135191793, + "learning_rate": 1.9996729665514306e-05, + "loss": 0.4933, + "step": 185 + }, + { + "epoch": 0.03667192429022082, + "grad_norm": 0.9692177718636533, + "learning_rate": 1.999668990716748e-05, + "loss": 0.5174, + "step": 186 + }, + { + "epoch": 0.03686908517350158, + "grad_norm": 0.8842091129334964, + "learning_rate": 1.999664990864213e-05, + "loss": 0.4742, + "step": 187 + }, + { + "epoch": 0.03706624605678233, + "grad_norm": 0.9359713100345171, + "learning_rate": 1.9996609669939214e-05, + "loss": 0.4754, + "step": 188 + }, + { + "epoch": 0.03726340694006309, + "grad_norm": 0.9191517387573663, + "learning_rate": 1.9996569191059705e-05, + "loss": 0.4416, + "step": 189 + }, + { + "epoch": 0.03746056782334385, + "grad_norm": 0.9570764893044853, + "learning_rate": 1.9996528472004567e-05, + "loss": 0.5044, + "step": 190 + }, + { + "epoch": 0.03765772870662461, + "grad_norm": 0.9708144686903228, + "learning_rate": 1.999648751277478e-05, + "loss": 0.4986, + "step": 191 + }, + { + "epoch": 0.03785488958990536, + "grad_norm": 0.9143041364090021, + "learning_rate": 1.9996446313371334e-05, + "loss": 0.4715, + "step": 192 + }, + { + "epoch": 0.03805205047318612, + "grad_norm": 1.0756442068587027, + "learning_rate": 1.9996404873795216e-05, + "loss": 0.466, + "step": 193 + }, + { + "epoch": 0.038249211356466875, + "grad_norm": 0.8509637591107831, + "learning_rate": 1.999636319404742e-05, + "loss": 0.4419, + "step": 194 + }, + { + "epoch": 0.03844637223974763, + "grad_norm": 0.8874791124952598, + "learning_rate": 1.9996321274128947e-05, + "loss": 0.4899, + "step": 195 + }, + { + "epoch": 0.038643533123028394, + "grad_norm": 0.9741286408649374, + "learning_rate": 1.9996279114040806e-05, + "loss": 0.4753, + "step": 196 + }, + { + "epoch": 0.03884069400630915, + "grad_norm": 0.9085224492157351, + "learning_rate": 1.999623671378401e-05, + "loss": 0.4701, + "step": 197 + }, + { + "epoch": 0.039037854889589906, + "grad_norm": 1.0258910989454444, + "learning_rate": 1.9996194073359576e-05, + "loss": 0.4888, + "step": 198 + }, + { + "epoch": 0.03923501577287066, + "grad_norm": 0.8126732806815207, + "learning_rate": 1.999615119276853e-05, + "loss": 0.4555, + "step": 199 + }, + { + "epoch": 0.03943217665615142, + "grad_norm": 0.8218884414629809, + "learning_rate": 1.99961080720119e-05, + "loss": 0.4769, + "step": 200 + }, + { + "epoch": 0.03962933753943218, + "grad_norm": 0.8991946813956051, + "learning_rate": 1.9996064711090727e-05, + "loss": 0.4588, + "step": 201 + }, + { + "epoch": 0.039826498422712936, + "grad_norm": 0.8566916147286883, + "learning_rate": 1.9996021110006046e-05, + "loss": 0.5019, + "step": 202 + }, + { + "epoch": 0.04002365930599369, + "grad_norm": 0.9756060213649459, + "learning_rate": 1.9995977268758912e-05, + "loss": 0.4746, + "step": 203 + }, + { + "epoch": 0.04022082018927445, + "grad_norm": 2.1550975546469253, + "learning_rate": 1.9995933187350372e-05, + "loss": 0.5151, + "step": 204 + }, + { + "epoch": 0.040417981072555204, + "grad_norm": 2.0312951209754386, + "learning_rate": 1.999588886578149e-05, + "loss": 0.4713, + "step": 205 + }, + { + "epoch": 0.04061514195583596, + "grad_norm": 1.0090851939886327, + "learning_rate": 1.9995844304053325e-05, + "loss": 0.4607, + "step": 206 + }, + { + "epoch": 0.04081230283911672, + "grad_norm": 1.2675954246551175, + "learning_rate": 1.9995799502166952e-05, + "loss": 0.4975, + "step": 207 + }, + { + "epoch": 0.04100946372239748, + "grad_norm": 1.218002669237388, + "learning_rate": 1.9995754460123445e-05, + "loss": 0.4877, + "step": 208 + }, + { + "epoch": 0.041206624605678234, + "grad_norm": 1.621757149431934, + "learning_rate": 1.999570917792389e-05, + "loss": 0.487, + "step": 209 + }, + { + "epoch": 0.04140378548895899, + "grad_norm": 1.0341497665720387, + "learning_rate": 1.999566365556937e-05, + "loss": 0.5016, + "step": 210 + }, + { + "epoch": 0.041600946372239746, + "grad_norm": 0.882195008783823, + "learning_rate": 1.9995617893060984e-05, + "loss": 0.4523, + "step": 211 + }, + { + "epoch": 0.0417981072555205, + "grad_norm": 0.9026396173054654, + "learning_rate": 1.9995571890399827e-05, + "loss": 0.4561, + "step": 212 + }, + { + "epoch": 0.041995268138801264, + "grad_norm": 1.2144739906269006, + "learning_rate": 1.9995525647587005e-05, + "loss": 0.5047, + "step": 213 + }, + { + "epoch": 0.04219242902208202, + "grad_norm": 0.975053328028896, + "learning_rate": 1.9995479164623633e-05, + "loss": 0.4736, + "step": 214 + }, + { + "epoch": 0.042389589905362776, + "grad_norm": 0.9231877842298208, + "learning_rate": 1.9995432441510824e-05, + "loss": 0.4898, + "step": 215 + }, + { + "epoch": 0.04258675078864353, + "grad_norm": 0.9144587574515857, + "learning_rate": 1.9995385478249697e-05, + "loss": 0.4796, + "step": 216 + }, + { + "epoch": 0.04278391167192429, + "grad_norm": 17.62666522642842, + "learning_rate": 1.999533827484139e-05, + "loss": 0.6332, + "step": 217 + }, + { + "epoch": 0.04298107255520505, + "grad_norm": 1.471356897407039, + "learning_rate": 1.9995290831287032e-05, + "loss": 0.4722, + "step": 218 + }, + { + "epoch": 0.04317823343848581, + "grad_norm": 11.254738838618625, + "learning_rate": 1.9995243147587758e-05, + "loss": 0.5691, + "step": 219 + }, + { + "epoch": 0.04337539432176656, + "grad_norm": 2.055362498950427, + "learning_rate": 1.999519522374472e-05, + "loss": 0.4621, + "step": 220 + }, + { + "epoch": 0.04357255520504732, + "grad_norm": 8.194523341432308, + "learning_rate": 1.999514705975907e-05, + "loss": 0.4652, + "step": 221 + }, + { + "epoch": 0.043769716088328074, + "grad_norm": 1.9182748476507985, + "learning_rate": 1.9995098655631957e-05, + "loss": 0.5171, + "step": 222 + }, + { + "epoch": 0.04396687697160883, + "grad_norm": 1.0480239186671922, + "learning_rate": 1.9995050011364557e-05, + "loss": 0.4751, + "step": 223 + }, + { + "epoch": 0.04416403785488959, + "grad_norm": 1.665247357622267, + "learning_rate": 1.9995001126958025e-05, + "loss": 0.4874, + "step": 224 + }, + { + "epoch": 0.04436119873817035, + "grad_norm": 1.2047598455545891, + "learning_rate": 1.999495200241355e-05, + "loss": 0.4712, + "step": 225 + }, + { + "epoch": 0.044558359621451105, + "grad_norm": 1.4064682020754653, + "learning_rate": 1.9994902637732295e-05, + "loss": 0.4641, + "step": 226 + }, + { + "epoch": 0.04475552050473186, + "grad_norm": 1.354286339165095, + "learning_rate": 1.999485303291546e-05, + "loss": 0.4793, + "step": 227 + }, + { + "epoch": 0.044952681388012616, + "grad_norm": 1.5121544057772933, + "learning_rate": 1.9994803187964233e-05, + "loss": 0.5025, + "step": 228 + }, + { + "epoch": 0.04514984227129337, + "grad_norm": 1.0667226038238984, + "learning_rate": 1.9994753102879807e-05, + "loss": 0.4352, + "step": 229 + }, + { + "epoch": 0.045347003154574135, + "grad_norm": 2.1283935213841714, + "learning_rate": 1.999470277766339e-05, + "loss": 0.4862, + "step": 230 + }, + { + "epoch": 0.04554416403785489, + "grad_norm": 0.9323160156698969, + "learning_rate": 1.9994652212316193e-05, + "loss": 0.4586, + "step": 231 + }, + { + "epoch": 0.04574132492113565, + "grad_norm": 1.2774644395318477, + "learning_rate": 1.9994601406839428e-05, + "loss": 0.5293, + "step": 232 + }, + { + "epoch": 0.0459384858044164, + "grad_norm": 0.8563504697504617, + "learning_rate": 1.9994550361234314e-05, + "loss": 0.4671, + "step": 233 + }, + { + "epoch": 0.04613564668769716, + "grad_norm": 1.2981705612870134, + "learning_rate": 1.9994499075502078e-05, + "loss": 0.4774, + "step": 234 + }, + { + "epoch": 0.04633280757097792, + "grad_norm": 0.9650905057900812, + "learning_rate": 1.999444754964395e-05, + "loss": 0.4785, + "step": 235 + }, + { + "epoch": 0.04652996845425868, + "grad_norm": 0.9892482279995838, + "learning_rate": 1.9994395783661177e-05, + "loss": 0.478, + "step": 236 + }, + { + "epoch": 0.04672712933753943, + "grad_norm": 1.0372935377294314, + "learning_rate": 1.9994343777554995e-05, + "loss": 0.4965, + "step": 237 + }, + { + "epoch": 0.04692429022082019, + "grad_norm": 1.2079564548732702, + "learning_rate": 1.9994291531326656e-05, + "loss": 0.4444, + "step": 238 + }, + { + "epoch": 0.047121451104100945, + "grad_norm": 0.9768363136576569, + "learning_rate": 1.999423904497741e-05, + "loss": 0.4985, + "step": 239 + }, + { + "epoch": 0.0473186119873817, + "grad_norm": 1.7614504439224654, + "learning_rate": 1.999418631850853e-05, + "loss": 0.4653, + "step": 240 + }, + { + "epoch": 0.047515772870662464, + "grad_norm": 1.1032925804972276, + "learning_rate": 1.9994133351921274e-05, + "loss": 0.4659, + "step": 241 + }, + { + "epoch": 0.04771293375394322, + "grad_norm": 1.2804412541262307, + "learning_rate": 1.9994080145216908e-05, + "loss": 0.4483, + "step": 242 + }, + { + "epoch": 0.047910094637223975, + "grad_norm": 2.294484377346188, + "learning_rate": 1.9994026698396727e-05, + "loss": 0.4887, + "step": 243 + }, + { + "epoch": 0.04810725552050473, + "grad_norm": 1.118613544316812, + "learning_rate": 1.9993973011462004e-05, + "loss": 0.4775, + "step": 244 + }, + { + "epoch": 0.04830441640378549, + "grad_norm": 0.8403959866676638, + "learning_rate": 1.999391908441403e-05, + "loss": 0.4442, + "step": 245 + }, + { + "epoch": 0.04850157728706624, + "grad_norm": 1.0300870576969843, + "learning_rate": 1.9993864917254103e-05, + "loss": 0.4462, + "step": 246 + }, + { + "epoch": 0.048698738170347006, + "grad_norm": 1.1077983845276296, + "learning_rate": 1.9993810509983524e-05, + "loss": 0.4789, + "step": 247 + }, + { + "epoch": 0.04889589905362776, + "grad_norm": 0.9138468109346035, + "learning_rate": 1.9993755862603597e-05, + "loss": 0.4638, + "step": 248 + }, + { + "epoch": 0.04909305993690852, + "grad_norm": 0.9955167957819017, + "learning_rate": 1.9993700975115636e-05, + "loss": 0.5144, + "step": 249 + }, + { + "epoch": 0.04929022082018927, + "grad_norm": 0.9520927314114271, + "learning_rate": 1.9993645847520965e-05, + "loss": 0.4976, + "step": 250 + }, + { + "epoch": 0.04948738170347003, + "grad_norm": 1.2526305722815652, + "learning_rate": 1.9993590479820906e-05, + "loss": 0.5092, + "step": 251 + }, + { + "epoch": 0.04968454258675079, + "grad_norm": 0.8493530635842205, + "learning_rate": 1.9993534872016784e-05, + "loss": 0.4517, + "step": 252 + }, + { + "epoch": 0.04988170347003155, + "grad_norm": 0.9146559563429385, + "learning_rate": 1.999347902410994e-05, + "loss": 0.5143, + "step": 253 + }, + { + "epoch": 0.050078864353312304, + "grad_norm": 1.2331946908319966, + "learning_rate": 1.9993422936101715e-05, + "loss": 0.5273, + "step": 254 + }, + { + "epoch": 0.05027602523659306, + "grad_norm": 0.9053104001937419, + "learning_rate": 1.9993366607993457e-05, + "loss": 0.4623, + "step": 255 + }, + { + "epoch": 0.050473186119873815, + "grad_norm": 1.0114462146919174, + "learning_rate": 1.999331003978652e-05, + "loss": 0.484, + "step": 256 + }, + { + "epoch": 0.05067034700315457, + "grad_norm": 0.8468019047992964, + "learning_rate": 1.9993253231482258e-05, + "loss": 0.477, + "step": 257 + }, + { + "epoch": 0.050867507886435334, + "grad_norm": 0.9113246734228752, + "learning_rate": 1.999319618308204e-05, + "loss": 0.4913, + "step": 258 + }, + { + "epoch": 0.05106466876971609, + "grad_norm": 0.8740898510256843, + "learning_rate": 1.999313889458724e-05, + "loss": 0.504, + "step": 259 + }, + { + "epoch": 0.051261829652996846, + "grad_norm": 0.8543091738206727, + "learning_rate": 1.9993081365999228e-05, + "loss": 0.4752, + "step": 260 + }, + { + "epoch": 0.0514589905362776, + "grad_norm": 0.8447390016252053, + "learning_rate": 1.9993023597319387e-05, + "loss": 0.4582, + "step": 261 + }, + { + "epoch": 0.05165615141955836, + "grad_norm": 0.8559956935990739, + "learning_rate": 1.999296558854911e-05, + "loss": 0.4567, + "step": 262 + }, + { + "epoch": 0.051853312302839114, + "grad_norm": 0.8497707119769089, + "learning_rate": 1.9992907339689786e-05, + "loss": 0.4681, + "step": 263 + }, + { + "epoch": 0.052050473186119876, + "grad_norm": 1.2959384495585367, + "learning_rate": 1.9992848850742817e-05, + "loss": 0.4612, + "step": 264 + }, + { + "epoch": 0.05224763406940063, + "grad_norm": 0.9252677800758691, + "learning_rate": 1.9992790121709604e-05, + "loss": 0.4969, + "step": 265 + }, + { + "epoch": 0.05244479495268139, + "grad_norm": 0.9986026073347362, + "learning_rate": 1.9992731152591563e-05, + "loss": 0.483, + "step": 266 + }, + { + "epoch": 0.052641955835962144, + "grad_norm": 0.8766211673811323, + "learning_rate": 1.999267194339011e-05, + "loss": 0.4952, + "step": 267 + }, + { + "epoch": 0.0528391167192429, + "grad_norm": 0.9062026471143431, + "learning_rate": 1.9992612494106666e-05, + "loss": 0.4922, + "step": 268 + }, + { + "epoch": 0.05303627760252366, + "grad_norm": 1.2051311308802126, + "learning_rate": 1.999255280474266e-05, + "loss": 0.4528, + "step": 269 + }, + { + "epoch": 0.05323343848580442, + "grad_norm": 0.9481095486211566, + "learning_rate": 1.9992492875299528e-05, + "loss": 0.4676, + "step": 270 + }, + { + "epoch": 0.053430599369085174, + "grad_norm": 1.0581042227575974, + "learning_rate": 1.9992432705778703e-05, + "loss": 0.5275, + "step": 271 + }, + { + "epoch": 0.05362776025236593, + "grad_norm": 1.0862738326883588, + "learning_rate": 1.9992372296181637e-05, + "loss": 0.4802, + "step": 272 + }, + { + "epoch": 0.053824921135646686, + "grad_norm": 1.1530116773068577, + "learning_rate": 1.999231164650978e-05, + "loss": 0.4963, + "step": 273 + }, + { + "epoch": 0.05402208201892744, + "grad_norm": 1.1406385581126395, + "learning_rate": 1.999225075676459e-05, + "loss": 0.5017, + "step": 274 + }, + { + "epoch": 0.054219242902208205, + "grad_norm": 0.9840133715884154, + "learning_rate": 1.9992189626947528e-05, + "loss": 0.4184, + "step": 275 + }, + { + "epoch": 0.05441640378548896, + "grad_norm": 0.8369535854434705, + "learning_rate": 1.9992128257060064e-05, + "loss": 0.4766, + "step": 276 + }, + { + "epoch": 0.05461356466876972, + "grad_norm": 0.9733589505757138, + "learning_rate": 1.999206664710367e-05, + "loss": 0.4783, + "step": 277 + }, + { + "epoch": 0.05481072555205047, + "grad_norm": 1.1016083321043935, + "learning_rate": 1.999200479707983e-05, + "loss": 0.5023, + "step": 278 + }, + { + "epoch": 0.05500788643533123, + "grad_norm": 0.9453559986774039, + "learning_rate": 1.9991942706990028e-05, + "loss": 0.4997, + "step": 279 + }, + { + "epoch": 0.055205047318611984, + "grad_norm": 1.1205398036178456, + "learning_rate": 1.999188037683576e-05, + "loss": 0.4918, + "step": 280 + }, + { + "epoch": 0.05540220820189275, + "grad_norm": 0.8547868381519064, + "learning_rate": 1.9991817806618512e-05, + "loss": 0.4691, + "step": 281 + }, + { + "epoch": 0.0555993690851735, + "grad_norm": 1.101644837851159, + "learning_rate": 1.99917549963398e-05, + "loss": 0.478, + "step": 282 + }, + { + "epoch": 0.05579652996845426, + "grad_norm": 0.9648045124396167, + "learning_rate": 1.9991691946001123e-05, + "loss": 0.451, + "step": 283 + }, + { + "epoch": 0.055993690851735015, + "grad_norm": 0.9890972584569301, + "learning_rate": 1.9991628655604006e-05, + "loss": 0.4688, + "step": 284 + }, + { + "epoch": 0.05619085173501577, + "grad_norm": 1.087262812823818, + "learning_rate": 1.999156512514996e-05, + "loss": 0.4974, + "step": 285 + }, + { + "epoch": 0.05638801261829653, + "grad_norm": 1.1458887553291872, + "learning_rate": 1.9991501354640517e-05, + "loss": 0.4982, + "step": 286 + }, + { + "epoch": 0.05658517350157729, + "grad_norm": 0.8822768555579248, + "learning_rate": 1.9991437344077212e-05, + "loss": 0.4486, + "step": 287 + }, + { + "epoch": 0.056782334384858045, + "grad_norm": 1.2313826346986292, + "learning_rate": 1.9991373093461574e-05, + "loss": 0.4873, + "step": 288 + }, + { + "epoch": 0.0569794952681388, + "grad_norm": 0.8882605229939141, + "learning_rate": 1.9991308602795156e-05, + "loss": 0.4553, + "step": 289 + }, + { + "epoch": 0.05717665615141956, + "grad_norm": 1.0115742101244216, + "learning_rate": 1.9991243872079495e-05, + "loss": 0.4561, + "step": 290 + }, + { + "epoch": 0.05737381703470031, + "grad_norm": 0.9908774101006372, + "learning_rate": 1.999117890131616e-05, + "loss": 0.4427, + "step": 291 + }, + { + "epoch": 0.057570977917981075, + "grad_norm": 0.8561287541214222, + "learning_rate": 1.9991113690506705e-05, + "loss": 0.4268, + "step": 292 + }, + { + "epoch": 0.05776813880126183, + "grad_norm": 1.0671041225310995, + "learning_rate": 1.99910482396527e-05, + "loss": 0.48, + "step": 293 + }, + { + "epoch": 0.05796529968454259, + "grad_norm": 0.9685606768815737, + "learning_rate": 1.9990982548755712e-05, + "loss": 0.5016, + "step": 294 + }, + { + "epoch": 0.05816246056782334, + "grad_norm": 0.9888136254935049, + "learning_rate": 1.9990916617817323e-05, + "loss": 0.5133, + "step": 295 + }, + { + "epoch": 0.0583596214511041, + "grad_norm": 0.9279140857322566, + "learning_rate": 1.9990850446839114e-05, + "loss": 0.4645, + "step": 296 + }, + { + "epoch": 0.058556782334384855, + "grad_norm": 0.9503914334506955, + "learning_rate": 1.999078403582268e-05, + "loss": 0.4442, + "step": 297 + }, + { + "epoch": 0.05875394321766562, + "grad_norm": 0.8179461934678155, + "learning_rate": 1.9990717384769617e-05, + "loss": 0.4266, + "step": 298 + }, + { + "epoch": 0.058951104100946373, + "grad_norm": 0.9592664796746861, + "learning_rate": 1.9990650493681517e-05, + "loss": 0.4843, + "step": 299 + }, + { + "epoch": 0.05914826498422713, + "grad_norm": 0.9196227211585776, + "learning_rate": 1.999058336256e-05, + "loss": 0.4678, + "step": 300 + }, + { + "epoch": 0.059345425867507885, + "grad_norm": 1.2796468769464633, + "learning_rate": 1.9990515991406666e-05, + "loss": 0.4968, + "step": 301 + }, + { + "epoch": 0.05954258675078864, + "grad_norm": 0.9210540437615927, + "learning_rate": 1.9990448380223145e-05, + "loss": 0.4315, + "step": 302 + }, + { + "epoch": 0.059739747634069404, + "grad_norm": 0.9150812147912519, + "learning_rate": 1.9990380529011056e-05, + "loss": 0.4992, + "step": 303 + }, + { + "epoch": 0.05993690851735016, + "grad_norm": 1.0184465463567445, + "learning_rate": 1.9990312437772025e-05, + "loss": 0.5149, + "step": 304 + }, + { + "epoch": 0.060134069400630916, + "grad_norm": 0.909544636631595, + "learning_rate": 1.99902441065077e-05, + "loss": 0.4802, + "step": 305 + }, + { + "epoch": 0.06033123028391167, + "grad_norm": 1.0603662944162957, + "learning_rate": 1.9990175535219708e-05, + "loss": 0.4475, + "step": 306 + }, + { + "epoch": 0.06052839116719243, + "grad_norm": 1.241117709798254, + "learning_rate": 1.999010672390971e-05, + "loss": 0.477, + "step": 307 + }, + { + "epoch": 0.06072555205047318, + "grad_norm": 1.035832208551567, + "learning_rate": 1.9990037672579347e-05, + "loss": 0.4822, + "step": 308 + }, + { + "epoch": 0.060922712933753946, + "grad_norm": 0.9408863869538917, + "learning_rate": 1.9989968381230288e-05, + "loss": 0.4862, + "step": 309 + }, + { + "epoch": 0.0611198738170347, + "grad_norm": 0.9296848076140912, + "learning_rate": 1.998989884986419e-05, + "loss": 0.5125, + "step": 310 + }, + { + "epoch": 0.06131703470031546, + "grad_norm": 0.8918476259653481, + "learning_rate": 1.998982907848273e-05, + "loss": 0.4937, + "step": 311 + }, + { + "epoch": 0.061514195583596214, + "grad_norm": 1.4848687664773468, + "learning_rate": 1.9989759067087582e-05, + "loss": 0.4865, + "step": 312 + }, + { + "epoch": 0.06171135646687697, + "grad_norm": 0.8732760816687123, + "learning_rate": 1.998968881568043e-05, + "loss": 0.4778, + "step": 313 + }, + { + "epoch": 0.061908517350157725, + "grad_norm": 1.1341625184193884, + "learning_rate": 1.998961832426295e-05, + "loss": 0.4798, + "step": 314 + }, + { + "epoch": 0.06210567823343849, + "grad_norm": 0.8993091141038897, + "learning_rate": 1.9989547592836853e-05, + "loss": 0.4292, + "step": 315 + }, + { + "epoch": 0.062302839116719244, + "grad_norm": 0.8850853933714552, + "learning_rate": 1.998947662140383e-05, + "loss": 0.4594, + "step": 316 + }, + { + "epoch": 0.0625, + "grad_norm": 1.1127472937800988, + "learning_rate": 1.9989405409965585e-05, + "loss": 0.4943, + "step": 317 + }, + { + "epoch": 0.06269716088328076, + "grad_norm": 0.866897163477894, + "learning_rate": 1.998933395852383e-05, + "loss": 0.4633, + "step": 318 + }, + { + "epoch": 0.06289432176656151, + "grad_norm": 0.9547891953542145, + "learning_rate": 1.9989262267080284e-05, + "loss": 0.5155, + "step": 319 + }, + { + "epoch": 0.06309148264984227, + "grad_norm": 5.818229069199477, + "learning_rate": 1.998919033563667e-05, + "loss": 0.5422, + "step": 320 + }, + { + "epoch": 0.06328864353312302, + "grad_norm": 1.5240274072221687, + "learning_rate": 1.998911816419471e-05, + "loss": 0.4835, + "step": 321 + }, + { + "epoch": 0.06348580441640378, + "grad_norm": 0.9717985560653184, + "learning_rate": 1.9989045752756145e-05, + "loss": 0.4848, + "step": 322 + }, + { + "epoch": 0.06368296529968455, + "grad_norm": 1.1194941379469816, + "learning_rate": 1.998897310132271e-05, + "loss": 0.465, + "step": 323 + }, + { + "epoch": 0.0638801261829653, + "grad_norm": 1.3015806776449268, + "learning_rate": 1.9988900209896148e-05, + "loss": 0.5076, + "step": 324 + }, + { + "epoch": 0.06407728706624606, + "grad_norm": 1.3495583310297246, + "learning_rate": 1.998882707847822e-05, + "loss": 0.4891, + "step": 325 + }, + { + "epoch": 0.06427444794952682, + "grad_norm": 1.094314000355448, + "learning_rate": 1.9988753707070675e-05, + "loss": 0.4603, + "step": 326 + }, + { + "epoch": 0.06447160883280757, + "grad_norm": 0.8976180746045599, + "learning_rate": 1.998868009567528e-05, + "loss": 0.4852, + "step": 327 + }, + { + "epoch": 0.06466876971608833, + "grad_norm": 1.176456935066941, + "learning_rate": 1.99886062442938e-05, + "loss": 0.4505, + "step": 328 + }, + { + "epoch": 0.06486593059936908, + "grad_norm": 0.8621831797378902, + "learning_rate": 1.9988532152928012e-05, + "loss": 0.4521, + "step": 329 + }, + { + "epoch": 0.06506309148264984, + "grad_norm": 1.1751922853480894, + "learning_rate": 1.9988457821579698e-05, + "loss": 0.4684, + "step": 330 + }, + { + "epoch": 0.0652602523659306, + "grad_norm": 1.214299413735334, + "learning_rate": 1.9988383250250636e-05, + "loss": 0.5136, + "step": 331 + }, + { + "epoch": 0.06545741324921135, + "grad_norm": 1.3935797324505006, + "learning_rate": 1.9988308438942626e-05, + "loss": 0.543, + "step": 332 + }, + { + "epoch": 0.06565457413249211, + "grad_norm": 1.3113325461742111, + "learning_rate": 1.9988233387657462e-05, + "loss": 0.469, + "step": 333 + }, + { + "epoch": 0.06585173501577288, + "grad_norm": 0.9832962911329318, + "learning_rate": 1.9988158096396945e-05, + "loss": 0.4977, + "step": 334 + }, + { + "epoch": 0.06604889589905363, + "grad_norm": 1.4696155006597584, + "learning_rate": 1.998808256516289e-05, + "loss": 0.5422, + "step": 335 + }, + { + "epoch": 0.06624605678233439, + "grad_norm": 1.1348175666726867, + "learning_rate": 1.9988006793957106e-05, + "loss": 0.4752, + "step": 336 + }, + { + "epoch": 0.06644321766561515, + "grad_norm": 1.5792191486101228, + "learning_rate": 1.9987930782781416e-05, + "loss": 0.5261, + "step": 337 + }, + { + "epoch": 0.0666403785488959, + "grad_norm": 1.0368934464451127, + "learning_rate": 1.9987854531637644e-05, + "loss": 0.4642, + "step": 338 + }, + { + "epoch": 0.06683753943217666, + "grad_norm": 1.0337884912885194, + "learning_rate": 1.9987778040527625e-05, + "loss": 0.4755, + "step": 339 + }, + { + "epoch": 0.06703470031545741, + "grad_norm": 1.133861117613178, + "learning_rate": 1.9987701309453195e-05, + "loss": 0.4828, + "step": 340 + }, + { + "epoch": 0.06723186119873817, + "grad_norm": 1.111075907788898, + "learning_rate": 1.99876243384162e-05, + "loss": 0.4941, + "step": 341 + }, + { + "epoch": 0.06742902208201892, + "grad_norm": 1.2284579964954567, + "learning_rate": 1.9987547127418485e-05, + "loss": 0.4999, + "step": 342 + }, + { + "epoch": 0.06762618296529968, + "grad_norm": 1.0071834128462238, + "learning_rate": 1.9987469676461904e-05, + "loss": 0.4132, + "step": 343 + }, + { + "epoch": 0.06782334384858044, + "grad_norm": 1.1346939479337024, + "learning_rate": 1.9987391985548326e-05, + "loss": 0.5326, + "step": 344 + }, + { + "epoch": 0.06802050473186119, + "grad_norm": 0.8817839707262805, + "learning_rate": 1.9987314054679615e-05, + "loss": 0.4856, + "step": 345 + }, + { + "epoch": 0.06821766561514196, + "grad_norm": 0.967996798695509, + "learning_rate": 1.998723588385764e-05, + "loss": 0.5015, + "step": 346 + }, + { + "epoch": 0.06841482649842272, + "grad_norm": 1.2535242900070593, + "learning_rate": 1.9987157473084276e-05, + "loss": 0.4721, + "step": 347 + }, + { + "epoch": 0.06861198738170347, + "grad_norm": 0.9971572384429465, + "learning_rate": 1.9987078822361412e-05, + "loss": 0.4866, + "step": 348 + }, + { + "epoch": 0.06880914826498423, + "grad_norm": 0.9880019131723238, + "learning_rate": 1.9986999931690937e-05, + "loss": 0.4217, + "step": 349 + }, + { + "epoch": 0.06900630914826499, + "grad_norm": 0.8510336475538133, + "learning_rate": 1.9986920801074747e-05, + "loss": 0.4997, + "step": 350 + }, + { + "epoch": 0.06920347003154574, + "grad_norm": 1.3697939147272762, + "learning_rate": 1.9986841430514743e-05, + "loss": 0.4791, + "step": 351 + }, + { + "epoch": 0.0694006309148265, + "grad_norm": 0.9037413372344649, + "learning_rate": 1.9986761820012833e-05, + "loss": 0.4642, + "step": 352 + }, + { + "epoch": 0.06959779179810725, + "grad_norm": 1.1537513355491777, + "learning_rate": 1.9986681969570924e-05, + "loss": 0.4693, + "step": 353 + }, + { + "epoch": 0.06979495268138801, + "grad_norm": 0.7969873717195902, + "learning_rate": 1.9986601879190938e-05, + "loss": 0.4309, + "step": 354 + }, + { + "epoch": 0.06999211356466876, + "grad_norm": 1.0506442175587962, + "learning_rate": 1.9986521548874802e-05, + "loss": 0.4575, + "step": 355 + }, + { + "epoch": 0.07018927444794952, + "grad_norm": 0.958839633619317, + "learning_rate": 1.9986440978624444e-05, + "loss": 0.4848, + "step": 356 + }, + { + "epoch": 0.07038643533123029, + "grad_norm": 0.8487815542536234, + "learning_rate": 1.9986360168441798e-05, + "loss": 0.4346, + "step": 357 + }, + { + "epoch": 0.07058359621451105, + "grad_norm": 0.9101190136837835, + "learning_rate": 1.998627911832881e-05, + "loss": 0.4798, + "step": 358 + }, + { + "epoch": 0.0707807570977918, + "grad_norm": 0.8359817460302178, + "learning_rate": 1.998619782828742e-05, + "loss": 0.4457, + "step": 359 + }, + { + "epoch": 0.07097791798107256, + "grad_norm": 0.9727976821516041, + "learning_rate": 1.9986116298319585e-05, + "loss": 0.5337, + "step": 360 + }, + { + "epoch": 0.07117507886435331, + "grad_norm": 0.7658661529985589, + "learning_rate": 1.998603452842727e-05, + "loss": 0.4475, + "step": 361 + }, + { + "epoch": 0.07137223974763407, + "grad_norm": 0.9267521801472242, + "learning_rate": 1.998595251861243e-05, + "loss": 0.5341, + "step": 362 + }, + { + "epoch": 0.07156940063091483, + "grad_norm": 1.5288633084722636, + "learning_rate": 1.998587026887704e-05, + "loss": 0.4811, + "step": 363 + }, + { + "epoch": 0.07176656151419558, + "grad_norm": 1.1549526328305526, + "learning_rate": 1.9985787779223073e-05, + "loss": 0.4785, + "step": 364 + }, + { + "epoch": 0.07196372239747634, + "grad_norm": 1.0840635546798056, + "learning_rate": 1.9985705049652513e-05, + "loss": 0.499, + "step": 365 + }, + { + "epoch": 0.0721608832807571, + "grad_norm": 0.779747962979529, + "learning_rate": 1.998562208016735e-05, + "loss": 0.4663, + "step": 366 + }, + { + "epoch": 0.07235804416403785, + "grad_norm": 1.1129287322836847, + "learning_rate": 1.9985538870769573e-05, + "loss": 0.4727, + "step": 367 + }, + { + "epoch": 0.07255520504731862, + "grad_norm": 0.853670529824468, + "learning_rate": 1.9985455421461183e-05, + "loss": 0.4947, + "step": 368 + }, + { + "epoch": 0.07275236593059937, + "grad_norm": 0.924674407680012, + "learning_rate": 1.9985371732244188e-05, + "loss": 0.4584, + "step": 369 + }, + { + "epoch": 0.07294952681388013, + "grad_norm": 0.8629571748054644, + "learning_rate": 1.9985287803120595e-05, + "loss": 0.4705, + "step": 370 + }, + { + "epoch": 0.07314668769716089, + "grad_norm": 0.8242683110246124, + "learning_rate": 1.998520363409242e-05, + "loss": 0.4579, + "step": 371 + }, + { + "epoch": 0.07334384858044164, + "grad_norm": 1.499462700491384, + "learning_rate": 1.9985119225161688e-05, + "loss": 0.4607, + "step": 372 + }, + { + "epoch": 0.0735410094637224, + "grad_norm": 0.9669915502441957, + "learning_rate": 1.9985034576330425e-05, + "loss": 0.4956, + "step": 373 + }, + { + "epoch": 0.07373817034700315, + "grad_norm": 1.0411892472276396, + "learning_rate": 1.998494968760067e-05, + "loss": 0.4557, + "step": 374 + }, + { + "epoch": 0.07393533123028391, + "grad_norm": 0.8567052653582087, + "learning_rate": 1.998486455897445e-05, + "loss": 0.5177, + "step": 375 + }, + { + "epoch": 0.07413249211356467, + "grad_norm": 1.1116739095705466, + "learning_rate": 1.998477919045382e-05, + "loss": 0.4716, + "step": 376 + }, + { + "epoch": 0.07432965299684542, + "grad_norm": 0.8626595222872029, + "learning_rate": 1.9984693582040834e-05, + "loss": 0.5426, + "step": 377 + }, + { + "epoch": 0.07452681388012618, + "grad_norm": 0.9587022633462172, + "learning_rate": 1.998460773373754e-05, + "loss": 0.462, + "step": 378 + }, + { + "epoch": 0.07472397476340693, + "grad_norm": 0.8941067614117472, + "learning_rate": 1.9984521645546007e-05, + "loss": 0.4989, + "step": 379 + }, + { + "epoch": 0.0749211356466877, + "grad_norm": 0.9295392099689723, + "learning_rate": 1.9984435317468298e-05, + "loss": 0.4745, + "step": 380 + }, + { + "epoch": 0.07511829652996846, + "grad_norm": 0.8793262590604104, + "learning_rate": 1.998434874950649e-05, + "loss": 0.4847, + "step": 381 + }, + { + "epoch": 0.07531545741324921, + "grad_norm": 0.9197548867944333, + "learning_rate": 1.9984261941662666e-05, + "loss": 0.4822, + "step": 382 + }, + { + "epoch": 0.07551261829652997, + "grad_norm": 0.918094619940462, + "learning_rate": 1.9984174893938908e-05, + "loss": 0.473, + "step": 383 + }, + { + "epoch": 0.07570977917981073, + "grad_norm": 0.834295051365838, + "learning_rate": 1.998408760633731e-05, + "loss": 0.4423, + "step": 384 + }, + { + "epoch": 0.07590694006309148, + "grad_norm": 0.8187104700252339, + "learning_rate": 1.998400007885996e-05, + "loss": 0.4582, + "step": 385 + }, + { + "epoch": 0.07610410094637224, + "grad_norm": 1.0466182710170804, + "learning_rate": 1.9983912311508977e-05, + "loss": 0.4694, + "step": 386 + }, + { + "epoch": 0.076301261829653, + "grad_norm": 0.8718063081983285, + "learning_rate": 1.998382430428645e-05, + "loss": 0.476, + "step": 387 + }, + { + "epoch": 0.07649842271293375, + "grad_norm": 0.7915900428994362, + "learning_rate": 1.9983736057194512e-05, + "loss": 0.416, + "step": 388 + }, + { + "epoch": 0.0766955835962145, + "grad_norm": 0.8158965365869196, + "learning_rate": 1.9983647570235274e-05, + "loss": 0.4667, + "step": 389 + }, + { + "epoch": 0.07689274447949526, + "grad_norm": 0.7997150055795212, + "learning_rate": 1.9983558843410863e-05, + "loss": 0.4466, + "step": 390 + }, + { + "epoch": 0.07708990536277603, + "grad_norm": 0.8940610724543107, + "learning_rate": 1.998346987672341e-05, + "loss": 0.4658, + "step": 391 + }, + { + "epoch": 0.07728706624605679, + "grad_norm": 1.294715926753004, + "learning_rate": 1.9983380670175054e-05, + "loss": 0.4642, + "step": 392 + }, + { + "epoch": 0.07748422712933754, + "grad_norm": 11.425972122218342, + "learning_rate": 1.998329122376794e-05, + "loss": 0.5511, + "step": 393 + }, + { + "epoch": 0.0776813880126183, + "grad_norm": 1.1867295493168513, + "learning_rate": 1.998320153750421e-05, + "loss": 0.4462, + "step": 394 + }, + { + "epoch": 0.07787854889589906, + "grad_norm": 0.9729316139359963, + "learning_rate": 1.998311161138603e-05, + "loss": 0.4976, + "step": 395 + }, + { + "epoch": 0.07807570977917981, + "grad_norm": 0.9157344759346018, + "learning_rate": 1.998302144541555e-05, + "loss": 0.4709, + "step": 396 + }, + { + "epoch": 0.07827287066246057, + "grad_norm": 0.934161687151528, + "learning_rate": 1.998293103959494e-05, + "loss": 0.4673, + "step": 397 + }, + { + "epoch": 0.07847003154574132, + "grad_norm": 1.4223262840464979, + "learning_rate": 1.9982840393926374e-05, + "loss": 0.5139, + "step": 398 + }, + { + "epoch": 0.07866719242902208, + "grad_norm": 1.131720070151689, + "learning_rate": 1.9982749508412026e-05, + "loss": 0.5327, + "step": 399 + }, + { + "epoch": 0.07886435331230283, + "grad_norm": 1.2074643005200005, + "learning_rate": 1.998265838305409e-05, + "loss": 0.4974, + "step": 400 + }, + { + "epoch": 0.07906151419558359, + "grad_norm": 0.9531948695115436, + "learning_rate": 1.998256701785474e-05, + "loss": 0.4841, + "step": 401 + }, + { + "epoch": 0.07925867507886436, + "grad_norm": 1.043133116018871, + "learning_rate": 1.998247541281618e-05, + "loss": 0.4506, + "step": 402 + }, + { + "epoch": 0.07945583596214512, + "grad_norm": 1.0246039260585653, + "learning_rate": 1.9982383567940606e-05, + "loss": 0.4445, + "step": 403 + }, + { + "epoch": 0.07965299684542587, + "grad_norm": 1.0452483998819864, + "learning_rate": 1.9982291483230232e-05, + "loss": 0.4851, + "step": 404 + }, + { + "epoch": 0.07985015772870663, + "grad_norm": 1.1683875566798174, + "learning_rate": 1.9982199158687266e-05, + "loss": 0.4548, + "step": 405 + }, + { + "epoch": 0.08004731861198738, + "grad_norm": 0.906125687513742, + "learning_rate": 1.9982106594313924e-05, + "loss": 0.4704, + "step": 406 + }, + { + "epoch": 0.08024447949526814, + "grad_norm": 5.213779615448294, + "learning_rate": 1.9982013790112437e-05, + "loss": 0.526, + "step": 407 + }, + { + "epoch": 0.0804416403785489, + "grad_norm": 1.9791118557774463, + "learning_rate": 1.9981920746085025e-05, + "loss": 0.4971, + "step": 408 + }, + { + "epoch": 0.08063880126182965, + "grad_norm": 8.292277341865951, + "learning_rate": 1.9981827462233932e-05, + "loss": 0.4924, + "step": 409 + }, + { + "epoch": 0.08083596214511041, + "grad_norm": 1.4694955541234114, + "learning_rate": 1.99817339385614e-05, + "loss": 0.5109, + "step": 410 + }, + { + "epoch": 0.08103312302839116, + "grad_norm": 1.0200131993418244, + "learning_rate": 1.9981640175069663e-05, + "loss": 0.4837, + "step": 411 + }, + { + "epoch": 0.08123028391167192, + "grad_norm": 1.241162294871943, + "learning_rate": 1.998154617176099e-05, + "loss": 0.4752, + "step": 412 + }, + { + "epoch": 0.08142744479495267, + "grad_norm": 1.6151902756221161, + "learning_rate": 1.9981451928637627e-05, + "loss": 0.4739, + "step": 413 + }, + { + "epoch": 0.08162460567823344, + "grad_norm": 1.2407308825596557, + "learning_rate": 1.9981357445701846e-05, + "loss": 0.4345, + "step": 414 + }, + { + "epoch": 0.0818217665615142, + "grad_norm": 0.9936264286526395, + "learning_rate": 1.9981262722955913e-05, + "loss": 0.5211, + "step": 415 + }, + { + "epoch": 0.08201892744479496, + "grad_norm": 1.3047191271197602, + "learning_rate": 1.9981167760402104e-05, + "loss": 0.4827, + "step": 416 + }, + { + "epoch": 0.08221608832807571, + "grad_norm": 0.8876022805465946, + "learning_rate": 1.9981072558042705e-05, + "loss": 0.4838, + "step": 417 + }, + { + "epoch": 0.08241324921135647, + "grad_norm": 1.3442900789849086, + "learning_rate": 1.9980977115879997e-05, + "loss": 0.4614, + "step": 418 + }, + { + "epoch": 0.08261041009463722, + "grad_norm": 1.5881293239590277, + "learning_rate": 1.998088143391628e-05, + "loss": 0.4813, + "step": 419 + }, + { + "epoch": 0.08280757097791798, + "grad_norm": 1.1153506722397977, + "learning_rate": 1.9980785512153846e-05, + "loss": 0.5095, + "step": 420 + }, + { + "epoch": 0.08300473186119874, + "grad_norm": 0.9249721852533636, + "learning_rate": 1.9980689350595004e-05, + "loss": 0.4764, + "step": 421 + }, + { + "epoch": 0.08320189274447949, + "grad_norm": 1.073860170389333, + "learning_rate": 1.9980592949242063e-05, + "loss": 0.4597, + "step": 422 + }, + { + "epoch": 0.08339905362776025, + "grad_norm": 0.8354298196477534, + "learning_rate": 1.998049630809734e-05, + "loss": 0.4805, + "step": 423 + }, + { + "epoch": 0.083596214511041, + "grad_norm": 0.9975582382283317, + "learning_rate": 1.9980399427163154e-05, + "loss": 0.4734, + "step": 424 + }, + { + "epoch": 0.08379337539432177, + "grad_norm": 0.7461081355848223, + "learning_rate": 1.9980302306441834e-05, + "loss": 0.4456, + "step": 425 + }, + { + "epoch": 0.08399053627760253, + "grad_norm": 0.8992637044263566, + "learning_rate": 1.9980204945935716e-05, + "loss": 0.4514, + "step": 426 + }, + { + "epoch": 0.08418769716088328, + "grad_norm": 0.9484738986938942, + "learning_rate": 1.9980107345647133e-05, + "loss": 0.5281, + "step": 427 + }, + { + "epoch": 0.08438485804416404, + "grad_norm": 0.8453814347866732, + "learning_rate": 1.9980009505578438e-05, + "loss": 0.4493, + "step": 428 + }, + { + "epoch": 0.0845820189274448, + "grad_norm": 0.8262276270862967, + "learning_rate": 1.9979911425731978e-05, + "loss": 0.4483, + "step": 429 + }, + { + "epoch": 0.08477917981072555, + "grad_norm": 0.856137009508442, + "learning_rate": 1.9979813106110108e-05, + "loss": 0.4635, + "step": 430 + }, + { + "epoch": 0.08497634069400631, + "grad_norm": 0.8889039446695481, + "learning_rate": 1.997971454671519e-05, + "loss": 0.4308, + "step": 431 + }, + { + "epoch": 0.08517350157728706, + "grad_norm": 8.27925993626678, + "learning_rate": 1.9979615747549594e-05, + "loss": 0.4852, + "step": 432 + }, + { + "epoch": 0.08537066246056782, + "grad_norm": 2.064710279361049, + "learning_rate": 1.9979516708615696e-05, + "loss": 0.4843, + "step": 433 + }, + { + "epoch": 0.08556782334384858, + "grad_norm": 0.9477116643465063, + "learning_rate": 1.997941742991587e-05, + "loss": 0.4534, + "step": 434 + }, + { + "epoch": 0.08576498422712933, + "grad_norm": 0.9584045524018094, + "learning_rate": 1.9979317911452503e-05, + "loss": 0.4787, + "step": 435 + }, + { + "epoch": 0.0859621451104101, + "grad_norm": 0.8809961453962474, + "learning_rate": 1.997921815322799e-05, + "loss": 0.4742, + "step": 436 + }, + { + "epoch": 0.08615930599369086, + "grad_norm": 1.0555900314632443, + "learning_rate": 1.997911815524472e-05, + "loss": 0.4854, + "step": 437 + }, + { + "epoch": 0.08635646687697161, + "grad_norm": 0.8568644478491275, + "learning_rate": 1.9979017917505102e-05, + "loss": 0.5062, + "step": 438 + }, + { + "epoch": 0.08655362776025237, + "grad_norm": 0.899069617822026, + "learning_rate": 1.997891744001155e-05, + "loss": 0.4572, + "step": 439 + }, + { + "epoch": 0.08675078864353312, + "grad_norm": 0.858458600280124, + "learning_rate": 1.997881672276646e-05, + "loss": 0.4561, + "step": 440 + }, + { + "epoch": 0.08694794952681388, + "grad_norm": 0.8872507068731488, + "learning_rate": 1.9978715765772266e-05, + "loss": 0.4893, + "step": 441 + }, + { + "epoch": 0.08714511041009464, + "grad_norm": 0.8001811452184983, + "learning_rate": 1.9978614569031388e-05, + "loss": 0.4473, + "step": 442 + }, + { + "epoch": 0.08734227129337539, + "grad_norm": 0.7719094886129825, + "learning_rate": 1.997851313254626e-05, + "loss": 0.4162, + "step": 443 + }, + { + "epoch": 0.08753943217665615, + "grad_norm": 0.8070187892782263, + "learning_rate": 1.997841145631932e-05, + "loss": 0.4357, + "step": 444 + }, + { + "epoch": 0.0877365930599369, + "grad_norm": 0.7876463515387281, + "learning_rate": 1.9978309540353013e-05, + "loss": 0.4511, + "step": 445 + }, + { + "epoch": 0.08793375394321766, + "grad_norm": 0.888526192109841, + "learning_rate": 1.9978207384649778e-05, + "loss": 0.482, + "step": 446 + }, + { + "epoch": 0.08813091482649842, + "grad_norm": 0.8684918565462284, + "learning_rate": 1.9978104989212078e-05, + "loss": 0.488, + "step": 447 + }, + { + "epoch": 0.08832807570977919, + "grad_norm": 0.9067510910757722, + "learning_rate": 1.997800235404237e-05, + "loss": 0.4546, + "step": 448 + }, + { + "epoch": 0.08852523659305994, + "grad_norm": 0.8445544456418074, + "learning_rate": 1.9977899479143117e-05, + "loss": 0.4509, + "step": 449 + }, + { + "epoch": 0.0887223974763407, + "grad_norm": 0.8441154566325766, + "learning_rate": 1.9977796364516796e-05, + "loss": 0.4794, + "step": 450 + }, + { + "epoch": 0.08891955835962145, + "grad_norm": 0.9219556253568325, + "learning_rate": 1.997769301016588e-05, + "loss": 0.4876, + "step": 451 + }, + { + "epoch": 0.08911671924290221, + "grad_norm": 0.8432925267438068, + "learning_rate": 1.997758941609286e-05, + "loss": 0.4947, + "step": 452 + }, + { + "epoch": 0.08931388012618297, + "grad_norm": 0.8912819991209812, + "learning_rate": 1.9977485582300215e-05, + "loss": 0.441, + "step": 453 + }, + { + "epoch": 0.08951104100946372, + "grad_norm": 0.8123986878321956, + "learning_rate": 1.9977381508790446e-05, + "loss": 0.461, + "step": 454 + }, + { + "epoch": 0.08970820189274448, + "grad_norm": 0.9178026802956625, + "learning_rate": 1.997727719556605e-05, + "loss": 0.5066, + "step": 455 + }, + { + "epoch": 0.08990536277602523, + "grad_norm": 0.8987063196006664, + "learning_rate": 1.9977172642629537e-05, + "loss": 0.44, + "step": 456 + }, + { + "epoch": 0.09010252365930599, + "grad_norm": 0.890159715184649, + "learning_rate": 1.9977067849983412e-05, + "loss": 0.4605, + "step": 457 + }, + { + "epoch": 0.09029968454258674, + "grad_norm": 0.95233992862221, + "learning_rate": 1.9976962817630202e-05, + "loss": 0.4727, + "step": 458 + }, + { + "epoch": 0.09049684542586751, + "grad_norm": 0.9459884973587354, + "learning_rate": 1.9976857545572425e-05, + "loss": 0.4619, + "step": 459 + }, + { + "epoch": 0.09069400630914827, + "grad_norm": 1.2642744480328538, + "learning_rate": 1.997675203381261e-05, + "loss": 0.5395, + "step": 460 + }, + { + "epoch": 0.09089116719242903, + "grad_norm": 1.0028131372893, + "learning_rate": 1.997664628235329e-05, + "loss": 0.4673, + "step": 461 + }, + { + "epoch": 0.09108832807570978, + "grad_norm": 0.9306510247064776, + "learning_rate": 1.9976540291197015e-05, + "loss": 0.4886, + "step": 462 + }, + { + "epoch": 0.09128548895899054, + "grad_norm": 0.905251801766755, + "learning_rate": 1.9976434060346324e-05, + "loss": 0.5161, + "step": 463 + }, + { + "epoch": 0.0914826498422713, + "grad_norm": 0.929602575181201, + "learning_rate": 1.9976327589803767e-05, + "loss": 0.4745, + "step": 464 + }, + { + "epoch": 0.09167981072555205, + "grad_norm": 3.0755405650687653, + "learning_rate": 1.997622087957191e-05, + "loss": 0.4565, + "step": 465 + }, + { + "epoch": 0.0918769716088328, + "grad_norm": 1.3035255174455254, + "learning_rate": 1.9976113929653312e-05, + "loss": 0.4869, + "step": 466 + }, + { + "epoch": 0.09207413249211356, + "grad_norm": 0.863482479091288, + "learning_rate": 1.997600674005054e-05, + "loss": 0.4533, + "step": 467 + }, + { + "epoch": 0.09227129337539432, + "grad_norm": 1.007184735840567, + "learning_rate": 1.9975899310766173e-05, + "loss": 0.4697, + "step": 468 + }, + { + "epoch": 0.09246845425867507, + "grad_norm": 1.1269484430615138, + "learning_rate": 1.997579164180279e-05, + "loss": 0.5163, + "step": 469 + }, + { + "epoch": 0.09266561514195584, + "grad_norm": 1.1397659345471354, + "learning_rate": 1.9975683733162987e-05, + "loss": 0.4959, + "step": 470 + }, + { + "epoch": 0.0928627760252366, + "grad_norm": 0.9860221015384155, + "learning_rate": 1.9975575584849346e-05, + "loss": 0.4788, + "step": 471 + }, + { + "epoch": 0.09305993690851735, + "grad_norm": 0.8672216442193951, + "learning_rate": 1.9975467196864465e-05, + "loss": 0.4846, + "step": 472 + }, + { + "epoch": 0.09325709779179811, + "grad_norm": 1.125980566236637, + "learning_rate": 1.9975358569210952e-05, + "loss": 0.467, + "step": 473 + }, + { + "epoch": 0.09345425867507887, + "grad_norm": 0.9201790577474425, + "learning_rate": 1.9975249701891414e-05, + "loss": 0.4767, + "step": 474 + }, + { + "epoch": 0.09365141955835962, + "grad_norm": 1.0505904231912129, + "learning_rate": 1.9975140594908472e-05, + "loss": 0.4671, + "step": 475 + }, + { + "epoch": 0.09384858044164038, + "grad_norm": 0.8373445019309896, + "learning_rate": 1.9975031248264746e-05, + "loss": 0.5008, + "step": 476 + }, + { + "epoch": 0.09404574132492113, + "grad_norm": 1.016203023349781, + "learning_rate": 1.997492166196286e-05, + "loss": 0.4614, + "step": 477 + }, + { + "epoch": 0.09424290220820189, + "grad_norm": 0.785117485057104, + "learning_rate": 1.9974811836005446e-05, + "loss": 0.465, + "step": 478 + }, + { + "epoch": 0.09444006309148265, + "grad_norm": 1.0204263244954062, + "learning_rate": 1.9974701770395147e-05, + "loss": 0.461, + "step": 479 + }, + { + "epoch": 0.0946372239747634, + "grad_norm": 2.1396373733832785, + "learning_rate": 1.9974591465134606e-05, + "loss": 0.4751, + "step": 480 + }, + { + "epoch": 0.09483438485804416, + "grad_norm": 1.112339191932461, + "learning_rate": 1.9974480920226472e-05, + "loss": 0.454, + "step": 481 + }, + { + "epoch": 0.09503154574132493, + "grad_norm": 1.0016882760903807, + "learning_rate": 1.9974370135673398e-05, + "loss": 0.4848, + "step": 482 + }, + { + "epoch": 0.09522870662460568, + "grad_norm": 1.0366723685296435, + "learning_rate": 1.9974259111478054e-05, + "loss": 0.4619, + "step": 483 + }, + { + "epoch": 0.09542586750788644, + "grad_norm": 0.7816273224252894, + "learning_rate": 1.9974147847643103e-05, + "loss": 0.4356, + "step": 484 + }, + { + "epoch": 0.0956230283911672, + "grad_norm": 2.2595110257069386, + "learning_rate": 1.9974036344171215e-05, + "loss": 0.528, + "step": 485 + }, + { + "epoch": 0.09582018927444795, + "grad_norm": 0.9462755974465362, + "learning_rate": 1.997392460106507e-05, + "loss": 0.4768, + "step": 486 + }, + { + "epoch": 0.0960173501577287, + "grad_norm": 0.9715227579239536, + "learning_rate": 1.997381261832736e-05, + "loss": 0.4725, + "step": 487 + }, + { + "epoch": 0.09621451104100946, + "grad_norm": 0.9626137085486789, + "learning_rate": 1.9973700395960765e-05, + "loss": 0.4768, + "step": 488 + }, + { + "epoch": 0.09641167192429022, + "grad_norm": 0.7957423308586874, + "learning_rate": 1.997358793396799e-05, + "loss": 0.4333, + "step": 489 + }, + { + "epoch": 0.09660883280757097, + "grad_norm": 0.8587376909161047, + "learning_rate": 1.9973475232351728e-05, + "loss": 0.4677, + "step": 490 + }, + { + "epoch": 0.09680599369085173, + "grad_norm": 0.8640406505371612, + "learning_rate": 1.9973362291114697e-05, + "loss": 0.4915, + "step": 491 + }, + { + "epoch": 0.09700315457413249, + "grad_norm": 22.479833252401775, + "learning_rate": 1.99732491102596e-05, + "loss": 0.5093, + "step": 492 + }, + { + "epoch": 0.09720031545741326, + "grad_norm": 1.7104531508622285, + "learning_rate": 1.9973135689789167e-05, + "loss": 0.4697, + "step": 493 + }, + { + "epoch": 0.09739747634069401, + "grad_norm": 0.8723153591836569, + "learning_rate": 1.9973022029706117e-05, + "loss": 0.4539, + "step": 494 + }, + { + "epoch": 0.09759463722397477, + "grad_norm": 1.1539417078528944, + "learning_rate": 1.997290813001318e-05, + "loss": 0.47, + "step": 495 + }, + { + "epoch": 0.09779179810725552, + "grad_norm": 0.8862353677994035, + "learning_rate": 1.9972793990713093e-05, + "loss": 0.4891, + "step": 496 + }, + { + "epoch": 0.09798895899053628, + "grad_norm": 1.1144995212251465, + "learning_rate": 1.9972679611808603e-05, + "loss": 0.4823, + "step": 497 + }, + { + "epoch": 0.09818611987381703, + "grad_norm": 1.4516257465643632, + "learning_rate": 1.997256499330245e-05, + "loss": 0.5077, + "step": 498 + }, + { + "epoch": 0.09838328075709779, + "grad_norm": 0.9004639995601033, + "learning_rate": 1.9972450135197397e-05, + "loss": 0.4404, + "step": 499 + }, + { + "epoch": 0.09858044164037855, + "grad_norm": 0.8035330529361184, + "learning_rate": 1.9972335037496195e-05, + "loss": 0.456, + "step": 500 + }, + { + "epoch": 0.0987776025236593, + "grad_norm": 0.8385329004850893, + "learning_rate": 1.9972219700201612e-05, + "loss": 0.4332, + "step": 501 + }, + { + "epoch": 0.09897476340694006, + "grad_norm": 0.8495426403609305, + "learning_rate": 1.9972104123316422e-05, + "loss": 0.4541, + "step": 502 + }, + { + "epoch": 0.09917192429022081, + "grad_norm": 0.7382887096139712, + "learning_rate": 1.9971988306843403e-05, + "loss": 0.4095, + "step": 503 + }, + { + "epoch": 0.09936908517350158, + "grad_norm": 0.8439026090011481, + "learning_rate": 1.997187225078533e-05, + "loss": 0.4443, + "step": 504 + }, + { + "epoch": 0.09956624605678234, + "grad_norm": 0.8075139072966796, + "learning_rate": 1.9971755955144995e-05, + "loss": 0.4608, + "step": 505 + }, + { + "epoch": 0.0997634069400631, + "grad_norm": 0.818587342922555, + "learning_rate": 1.9971639419925197e-05, + "loss": 0.4486, + "step": 506 + }, + { + "epoch": 0.09996056782334385, + "grad_norm": 0.8577985021487166, + "learning_rate": 1.997152264512873e-05, + "loss": 0.4494, + "step": 507 + }, + { + "epoch": 0.10015772870662461, + "grad_norm": 0.8628896861024081, + "learning_rate": 1.9971405630758402e-05, + "loss": 0.474, + "step": 508 + }, + { + "epoch": 0.10035488958990536, + "grad_norm": 0.799898383721641, + "learning_rate": 1.9971288376817023e-05, + "loss": 0.4588, + "step": 509 + }, + { + "epoch": 0.10055205047318612, + "grad_norm": 0.7095652696234459, + "learning_rate": 1.997117088330741e-05, + "loss": 0.4359, + "step": 510 + }, + { + "epoch": 0.10074921135646688, + "grad_norm": 0.9284886130598523, + "learning_rate": 1.9971053150232387e-05, + "loss": 0.4517, + "step": 511 + }, + { + "epoch": 0.10094637223974763, + "grad_norm": 0.8423011199925328, + "learning_rate": 1.9970935177594787e-05, + "loss": 0.4511, + "step": 512 + }, + { + "epoch": 0.10114353312302839, + "grad_norm": 0.7810718155478004, + "learning_rate": 1.9970816965397435e-05, + "loss": 0.5072, + "step": 513 + }, + { + "epoch": 0.10134069400630914, + "grad_norm": 0.8041359390874309, + "learning_rate": 1.9970698513643178e-05, + "loss": 0.4703, + "step": 514 + }, + { + "epoch": 0.1015378548895899, + "grad_norm": 1.53976548457417, + "learning_rate": 1.9970579822334856e-05, + "loss": 0.4518, + "step": 515 + }, + { + "epoch": 0.10173501577287067, + "grad_norm": 0.9907811278036276, + "learning_rate": 1.9970460891475328e-05, + "loss": 0.476, + "step": 516 + }, + { + "epoch": 0.10193217665615142, + "grad_norm": 0.8081338018995352, + "learning_rate": 1.997034172106745e-05, + "loss": 0.4691, + "step": 517 + }, + { + "epoch": 0.10212933753943218, + "grad_norm": 1.0051078776013984, + "learning_rate": 1.9970222311114078e-05, + "loss": 0.5074, + "step": 518 + }, + { + "epoch": 0.10232649842271294, + "grad_norm": 0.8105369088543491, + "learning_rate": 1.9970102661618088e-05, + "loss": 0.4502, + "step": 519 + }, + { + "epoch": 0.10252365930599369, + "grad_norm": 0.7202414114786588, + "learning_rate": 1.9969982772582354e-05, + "loss": 0.4452, + "step": 520 + }, + { + "epoch": 0.10272082018927445, + "grad_norm": 0.8648373894327899, + "learning_rate": 1.996986264400975e-05, + "loss": 0.4757, + "step": 521 + }, + { + "epoch": 0.1029179810725552, + "grad_norm": 0.7629352205940431, + "learning_rate": 1.9969742275903172e-05, + "loss": 0.477, + "step": 522 + }, + { + "epoch": 0.10311514195583596, + "grad_norm": 0.8526046130077743, + "learning_rate": 1.996962166826551e-05, + "loss": 0.4341, + "step": 523 + }, + { + "epoch": 0.10331230283911672, + "grad_norm": 5.257272773165352, + "learning_rate": 1.9969500821099654e-05, + "loss": 0.4958, + "step": 524 + }, + { + "epoch": 0.10350946372239747, + "grad_norm": 0.8477826266411921, + "learning_rate": 1.996937973440851e-05, + "loss": 0.43, + "step": 525 + }, + { + "epoch": 0.10370662460567823, + "grad_norm": 1.4050180036912192, + "learning_rate": 1.9969258408194997e-05, + "loss": 0.4446, + "step": 526 + }, + { + "epoch": 0.103903785488959, + "grad_norm": 0.976079511985993, + "learning_rate": 1.996913684246202e-05, + "loss": 0.4718, + "step": 527 + }, + { + "epoch": 0.10410094637223975, + "grad_norm": 1.460058823361328, + "learning_rate": 1.99690150372125e-05, + "loss": 0.4904, + "step": 528 + }, + { + "epoch": 0.10429810725552051, + "grad_norm": 1.2260644333639692, + "learning_rate": 1.9968892992449364e-05, + "loss": 0.4757, + "step": 529 + }, + { + "epoch": 0.10449526813880126, + "grad_norm": 0.820353657627711, + "learning_rate": 1.9968770708175552e-05, + "loss": 0.4823, + "step": 530 + }, + { + "epoch": 0.10469242902208202, + "grad_norm": 0.7536246666392749, + "learning_rate": 1.996864818439399e-05, + "loss": 0.4212, + "step": 531 + }, + { + "epoch": 0.10488958990536278, + "grad_norm": 0.8026979740475652, + "learning_rate": 1.9968525421107633e-05, + "loss": 0.4606, + "step": 532 + }, + { + "epoch": 0.10508675078864353, + "grad_norm": 0.8687858454813581, + "learning_rate": 1.996840241831942e-05, + "loss": 0.4862, + "step": 533 + }, + { + "epoch": 0.10528391167192429, + "grad_norm": 3.2595441481445944, + "learning_rate": 1.9968279176032314e-05, + "loss": 0.4919, + "step": 534 + }, + { + "epoch": 0.10548107255520504, + "grad_norm": 0.9637701357273671, + "learning_rate": 1.9968155694249274e-05, + "loss": 0.4574, + "step": 535 + }, + { + "epoch": 0.1056782334384858, + "grad_norm": 0.9015755854427224, + "learning_rate": 1.9968031972973266e-05, + "loss": 0.4892, + "step": 536 + }, + { + "epoch": 0.10587539432176656, + "grad_norm": 0.811289401193313, + "learning_rate": 1.996790801220726e-05, + "loss": 0.4393, + "step": 537 + }, + { + "epoch": 0.10607255520504733, + "grad_norm": 1.15420005303407, + "learning_rate": 1.996778381195424e-05, + "loss": 0.5066, + "step": 538 + }, + { + "epoch": 0.10626971608832808, + "grad_norm": 0.8200798308419749, + "learning_rate": 1.9967659372217187e-05, + "loss": 0.5189, + "step": 539 + }, + { + "epoch": 0.10646687697160884, + "grad_norm": 0.8867823143154604, + "learning_rate": 1.9967534692999085e-05, + "loss": 0.4365, + "step": 540 + }, + { + "epoch": 0.10666403785488959, + "grad_norm": 0.8669753830991764, + "learning_rate": 1.996740977430294e-05, + "loss": 0.457, + "step": 541 + }, + { + "epoch": 0.10686119873817035, + "grad_norm": 0.8954903277161942, + "learning_rate": 1.996728461613175e-05, + "loss": 0.4597, + "step": 542 + }, + { + "epoch": 0.1070583596214511, + "grad_norm": 0.8258256049709248, + "learning_rate": 1.9967159218488515e-05, + "loss": 0.428, + "step": 543 + }, + { + "epoch": 0.10725552050473186, + "grad_norm": 0.811522877066346, + "learning_rate": 1.996703358137626e-05, + "loss": 0.4708, + "step": 544 + }, + { + "epoch": 0.10745268138801262, + "grad_norm": 0.9746081819733365, + "learning_rate": 1.996690770479799e-05, + "loss": 0.5035, + "step": 545 + }, + { + "epoch": 0.10764984227129337, + "grad_norm": 0.8151447726161154, + "learning_rate": 1.9966781588756743e-05, + "loss": 0.4748, + "step": 546 + }, + { + "epoch": 0.10784700315457413, + "grad_norm": 0.9244734430462046, + "learning_rate": 1.996665523325554e-05, + "loss": 0.4811, + "step": 547 + }, + { + "epoch": 0.10804416403785488, + "grad_norm": 0.8967791767272244, + "learning_rate": 1.9966528638297415e-05, + "loss": 0.4329, + "step": 548 + }, + { + "epoch": 0.10824132492113564, + "grad_norm": 1.184825474138752, + "learning_rate": 1.9966401803885413e-05, + "loss": 0.4622, + "step": 549 + }, + { + "epoch": 0.10843848580441641, + "grad_norm": 0.8122484029135365, + "learning_rate": 1.9966274730022587e-05, + "loss": 0.473, + "step": 550 + }, + { + "epoch": 0.10863564668769717, + "grad_norm": 1.1993820808729505, + "learning_rate": 1.996614741671198e-05, + "loss": 0.4356, + "step": 551 + }, + { + "epoch": 0.10883280757097792, + "grad_norm": 0.767742171284427, + "learning_rate": 1.996601986395666e-05, + "loss": 0.4302, + "step": 552 + }, + { + "epoch": 0.10902996845425868, + "grad_norm": 1.1421804231314927, + "learning_rate": 1.9965892071759685e-05, + "loss": 0.4594, + "step": 553 + }, + { + "epoch": 0.10922712933753943, + "grad_norm": 0.8790816673408636, + "learning_rate": 1.9965764040124126e-05, + "loss": 0.4877, + "step": 554 + }, + { + "epoch": 0.10942429022082019, + "grad_norm": 1.2969027619478393, + "learning_rate": 1.9965635769053064e-05, + "loss": 0.474, + "step": 555 + }, + { + "epoch": 0.10962145110410094, + "grad_norm": 0.8359668233842006, + "learning_rate": 1.9965507258549573e-05, + "loss": 0.4586, + "step": 556 + }, + { + "epoch": 0.1098186119873817, + "grad_norm": 0.9318305367356925, + "learning_rate": 1.996537850861675e-05, + "loss": 0.4439, + "step": 557 + }, + { + "epoch": 0.11001577287066246, + "grad_norm": 0.9793179279182511, + "learning_rate": 1.9965249519257682e-05, + "loss": 0.4953, + "step": 558 + }, + { + "epoch": 0.11021293375394321, + "grad_norm": 0.7714402928720476, + "learning_rate": 1.9965120290475466e-05, + "loss": 0.4635, + "step": 559 + }, + { + "epoch": 0.11041009463722397, + "grad_norm": 1.0153326096876347, + "learning_rate": 1.996499082227321e-05, + "loss": 0.465, + "step": 560 + }, + { + "epoch": 0.11060725552050474, + "grad_norm": 0.7664943405741772, + "learning_rate": 1.996486111465403e-05, + "loss": 0.4849, + "step": 561 + }, + { + "epoch": 0.1108044164037855, + "grad_norm": 0.8468927415011162, + "learning_rate": 1.996473116762103e-05, + "loss": 0.4897, + "step": 562 + }, + { + "epoch": 0.11100157728706625, + "grad_norm": 0.7604431065724107, + "learning_rate": 1.9964600981177344e-05, + "loss": 0.4737, + "step": 563 + }, + { + "epoch": 0.111198738170347, + "grad_norm": 0.7359161844087391, + "learning_rate": 1.9964470555326096e-05, + "loss": 0.4508, + "step": 564 + }, + { + "epoch": 0.11139589905362776, + "grad_norm": 1.3795069909449265, + "learning_rate": 1.9964339890070415e-05, + "loss": 0.4826, + "step": 565 + }, + { + "epoch": 0.11159305993690852, + "grad_norm": 0.9524081314639894, + "learning_rate": 1.9964208985413448e-05, + "loss": 0.4497, + "step": 566 + }, + { + "epoch": 0.11179022082018927, + "grad_norm": 0.7435304463746423, + "learning_rate": 1.9964077841358333e-05, + "loss": 0.4797, + "step": 567 + }, + { + "epoch": 0.11198738170347003, + "grad_norm": 2.6303972907019326, + "learning_rate": 1.9963946457908223e-05, + "loss": 0.4954, + "step": 568 + }, + { + "epoch": 0.11218454258675079, + "grad_norm": 0.8886897209525809, + "learning_rate": 1.9963814835066274e-05, + "loss": 0.4991, + "step": 569 + }, + { + "epoch": 0.11238170347003154, + "grad_norm": 16.23439623915258, + "learning_rate": 1.9963682972835654e-05, + "loss": 0.4808, + "step": 570 + }, + { + "epoch": 0.1125788643533123, + "grad_norm": 1.0835312648993332, + "learning_rate": 1.9963550871219522e-05, + "loss": 0.5025, + "step": 571 + }, + { + "epoch": 0.11277602523659307, + "grad_norm": 2.8501049449687157, + "learning_rate": 1.996341853022106e-05, + "loss": 0.4398, + "step": 572 + }, + { + "epoch": 0.11297318611987382, + "grad_norm": 1.4391188096590775, + "learning_rate": 1.9963285949843446e-05, + "loss": 0.4883, + "step": 573 + }, + { + "epoch": 0.11317034700315458, + "grad_norm": 0.8468547462854358, + "learning_rate": 1.996315313008986e-05, + "loss": 0.4499, + "step": 574 + }, + { + "epoch": 0.11336750788643533, + "grad_norm": 1.1620214814742198, + "learning_rate": 1.99630200709635e-05, + "loss": 0.4489, + "step": 575 + }, + { + "epoch": 0.11356466876971609, + "grad_norm": 1.9853935899449358, + "learning_rate": 1.996288677246756e-05, + "loss": 0.4849, + "step": 576 + }, + { + "epoch": 0.11376182965299685, + "grad_norm": 1.8186145471704431, + "learning_rate": 1.996275323460524e-05, + "loss": 0.5109, + "step": 577 + }, + { + "epoch": 0.1139589905362776, + "grad_norm": 0.8478050994792304, + "learning_rate": 1.996261945737975e-05, + "loss": 0.5038, + "step": 578 + }, + { + "epoch": 0.11415615141955836, + "grad_norm": 1.1104246850094956, + "learning_rate": 1.9962485440794306e-05, + "loss": 0.4551, + "step": 579 + }, + { + "epoch": 0.11435331230283911, + "grad_norm": 2.048919052058303, + "learning_rate": 1.9962351184852123e-05, + "loss": 0.4642, + "step": 580 + }, + { + "epoch": 0.11455047318611987, + "grad_norm": 0.9669037613683414, + "learning_rate": 1.9962216689556435e-05, + "loss": 0.4547, + "step": 581 + }, + { + "epoch": 0.11474763406940063, + "grad_norm": 0.7967676382705094, + "learning_rate": 1.996208195491047e-05, + "loss": 0.4732, + "step": 582 + }, + { + "epoch": 0.11494479495268138, + "grad_norm": 0.973744486308741, + "learning_rate": 1.9961946980917457e-05, + "loss": 0.4612, + "step": 583 + }, + { + "epoch": 0.11514195583596215, + "grad_norm": 0.8065923330637983, + "learning_rate": 1.9961811767580646e-05, + "loss": 0.4467, + "step": 584 + }, + { + "epoch": 0.1153391167192429, + "grad_norm": 2.7671844074244127, + "learning_rate": 1.996167631490329e-05, + "loss": 0.493, + "step": 585 + }, + { + "epoch": 0.11553627760252366, + "grad_norm": 0.8616456402628487, + "learning_rate": 1.9961540622888637e-05, + "loss": 0.4738, + "step": 586 + }, + { + "epoch": 0.11573343848580442, + "grad_norm": 4.8009523346043945, + "learning_rate": 1.9961404691539947e-05, + "loss": 0.4741, + "step": 587 + }, + { + "epoch": 0.11593059936908517, + "grad_norm": 1.2289335426336983, + "learning_rate": 1.996126852086049e-05, + "loss": 0.4626, + "step": 588 + }, + { + "epoch": 0.11612776025236593, + "grad_norm": 0.8888193606458007, + "learning_rate": 1.996113211085353e-05, + "loss": 0.469, + "step": 589 + }, + { + "epoch": 0.11632492113564669, + "grad_norm": 1.020387123983096, + "learning_rate": 1.996099546152235e-05, + "loss": 0.4977, + "step": 590 + }, + { + "epoch": 0.11652208201892744, + "grad_norm": 1.5821331097425058, + "learning_rate": 1.9960858572870238e-05, + "loss": 0.5181, + "step": 591 + }, + { + "epoch": 0.1167192429022082, + "grad_norm": 1.406346825248364, + "learning_rate": 1.9960721444900475e-05, + "loss": 0.5002, + "step": 592 + }, + { + "epoch": 0.11691640378548895, + "grad_norm": 0.8333326596917328, + "learning_rate": 1.9960584077616356e-05, + "loss": 0.4882, + "step": 593 + }, + { + "epoch": 0.11711356466876971, + "grad_norm": 0.9769660099184305, + "learning_rate": 1.9960446471021187e-05, + "loss": 0.4375, + "step": 594 + }, + { + "epoch": 0.11731072555205048, + "grad_norm": 0.8908916939407923, + "learning_rate": 1.9960308625118265e-05, + "loss": 0.475, + "step": 595 + }, + { + "epoch": 0.11750788643533124, + "grad_norm": 0.8760104320431786, + "learning_rate": 1.996017053991091e-05, + "loss": 0.461, + "step": 596 + }, + { + "epoch": 0.11770504731861199, + "grad_norm": 1.0317263432759556, + "learning_rate": 1.9960032215402436e-05, + "loss": 0.4877, + "step": 597 + }, + { + "epoch": 0.11790220820189275, + "grad_norm": 1.3949799089229347, + "learning_rate": 1.995989365159617e-05, + "loss": 0.4721, + "step": 598 + }, + { + "epoch": 0.1180993690851735, + "grad_norm": 3.854325351616305, + "learning_rate": 1.9959754848495437e-05, + "loss": 0.4897, + "step": 599 + }, + { + "epoch": 0.11829652996845426, + "grad_norm": 0.9559253651339167, + "learning_rate": 1.9959615806103572e-05, + "loss": 0.4632, + "step": 600 + }, + { + "epoch": 0.11849369085173501, + "grad_norm": 1.864457347756339, + "learning_rate": 1.9959476524423917e-05, + "loss": 0.4725, + "step": 601 + }, + { + "epoch": 0.11869085173501577, + "grad_norm": 1.03878334430699, + "learning_rate": 1.9959337003459816e-05, + "loss": 0.4476, + "step": 602 + }, + { + "epoch": 0.11888801261829653, + "grad_norm": 0.9045258083599287, + "learning_rate": 1.995919724321463e-05, + "loss": 0.4791, + "step": 603 + }, + { + "epoch": 0.11908517350157728, + "grad_norm": 0.9747663255202897, + "learning_rate": 1.9959057243691707e-05, + "loss": 0.4271, + "step": 604 + }, + { + "epoch": 0.11928233438485804, + "grad_norm": 1.2080345292681676, + "learning_rate": 1.995891700489441e-05, + "loss": 0.4782, + "step": 605 + }, + { + "epoch": 0.11947949526813881, + "grad_norm": 0.8131818958865078, + "learning_rate": 1.9958776526826115e-05, + "loss": 0.4596, + "step": 606 + }, + { + "epoch": 0.11967665615141956, + "grad_norm": 0.9375228525284777, + "learning_rate": 1.9958635809490195e-05, + "loss": 0.4583, + "step": 607 + }, + { + "epoch": 0.11987381703470032, + "grad_norm": 0.9475574679425915, + "learning_rate": 1.995849485289003e-05, + "loss": 0.4766, + "step": 608 + }, + { + "epoch": 0.12007097791798108, + "grad_norm": 1.0121761969425598, + "learning_rate": 1.9958353657029007e-05, + "loss": 0.4816, + "step": 609 + }, + { + "epoch": 0.12026813880126183, + "grad_norm": 1.0654177765828532, + "learning_rate": 1.9958212221910514e-05, + "loss": 0.4907, + "step": 610 + }, + { + "epoch": 0.12046529968454259, + "grad_norm": 0.8477027332986309, + "learning_rate": 1.9958070547537956e-05, + "loss": 0.4706, + "step": 611 + }, + { + "epoch": 0.12066246056782334, + "grad_norm": 0.8677643052383288, + "learning_rate": 1.9957928633914735e-05, + "loss": 0.4858, + "step": 612 + }, + { + "epoch": 0.1208596214511041, + "grad_norm": 0.9201595715495308, + "learning_rate": 1.9957786481044253e-05, + "loss": 0.4733, + "step": 613 + }, + { + "epoch": 0.12105678233438485, + "grad_norm": 1.0931868890217196, + "learning_rate": 1.995764408892994e-05, + "loss": 0.4507, + "step": 614 + }, + { + "epoch": 0.12125394321766561, + "grad_norm": 0.9329354196032422, + "learning_rate": 1.9957501457575207e-05, + "loss": 0.481, + "step": 615 + }, + { + "epoch": 0.12145110410094637, + "grad_norm": 0.8796940111635481, + "learning_rate": 1.995735858698348e-05, + "loss": 0.4416, + "step": 616 + }, + { + "epoch": 0.12164826498422712, + "grad_norm": 0.9276302620532716, + "learning_rate": 1.9957215477158196e-05, + "loss": 0.5071, + "step": 617 + }, + { + "epoch": 0.12184542586750789, + "grad_norm": 0.9200272790666039, + "learning_rate": 1.9957072128102792e-05, + "loss": 0.4609, + "step": 618 + }, + { + "epoch": 0.12204258675078865, + "grad_norm": 0.8595424766519794, + "learning_rate": 1.995692853982071e-05, + "loss": 0.4935, + "step": 619 + }, + { + "epoch": 0.1222397476340694, + "grad_norm": 0.8508720329773822, + "learning_rate": 1.99567847123154e-05, + "loss": 0.4703, + "step": 620 + }, + { + "epoch": 0.12243690851735016, + "grad_norm": 0.7662885988170952, + "learning_rate": 1.9956640645590326e-05, + "loss": 0.4285, + "step": 621 + }, + { + "epoch": 0.12263406940063092, + "grad_norm": 0.9173829148052663, + "learning_rate": 1.9956496339648936e-05, + "loss": 0.4573, + "step": 622 + }, + { + "epoch": 0.12283123028391167, + "grad_norm": 0.7616629858014796, + "learning_rate": 1.9956351794494706e-05, + "loss": 0.5152, + "step": 623 + }, + { + "epoch": 0.12302839116719243, + "grad_norm": 0.8272088646379836, + "learning_rate": 1.9956207010131107e-05, + "loss": 0.477, + "step": 624 + }, + { + "epoch": 0.12322555205047318, + "grad_norm": 0.7612317285779934, + "learning_rate": 1.9956061986561615e-05, + "loss": 0.4668, + "step": 625 + }, + { + "epoch": 0.12342271293375394, + "grad_norm": 0.7612795525681632, + "learning_rate": 1.9955916723789718e-05, + "loss": 0.4562, + "step": 626 + }, + { + "epoch": 0.1236198738170347, + "grad_norm": 0.8179624963182074, + "learning_rate": 1.99557712218189e-05, + "loss": 0.4713, + "step": 627 + }, + { + "epoch": 0.12381703470031545, + "grad_norm": 0.8528861188969025, + "learning_rate": 1.995562548065267e-05, + "loss": 0.4664, + "step": 628 + }, + { + "epoch": 0.12401419558359622, + "grad_norm": 0.7690605067619397, + "learning_rate": 1.995547950029451e-05, + "loss": 0.455, + "step": 629 + }, + { + "epoch": 0.12421135646687698, + "grad_norm": 0.7483628212376229, + "learning_rate": 1.9955333280747944e-05, + "loss": 0.4652, + "step": 630 + }, + { + "epoch": 0.12440851735015773, + "grad_norm": 0.9221392473701614, + "learning_rate": 1.995518682201648e-05, + "loss": 0.4776, + "step": 631 + }, + { + "epoch": 0.12460567823343849, + "grad_norm": 0.851528134608556, + "learning_rate": 1.995504012410363e-05, + "loss": 0.4963, + "step": 632 + }, + { + "epoch": 0.12480283911671924, + "grad_norm": 0.7424830012420283, + "learning_rate": 1.9954893187012927e-05, + "loss": 0.4266, + "step": 633 + }, + { + "epoch": 0.125, + "grad_norm": 0.8242224350981598, + "learning_rate": 1.99547460107479e-05, + "loss": 0.4744, + "step": 634 + }, + { + "epoch": 0.12519716088328076, + "grad_norm": 0.7923912180491203, + "learning_rate": 1.9954598595312084e-05, + "loss": 0.4633, + "step": 635 + }, + { + "epoch": 0.1253943217665615, + "grad_norm": 0.8875572382982255, + "learning_rate": 1.9954450940709018e-05, + "loss": 0.4813, + "step": 636 + }, + { + "epoch": 0.12559148264984227, + "grad_norm": 0.7926285972835072, + "learning_rate": 1.9954303046942255e-05, + "loss": 0.4878, + "step": 637 + }, + { + "epoch": 0.12578864353312302, + "grad_norm": 0.7499862912600862, + "learning_rate": 1.995415491401534e-05, + "loss": 0.4734, + "step": 638 + }, + { + "epoch": 0.12598580441640378, + "grad_norm": 0.7691964904703829, + "learning_rate": 1.9954006541931844e-05, + "loss": 0.4509, + "step": 639 + }, + { + "epoch": 0.12618296529968454, + "grad_norm": 0.7532129288589031, + "learning_rate": 1.9953857930695318e-05, + "loss": 0.4733, + "step": 640 + }, + { + "epoch": 0.1263801261829653, + "grad_norm": 0.8835688736071808, + "learning_rate": 1.995370908030934e-05, + "loss": 0.4813, + "step": 641 + }, + { + "epoch": 0.12657728706624605, + "grad_norm": 0.8280331392534872, + "learning_rate": 1.995355999077749e-05, + "loss": 0.5209, + "step": 642 + }, + { + "epoch": 0.1267744479495268, + "grad_norm": 0.7790589964453112, + "learning_rate": 1.9953410662103346e-05, + "loss": 0.468, + "step": 643 + }, + { + "epoch": 0.12697160883280756, + "grad_norm": 1.1955818097157254, + "learning_rate": 1.995326109429049e-05, + "loss": 0.5219, + "step": 644 + }, + { + "epoch": 0.12716876971608831, + "grad_norm": 0.760756053231397, + "learning_rate": 1.9953111287342524e-05, + "loss": 0.4269, + "step": 645 + }, + { + "epoch": 0.1273659305993691, + "grad_norm": 0.801104959032305, + "learning_rate": 1.9952961241263047e-05, + "loss": 0.4947, + "step": 646 + }, + { + "epoch": 0.12756309148264985, + "grad_norm": 0.9451949931761754, + "learning_rate": 1.9952810956055656e-05, + "loss": 0.4904, + "step": 647 + }, + { + "epoch": 0.1277602523659306, + "grad_norm": 0.8441360454771364, + "learning_rate": 1.995266043172397e-05, + "loss": 0.4505, + "step": 648 + }, + { + "epoch": 0.12795741324921137, + "grad_norm": 1.975864774694993, + "learning_rate": 1.99525096682716e-05, + "loss": 0.4824, + "step": 649 + }, + { + "epoch": 0.12815457413249212, + "grad_norm": 0.9293646943980033, + "learning_rate": 1.995235866570217e-05, + "loss": 0.4646, + "step": 650 + }, + { + "epoch": 0.12835173501577288, + "grad_norm": 0.7685956450339866, + "learning_rate": 1.9952207424019314e-05, + "loss": 0.4259, + "step": 651 + }, + { + "epoch": 0.12854889589905363, + "grad_norm": 1.0613268056457563, + "learning_rate": 1.9952055943226656e-05, + "loss": 0.4783, + "step": 652 + }, + { + "epoch": 0.1287460567823344, + "grad_norm": 0.7669978348644308, + "learning_rate": 1.995190422332784e-05, + "loss": 0.4342, + "step": 653 + }, + { + "epoch": 0.12894321766561515, + "grad_norm": 1.3019510608457148, + "learning_rate": 1.995175226432651e-05, + "loss": 0.5192, + "step": 654 + }, + { + "epoch": 0.1291403785488959, + "grad_norm": 1.0625567624596923, + "learning_rate": 1.995160006622632e-05, + "loss": 0.483, + "step": 655 + }, + { + "epoch": 0.12933753943217666, + "grad_norm": 0.7982339743597725, + "learning_rate": 1.995144762903092e-05, + "loss": 0.4548, + "step": 656 + }, + { + "epoch": 0.1295347003154574, + "grad_norm": 2.15419728357655, + "learning_rate": 1.995129495274398e-05, + "loss": 0.4486, + "step": 657 + }, + { + "epoch": 0.12973186119873817, + "grad_norm": 0.7953532235642334, + "learning_rate": 1.9951142037369163e-05, + "loss": 0.5146, + "step": 658 + }, + { + "epoch": 0.12992902208201892, + "grad_norm": 0.7733286355003084, + "learning_rate": 1.995098888291015e-05, + "loss": 0.4509, + "step": 659 + }, + { + "epoch": 0.13012618296529968, + "grad_norm": 0.7219639448869223, + "learning_rate": 1.995083548937061e-05, + "loss": 0.4339, + "step": 660 + }, + { + "epoch": 0.13032334384858044, + "grad_norm": 0.6876863470552866, + "learning_rate": 1.9950681856754236e-05, + "loss": 0.4397, + "step": 661 + }, + { + "epoch": 0.1305205047318612, + "grad_norm": 0.7330567511407265, + "learning_rate": 1.9950527985064717e-05, + "loss": 0.4403, + "step": 662 + }, + { + "epoch": 0.13071766561514195, + "grad_norm": 0.6967061391482817, + "learning_rate": 1.9950373874305752e-05, + "loss": 0.4402, + "step": 663 + }, + { + "epoch": 0.1309148264984227, + "grad_norm": 1.2938777572194384, + "learning_rate": 1.9950219524481042e-05, + "loss": 0.5203, + "step": 664 + }, + { + "epoch": 0.13111198738170346, + "grad_norm": 0.9539568996661243, + "learning_rate": 1.995006493559429e-05, + "loss": 0.4834, + "step": 665 + }, + { + "epoch": 0.13130914826498422, + "grad_norm": 0.8042726712344785, + "learning_rate": 1.9949910107649218e-05, + "loss": 0.4626, + "step": 666 + }, + { + "epoch": 0.13150630914826497, + "grad_norm": 1.6472739683316433, + "learning_rate": 1.9949755040649545e-05, + "loss": 0.479, + "step": 667 + }, + { + "epoch": 0.13170347003154576, + "grad_norm": 1.0783243407313121, + "learning_rate": 1.9949599734598993e-05, + "loss": 0.4656, + "step": 668 + }, + { + "epoch": 0.1319006309148265, + "grad_norm": 0.8310407005161574, + "learning_rate": 1.9949444189501294e-05, + "loss": 0.4254, + "step": 669 + }, + { + "epoch": 0.13209779179810727, + "grad_norm": 0.7957824614600361, + "learning_rate": 1.9949288405360186e-05, + "loss": 0.4683, + "step": 670 + }, + { + "epoch": 0.13229495268138802, + "grad_norm": 0.7451707613081128, + "learning_rate": 1.9949132382179415e-05, + "loss": 0.4632, + "step": 671 + }, + { + "epoch": 0.13249211356466878, + "grad_norm": 0.7738134822360583, + "learning_rate": 1.9948976119962724e-05, + "loss": 0.4252, + "step": 672 + }, + { + "epoch": 0.13268927444794953, + "grad_norm": 1.2563956712818225, + "learning_rate": 1.9948819618713868e-05, + "loss": 0.4297, + "step": 673 + }, + { + "epoch": 0.1328864353312303, + "grad_norm": 0.7251684476228085, + "learning_rate": 1.994866287843661e-05, + "loss": 0.4308, + "step": 674 + }, + { + "epoch": 0.13308359621451105, + "grad_norm": 0.8186852540206319, + "learning_rate": 1.9948505899134717e-05, + "loss": 0.4646, + "step": 675 + }, + { + "epoch": 0.1332807570977918, + "grad_norm": 1.7860304743465076, + "learning_rate": 1.994834868081196e-05, + "loss": 0.4876, + "step": 676 + }, + { + "epoch": 0.13347791798107256, + "grad_norm": 0.850580467492657, + "learning_rate": 1.9948191223472108e-05, + "loss": 0.4828, + "step": 677 + }, + { + "epoch": 0.13367507886435331, + "grad_norm": 0.7130834372780571, + "learning_rate": 1.9948033527118954e-05, + "loss": 0.4874, + "step": 678 + }, + { + "epoch": 0.13387223974763407, + "grad_norm": 0.7595923377554168, + "learning_rate": 1.9947875591756286e-05, + "loss": 0.4508, + "step": 679 + }, + { + "epoch": 0.13406940063091483, + "grad_norm": 0.841937797265363, + "learning_rate": 1.9947717417387894e-05, + "loss": 0.4789, + "step": 680 + }, + { + "epoch": 0.13426656151419558, + "grad_norm": 0.7865575362210725, + "learning_rate": 1.994755900401758e-05, + "loss": 0.439, + "step": 681 + }, + { + "epoch": 0.13446372239747634, + "grad_norm": 0.7464667503739694, + "learning_rate": 1.9947400351649148e-05, + "loss": 0.4574, + "step": 682 + }, + { + "epoch": 0.1346608832807571, + "grad_norm": 0.765573758240295, + "learning_rate": 1.9947241460286414e-05, + "loss": 0.4661, + "step": 683 + }, + { + "epoch": 0.13485804416403785, + "grad_norm": 0.8053849370801137, + "learning_rate": 1.9947082329933192e-05, + "loss": 0.4902, + "step": 684 + }, + { + "epoch": 0.1350552050473186, + "grad_norm": 0.9079668954607747, + "learning_rate": 1.9946922960593307e-05, + "loss": 0.4344, + "step": 685 + }, + { + "epoch": 0.13525236593059936, + "grad_norm": 0.7910386751156379, + "learning_rate": 1.994676335227059e-05, + "loss": 0.4423, + "step": 686 + }, + { + "epoch": 0.13544952681388012, + "grad_norm": 1.5549413245532762, + "learning_rate": 1.994660350496887e-05, + "loss": 0.5308, + "step": 687 + }, + { + "epoch": 0.13564668769716087, + "grad_norm": 0.9255915463007742, + "learning_rate": 1.9946443418691994e-05, + "loss": 0.4618, + "step": 688 + }, + { + "epoch": 0.13584384858044163, + "grad_norm": 0.7804360864352426, + "learning_rate": 1.9946283093443803e-05, + "loss": 0.449, + "step": 689 + }, + { + "epoch": 0.13604100946372238, + "grad_norm": 0.7475663046747916, + "learning_rate": 1.9946122529228153e-05, + "loss": 0.4503, + "step": 690 + }, + { + "epoch": 0.13623817034700317, + "grad_norm": 1.2554380433560701, + "learning_rate": 1.9945961726048895e-05, + "loss": 0.5168, + "step": 691 + }, + { + "epoch": 0.13643533123028392, + "grad_norm": 0.7879053546798648, + "learning_rate": 1.9945800683909904e-05, + "loss": 0.4771, + "step": 692 + }, + { + "epoch": 0.13663249211356468, + "grad_norm": 0.7487562573568673, + "learning_rate": 1.9945639402815037e-05, + "loss": 0.4672, + "step": 693 + }, + { + "epoch": 0.13682965299684544, + "grad_norm": 0.7202621138555081, + "learning_rate": 1.9945477882768177e-05, + "loss": 0.447, + "step": 694 + }, + { + "epoch": 0.1370268138801262, + "grad_norm": 0.8717913765163078, + "learning_rate": 1.99453161237732e-05, + "loss": 0.4599, + "step": 695 + }, + { + "epoch": 0.13722397476340695, + "grad_norm": 0.8569531287456702, + "learning_rate": 1.9945154125833996e-05, + "loss": 0.4604, + "step": 696 + }, + { + "epoch": 0.1374211356466877, + "grad_norm": 0.8364641774305389, + "learning_rate": 1.9944991888954453e-05, + "loss": 0.48, + "step": 697 + }, + { + "epoch": 0.13761829652996846, + "grad_norm": 0.7618210867976017, + "learning_rate": 1.9944829413138472e-05, + "loss": 0.4546, + "step": 698 + }, + { + "epoch": 0.13781545741324921, + "grad_norm": 0.745175412220073, + "learning_rate": 1.9944666698389957e-05, + "loss": 0.465, + "step": 699 + }, + { + "epoch": 0.13801261829652997, + "grad_norm": 0.8186541094239488, + "learning_rate": 1.9944503744712814e-05, + "loss": 0.5081, + "step": 700 + }, + { + "epoch": 0.13820977917981073, + "grad_norm": 0.7800566021733093, + "learning_rate": 1.994434055211096e-05, + "loss": 0.4692, + "step": 701 + }, + { + "epoch": 0.13840694006309148, + "grad_norm": 2.337423114442513, + "learning_rate": 1.9944177120588318e-05, + "loss": 0.4922, + "step": 702 + }, + { + "epoch": 0.13860410094637224, + "grad_norm": 0.9603122253059109, + "learning_rate": 1.9944013450148812e-05, + "loss": 0.5075, + "step": 703 + }, + { + "epoch": 0.138801261829653, + "grad_norm": 0.7119216198791689, + "learning_rate": 1.9943849540796375e-05, + "loss": 0.4626, + "step": 704 + }, + { + "epoch": 0.13899842271293375, + "grad_norm": 0.8234426695741333, + "learning_rate": 1.9943685392534945e-05, + "loss": 0.4274, + "step": 705 + }, + { + "epoch": 0.1391955835962145, + "grad_norm": 0.8841914182356732, + "learning_rate": 1.9943521005368468e-05, + "loss": 0.5169, + "step": 706 + }, + { + "epoch": 0.13939274447949526, + "grad_norm": 0.9754273189473419, + "learning_rate": 1.994335637930089e-05, + "loss": 0.4822, + "step": 707 + }, + { + "epoch": 0.13958990536277602, + "grad_norm": 0.8312745222051547, + "learning_rate": 1.9943191514336164e-05, + "loss": 0.4614, + "step": 708 + }, + { + "epoch": 0.13978706624605677, + "grad_norm": 0.8430236639900083, + "learning_rate": 1.9943026410478258e-05, + "loss": 0.4339, + "step": 709 + }, + { + "epoch": 0.13998422712933753, + "grad_norm": 1.0814961430956602, + "learning_rate": 1.9942861067731135e-05, + "loss": 0.4825, + "step": 710 + }, + { + "epoch": 0.14018138801261829, + "grad_norm": 0.77685594526798, + "learning_rate": 1.994269548609877e-05, + "loss": 0.4801, + "step": 711 + }, + { + "epoch": 0.14037854889589904, + "grad_norm": 0.8376210695048821, + "learning_rate": 1.9942529665585134e-05, + "loss": 0.4742, + "step": 712 + }, + { + "epoch": 0.1405757097791798, + "grad_norm": 1.0443709227200295, + "learning_rate": 1.994236360619422e-05, + "loss": 0.4616, + "step": 713 + }, + { + "epoch": 0.14077287066246058, + "grad_norm": 0.7104447806333858, + "learning_rate": 1.9942197307930014e-05, + "loss": 0.4193, + "step": 714 + }, + { + "epoch": 0.14097003154574134, + "grad_norm": 0.7967791879544678, + "learning_rate": 1.994203077079651e-05, + "loss": 0.4377, + "step": 715 + }, + { + "epoch": 0.1411671924290221, + "grad_norm": 0.6852662543141802, + "learning_rate": 1.994186399479771e-05, + "loss": 0.4558, + "step": 716 + }, + { + "epoch": 0.14136435331230285, + "grad_norm": 0.7293696917628472, + "learning_rate": 1.9941696979937622e-05, + "loss": 0.4695, + "step": 717 + }, + { + "epoch": 0.1415615141955836, + "grad_norm": 1.0106248411363608, + "learning_rate": 1.994152972622026e-05, + "loss": 0.4496, + "step": 718 + }, + { + "epoch": 0.14175867507886436, + "grad_norm": 2.069396673415166, + "learning_rate": 1.994136223364964e-05, + "loss": 0.5479, + "step": 719 + }, + { + "epoch": 0.14195583596214512, + "grad_norm": 0.8625581928141414, + "learning_rate": 1.994119450222978e-05, + "loss": 0.5064, + "step": 720 + }, + { + "epoch": 0.14215299684542587, + "grad_norm": 0.9612101953431659, + "learning_rate": 1.9941026531964723e-05, + "loss": 0.4676, + "step": 721 + }, + { + "epoch": 0.14235015772870663, + "grad_norm": 0.775181863734654, + "learning_rate": 1.9940858322858493e-05, + "loss": 0.4736, + "step": 722 + }, + { + "epoch": 0.14254731861198738, + "grad_norm": 1.0608368229238194, + "learning_rate": 1.994068987491514e-05, + "loss": 0.4761, + "step": 723 + }, + { + "epoch": 0.14274447949526814, + "grad_norm": 0.7660677802463719, + "learning_rate": 1.9940521188138707e-05, + "loss": 0.4652, + "step": 724 + }, + { + "epoch": 0.1429416403785489, + "grad_norm": 0.8853653109759493, + "learning_rate": 1.9940352262533253e-05, + "loss": 0.4351, + "step": 725 + }, + { + "epoch": 0.14313880126182965, + "grad_norm": 0.8053400341791106, + "learning_rate": 1.9940183098102823e-05, + "loss": 0.4688, + "step": 726 + }, + { + "epoch": 0.1433359621451104, + "grad_norm": 0.8403515646943758, + "learning_rate": 1.9940013694851492e-05, + "loss": 0.4709, + "step": 727 + }, + { + "epoch": 0.14353312302839116, + "grad_norm": 0.8639620985700686, + "learning_rate": 1.9939844052783328e-05, + "loss": 0.44, + "step": 728 + }, + { + "epoch": 0.14373028391167192, + "grad_norm": 0.7864708085272261, + "learning_rate": 1.9939674171902406e-05, + "loss": 0.4835, + "step": 729 + }, + { + "epoch": 0.14392744479495267, + "grad_norm": 1.5378342164534347, + "learning_rate": 1.9939504052212807e-05, + "loss": 0.4371, + "step": 730 + }, + { + "epoch": 0.14412460567823343, + "grad_norm": 0.8960131433484737, + "learning_rate": 1.993933369371862e-05, + "loss": 0.4814, + "step": 731 + }, + { + "epoch": 0.1443217665615142, + "grad_norm": 1.2989860336198154, + "learning_rate": 1.9939163096423936e-05, + "loss": 0.4499, + "step": 732 + }, + { + "epoch": 0.14451892744479494, + "grad_norm": 0.8858611738667077, + "learning_rate": 1.9938992260332854e-05, + "loss": 0.4327, + "step": 733 + }, + { + "epoch": 0.1447160883280757, + "grad_norm": 1.1182251531253669, + "learning_rate": 1.993882118544948e-05, + "loss": 0.4575, + "step": 734 + }, + { + "epoch": 0.14491324921135645, + "grad_norm": 1.181366460583882, + "learning_rate": 1.993864987177792e-05, + "loss": 0.4997, + "step": 735 + }, + { + "epoch": 0.14511041009463724, + "grad_norm": 0.8730586316239893, + "learning_rate": 1.9938478319322296e-05, + "loss": 0.4279, + "step": 736 + }, + { + "epoch": 0.145307570977918, + "grad_norm": 0.8622249699446333, + "learning_rate": 1.9938306528086728e-05, + "loss": 0.4689, + "step": 737 + }, + { + "epoch": 0.14550473186119875, + "grad_norm": 0.856694039815468, + "learning_rate": 1.9938134498075344e-05, + "loss": 0.495, + "step": 738 + }, + { + "epoch": 0.1457018927444795, + "grad_norm": 1.076447016915175, + "learning_rate": 1.993796222929227e-05, + "loss": 0.4981, + "step": 739 + }, + { + "epoch": 0.14589905362776026, + "grad_norm": 1.1443003392978084, + "learning_rate": 1.9937789721741654e-05, + "loss": 0.4818, + "step": 740 + }, + { + "epoch": 0.14609621451104102, + "grad_norm": 0.7913177157915059, + "learning_rate": 1.9937616975427635e-05, + "loss": 0.4423, + "step": 741 + }, + { + "epoch": 0.14629337539432177, + "grad_norm": 0.8690811074204894, + "learning_rate": 1.993744399035437e-05, + "loss": 0.4583, + "step": 742 + }, + { + "epoch": 0.14649053627760253, + "grad_norm": 0.8018130690999904, + "learning_rate": 1.9937270766526007e-05, + "loss": 0.4581, + "step": 743 + }, + { + "epoch": 0.14668769716088328, + "grad_norm": 0.7068458870399968, + "learning_rate": 1.9937097303946712e-05, + "loss": 0.4365, + "step": 744 + }, + { + "epoch": 0.14688485804416404, + "grad_norm": 1.106160247073608, + "learning_rate": 1.993692360262065e-05, + "loss": 0.4621, + "step": 745 + }, + { + "epoch": 0.1470820189274448, + "grad_norm": 1.2660319824619375, + "learning_rate": 1.9936749662552e-05, + "loss": 0.4606, + "step": 746 + }, + { + "epoch": 0.14727917981072555, + "grad_norm": 1.0379124369378552, + "learning_rate": 1.9936575483744934e-05, + "loss": 0.5044, + "step": 747 + }, + { + "epoch": 0.1474763406940063, + "grad_norm": 2.817773387208584, + "learning_rate": 1.993640106620364e-05, + "loss": 0.4786, + "step": 748 + }, + { + "epoch": 0.14767350157728706, + "grad_norm": 1.2455773315165135, + "learning_rate": 1.993622640993231e-05, + "loss": 0.4406, + "step": 749 + }, + { + "epoch": 0.14787066246056782, + "grad_norm": 0.8090742340724816, + "learning_rate": 1.993605151493514e-05, + "loss": 0.4581, + "step": 750 + }, + { + "epoch": 0.14806782334384858, + "grad_norm": 1.079672829303502, + "learning_rate": 1.9935876381216327e-05, + "loss": 0.4869, + "step": 751 + }, + { + "epoch": 0.14826498422712933, + "grad_norm": 1.1108900962579644, + "learning_rate": 1.993570100878009e-05, + "loss": 0.4919, + "step": 752 + }, + { + "epoch": 0.1484621451104101, + "grad_norm": 0.9754359159817712, + "learning_rate": 1.993552539763063e-05, + "loss": 0.4823, + "step": 753 + }, + { + "epoch": 0.14865930599369084, + "grad_norm": 0.9888856468493855, + "learning_rate": 1.9935349547772168e-05, + "loss": 0.4685, + "step": 754 + }, + { + "epoch": 0.1488564668769716, + "grad_norm": 0.8379931331598425, + "learning_rate": 1.993517345920894e-05, + "loss": 0.4936, + "step": 755 + }, + { + "epoch": 0.14905362776025236, + "grad_norm": 1.0605810276155365, + "learning_rate": 1.9934997131945165e-05, + "loss": 0.5102, + "step": 756 + }, + { + "epoch": 0.1492507886435331, + "grad_norm": 0.7915803452960315, + "learning_rate": 1.993482056598508e-05, + "loss": 0.4434, + "step": 757 + }, + { + "epoch": 0.14944794952681387, + "grad_norm": 0.9745418084289177, + "learning_rate": 1.9934643761332933e-05, + "loss": 0.493, + "step": 758 + }, + { + "epoch": 0.14964511041009465, + "grad_norm": 0.8122670056237417, + "learning_rate": 1.993446671799297e-05, + "loss": 0.4966, + "step": 759 + }, + { + "epoch": 0.1498422712933754, + "grad_norm": 0.8242151071688861, + "learning_rate": 1.9934289435969443e-05, + "loss": 0.4771, + "step": 760 + }, + { + "epoch": 0.15003943217665616, + "grad_norm": 0.8669205650067113, + "learning_rate": 1.9934111915266614e-05, + "loss": 0.4658, + "step": 761 + }, + { + "epoch": 0.15023659305993692, + "grad_norm": 0.8210061083466603, + "learning_rate": 1.9933934155888745e-05, + "loss": 0.4503, + "step": 762 + }, + { + "epoch": 0.15043375394321767, + "grad_norm": 0.8739389286616153, + "learning_rate": 1.993375615784011e-05, + "loss": 0.4426, + "step": 763 + }, + { + "epoch": 0.15063091482649843, + "grad_norm": 0.8857407360915495, + "learning_rate": 1.993357792112498e-05, + "loss": 0.4337, + "step": 764 + }, + { + "epoch": 0.15082807570977919, + "grad_norm": 0.8810531719213002, + "learning_rate": 1.9933399445747645e-05, + "loss": 0.4614, + "step": 765 + }, + { + "epoch": 0.15102523659305994, + "grad_norm": 0.8157853115519594, + "learning_rate": 1.9933220731712385e-05, + "loss": 0.4531, + "step": 766 + }, + { + "epoch": 0.1512223974763407, + "grad_norm": 0.6694213590950058, + "learning_rate": 1.9933041779023502e-05, + "loss": 0.4429, + "step": 767 + }, + { + "epoch": 0.15141955835962145, + "grad_norm": 0.8193309247418029, + "learning_rate": 1.993286258768529e-05, + "loss": 0.4641, + "step": 768 + }, + { + "epoch": 0.1516167192429022, + "grad_norm": 1.13684291197781, + "learning_rate": 1.9932683157702054e-05, + "loss": 0.4807, + "step": 769 + }, + { + "epoch": 0.15181388012618297, + "grad_norm": 0.7712724217918977, + "learning_rate": 1.9932503489078105e-05, + "loss": 0.496, + "step": 770 + }, + { + "epoch": 0.15201104100946372, + "grad_norm": 0.7302957702591549, + "learning_rate": 1.993232358181776e-05, + "loss": 0.4562, + "step": 771 + }, + { + "epoch": 0.15220820189274448, + "grad_norm": 2.0853914712696477, + "learning_rate": 1.9932143435925346e-05, + "loss": 0.4809, + "step": 772 + }, + { + "epoch": 0.15240536277602523, + "grad_norm": 0.7477688111923385, + "learning_rate": 1.993196305140519e-05, + "loss": 0.4518, + "step": 773 + }, + { + "epoch": 0.152602523659306, + "grad_norm": 0.7646481115742062, + "learning_rate": 1.993178242826162e-05, + "loss": 0.4686, + "step": 774 + }, + { + "epoch": 0.15279968454258674, + "grad_norm": 0.7688921576232522, + "learning_rate": 1.9931601566498976e-05, + "loss": 0.4965, + "step": 775 + }, + { + "epoch": 0.1529968454258675, + "grad_norm": 0.749864201600683, + "learning_rate": 1.9931420466121613e-05, + "loss": 0.435, + "step": 776 + }, + { + "epoch": 0.15319400630914826, + "grad_norm": 0.856076678547743, + "learning_rate": 1.993123912713387e-05, + "loss": 0.5236, + "step": 777 + }, + { + "epoch": 0.153391167192429, + "grad_norm": 1.1208704076088636, + "learning_rate": 1.9931057549540114e-05, + "loss": 0.5461, + "step": 778 + }, + { + "epoch": 0.15358832807570977, + "grad_norm": 1.6176725348893128, + "learning_rate": 1.9930875733344698e-05, + "loss": 0.4927, + "step": 779 + }, + { + "epoch": 0.15378548895899052, + "grad_norm": 0.6955630821889409, + "learning_rate": 1.9930693678552e-05, + "loss": 0.4298, + "step": 780 + }, + { + "epoch": 0.15398264984227128, + "grad_norm": 0.7928697755514822, + "learning_rate": 1.9930511385166388e-05, + "loss": 0.4368, + "step": 781 + }, + { + "epoch": 0.15417981072555206, + "grad_norm": 0.9287503152149722, + "learning_rate": 1.9930328853192243e-05, + "loss": 0.4308, + "step": 782 + }, + { + "epoch": 0.15437697160883282, + "grad_norm": 0.9144564030468558, + "learning_rate": 1.993014608263395e-05, + "loss": 0.4802, + "step": 783 + }, + { + "epoch": 0.15457413249211358, + "grad_norm": 0.7843727384732332, + "learning_rate": 1.9929963073495896e-05, + "loss": 0.4635, + "step": 784 + }, + { + "epoch": 0.15477129337539433, + "grad_norm": 0.8191880589199644, + "learning_rate": 1.992977982578249e-05, + "loss": 0.4816, + "step": 785 + }, + { + "epoch": 0.1549684542586751, + "grad_norm": 0.8904807870661935, + "learning_rate": 1.9929596339498122e-05, + "loss": 0.47, + "step": 786 + }, + { + "epoch": 0.15516561514195584, + "grad_norm": 1.8183390384125857, + "learning_rate": 1.9929412614647207e-05, + "loss": 0.4631, + "step": 787 + }, + { + "epoch": 0.1553627760252366, + "grad_norm": 0.8216800794838671, + "learning_rate": 1.992922865123416e-05, + "loss": 0.4264, + "step": 788 + }, + { + "epoch": 0.15555993690851735, + "grad_norm": 0.7165148280953453, + "learning_rate": 1.9929044449263397e-05, + "loss": 0.4674, + "step": 789 + }, + { + "epoch": 0.1557570977917981, + "grad_norm": 1.52791161060643, + "learning_rate": 1.9928860008739343e-05, + "loss": 0.5036, + "step": 790 + }, + { + "epoch": 0.15595425867507887, + "grad_norm": 0.9923711638162429, + "learning_rate": 1.9928675329666435e-05, + "loss": 0.4778, + "step": 791 + }, + { + "epoch": 0.15615141955835962, + "grad_norm": 0.8551576282277356, + "learning_rate": 1.9928490412049108e-05, + "loss": 0.464, + "step": 792 + }, + { + "epoch": 0.15634858044164038, + "grad_norm": 0.7732867427931409, + "learning_rate": 1.99283052558918e-05, + "loss": 0.4623, + "step": 793 + }, + { + "epoch": 0.15654574132492113, + "grad_norm": 0.9096270847699062, + "learning_rate": 1.9928119861198962e-05, + "loss": 0.5047, + "step": 794 + }, + { + "epoch": 0.1567429022082019, + "grad_norm": 0.913028519166887, + "learning_rate": 1.9927934227975054e-05, + "loss": 0.4832, + "step": 795 + }, + { + "epoch": 0.15694006309148265, + "grad_norm": 0.7830739067460275, + "learning_rate": 1.9927748356224528e-05, + "loss": 0.4744, + "step": 796 + }, + { + "epoch": 0.1571372239747634, + "grad_norm": 0.8632625237500243, + "learning_rate": 1.9927562245951854e-05, + "loss": 0.4458, + "step": 797 + }, + { + "epoch": 0.15733438485804416, + "grad_norm": 0.7175002952446166, + "learning_rate": 1.9927375897161502e-05, + "loss": 0.4602, + "step": 798 + }, + { + "epoch": 0.1575315457413249, + "grad_norm": 0.8959680573938172, + "learning_rate": 1.9927189309857948e-05, + "loss": 0.4611, + "step": 799 + }, + { + "epoch": 0.15772870662460567, + "grad_norm": 0.8010317830596815, + "learning_rate": 1.9927002484045678e-05, + "loss": 0.4761, + "step": 800 + }, + { + "epoch": 0.15792586750788642, + "grad_norm": 0.704519354931981, + "learning_rate": 1.992681541972918e-05, + "loss": 0.4654, + "step": 801 + }, + { + "epoch": 0.15812302839116718, + "grad_norm": 1.0431620409267255, + "learning_rate": 1.9926628116912946e-05, + "loss": 0.519, + "step": 802 + }, + { + "epoch": 0.15832018927444794, + "grad_norm": 0.743437599527334, + "learning_rate": 1.992644057560148e-05, + "loss": 0.4452, + "step": 803 + }, + { + "epoch": 0.15851735015772872, + "grad_norm": 2.131396450457408, + "learning_rate": 1.992625279579928e-05, + "loss": 0.4587, + "step": 804 + }, + { + "epoch": 0.15871451104100948, + "grad_norm": 1.6197122809814877, + "learning_rate": 1.992606477751087e-05, + "loss": 0.4668, + "step": 805 + }, + { + "epoch": 0.15891167192429023, + "grad_norm": 0.9266315678990761, + "learning_rate": 1.9925876520740758e-05, + "loss": 0.5003, + "step": 806 + }, + { + "epoch": 0.159108832807571, + "grad_norm": 1.3138014974549306, + "learning_rate": 1.9925688025493468e-05, + "loss": 0.4816, + "step": 807 + }, + { + "epoch": 0.15930599369085174, + "grad_norm": 0.7340535577058777, + "learning_rate": 1.9925499291773528e-05, + "loss": 0.4804, + "step": 808 + }, + { + "epoch": 0.1595031545741325, + "grad_norm": 1.1487683565124704, + "learning_rate": 1.9925310319585475e-05, + "loss": 0.4804, + "step": 809 + }, + { + "epoch": 0.15970031545741326, + "grad_norm": 0.7305026939171207, + "learning_rate": 1.9925121108933852e-05, + "loss": 0.4774, + "step": 810 + }, + { + "epoch": 0.159897476340694, + "grad_norm": 0.9950230997430713, + "learning_rate": 1.99249316598232e-05, + "loss": 0.4638, + "step": 811 + }, + { + "epoch": 0.16009463722397477, + "grad_norm": 0.857287078084844, + "learning_rate": 1.9924741972258076e-05, + "loss": 0.4742, + "step": 812 + }, + { + "epoch": 0.16029179810725552, + "grad_norm": 0.8005250545507363, + "learning_rate": 1.9924552046243026e-05, + "loss": 0.488, + "step": 813 + }, + { + "epoch": 0.16048895899053628, + "grad_norm": 0.90398378601464, + "learning_rate": 1.9924361881782625e-05, + "loss": 0.4677, + "step": 814 + }, + { + "epoch": 0.16068611987381703, + "grad_norm": 0.7372247300869629, + "learning_rate": 1.992417147888144e-05, + "loss": 0.4529, + "step": 815 + }, + { + "epoch": 0.1608832807570978, + "grad_norm": 0.8887941980027961, + "learning_rate": 1.992398083754404e-05, + "loss": 0.4629, + "step": 816 + }, + { + "epoch": 0.16108044164037855, + "grad_norm": 0.6535394579923204, + "learning_rate": 1.992378995777501e-05, + "loss": 0.4515, + "step": 817 + }, + { + "epoch": 0.1612776025236593, + "grad_norm": 0.84734882482852, + "learning_rate": 1.9923598839578937e-05, + "loss": 0.4263, + "step": 818 + }, + { + "epoch": 0.16147476340694006, + "grad_norm": 0.7925717387308715, + "learning_rate": 1.9923407482960408e-05, + "loss": 0.4775, + "step": 819 + }, + { + "epoch": 0.16167192429022081, + "grad_norm": 0.8742951540598778, + "learning_rate": 1.9923215887924022e-05, + "loss": 0.4637, + "step": 820 + }, + { + "epoch": 0.16186908517350157, + "grad_norm": 0.7279557612080059, + "learning_rate": 1.9923024054474384e-05, + "loss": 0.4636, + "step": 821 + }, + { + "epoch": 0.16206624605678233, + "grad_norm": 0.8534513364783904, + "learning_rate": 1.99228319826161e-05, + "loss": 0.4857, + "step": 822 + }, + { + "epoch": 0.16226340694006308, + "grad_norm": 0.7402713942104387, + "learning_rate": 1.992263967235379e-05, + "loss": 0.481, + "step": 823 + }, + { + "epoch": 0.16246056782334384, + "grad_norm": 0.7153338719389193, + "learning_rate": 1.992244712369207e-05, + "loss": 0.4457, + "step": 824 + }, + { + "epoch": 0.1626577287066246, + "grad_norm": 1.4728339073471215, + "learning_rate": 1.9922254336635567e-05, + "loss": 0.4514, + "step": 825 + }, + { + "epoch": 0.16285488958990535, + "grad_norm": 0.8791659930736712, + "learning_rate": 1.9922061311188914e-05, + "loss": 0.4852, + "step": 826 + }, + { + "epoch": 0.16305205047318613, + "grad_norm": 0.7681632672109859, + "learning_rate": 1.9921868047356747e-05, + "loss": 0.4869, + "step": 827 + }, + { + "epoch": 0.1632492113564669, + "grad_norm": 0.8229171295277476, + "learning_rate": 1.992167454514371e-05, + "loss": 0.4828, + "step": 828 + }, + { + "epoch": 0.16344637223974764, + "grad_norm": 0.7529692381939216, + "learning_rate": 1.9921480804554453e-05, + "loss": 0.46, + "step": 829 + }, + { + "epoch": 0.1636435331230284, + "grad_norm": 0.9423934940595431, + "learning_rate": 1.9921286825593632e-05, + "loss": 0.4418, + "step": 830 + }, + { + "epoch": 0.16384069400630916, + "grad_norm": 0.8322161382525551, + "learning_rate": 1.9921092608265902e-05, + "loss": 0.4489, + "step": 831 + }, + { + "epoch": 0.1640378548895899, + "grad_norm": 0.8265863211632591, + "learning_rate": 1.9920898152575932e-05, + "loss": 0.457, + "step": 832 + }, + { + "epoch": 0.16423501577287067, + "grad_norm": 0.8016187387195418, + "learning_rate": 1.99207034585284e-05, + "loss": 0.4935, + "step": 833 + }, + { + "epoch": 0.16443217665615142, + "grad_norm": 0.7562870006268851, + "learning_rate": 1.992050852612797e-05, + "loss": 0.4901, + "step": 834 + }, + { + "epoch": 0.16462933753943218, + "grad_norm": 0.8037484145010387, + "learning_rate": 1.992031335537934e-05, + "loss": 0.4905, + "step": 835 + }, + { + "epoch": 0.16482649842271294, + "grad_norm": 0.8098114037544843, + "learning_rate": 1.9920117946287193e-05, + "loss": 0.465, + "step": 836 + }, + { + "epoch": 0.1650236593059937, + "grad_norm": 0.8269800376599079, + "learning_rate": 1.991992229885622e-05, + "loss": 0.4735, + "step": 837 + }, + { + "epoch": 0.16522082018927445, + "grad_norm": 0.7558534353840859, + "learning_rate": 1.9919726413091127e-05, + "loss": 0.4824, + "step": 838 + }, + { + "epoch": 0.1654179810725552, + "grad_norm": 0.766057843711096, + "learning_rate": 1.9919530288996617e-05, + "loss": 0.4509, + "step": 839 + }, + { + "epoch": 0.16561514195583596, + "grad_norm": 0.8748898217978727, + "learning_rate": 1.9919333926577406e-05, + "loss": 0.442, + "step": 840 + }, + { + "epoch": 0.16581230283911672, + "grad_norm": 0.8934360442693702, + "learning_rate": 1.9919137325838208e-05, + "loss": 0.5239, + "step": 841 + }, + { + "epoch": 0.16600946372239747, + "grad_norm": 0.8552477164679089, + "learning_rate": 1.9918940486783752e-05, + "loss": 0.4645, + "step": 842 + }, + { + "epoch": 0.16620662460567823, + "grad_norm": 0.7957360554655829, + "learning_rate": 1.9918743409418756e-05, + "loss": 0.4883, + "step": 843 + }, + { + "epoch": 0.16640378548895898, + "grad_norm": 0.6874695944020062, + "learning_rate": 1.9918546093747965e-05, + "loss": 0.4179, + "step": 844 + }, + { + "epoch": 0.16660094637223974, + "grad_norm": 0.7762889366838219, + "learning_rate": 1.991834853977612e-05, + "loss": 0.4905, + "step": 845 + }, + { + "epoch": 0.1667981072555205, + "grad_norm": 0.7696965320259346, + "learning_rate": 1.9918150747507963e-05, + "loss": 0.4404, + "step": 846 + }, + { + "epoch": 0.16699526813880125, + "grad_norm": 0.8072545032949946, + "learning_rate": 1.9917952716948243e-05, + "loss": 0.4681, + "step": 847 + }, + { + "epoch": 0.167192429022082, + "grad_norm": 0.8777353512586558, + "learning_rate": 1.9917754448101725e-05, + "loss": 0.4939, + "step": 848 + }, + { + "epoch": 0.16738958990536276, + "grad_norm": 0.7082853265302725, + "learning_rate": 1.991755594097317e-05, + "loss": 0.4543, + "step": 849 + }, + { + "epoch": 0.16758675078864355, + "grad_norm": 0.8604796366799073, + "learning_rate": 1.9917357195567347e-05, + "loss": 0.4454, + "step": 850 + }, + { + "epoch": 0.1677839116719243, + "grad_norm": 0.6898391954794304, + "learning_rate": 1.991715821188903e-05, + "loss": 0.4238, + "step": 851 + }, + { + "epoch": 0.16798107255520506, + "grad_norm": 0.8800493119077631, + "learning_rate": 1.9916958989943002e-05, + "loss": 0.4569, + "step": 852 + }, + { + "epoch": 0.1681782334384858, + "grad_norm": 0.8417073275173806, + "learning_rate": 1.9916759529734046e-05, + "loss": 0.4777, + "step": 853 + }, + { + "epoch": 0.16837539432176657, + "grad_norm": 0.7791244231651858, + "learning_rate": 1.9916559831266957e-05, + "loss": 0.4544, + "step": 854 + }, + { + "epoch": 0.16857255520504733, + "grad_norm": 0.8140536453201463, + "learning_rate": 1.9916359894546534e-05, + "loss": 0.4634, + "step": 855 + }, + { + "epoch": 0.16876971608832808, + "grad_norm": 0.7303022471726613, + "learning_rate": 1.9916159719577577e-05, + "loss": 0.4519, + "step": 856 + }, + { + "epoch": 0.16896687697160884, + "grad_norm": 0.7729780959678046, + "learning_rate": 1.9915959306364897e-05, + "loss": 0.422, + "step": 857 + }, + { + "epoch": 0.1691640378548896, + "grad_norm": 0.8034659806730786, + "learning_rate": 1.9915758654913313e-05, + "loss": 0.4942, + "step": 858 + }, + { + "epoch": 0.16936119873817035, + "grad_norm": 0.7897233162629976, + "learning_rate": 1.991555776522764e-05, + "loss": 0.4762, + "step": 859 + }, + { + "epoch": 0.1695583596214511, + "grad_norm": 0.7803962061080867, + "learning_rate": 1.9915356637312704e-05, + "loss": 0.4649, + "step": 860 + }, + { + "epoch": 0.16975552050473186, + "grad_norm": 0.9547947469437102, + "learning_rate": 1.991515527117334e-05, + "loss": 0.4652, + "step": 861 + }, + { + "epoch": 0.16995268138801262, + "grad_norm": 0.8072759008329805, + "learning_rate": 1.9914953666814392e-05, + "loss": 0.4665, + "step": 862 + }, + { + "epoch": 0.17014984227129337, + "grad_norm": 0.8077676711668577, + "learning_rate": 1.9914751824240694e-05, + "loss": 0.4617, + "step": 863 + }, + { + "epoch": 0.17034700315457413, + "grad_norm": 1.293222408833284, + "learning_rate": 1.9914549743457096e-05, + "loss": 0.4666, + "step": 864 + }, + { + "epoch": 0.17054416403785488, + "grad_norm": 0.7923729068722024, + "learning_rate": 1.991434742446846e-05, + "loss": 0.4261, + "step": 865 + }, + { + "epoch": 0.17074132492113564, + "grad_norm": 14.718649448920804, + "learning_rate": 1.9914144867279644e-05, + "loss": 0.5072, + "step": 866 + }, + { + "epoch": 0.1709384858044164, + "grad_norm": 0.831483953129002, + "learning_rate": 1.991394207189551e-05, + "loss": 0.4281, + "step": 867 + }, + { + "epoch": 0.17113564668769715, + "grad_norm": 0.8157367156824404, + "learning_rate": 1.9913739038320935e-05, + "loss": 0.4886, + "step": 868 + }, + { + "epoch": 0.1713328075709779, + "grad_norm": 0.747934401454802, + "learning_rate": 1.99135357665608e-05, + "loss": 0.464, + "step": 869 + }, + { + "epoch": 0.17152996845425866, + "grad_norm": 0.7940635147564067, + "learning_rate": 1.991333225661998e-05, + "loss": 0.446, + "step": 870 + }, + { + "epoch": 0.17172712933753942, + "grad_norm": 0.7556559635824, + "learning_rate": 1.9913128508503373e-05, + "loss": 0.4187, + "step": 871 + }, + { + "epoch": 0.1719242902208202, + "grad_norm": 0.937585969395132, + "learning_rate": 1.991292452221587e-05, + "loss": 0.4627, + "step": 872 + }, + { + "epoch": 0.17212145110410096, + "grad_norm": 0.7287727570911343, + "learning_rate": 1.9912720297762372e-05, + "loss": 0.4412, + "step": 873 + }, + { + "epoch": 0.17231861198738171, + "grad_norm": 1.4483761847244239, + "learning_rate": 1.9912515835147785e-05, + "loss": 0.4702, + "step": 874 + }, + { + "epoch": 0.17251577287066247, + "grad_norm": 0.7776251644712487, + "learning_rate": 1.9912311134377023e-05, + "loss": 0.4827, + "step": 875 + }, + { + "epoch": 0.17271293375394323, + "grad_norm": 0.811467179063533, + "learning_rate": 1.9912106195455002e-05, + "loss": 0.4303, + "step": 876 + }, + { + "epoch": 0.17291009463722398, + "grad_norm": 0.7347947732597557, + "learning_rate": 1.991190101838665e-05, + "loss": 0.48, + "step": 877 + }, + { + "epoch": 0.17310725552050474, + "grad_norm": 0.7700860751914179, + "learning_rate": 1.9911695603176896e-05, + "loss": 0.4418, + "step": 878 + }, + { + "epoch": 0.1733044164037855, + "grad_norm": 0.7424955632937327, + "learning_rate": 1.9911489949830665e-05, + "loss": 0.4778, + "step": 879 + }, + { + "epoch": 0.17350157728706625, + "grad_norm": 0.6988442966765853, + "learning_rate": 1.9911284058352916e-05, + "loss": 0.4955, + "step": 880 + }, + { + "epoch": 0.173698738170347, + "grad_norm": 1.201129785880802, + "learning_rate": 1.9911077928748577e-05, + "loss": 0.5094, + "step": 881 + }, + { + "epoch": 0.17389589905362776, + "grad_norm": 0.7077413544925765, + "learning_rate": 1.9910871561022617e-05, + "loss": 0.4489, + "step": 882 + }, + { + "epoch": 0.17409305993690852, + "grad_norm": 0.7332451772736706, + "learning_rate": 1.9910664955179983e-05, + "loss": 0.4479, + "step": 883 + }, + { + "epoch": 0.17429022082018927, + "grad_norm": 0.7177585522779164, + "learning_rate": 1.9910458111225645e-05, + "loss": 0.428, + "step": 884 + }, + { + "epoch": 0.17448738170347003, + "grad_norm": 0.6737153923574553, + "learning_rate": 1.9910251029164568e-05, + "loss": 0.4288, + "step": 885 + }, + { + "epoch": 0.17468454258675079, + "grad_norm": 0.8450869761757991, + "learning_rate": 1.9910043709001727e-05, + "loss": 0.4537, + "step": 886 + }, + { + "epoch": 0.17488170347003154, + "grad_norm": 0.6872411051621798, + "learning_rate": 1.990983615074211e-05, + "loss": 0.4449, + "step": 887 + }, + { + "epoch": 0.1750788643533123, + "grad_norm": 0.8721136747808786, + "learning_rate": 1.9909628354390697e-05, + "loss": 0.4752, + "step": 888 + }, + { + "epoch": 0.17527602523659305, + "grad_norm": 0.7655080513181773, + "learning_rate": 1.990942031995248e-05, + "loss": 0.4868, + "step": 889 + }, + { + "epoch": 0.1754731861198738, + "grad_norm": 0.7199528165114193, + "learning_rate": 1.9909212047432465e-05, + "loss": 0.4528, + "step": 890 + }, + { + "epoch": 0.17567034700315456, + "grad_norm": 0.7061497459915089, + "learning_rate": 1.990900353683565e-05, + "loss": 0.455, + "step": 891 + }, + { + "epoch": 0.17586750788643532, + "grad_norm": 2.291375288631725, + "learning_rate": 1.990879478816704e-05, + "loss": 0.4722, + "step": 892 + }, + { + "epoch": 0.17606466876971608, + "grad_norm": 0.8123591618774287, + "learning_rate": 1.9908585801431658e-05, + "loss": 0.4706, + "step": 893 + }, + { + "epoch": 0.17626182965299683, + "grad_norm": 0.785222829757174, + "learning_rate": 1.9908376576634526e-05, + "loss": 0.4608, + "step": 894 + }, + { + "epoch": 0.17645899053627762, + "grad_norm": 0.7633221752691605, + "learning_rate": 1.9908167113780665e-05, + "loss": 0.4663, + "step": 895 + }, + { + "epoch": 0.17665615141955837, + "grad_norm": 0.7931603437941884, + "learning_rate": 1.990795741287511e-05, + "loss": 0.4979, + "step": 896 + }, + { + "epoch": 0.17685331230283913, + "grad_norm": 0.740593821102, + "learning_rate": 1.99077474739229e-05, + "loss": 0.4595, + "step": 897 + }, + { + "epoch": 0.17705047318611988, + "grad_norm": 0.7137127142445181, + "learning_rate": 1.9907537296929077e-05, + "loss": 0.468, + "step": 898 + }, + { + "epoch": 0.17724763406940064, + "grad_norm": 0.7856236565024629, + "learning_rate": 1.9907326881898693e-05, + "loss": 0.4532, + "step": 899 + }, + { + "epoch": 0.1774447949526814, + "grad_norm": 0.8304106213478027, + "learning_rate": 1.99071162288368e-05, + "loss": 0.4835, + "step": 900 + }, + { + "epoch": 0.17764195583596215, + "grad_norm": 0.7075946238835652, + "learning_rate": 1.9906905337748466e-05, + "loss": 0.4415, + "step": 901 + }, + { + "epoch": 0.1778391167192429, + "grad_norm": 0.7255892333472377, + "learning_rate": 1.990669420863875e-05, + "loss": 0.462, + "step": 902 + }, + { + "epoch": 0.17803627760252366, + "grad_norm": 0.7551817312285064, + "learning_rate": 1.990648284151273e-05, + "loss": 0.4352, + "step": 903 + }, + { + "epoch": 0.17823343848580442, + "grad_norm": 0.7462637275359422, + "learning_rate": 1.9906271236375478e-05, + "loss": 0.4635, + "step": 904 + }, + { + "epoch": 0.17843059936908517, + "grad_norm": 0.7285489399185598, + "learning_rate": 1.9906059393232088e-05, + "loss": 0.4696, + "step": 905 + }, + { + "epoch": 0.17862776025236593, + "grad_norm": 0.789466645420734, + "learning_rate": 1.990584731208764e-05, + "loss": 0.4968, + "step": 906 + }, + { + "epoch": 0.17882492113564669, + "grad_norm": 2.968306758137648, + "learning_rate": 1.9905634992947235e-05, + "loss": 0.491, + "step": 907 + }, + { + "epoch": 0.17902208201892744, + "grad_norm": 0.7750070374318527, + "learning_rate": 1.990542243581597e-05, + "loss": 0.4703, + "step": 908 + }, + { + "epoch": 0.1792192429022082, + "grad_norm": 0.695028794150739, + "learning_rate": 1.9905209640698952e-05, + "loss": 0.4642, + "step": 909 + }, + { + "epoch": 0.17941640378548895, + "grad_norm": 0.7695918165395215, + "learning_rate": 1.9904996607601303e-05, + "loss": 0.4552, + "step": 910 + }, + { + "epoch": 0.1796135646687697, + "grad_norm": 0.7202892429327876, + "learning_rate": 1.9904783336528128e-05, + "loss": 0.4646, + "step": 911 + }, + { + "epoch": 0.17981072555205047, + "grad_norm": 0.8135132238328896, + "learning_rate": 1.9904569827484556e-05, + "loss": 0.4594, + "step": 912 + }, + { + "epoch": 0.18000788643533122, + "grad_norm": 0.7554729773822189, + "learning_rate": 1.990435608047572e-05, + "loss": 0.4008, + "step": 913 + }, + { + "epoch": 0.18020504731861198, + "grad_norm": 0.737373759645686, + "learning_rate": 1.9904142095506756e-05, + "loss": 0.4335, + "step": 914 + }, + { + "epoch": 0.18040220820189273, + "grad_norm": 0.8409703840361105, + "learning_rate": 1.99039278725828e-05, + "loss": 0.4612, + "step": 915 + }, + { + "epoch": 0.1805993690851735, + "grad_norm": 0.6825714924533132, + "learning_rate": 1.9903713411709003e-05, + "loss": 0.4225, + "step": 916 + }, + { + "epoch": 0.18079652996845424, + "grad_norm": 0.9821803862497531, + "learning_rate": 1.9903498712890516e-05, + "loss": 0.4981, + "step": 917 + }, + { + "epoch": 0.18099369085173503, + "grad_norm": 0.7273123016078351, + "learning_rate": 1.9903283776132495e-05, + "loss": 0.4461, + "step": 918 + }, + { + "epoch": 0.18119085173501578, + "grad_norm": 0.8665342753995058, + "learning_rate": 1.9903068601440106e-05, + "loss": 0.4452, + "step": 919 + }, + { + "epoch": 0.18138801261829654, + "grad_norm": 0.7822855298339977, + "learning_rate": 1.9902853188818518e-05, + "loss": 0.4655, + "step": 920 + }, + { + "epoch": 0.1815851735015773, + "grad_norm": 0.8357899870781356, + "learning_rate": 1.990263753827291e-05, + "loss": 0.4416, + "step": 921 + }, + { + "epoch": 0.18178233438485805, + "grad_norm": 0.6936838258206445, + "learning_rate": 1.990242164980846e-05, + "loss": 0.4612, + "step": 922 + }, + { + "epoch": 0.1819794952681388, + "grad_norm": 0.8050591566486436, + "learning_rate": 1.9902205523430353e-05, + "loss": 0.4465, + "step": 923 + }, + { + "epoch": 0.18217665615141956, + "grad_norm": 1.0415997120130642, + "learning_rate": 1.9901989159143786e-05, + "loss": 0.4404, + "step": 924 + }, + { + "epoch": 0.18237381703470032, + "grad_norm": 0.7976243404123774, + "learning_rate": 1.9901772556953958e-05, + "loss": 0.4767, + "step": 925 + }, + { + "epoch": 0.18257097791798108, + "grad_norm": 0.7036095691664724, + "learning_rate": 1.9901555716866067e-05, + "loss": 0.4479, + "step": 926 + }, + { + "epoch": 0.18276813880126183, + "grad_norm": 0.7746405790517535, + "learning_rate": 1.9901338638885327e-05, + "loss": 0.4514, + "step": 927 + }, + { + "epoch": 0.1829652996845426, + "grad_norm": 0.7905763277511705, + "learning_rate": 1.9901121323016955e-05, + "loss": 0.4947, + "step": 928 + }, + { + "epoch": 0.18316246056782334, + "grad_norm": 0.8011819612935877, + "learning_rate": 1.9900903769266167e-05, + "loss": 0.4711, + "step": 929 + }, + { + "epoch": 0.1833596214511041, + "grad_norm": 0.7885618953274046, + "learning_rate": 1.9900685977638194e-05, + "loss": 0.4474, + "step": 930 + }, + { + "epoch": 0.18355678233438485, + "grad_norm": 0.8628719762309652, + "learning_rate": 1.9900467948138266e-05, + "loss": 0.484, + "step": 931 + }, + { + "epoch": 0.1837539432176656, + "grad_norm": 0.8426852896992455, + "learning_rate": 1.9900249680771622e-05, + "loss": 0.4597, + "step": 932 + }, + { + "epoch": 0.18395110410094637, + "grad_norm": 0.8844622895009938, + "learning_rate": 1.990003117554351e-05, + "loss": 0.4837, + "step": 933 + }, + { + "epoch": 0.18414826498422712, + "grad_norm": 0.7099824020200362, + "learning_rate": 1.9899812432459175e-05, + "loss": 0.4425, + "step": 934 + }, + { + "epoch": 0.18434542586750788, + "grad_norm": 0.7629633321327819, + "learning_rate": 1.9899593451523875e-05, + "loss": 0.4526, + "step": 935 + }, + { + "epoch": 0.18454258675078863, + "grad_norm": 0.8792149544075311, + "learning_rate": 1.989937423274287e-05, + "loss": 0.4682, + "step": 936 + }, + { + "epoch": 0.1847397476340694, + "grad_norm": 0.6838639839900289, + "learning_rate": 1.9899154776121424e-05, + "loss": 0.4261, + "step": 937 + }, + { + "epoch": 0.18493690851735015, + "grad_norm": 0.706142676214522, + "learning_rate": 1.9898935081664814e-05, + "loss": 0.4805, + "step": 938 + }, + { + "epoch": 0.1851340694006309, + "grad_norm": 0.6859539690000207, + "learning_rate": 1.9898715149378317e-05, + "loss": 0.4706, + "step": 939 + }, + { + "epoch": 0.18533123028391169, + "grad_norm": 0.6812899026189113, + "learning_rate": 1.989849497926722e-05, + "loss": 0.4256, + "step": 940 + }, + { + "epoch": 0.18552839116719244, + "grad_norm": 0.6577900927420254, + "learning_rate": 1.989827457133681e-05, + "loss": 0.4463, + "step": 941 + }, + { + "epoch": 0.1857255520504732, + "grad_norm": 0.7514857045339612, + "learning_rate": 1.9898053925592376e-05, + "loss": 0.4591, + "step": 942 + }, + { + "epoch": 0.18592271293375395, + "grad_norm": 0.6549094044973097, + "learning_rate": 1.9897833042039233e-05, + "loss": 0.4623, + "step": 943 + }, + { + "epoch": 0.1861198738170347, + "grad_norm": 0.7310174851562488, + "learning_rate": 1.9897611920682676e-05, + "loss": 0.4487, + "step": 944 + }, + { + "epoch": 0.18631703470031546, + "grad_norm": 0.6963083495505222, + "learning_rate": 1.9897390561528024e-05, + "loss": 0.4947, + "step": 945 + }, + { + "epoch": 0.18651419558359622, + "grad_norm": 0.6460049210995387, + "learning_rate": 1.9897168964580594e-05, + "loss": 0.4631, + "step": 946 + }, + { + "epoch": 0.18671135646687698, + "grad_norm": 0.7279929742381652, + "learning_rate": 1.9896947129845707e-05, + "loss": 0.454, + "step": 947 + }, + { + "epoch": 0.18690851735015773, + "grad_norm": 0.7314455893730071, + "learning_rate": 1.9896725057328695e-05, + "loss": 0.468, + "step": 948 + }, + { + "epoch": 0.1871056782334385, + "grad_norm": 0.707927540023582, + "learning_rate": 1.9896502747034894e-05, + "loss": 0.475, + "step": 949 + }, + { + "epoch": 0.18730283911671924, + "grad_norm": 0.8331091743901959, + "learning_rate": 1.989628019896965e-05, + "loss": 0.467, + "step": 950 + }, + { + "epoch": 0.1875, + "grad_norm": 0.7225918783951755, + "learning_rate": 1.98960574131383e-05, + "loss": 0.477, + "step": 951 + }, + { + "epoch": 0.18769716088328076, + "grad_norm": 0.6757638681754824, + "learning_rate": 1.9895834389546204e-05, + "loss": 0.4572, + "step": 952 + }, + { + "epoch": 0.1878943217665615, + "grad_norm": 0.7206616057098645, + "learning_rate": 1.9895611128198714e-05, + "loss": 0.4644, + "step": 953 + }, + { + "epoch": 0.18809148264984227, + "grad_norm": 0.6474073350668482, + "learning_rate": 1.9895387629101203e-05, + "loss": 0.4484, + "step": 954 + }, + { + "epoch": 0.18828864353312302, + "grad_norm": 0.7027079831141624, + "learning_rate": 1.989516389225903e-05, + "loss": 0.4845, + "step": 955 + }, + { + "epoch": 0.18848580441640378, + "grad_norm": 0.6725377762031143, + "learning_rate": 1.9894939917677577e-05, + "loss": 0.4592, + "step": 956 + }, + { + "epoch": 0.18868296529968454, + "grad_norm": 0.6747407598105398, + "learning_rate": 1.9894715705362227e-05, + "loss": 0.4316, + "step": 957 + }, + { + "epoch": 0.1888801261829653, + "grad_norm": 0.990350135133828, + "learning_rate": 1.9894491255318362e-05, + "loss": 0.4676, + "step": 958 + }, + { + "epoch": 0.18907728706624605, + "grad_norm": 0.6877906835243988, + "learning_rate": 1.9894266567551378e-05, + "loss": 0.469, + "step": 959 + }, + { + "epoch": 0.1892744479495268, + "grad_norm": 0.6882946457373598, + "learning_rate": 1.989404164206667e-05, + "loss": 0.4932, + "step": 960 + }, + { + "epoch": 0.18947160883280756, + "grad_norm": 0.7061372635020192, + "learning_rate": 1.9893816478869646e-05, + "loss": 0.484, + "step": 961 + }, + { + "epoch": 0.18966876971608831, + "grad_norm": 0.6724470559392719, + "learning_rate": 1.989359107796571e-05, + "loss": 0.4332, + "step": 962 + }, + { + "epoch": 0.1898659305993691, + "grad_norm": 0.7625542937410119, + "learning_rate": 1.9893365439360285e-05, + "loss": 0.4763, + "step": 963 + }, + { + "epoch": 0.19006309148264985, + "grad_norm": 0.6953307451809908, + "learning_rate": 1.9893139563058786e-05, + "loss": 0.4958, + "step": 964 + }, + { + "epoch": 0.1902602523659306, + "grad_norm": 0.7466843034274665, + "learning_rate": 1.9892913449066643e-05, + "loss": 0.4958, + "step": 965 + }, + { + "epoch": 0.19045741324921137, + "grad_norm": 0.6585282349015559, + "learning_rate": 1.9892687097389288e-05, + "loss": 0.4224, + "step": 966 + }, + { + "epoch": 0.19065457413249212, + "grad_norm": 0.6883957756753183, + "learning_rate": 1.9892460508032158e-05, + "loss": 0.4603, + "step": 967 + }, + { + "epoch": 0.19085173501577288, + "grad_norm": 0.7020929418484416, + "learning_rate": 1.9892233681000696e-05, + "loss": 0.4644, + "step": 968 + }, + { + "epoch": 0.19104889589905363, + "grad_norm": 0.659651299875094, + "learning_rate": 1.9892006616300358e-05, + "loss": 0.4378, + "step": 969 + }, + { + "epoch": 0.1912460567823344, + "grad_norm": 0.6871160793712359, + "learning_rate": 1.989177931393659e-05, + "loss": 0.4432, + "step": 970 + }, + { + "epoch": 0.19144321766561515, + "grad_norm": 0.7875222747543351, + "learning_rate": 1.989155177391486e-05, + "loss": 0.4698, + "step": 971 + }, + { + "epoch": 0.1916403785488959, + "grad_norm": 0.6688379722075266, + "learning_rate": 1.9891323996240633e-05, + "loss": 0.4608, + "step": 972 + }, + { + "epoch": 0.19183753943217666, + "grad_norm": 0.7330583737954934, + "learning_rate": 1.9891095980919383e-05, + "loss": 0.4809, + "step": 973 + }, + { + "epoch": 0.1920347003154574, + "grad_norm": 0.6958413705522188, + "learning_rate": 1.9890867727956587e-05, + "loss": 0.4627, + "step": 974 + }, + { + "epoch": 0.19223186119873817, + "grad_norm": 1.5897290586527348, + "learning_rate": 1.9890639237357726e-05, + "loss": 0.5164, + "step": 975 + }, + { + "epoch": 0.19242902208201892, + "grad_norm": 0.7313960505685592, + "learning_rate": 1.98904105091283e-05, + "loss": 0.4792, + "step": 976 + }, + { + "epoch": 0.19262618296529968, + "grad_norm": 0.7063385670757377, + "learning_rate": 1.989018154327379e-05, + "loss": 0.4605, + "step": 977 + }, + { + "epoch": 0.19282334384858044, + "grad_norm": 0.708185416892424, + "learning_rate": 1.9889952339799704e-05, + "loss": 0.439, + "step": 978 + }, + { + "epoch": 0.1930205047318612, + "grad_norm": 0.6540312802061391, + "learning_rate": 1.9889722898711546e-05, + "loss": 0.4435, + "step": 979 + }, + { + "epoch": 0.19321766561514195, + "grad_norm": 0.8450940057465913, + "learning_rate": 1.9889493220014837e-05, + "loss": 0.4531, + "step": 980 + }, + { + "epoch": 0.1934148264984227, + "grad_norm": 0.7708701148620279, + "learning_rate": 1.9889263303715086e-05, + "loss": 0.4704, + "step": 981 + }, + { + "epoch": 0.19361198738170346, + "grad_norm": 0.621772319575249, + "learning_rate": 1.9889033149817823e-05, + "loss": 0.4514, + "step": 982 + }, + { + "epoch": 0.19380914826498422, + "grad_norm": 0.737304053648671, + "learning_rate": 1.9888802758328574e-05, + "loss": 0.458, + "step": 983 + }, + { + "epoch": 0.19400630914826497, + "grad_norm": 0.6741704564219441, + "learning_rate": 1.9888572129252875e-05, + "loss": 0.4394, + "step": 984 + }, + { + "epoch": 0.19420347003154576, + "grad_norm": 0.7931415567269723, + "learning_rate": 1.9888341262596266e-05, + "loss": 0.4729, + "step": 985 + }, + { + "epoch": 0.1944006309148265, + "grad_norm": 0.731130849168293, + "learning_rate": 1.9888110158364296e-05, + "loss": 0.4546, + "step": 986 + }, + { + "epoch": 0.19459779179810727, + "grad_norm": 0.6910048306098391, + "learning_rate": 1.988787881656252e-05, + "loss": 0.4799, + "step": 987 + }, + { + "epoch": 0.19479495268138802, + "grad_norm": 0.6876331628207631, + "learning_rate": 1.988764723719649e-05, + "loss": 0.4642, + "step": 988 + }, + { + "epoch": 0.19499211356466878, + "grad_norm": 0.6926517120324639, + "learning_rate": 1.988741542027177e-05, + "loss": 0.4482, + "step": 989 + }, + { + "epoch": 0.19518927444794953, + "grad_norm": 0.6914878618737181, + "learning_rate": 1.9887183365793935e-05, + "loss": 0.4126, + "step": 990 + }, + { + "epoch": 0.1953864353312303, + "grad_norm": 0.6866079399218992, + "learning_rate": 1.9886951073768557e-05, + "loss": 0.4373, + "step": 991 + }, + { + "epoch": 0.19558359621451105, + "grad_norm": 0.6833596593294384, + "learning_rate": 1.988671854420122e-05, + "loss": 0.4332, + "step": 992 + }, + { + "epoch": 0.1957807570977918, + "grad_norm": 0.7160110640102156, + "learning_rate": 1.9886485777097505e-05, + "loss": 0.472, + "step": 993 + }, + { + "epoch": 0.19597791798107256, + "grad_norm": 0.701325188265206, + "learning_rate": 1.9886252772463008e-05, + "loss": 0.4854, + "step": 994 + }, + { + "epoch": 0.19617507886435331, + "grad_norm": 0.6802161665541124, + "learning_rate": 1.9886019530303328e-05, + "loss": 0.4466, + "step": 995 + }, + { + "epoch": 0.19637223974763407, + "grad_norm": 0.7478910314629341, + "learning_rate": 1.9885786050624066e-05, + "loss": 0.5023, + "step": 996 + }, + { + "epoch": 0.19656940063091483, + "grad_norm": 0.7673069709477471, + "learning_rate": 1.9885552333430834e-05, + "loss": 0.5205, + "step": 997 + }, + { + "epoch": 0.19676656151419558, + "grad_norm": 0.7036229450957218, + "learning_rate": 1.9885318378729247e-05, + "loss": 0.4693, + "step": 998 + }, + { + "epoch": 0.19696372239747634, + "grad_norm": 0.7059304315094523, + "learning_rate": 1.9885084186524922e-05, + "loss": 0.4687, + "step": 999 + }, + { + "epoch": 0.1971608832807571, + "grad_norm": 0.701201649068646, + "learning_rate": 1.988484975682349e-05, + "loss": 0.4727, + "step": 1000 + }, + { + "epoch": 0.19735804416403785, + "grad_norm": 0.6840771386046793, + "learning_rate": 1.9884615089630584e-05, + "loss": 0.4636, + "step": 1001 + }, + { + "epoch": 0.1975552050473186, + "grad_norm": 0.7119449126029943, + "learning_rate": 1.988438018495184e-05, + "loss": 0.465, + "step": 1002 + }, + { + "epoch": 0.19775236593059936, + "grad_norm": 0.7547620511614711, + "learning_rate": 1.9884145042792905e-05, + "loss": 0.4701, + "step": 1003 + }, + { + "epoch": 0.19794952681388012, + "grad_norm": 0.6957906304621474, + "learning_rate": 1.9883909663159424e-05, + "loss": 0.4531, + "step": 1004 + }, + { + "epoch": 0.19814668769716087, + "grad_norm": 0.8261687307427324, + "learning_rate": 1.9883674046057054e-05, + "loss": 0.4989, + "step": 1005 + }, + { + "epoch": 0.19834384858044163, + "grad_norm": 0.6898115598695652, + "learning_rate": 1.9883438191491453e-05, + "loss": 0.4537, + "step": 1006 + }, + { + "epoch": 0.19854100946372238, + "grad_norm": 0.818376757438343, + "learning_rate": 1.9883202099468294e-05, + "loss": 0.4706, + "step": 1007 + }, + { + "epoch": 0.19873817034700317, + "grad_norm": 0.7807033836708956, + "learning_rate": 1.988296576999324e-05, + "loss": 0.4311, + "step": 1008 + }, + { + "epoch": 0.19893533123028392, + "grad_norm": 0.6424414464365055, + "learning_rate": 1.988272920307198e-05, + "loss": 0.4303, + "step": 1009 + }, + { + "epoch": 0.19913249211356468, + "grad_norm": 0.7799809923491292, + "learning_rate": 1.9882492398710192e-05, + "loss": 0.5081, + "step": 1010 + }, + { + "epoch": 0.19932965299684544, + "grad_norm": 0.7784674048585015, + "learning_rate": 1.9882255356913563e-05, + "loss": 0.5344, + "step": 1011 + }, + { + "epoch": 0.1995268138801262, + "grad_norm": 0.7082795210370787, + "learning_rate": 1.988201807768779e-05, + "loss": 0.4639, + "step": 1012 + }, + { + "epoch": 0.19972397476340695, + "grad_norm": 0.6999909023367687, + "learning_rate": 1.9881780561038583e-05, + "loss": 0.4525, + "step": 1013 + }, + { + "epoch": 0.1999211356466877, + "grad_norm": 0.7285326225615099, + "learning_rate": 1.988154280697163e-05, + "loss": 0.4278, + "step": 1014 + }, + { + "epoch": 0.20011829652996846, + "grad_norm": 0.7638833349383104, + "learning_rate": 1.988130481549266e-05, + "loss": 0.4622, + "step": 1015 + }, + { + "epoch": 0.20031545741324921, + "grad_norm": 0.6387623204916645, + "learning_rate": 1.9881066586607384e-05, + "loss": 0.4361, + "step": 1016 + }, + { + "epoch": 0.20051261829652997, + "grad_norm": 0.7537122531684395, + "learning_rate": 1.9880828120321523e-05, + "loss": 0.5057, + "step": 1017 + }, + { + "epoch": 0.20070977917981073, + "grad_norm": 0.6862604234144699, + "learning_rate": 1.988058941664081e-05, + "loss": 0.4416, + "step": 1018 + }, + { + "epoch": 0.20090694006309148, + "grad_norm": 0.7489284571072279, + "learning_rate": 1.988035047557098e-05, + "loss": 0.4998, + "step": 1019 + }, + { + "epoch": 0.20110410094637224, + "grad_norm": 0.7523322609228381, + "learning_rate": 1.9880111297117772e-05, + "loss": 0.4551, + "step": 1020 + }, + { + "epoch": 0.201301261829653, + "grad_norm": 0.7501370176061963, + "learning_rate": 1.9879871881286936e-05, + "loss": 0.4416, + "step": 1021 + }, + { + "epoch": 0.20149842271293375, + "grad_norm": 0.7351622592876997, + "learning_rate": 1.9879632228084224e-05, + "loss": 0.4639, + "step": 1022 + }, + { + "epoch": 0.2016955835962145, + "grad_norm": 0.7423971116335808, + "learning_rate": 1.9879392337515385e-05, + "loss": 0.488, + "step": 1023 + }, + { + "epoch": 0.20189274447949526, + "grad_norm": 0.6751923754084928, + "learning_rate": 1.9879152209586193e-05, + "loss": 0.3938, + "step": 1024 + }, + { + "epoch": 0.20208990536277602, + "grad_norm": 0.722495594329759, + "learning_rate": 1.987891184430241e-05, + "loss": 0.4837, + "step": 1025 + }, + { + "epoch": 0.20228706624605677, + "grad_norm": 0.6782801414805263, + "learning_rate": 1.9878671241669824e-05, + "loss": 0.4781, + "step": 1026 + }, + { + "epoch": 0.20248422712933753, + "grad_norm": 0.7053528722513924, + "learning_rate": 1.98784304016942e-05, + "loss": 0.4424, + "step": 1027 + }, + { + "epoch": 0.20268138801261829, + "grad_norm": 0.6877822028898056, + "learning_rate": 1.987818932438133e-05, + "loss": 0.4585, + "step": 1028 + }, + { + "epoch": 0.20287854889589904, + "grad_norm": 0.7414751335330734, + "learning_rate": 1.9877948009737006e-05, + "loss": 0.4581, + "step": 1029 + }, + { + "epoch": 0.2030757097791798, + "grad_norm": 0.6447511500376699, + "learning_rate": 1.9877706457767028e-05, + "loss": 0.4246, + "step": 1030 + }, + { + "epoch": 0.20327287066246058, + "grad_norm": 0.7084092307640751, + "learning_rate": 1.9877464668477195e-05, + "loss": 0.4779, + "step": 1031 + }, + { + "epoch": 0.20347003154574134, + "grad_norm": 0.753567669793824, + "learning_rate": 1.987722264187332e-05, + "loss": 0.4658, + "step": 1032 + }, + { + "epoch": 0.2036671924290221, + "grad_norm": 0.733202685122525, + "learning_rate": 1.987698037796122e-05, + "loss": 0.4506, + "step": 1033 + }, + { + "epoch": 0.20386435331230285, + "grad_norm": 0.6827619467500508, + "learning_rate": 1.987673787674671e-05, + "loss": 0.4124, + "step": 1034 + }, + { + "epoch": 0.2040615141955836, + "grad_norm": 0.7015654555624304, + "learning_rate": 1.987649513823562e-05, + "loss": 0.4498, + "step": 1035 + }, + { + "epoch": 0.20425867507886436, + "grad_norm": 0.6474610550353069, + "learning_rate": 1.987625216243378e-05, + "loss": 0.4092, + "step": 1036 + }, + { + "epoch": 0.20445583596214512, + "grad_norm": 0.7064160426450053, + "learning_rate": 1.987600894934703e-05, + "loss": 0.4379, + "step": 1037 + }, + { + "epoch": 0.20465299684542587, + "grad_norm": 0.6676749684842929, + "learning_rate": 1.987576549898121e-05, + "loss": 0.439, + "step": 1038 + }, + { + "epoch": 0.20485015772870663, + "grad_norm": 0.7077339162356132, + "learning_rate": 1.987552181134217e-05, + "loss": 0.4433, + "step": 1039 + }, + { + "epoch": 0.20504731861198738, + "grad_norm": 0.7923537698393385, + "learning_rate": 1.9875277886435768e-05, + "loss": 0.4747, + "step": 1040 + }, + { + "epoch": 0.20524447949526814, + "grad_norm": 0.7264062737086919, + "learning_rate": 1.9875033724267863e-05, + "loss": 0.4479, + "step": 1041 + }, + { + "epoch": 0.2054416403785489, + "grad_norm": 0.7252182386797893, + "learning_rate": 1.987478932484432e-05, + "loss": 0.4701, + "step": 1042 + }, + { + "epoch": 0.20563880126182965, + "grad_norm": 0.7474427366745425, + "learning_rate": 1.9874544688171008e-05, + "loss": 0.4749, + "step": 1043 + }, + { + "epoch": 0.2058359621451104, + "grad_norm": 0.8821899810782728, + "learning_rate": 1.9874299814253813e-05, + "loss": 0.512, + "step": 1044 + }, + { + "epoch": 0.20603312302839116, + "grad_norm": 0.7196304240914877, + "learning_rate": 1.9874054703098608e-05, + "loss": 0.44, + "step": 1045 + }, + { + "epoch": 0.20623028391167192, + "grad_norm": 0.6964106812809121, + "learning_rate": 1.987380935471129e-05, + "loss": 0.4283, + "step": 1046 + }, + { + "epoch": 0.20642744479495267, + "grad_norm": 0.7918473693269067, + "learning_rate": 1.9873563769097752e-05, + "loss": 0.4519, + "step": 1047 + }, + { + "epoch": 0.20662460567823343, + "grad_norm": 0.8505419812339252, + "learning_rate": 1.9873317946263892e-05, + "loss": 0.476, + "step": 1048 + }, + { + "epoch": 0.2068217665615142, + "grad_norm": 0.6864258063010205, + "learning_rate": 1.9873071886215616e-05, + "loss": 0.463, + "step": 1049 + }, + { + "epoch": 0.20701892744479494, + "grad_norm": 0.9553769219710571, + "learning_rate": 1.987282558895884e-05, + "loss": 0.4925, + "step": 1050 + }, + { + "epoch": 0.2072160883280757, + "grad_norm": 0.6980654478796005, + "learning_rate": 1.9872579054499478e-05, + "loss": 0.426, + "step": 1051 + }, + { + "epoch": 0.20741324921135645, + "grad_norm": 0.719479062368989, + "learning_rate": 1.987233228284345e-05, + "loss": 0.4504, + "step": 1052 + }, + { + "epoch": 0.20761041009463724, + "grad_norm": 0.7018789823838913, + "learning_rate": 1.9872085273996694e-05, + "loss": 0.4546, + "step": 1053 + }, + { + "epoch": 0.207807570977918, + "grad_norm": 0.7802910640814608, + "learning_rate": 1.9871838027965134e-05, + "loss": 0.4972, + "step": 1054 + }, + { + "epoch": 0.20800473186119875, + "grad_norm": 0.7844445505713344, + "learning_rate": 1.987159054475472e-05, + "loss": 0.4492, + "step": 1055 + }, + { + "epoch": 0.2082018927444795, + "grad_norm": 0.692043038159667, + "learning_rate": 1.9871342824371393e-05, + "loss": 0.4406, + "step": 1056 + }, + { + "epoch": 0.20839905362776026, + "grad_norm": 0.775005815750936, + "learning_rate": 1.9871094866821104e-05, + "loss": 0.4282, + "step": 1057 + }, + { + "epoch": 0.20859621451104102, + "grad_norm": 0.7449051373918187, + "learning_rate": 1.987084667210981e-05, + "loss": 0.46, + "step": 1058 + }, + { + "epoch": 0.20879337539432177, + "grad_norm": 0.7073489451850218, + "learning_rate": 1.987059824024348e-05, + "loss": 0.429, + "step": 1059 + }, + { + "epoch": 0.20899053627760253, + "grad_norm": 0.7309470583978874, + "learning_rate": 1.9870349571228075e-05, + "loss": 0.4798, + "step": 1060 + }, + { + "epoch": 0.20918769716088328, + "grad_norm": 0.7718451336210451, + "learning_rate": 1.9870100665069577e-05, + "loss": 0.4688, + "step": 1061 + }, + { + "epoch": 0.20938485804416404, + "grad_norm": 0.6817754353610815, + "learning_rate": 1.9869851521773956e-05, + "loss": 0.4495, + "step": 1062 + }, + { + "epoch": 0.2095820189274448, + "grad_norm": 0.7665443527171363, + "learning_rate": 1.986960214134721e-05, + "loss": 0.4802, + "step": 1063 + }, + { + "epoch": 0.20977917981072555, + "grad_norm": 0.6633898852299499, + "learning_rate": 1.986935252379532e-05, + "loss": 0.4538, + "step": 1064 + }, + { + "epoch": 0.2099763406940063, + "grad_norm": 0.68853230845626, + "learning_rate": 1.9869102669124293e-05, + "loss": 0.4882, + "step": 1065 + }, + { + "epoch": 0.21017350157728706, + "grad_norm": 0.7017829715484818, + "learning_rate": 1.986885257734012e-05, + "loss": 0.4855, + "step": 1066 + }, + { + "epoch": 0.21037066246056782, + "grad_norm": 0.7815202320893195, + "learning_rate": 1.986860224844882e-05, + "loss": 0.4763, + "step": 1067 + }, + { + "epoch": 0.21056782334384858, + "grad_norm": 0.7066784986762935, + "learning_rate": 1.9868351682456408e-05, + "loss": 0.481, + "step": 1068 + }, + { + "epoch": 0.21076498422712933, + "grad_norm": 0.7041493363838017, + "learning_rate": 1.986810087936889e-05, + "loss": 0.4637, + "step": 1069 + }, + { + "epoch": 0.2109621451104101, + "grad_norm": 0.72824829966811, + "learning_rate": 1.9867849839192313e-05, + "loss": 0.4651, + "step": 1070 + }, + { + "epoch": 0.21115930599369084, + "grad_norm": 0.6680455738505751, + "learning_rate": 1.986759856193269e-05, + "loss": 0.4665, + "step": 1071 + }, + { + "epoch": 0.2113564668769716, + "grad_norm": 0.9281048268306978, + "learning_rate": 1.9867347047596066e-05, + "loss": 0.471, + "step": 1072 + }, + { + "epoch": 0.21155362776025236, + "grad_norm": 0.7395272448325264, + "learning_rate": 1.9867095296188483e-05, + "loss": 0.4715, + "step": 1073 + }, + { + "epoch": 0.2117507886435331, + "grad_norm": 0.7830224178550704, + "learning_rate": 1.986684330771599e-05, + "loss": 0.4641, + "step": 1074 + }, + { + "epoch": 0.21194794952681387, + "grad_norm": 0.7574110213860703, + "learning_rate": 1.986659108218464e-05, + "loss": 0.4474, + "step": 1075 + }, + { + "epoch": 0.21214511041009465, + "grad_norm": 0.6927377615200532, + "learning_rate": 1.98663386196005e-05, + "loss": 0.4287, + "step": 1076 + }, + { + "epoch": 0.2123422712933754, + "grad_norm": 0.7232907585511054, + "learning_rate": 1.986608591996962e-05, + "loss": 0.471, + "step": 1077 + }, + { + "epoch": 0.21253943217665616, + "grad_norm": 0.6491519990201463, + "learning_rate": 1.9865832983298085e-05, + "loss": 0.4228, + "step": 1078 + }, + { + "epoch": 0.21273659305993692, + "grad_norm": 0.6831967457936221, + "learning_rate": 1.986557980959197e-05, + "loss": 0.4732, + "step": 1079 + }, + { + "epoch": 0.21293375394321767, + "grad_norm": 0.6954600579093602, + "learning_rate": 1.986532639885735e-05, + "loss": 0.4764, + "step": 1080 + }, + { + "epoch": 0.21313091482649843, + "grad_norm": 0.6995051067856928, + "learning_rate": 1.9865072751100324e-05, + "loss": 0.4769, + "step": 1081 + }, + { + "epoch": 0.21332807570977919, + "grad_norm": 0.6775668825484679, + "learning_rate": 1.9864818866326978e-05, + "loss": 0.4461, + "step": 1082 + }, + { + "epoch": 0.21352523659305994, + "grad_norm": 0.7519792388180498, + "learning_rate": 1.9864564744543412e-05, + "loss": 0.4812, + "step": 1083 + }, + { + "epoch": 0.2137223974763407, + "grad_norm": 0.7107462899081852, + "learning_rate": 1.986431038575574e-05, + "loss": 0.5028, + "step": 1084 + }, + { + "epoch": 0.21391955835962145, + "grad_norm": 0.6753961183203763, + "learning_rate": 1.9864055789970064e-05, + "loss": 0.4237, + "step": 1085 + }, + { + "epoch": 0.2141167192429022, + "grad_norm": 0.6737584152485184, + "learning_rate": 1.9863800957192504e-05, + "loss": 0.4461, + "step": 1086 + }, + { + "epoch": 0.21431388012618297, + "grad_norm": 0.6556918876340901, + "learning_rate": 1.9863545887429185e-05, + "loss": 0.4506, + "step": 1087 + }, + { + "epoch": 0.21451104100946372, + "grad_norm": 0.6877170697107042, + "learning_rate": 1.9863290580686228e-05, + "loss": 0.4058, + "step": 1088 + }, + { + "epoch": 0.21470820189274448, + "grad_norm": 0.7037870318662259, + "learning_rate": 1.9863035036969775e-05, + "loss": 0.4576, + "step": 1089 + }, + { + "epoch": 0.21490536277602523, + "grad_norm": 0.7399278087323496, + "learning_rate": 1.9862779256285964e-05, + "loss": 0.4907, + "step": 1090 + }, + { + "epoch": 0.215102523659306, + "grad_norm": 0.688883167018551, + "learning_rate": 1.986252323864094e-05, + "loss": 0.4597, + "step": 1091 + }, + { + "epoch": 0.21529968454258674, + "grad_norm": 0.7010288184037431, + "learning_rate": 1.9862266984040847e-05, + "loss": 0.4519, + "step": 1092 + }, + { + "epoch": 0.2154968454258675, + "grad_norm": 0.6241563971147653, + "learning_rate": 1.9862010492491852e-05, + "loss": 0.3987, + "step": 1093 + }, + { + "epoch": 0.21569400630914826, + "grad_norm": 0.6700336161149536, + "learning_rate": 1.9861753764000115e-05, + "loss": 0.4728, + "step": 1094 + }, + { + "epoch": 0.215891167192429, + "grad_norm": 0.7471341036922323, + "learning_rate": 1.98614967985718e-05, + "loss": 0.446, + "step": 1095 + }, + { + "epoch": 0.21608832807570977, + "grad_norm": 0.6506229824272028, + "learning_rate": 1.986123959621308e-05, + "loss": 0.416, + "step": 1096 + }, + { + "epoch": 0.21628548895899052, + "grad_norm": 1.2484822940321072, + "learning_rate": 1.986098215693014e-05, + "loss": 0.4903, + "step": 1097 + }, + { + "epoch": 0.21648264984227128, + "grad_norm": 0.9678245848808749, + "learning_rate": 1.986072448072916e-05, + "loss": 0.4802, + "step": 1098 + }, + { + "epoch": 0.21667981072555206, + "grad_norm": 0.7448588276706483, + "learning_rate": 1.9860466567616335e-05, + "loss": 0.4231, + "step": 1099 + }, + { + "epoch": 0.21687697160883282, + "grad_norm": 0.7225398921155327, + "learning_rate": 1.9860208417597863e-05, + "loss": 0.4666, + "step": 1100 + }, + { + "epoch": 0.21707413249211358, + "grad_norm": 0.8881220700758276, + "learning_rate": 1.9859950030679943e-05, + "loss": 0.4587, + "step": 1101 + }, + { + "epoch": 0.21727129337539433, + "grad_norm": 0.6824253304142237, + "learning_rate": 1.985969140686878e-05, + "loss": 0.4671, + "step": 1102 + }, + { + "epoch": 0.2174684542586751, + "grad_norm": 0.822081525030592, + "learning_rate": 1.9859432546170594e-05, + "loss": 0.4309, + "step": 1103 + }, + { + "epoch": 0.21766561514195584, + "grad_norm": 0.7177176238573195, + "learning_rate": 1.98591734485916e-05, + "loss": 0.4662, + "step": 1104 + }, + { + "epoch": 0.2178627760252366, + "grad_norm": 23.933597214678645, + "learning_rate": 1.9858914114138024e-05, + "loss": 0.4718, + "step": 1105 + }, + { + "epoch": 0.21805993690851735, + "grad_norm": 0.9295012594770319, + "learning_rate": 1.9858654542816098e-05, + "loss": 0.4374, + "step": 1106 + }, + { + "epoch": 0.2182570977917981, + "grad_norm": 0.7110886368537057, + "learning_rate": 1.9858394734632054e-05, + "loss": 0.4292, + "step": 1107 + }, + { + "epoch": 0.21845425867507887, + "grad_norm": 0.745899081526924, + "learning_rate": 1.9858134689592143e-05, + "loss": 0.4494, + "step": 1108 + }, + { + "epoch": 0.21865141955835962, + "grad_norm": 3.2936984686195667, + "learning_rate": 1.9857874407702606e-05, + "loss": 0.436, + "step": 1109 + }, + { + "epoch": 0.21884858044164038, + "grad_norm": 0.9426735384515953, + "learning_rate": 1.9857613888969694e-05, + "loss": 0.4761, + "step": 1110 + }, + { + "epoch": 0.21904574132492113, + "grad_norm": 0.9274354892950216, + "learning_rate": 1.9857353133399675e-05, + "loss": 0.4885, + "step": 1111 + }, + { + "epoch": 0.2192429022082019, + "grad_norm": 0.6753675988750968, + "learning_rate": 1.9857092140998807e-05, + "loss": 0.4496, + "step": 1112 + }, + { + "epoch": 0.21944006309148265, + "grad_norm": 0.9644822228389445, + "learning_rate": 1.985683091177336e-05, + "loss": 0.4677, + "step": 1113 + }, + { + "epoch": 0.2196372239747634, + "grad_norm": 0.7186407032609544, + "learning_rate": 1.9856569445729615e-05, + "loss": 0.4894, + "step": 1114 + }, + { + "epoch": 0.21983438485804416, + "grad_norm": 0.7974174876233592, + "learning_rate": 1.9856307742873852e-05, + "loss": 0.4593, + "step": 1115 + }, + { + "epoch": 0.2200315457413249, + "grad_norm": 0.7199784472630298, + "learning_rate": 1.9856045803212356e-05, + "loss": 0.4488, + "step": 1116 + }, + { + "epoch": 0.22022870662460567, + "grad_norm": 0.7059488613258683, + "learning_rate": 1.9855783626751425e-05, + "loss": 0.423, + "step": 1117 + }, + { + "epoch": 0.22042586750788642, + "grad_norm": 0.6784311746671168, + "learning_rate": 1.9855521213497355e-05, + "loss": 0.4455, + "step": 1118 + }, + { + "epoch": 0.22062302839116718, + "grad_norm": 0.6987440905493829, + "learning_rate": 1.9855258563456448e-05, + "loss": 0.4458, + "step": 1119 + }, + { + "epoch": 0.22082018927444794, + "grad_norm": 0.7453334954062374, + "learning_rate": 1.985499567663502e-05, + "loss": 0.472, + "step": 1120 + }, + { + "epoch": 0.22101735015772872, + "grad_norm": 0.8208236623644202, + "learning_rate": 1.9854732553039388e-05, + "loss": 0.4327, + "step": 1121 + }, + { + "epoch": 0.22121451104100948, + "grad_norm": 0.8076041122842882, + "learning_rate": 1.9854469192675868e-05, + "loss": 0.4488, + "step": 1122 + }, + { + "epoch": 0.22141167192429023, + "grad_norm": 0.6509600595961608, + "learning_rate": 1.9854205595550787e-05, + "loss": 0.4253, + "step": 1123 + }, + { + "epoch": 0.221608832807571, + "grad_norm": 0.8076858203915139, + "learning_rate": 1.9853941761670483e-05, + "loss": 0.4793, + "step": 1124 + }, + { + "epoch": 0.22180599369085174, + "grad_norm": 0.6925472433506414, + "learning_rate": 1.9853677691041293e-05, + "loss": 0.4703, + "step": 1125 + }, + { + "epoch": 0.2220031545741325, + "grad_norm": 0.68014215224375, + "learning_rate": 1.985341338366956e-05, + "loss": 0.4668, + "step": 1126 + }, + { + "epoch": 0.22220031545741326, + "grad_norm": 0.7501920154620356, + "learning_rate": 1.9853148839561638e-05, + "loss": 0.4611, + "step": 1127 + }, + { + "epoch": 0.222397476340694, + "grad_norm": 0.7031267527114912, + "learning_rate": 1.985288405872388e-05, + "loss": 0.4603, + "step": 1128 + }, + { + "epoch": 0.22259463722397477, + "grad_norm": 0.6372019068678325, + "learning_rate": 1.9852619041162646e-05, + "loss": 0.41, + "step": 1129 + }, + { + "epoch": 0.22279179810725552, + "grad_norm": 0.6979715600361694, + "learning_rate": 1.9852353786884306e-05, + "loss": 0.4666, + "step": 1130 + }, + { + "epoch": 0.22298895899053628, + "grad_norm": 0.7323097747430808, + "learning_rate": 1.9852088295895232e-05, + "loss": 0.4468, + "step": 1131 + }, + { + "epoch": 0.22318611987381703, + "grad_norm": 0.6991485532926531, + "learning_rate": 1.9851822568201806e-05, + "loss": 0.4634, + "step": 1132 + }, + { + "epoch": 0.2233832807570978, + "grad_norm": 0.7591783665648759, + "learning_rate": 1.9851556603810406e-05, + "loss": 0.4419, + "step": 1133 + }, + { + "epoch": 0.22358044164037855, + "grad_norm": 1.1746824112571197, + "learning_rate": 1.9851290402727426e-05, + "loss": 0.4639, + "step": 1134 + }, + { + "epoch": 0.2237776025236593, + "grad_norm": 0.7081731495754315, + "learning_rate": 1.985102396495926e-05, + "loss": 0.4881, + "step": 1135 + }, + { + "epoch": 0.22397476340694006, + "grad_norm": 0.6590193639944613, + "learning_rate": 1.9850757290512313e-05, + "loss": 0.4536, + "step": 1136 + }, + { + "epoch": 0.22417192429022081, + "grad_norm": 8.480821491410778, + "learning_rate": 1.9850490379392988e-05, + "loss": 0.4433, + "step": 1137 + }, + { + "epoch": 0.22436908517350157, + "grad_norm": 0.7990748970455287, + "learning_rate": 1.9850223231607696e-05, + "loss": 0.4704, + "step": 1138 + }, + { + "epoch": 0.22456624605678233, + "grad_norm": 0.6257113394097338, + "learning_rate": 1.984995584716286e-05, + "loss": 0.4377, + "step": 1139 + }, + { + "epoch": 0.22476340694006308, + "grad_norm": 4.598690932717382, + "learning_rate": 1.9849688226064906e-05, + "loss": 0.5025, + "step": 1140 + }, + { + "epoch": 0.22496056782334384, + "grad_norm": 0.8210582553644494, + "learning_rate": 1.9849420368320254e-05, + "loss": 0.4854, + "step": 1141 + }, + { + "epoch": 0.2251577287066246, + "grad_norm": 0.7646003921775568, + "learning_rate": 1.9849152273935353e-05, + "loss": 0.4344, + "step": 1142 + }, + { + "epoch": 0.22535488958990535, + "grad_norm": 0.7228017400656176, + "learning_rate": 1.9848883942916632e-05, + "loss": 0.4785, + "step": 1143 + }, + { + "epoch": 0.22555205047318613, + "grad_norm": 0.7985110474365666, + "learning_rate": 1.9848615375270547e-05, + "loss": 0.4461, + "step": 1144 + }, + { + "epoch": 0.2257492113564669, + "grad_norm": 0.8935580670763312, + "learning_rate": 1.984834657100354e-05, + "loss": 0.4658, + "step": 1145 + }, + { + "epoch": 0.22594637223974764, + "grad_norm": 0.8552039095318358, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.4574, + "step": 1146 + }, + { + "epoch": 0.2261435331230284, + "grad_norm": 1.2094865238135273, + "learning_rate": 1.984780825263263e-05, + "loss": 0.4461, + "step": 1147 + }, + { + "epoch": 0.22634069400630916, + "grad_norm": 0.8645616019682253, + "learning_rate": 1.984753873854165e-05, + "loss": 0.5116, + "step": 1148 + }, + { + "epoch": 0.2265378548895899, + "grad_norm": 3.127099963117224, + "learning_rate": 1.984726898785563e-05, + "loss": 0.4398, + "step": 1149 + }, + { + "epoch": 0.22673501577287067, + "grad_norm": 1.1378328088647094, + "learning_rate": 1.9846999000581033e-05, + "loss": 0.4529, + "step": 1150 + }, + { + "epoch": 0.22693217665615142, + "grad_norm": 1.2150716840072067, + "learning_rate": 1.9846728776724358e-05, + "loss": 0.5032, + "step": 1151 + }, + { + "epoch": 0.22712933753943218, + "grad_norm": 1.0286587973333643, + "learning_rate": 1.98464583162921e-05, + "loss": 0.4525, + "step": 1152 + }, + { + "epoch": 0.22732649842271294, + "grad_norm": 1.126989898873081, + "learning_rate": 1.9846187619290746e-05, + "loss": 0.4292, + "step": 1153 + }, + { + "epoch": 0.2275236593059937, + "grad_norm": 1.2199916483600795, + "learning_rate": 1.9845916685726808e-05, + "loss": 0.4732, + "step": 1154 + }, + { + "epoch": 0.22772082018927445, + "grad_norm": 0.9371106577012303, + "learning_rate": 1.9845645515606792e-05, + "loss": 0.4872, + "step": 1155 + }, + { + "epoch": 0.2279179810725552, + "grad_norm": 0.9056546747302068, + "learning_rate": 1.9845374108937213e-05, + "loss": 0.4975, + "step": 1156 + }, + { + "epoch": 0.22811514195583596, + "grad_norm": 1.5812769964252382, + "learning_rate": 1.9845102465724593e-05, + "loss": 0.4733, + "step": 1157 + }, + { + "epoch": 0.22831230283911672, + "grad_norm": 1.1843221653360212, + "learning_rate": 1.984483058597546e-05, + "loss": 0.4624, + "step": 1158 + }, + { + "epoch": 0.22850946372239747, + "grad_norm": 0.9821603418098558, + "learning_rate": 1.9844558469696342e-05, + "loss": 0.4413, + "step": 1159 + }, + { + "epoch": 0.22870662460567823, + "grad_norm": 0.9806072784995279, + "learning_rate": 1.984428611689378e-05, + "loss": 0.4868, + "step": 1160 + }, + { + "epoch": 0.22890378548895898, + "grad_norm": 0.8793984774617989, + "learning_rate": 1.9844013527574316e-05, + "loss": 0.4267, + "step": 1161 + }, + { + "epoch": 0.22910094637223974, + "grad_norm": 0.8495785803021494, + "learning_rate": 1.9843740701744497e-05, + "loss": 0.4354, + "step": 1162 + }, + { + "epoch": 0.2292981072555205, + "grad_norm": 1.7057039922514965, + "learning_rate": 1.9843467639410885e-05, + "loss": 0.4447, + "step": 1163 + }, + { + "epoch": 0.22949526813880125, + "grad_norm": 1.075911659015585, + "learning_rate": 1.9843194340580032e-05, + "loss": 0.4962, + "step": 1164 + }, + { + "epoch": 0.229692429022082, + "grad_norm": 0.8457228409931461, + "learning_rate": 1.9842920805258513e-05, + "loss": 0.4552, + "step": 1165 + }, + { + "epoch": 0.22988958990536276, + "grad_norm": 1.008444001726399, + "learning_rate": 1.9842647033452893e-05, + "loss": 0.4462, + "step": 1166 + }, + { + "epoch": 0.23008675078864355, + "grad_norm": 0.8092829859053057, + "learning_rate": 1.984237302516975e-05, + "loss": 0.4559, + "step": 1167 + }, + { + "epoch": 0.2302839116719243, + "grad_norm": 0.9687384781928995, + "learning_rate": 1.984209878041567e-05, + "loss": 0.5053, + "step": 1168 + }, + { + "epoch": 0.23048107255520506, + "grad_norm": 0.8297737211847974, + "learning_rate": 1.984182429919724e-05, + "loss": 0.462, + "step": 1169 + }, + { + "epoch": 0.2306782334384858, + "grad_norm": 0.7460078380279647, + "learning_rate": 1.9841549581521058e-05, + "loss": 0.4538, + "step": 1170 + }, + { + "epoch": 0.23087539432176657, + "grad_norm": 0.9658528590503032, + "learning_rate": 1.984127462739372e-05, + "loss": 0.4759, + "step": 1171 + }, + { + "epoch": 0.23107255520504733, + "grad_norm": 0.7016223428827921, + "learning_rate": 1.9840999436821836e-05, + "loss": 0.4328, + "step": 1172 + }, + { + "epoch": 0.23126971608832808, + "grad_norm": 0.6671082456539488, + "learning_rate": 1.9840724009812013e-05, + "loss": 0.4259, + "step": 1173 + }, + { + "epoch": 0.23146687697160884, + "grad_norm": 0.7264492795866501, + "learning_rate": 1.9840448346370873e-05, + "loss": 0.4251, + "step": 1174 + }, + { + "epoch": 0.2316640378548896, + "grad_norm": 0.726746284722804, + "learning_rate": 1.9840172446505036e-05, + "loss": 0.4838, + "step": 1175 + }, + { + "epoch": 0.23186119873817035, + "grad_norm": 1.151370972769961, + "learning_rate": 1.9839896310221133e-05, + "loss": 0.4793, + "step": 1176 + }, + { + "epoch": 0.2320583596214511, + "grad_norm": 0.8135672403953714, + "learning_rate": 1.9839619937525794e-05, + "loss": 0.4496, + "step": 1177 + }, + { + "epoch": 0.23225552050473186, + "grad_norm": 0.6927163568862448, + "learning_rate": 1.9839343328425668e-05, + "loss": 0.4628, + "step": 1178 + }, + { + "epoch": 0.23245268138801262, + "grad_norm": 0.6732578753958156, + "learning_rate": 1.983906648292739e-05, + "loss": 0.43, + "step": 1179 + }, + { + "epoch": 0.23264984227129337, + "grad_norm": 0.7086446749133889, + "learning_rate": 1.9838789401037616e-05, + "loss": 0.4291, + "step": 1180 + }, + { + "epoch": 0.23284700315457413, + "grad_norm": 0.7334824055009613, + "learning_rate": 1.9838512082763002e-05, + "loss": 0.4342, + "step": 1181 + }, + { + "epoch": 0.23304416403785488, + "grad_norm": 0.7653457648798104, + "learning_rate": 1.983823452811022e-05, + "loss": 0.4805, + "step": 1182 + }, + { + "epoch": 0.23324132492113564, + "grad_norm": 0.9262252445526374, + "learning_rate": 1.9837956737085924e-05, + "loss": 0.4788, + "step": 1183 + }, + { + "epoch": 0.2334384858044164, + "grad_norm": 0.7504820328186347, + "learning_rate": 1.98376787096968e-05, + "loss": 0.4555, + "step": 1184 + }, + { + "epoch": 0.23363564668769715, + "grad_norm": 0.721933776706364, + "learning_rate": 1.983740044594952e-05, + "loss": 0.4848, + "step": 1185 + }, + { + "epoch": 0.2338328075709779, + "grad_norm": 2.846732366688398, + "learning_rate": 1.9837121945850766e-05, + "loss": 0.4638, + "step": 1186 + }, + { + "epoch": 0.23402996845425866, + "grad_norm": 0.8306542747582979, + "learning_rate": 1.9836843209407247e-05, + "loss": 0.4082, + "step": 1187 + }, + { + "epoch": 0.23422712933753942, + "grad_norm": 0.736142320584185, + "learning_rate": 1.983656423662564e-05, + "loss": 0.4569, + "step": 1188 + }, + { + "epoch": 0.2344242902208202, + "grad_norm": 0.7039751490055907, + "learning_rate": 1.983628502751266e-05, + "loss": 0.4182, + "step": 1189 + }, + { + "epoch": 0.23462145110410096, + "grad_norm": 0.9525090224636836, + "learning_rate": 1.983600558207501e-05, + "loss": 0.4414, + "step": 1190 + }, + { + "epoch": 0.23481861198738171, + "grad_norm": 1.4964168121471113, + "learning_rate": 1.9835725900319406e-05, + "loss": 0.4468, + "step": 1191 + }, + { + "epoch": 0.23501577287066247, + "grad_norm": 0.8670187458357161, + "learning_rate": 1.9835445982252565e-05, + "loss": 0.4159, + "step": 1192 + }, + { + "epoch": 0.23521293375394323, + "grad_norm": 0.6948661600832843, + "learning_rate": 1.983516582788121e-05, + "loss": 0.4451, + "step": 1193 + }, + { + "epoch": 0.23541009463722398, + "grad_norm": 0.7812431346409022, + "learning_rate": 1.9834885437212083e-05, + "loss": 0.4613, + "step": 1194 + }, + { + "epoch": 0.23560725552050474, + "grad_norm": 0.7970118513981926, + "learning_rate": 1.983460481025191e-05, + "loss": 0.4761, + "step": 1195 + }, + { + "epoch": 0.2358044164037855, + "grad_norm": 0.7490679736689834, + "learning_rate": 1.9834323947007433e-05, + "loss": 0.4931, + "step": 1196 + }, + { + "epoch": 0.23600157728706625, + "grad_norm": 0.7282970951897094, + "learning_rate": 1.983404284748541e-05, + "loss": 0.4184, + "step": 1197 + }, + { + "epoch": 0.236198738170347, + "grad_norm": 0.7024789178171861, + "learning_rate": 1.9833761511692583e-05, + "loss": 0.4604, + "step": 1198 + }, + { + "epoch": 0.23639589905362776, + "grad_norm": 0.7562914768111765, + "learning_rate": 1.9833479939635724e-05, + "loss": 0.4835, + "step": 1199 + }, + { + "epoch": 0.23659305993690852, + "grad_norm": 0.6905900737165961, + "learning_rate": 1.9833198131321582e-05, + "loss": 0.4362, + "step": 1200 + }, + { + "epoch": 0.23679022082018927, + "grad_norm": 0.666922032665526, + "learning_rate": 1.9832916086756938e-05, + "loss": 0.4619, + "step": 1201 + }, + { + "epoch": 0.23698738170347003, + "grad_norm": 0.6837662204239521, + "learning_rate": 1.983263380594857e-05, + "loss": 0.4373, + "step": 1202 + }, + { + "epoch": 0.23718454258675079, + "grad_norm": 0.6414003295820169, + "learning_rate": 1.9832351288903256e-05, + "loss": 0.4317, + "step": 1203 + }, + { + "epoch": 0.23738170347003154, + "grad_norm": 0.7460692979898119, + "learning_rate": 1.9832068535627785e-05, + "loss": 0.479, + "step": 1204 + }, + { + "epoch": 0.2375788643533123, + "grad_norm": 0.6428078211747335, + "learning_rate": 1.983178554612895e-05, + "loss": 0.4469, + "step": 1205 + }, + { + "epoch": 0.23777602523659305, + "grad_norm": 0.7517122154912249, + "learning_rate": 1.9831502320413543e-05, + "loss": 0.4858, + "step": 1206 + }, + { + "epoch": 0.2379731861198738, + "grad_norm": 1.3463826441881264, + "learning_rate": 1.983121885848838e-05, + "loss": 0.4735, + "step": 1207 + }, + { + "epoch": 0.23817034700315456, + "grad_norm": 0.7102923990366657, + "learning_rate": 1.983093516036027e-05, + "loss": 0.487, + "step": 1208 + }, + { + "epoch": 0.23836750788643532, + "grad_norm": 2.290103684996036, + "learning_rate": 1.9830651226036023e-05, + "loss": 0.4727, + "step": 1209 + }, + { + "epoch": 0.23856466876971608, + "grad_norm": 0.6811005595914935, + "learning_rate": 1.9830367055522463e-05, + "loss": 0.4263, + "step": 1210 + }, + { + "epoch": 0.23876182965299683, + "grad_norm": 0.799809715831858, + "learning_rate": 1.983008264882642e-05, + "loss": 0.4097, + "step": 1211 + }, + { + "epoch": 0.23895899053627762, + "grad_norm": 0.7354195020597284, + "learning_rate": 1.9829798005954725e-05, + "loss": 0.4724, + "step": 1212 + }, + { + "epoch": 0.23915615141955837, + "grad_norm": 0.8296463030390158, + "learning_rate": 1.982951312691422e-05, + "loss": 0.4904, + "step": 1213 + }, + { + "epoch": 0.23935331230283913, + "grad_norm": 0.6229789713014955, + "learning_rate": 1.9829228011711738e-05, + "loss": 0.4451, + "step": 1214 + }, + { + "epoch": 0.23955047318611988, + "grad_norm": 0.7037507970309669, + "learning_rate": 1.9828942660354144e-05, + "loss": 0.4382, + "step": 1215 + }, + { + "epoch": 0.23974763406940064, + "grad_norm": 1.7862177754634971, + "learning_rate": 1.9828657072848284e-05, + "loss": 0.4677, + "step": 1216 + }, + { + "epoch": 0.2399447949526814, + "grad_norm": 0.7511599646811729, + "learning_rate": 1.9828371249201025e-05, + "loss": 0.4432, + "step": 1217 + }, + { + "epoch": 0.24014195583596215, + "grad_norm": 0.6663595319040087, + "learning_rate": 1.982808518941923e-05, + "loss": 0.4535, + "step": 1218 + }, + { + "epoch": 0.2403391167192429, + "grad_norm": 0.7505465175906688, + "learning_rate": 1.982779889350978e-05, + "loss": 0.483, + "step": 1219 + }, + { + "epoch": 0.24053627760252366, + "grad_norm": 0.7052786101408759, + "learning_rate": 1.982751236147954e-05, + "loss": 0.4568, + "step": 1220 + }, + { + "epoch": 0.24073343848580442, + "grad_norm": 1.0567060689641148, + "learning_rate": 1.9827225593335403e-05, + "loss": 0.4725, + "step": 1221 + }, + { + "epoch": 0.24093059936908517, + "grad_norm": 0.668419640423508, + "learning_rate": 1.9826938589084258e-05, + "loss": 0.462, + "step": 1222 + }, + { + "epoch": 0.24112776025236593, + "grad_norm": 1.2570484391370291, + "learning_rate": 1.9826651348733e-05, + "loss": 0.5211, + "step": 1223 + }, + { + "epoch": 0.24132492113564669, + "grad_norm": 0.7349089292621471, + "learning_rate": 1.982636387228853e-05, + "loss": 0.4528, + "step": 1224 + }, + { + "epoch": 0.24152208201892744, + "grad_norm": 0.8253540404906965, + "learning_rate": 1.9826076159757753e-05, + "loss": 0.5093, + "step": 1225 + }, + { + "epoch": 0.2417192429022082, + "grad_norm": 2.1092492780074745, + "learning_rate": 1.9825788211147587e-05, + "loss": 0.4845, + "step": 1226 + }, + { + "epoch": 0.24191640378548895, + "grad_norm": 0.8850457162579004, + "learning_rate": 1.9825500026464947e-05, + "loss": 0.4805, + "step": 1227 + }, + { + "epoch": 0.2421135646687697, + "grad_norm": 0.6781748488422774, + "learning_rate": 1.9825211605716748e-05, + "loss": 0.4738, + "step": 1228 + }, + { + "epoch": 0.24231072555205047, + "grad_norm": 1.0845607789046796, + "learning_rate": 1.9824922948909937e-05, + "loss": 0.4449, + "step": 1229 + }, + { + "epoch": 0.24250788643533122, + "grad_norm": 1.2815030508313785, + "learning_rate": 1.9824634056051436e-05, + "loss": 0.473, + "step": 1230 + }, + { + "epoch": 0.24270504731861198, + "grad_norm": 0.8080992979642333, + "learning_rate": 1.9824344927148193e-05, + "loss": 0.4731, + "step": 1231 + }, + { + "epoch": 0.24290220820189273, + "grad_norm": 0.7437272267133168, + "learning_rate": 1.982405556220715e-05, + "loss": 0.4519, + "step": 1232 + }, + { + "epoch": 0.2430993690851735, + "grad_norm": 0.7480610069853911, + "learning_rate": 1.982376596123526e-05, + "loss": 0.4417, + "step": 1233 + }, + { + "epoch": 0.24329652996845424, + "grad_norm": 0.7550320130527832, + "learning_rate": 1.982347612423948e-05, + "loss": 0.439, + "step": 1234 + }, + { + "epoch": 0.24349369085173503, + "grad_norm": 0.6749020950952507, + "learning_rate": 1.9823186051226783e-05, + "loss": 0.4201, + "step": 1235 + }, + { + "epoch": 0.24369085173501578, + "grad_norm": 0.8142599734476462, + "learning_rate": 1.9822895742204122e-05, + "loss": 0.4621, + "step": 1236 + }, + { + "epoch": 0.24388801261829654, + "grad_norm": 0.6700713256447526, + "learning_rate": 1.9822605197178485e-05, + "loss": 0.4387, + "step": 1237 + }, + { + "epoch": 0.2440851735015773, + "grad_norm": 0.7745146153654806, + "learning_rate": 1.9822314416156848e-05, + "loss": 0.4605, + "step": 1238 + }, + { + "epoch": 0.24428233438485805, + "grad_norm": 0.9942732385767507, + "learning_rate": 1.9822023399146194e-05, + "loss": 0.4773, + "step": 1239 + }, + { + "epoch": 0.2444794952681388, + "grad_norm": 0.7228782340223426, + "learning_rate": 1.9821732146153517e-05, + "loss": 0.4754, + "step": 1240 + }, + { + "epoch": 0.24467665615141956, + "grad_norm": 0.7905102450931651, + "learning_rate": 1.9821440657185822e-05, + "loss": 0.4596, + "step": 1241 + }, + { + "epoch": 0.24487381703470032, + "grad_norm": 0.8374670342807091, + "learning_rate": 1.9821148932250103e-05, + "loss": 0.4102, + "step": 1242 + }, + { + "epoch": 0.24507097791798108, + "grad_norm": 0.9557468893603367, + "learning_rate": 1.9820856971353374e-05, + "loss": 0.441, + "step": 1243 + }, + { + "epoch": 0.24526813880126183, + "grad_norm": 0.9884255566988156, + "learning_rate": 1.9820564774502644e-05, + "loss": 0.4866, + "step": 1244 + }, + { + "epoch": 0.2454652996845426, + "grad_norm": 0.6985509946646631, + "learning_rate": 1.9820272341704937e-05, + "loss": 0.4441, + "step": 1245 + }, + { + "epoch": 0.24566246056782334, + "grad_norm": 0.7274501226462767, + "learning_rate": 1.981997967296728e-05, + "loss": 0.4387, + "step": 1246 + }, + { + "epoch": 0.2458596214511041, + "grad_norm": 0.7388406161273386, + "learning_rate": 1.9819686768296706e-05, + "loss": 0.4613, + "step": 1247 + }, + { + "epoch": 0.24605678233438485, + "grad_norm": 0.7240588593494661, + "learning_rate": 1.9819393627700247e-05, + "loss": 0.4782, + "step": 1248 + }, + { + "epoch": 0.2462539432176656, + "grad_norm": 0.7023054870349664, + "learning_rate": 1.9819100251184945e-05, + "loss": 0.4687, + "step": 1249 + }, + { + "epoch": 0.24645110410094637, + "grad_norm": 0.7018924187358255, + "learning_rate": 1.9818806638757856e-05, + "loss": 0.4643, + "step": 1250 + }, + { + "epoch": 0.24664826498422712, + "grad_norm": 0.7507947096176073, + "learning_rate": 1.981851279042603e-05, + "loss": 0.4471, + "step": 1251 + }, + { + "epoch": 0.24684542586750788, + "grad_norm": 0.7236171294016701, + "learning_rate": 1.9818218706196527e-05, + "loss": 0.486, + "step": 1252 + }, + { + "epoch": 0.24704258675078863, + "grad_norm": 0.7074399537871575, + "learning_rate": 1.9817924386076416e-05, + "loss": 0.4696, + "step": 1253 + }, + { + "epoch": 0.2472397476340694, + "grad_norm": 0.6990187430370988, + "learning_rate": 1.981762983007276e-05, + "loss": 0.4539, + "step": 1254 + }, + { + "epoch": 0.24743690851735015, + "grad_norm": 0.8201430818139063, + "learning_rate": 1.9817335038192644e-05, + "loss": 0.4644, + "step": 1255 + }, + { + "epoch": 0.2476340694006309, + "grad_norm": 0.6857424544658502, + "learning_rate": 1.981704001044315e-05, + "loss": 0.4737, + "step": 1256 + }, + { + "epoch": 0.24783123028391169, + "grad_norm": 0.6640652585450276, + "learning_rate": 1.981674474683136e-05, + "loss": 0.4464, + "step": 1257 + }, + { + "epoch": 0.24802839116719244, + "grad_norm": 0.7345704399108267, + "learning_rate": 1.9816449247364374e-05, + "loss": 0.4625, + "step": 1258 + }, + { + "epoch": 0.2482255520504732, + "grad_norm": 0.6686587208589408, + "learning_rate": 1.981615351204929e-05, + "loss": 0.4369, + "step": 1259 + }, + { + "epoch": 0.24842271293375395, + "grad_norm": 0.6762802311646812, + "learning_rate": 1.9815857540893214e-05, + "loss": 0.432, + "step": 1260 + }, + { + "epoch": 0.2486198738170347, + "grad_norm": 0.6562041100893717, + "learning_rate": 1.9815561333903255e-05, + "loss": 0.43, + "step": 1261 + }, + { + "epoch": 0.24881703470031546, + "grad_norm": 0.6532239829668289, + "learning_rate": 1.981526489108653e-05, + "loss": 0.4351, + "step": 1262 + }, + { + "epoch": 0.24901419558359622, + "grad_norm": 0.6996616378046743, + "learning_rate": 1.981496821245016e-05, + "loss": 0.4881, + "step": 1263 + }, + { + "epoch": 0.24921135646687698, + "grad_norm": 0.6705242832654518, + "learning_rate": 1.9814671298001274e-05, + "loss": 0.4536, + "step": 1264 + }, + { + "epoch": 0.24940851735015773, + "grad_norm": 0.7366373896084766, + "learning_rate": 1.9814374147747012e-05, + "loss": 0.4733, + "step": 1265 + }, + { + "epoch": 0.2496056782334385, + "grad_norm": 0.6716024398128168, + "learning_rate": 1.98140767616945e-05, + "loss": 0.4879, + "step": 1266 + }, + { + "epoch": 0.24980283911671924, + "grad_norm": 0.7567180353841482, + "learning_rate": 1.98137791398509e-05, + "loss": 0.4714, + "step": 1267 + }, + { + "epoch": 0.25, + "grad_norm": 0.6280236993661752, + "learning_rate": 1.981348128222335e-05, + "loss": 0.4467, + "step": 1268 + }, + { + "epoch": 0.25, + "eval_loss": 0.45942220091819763, + "eval_runtime": 344.9533, + "eval_samples_per_second": 23.568, + "eval_steps_per_second": 1.476, + "step": 1268 + }, + { + "epoch": 0.25019716088328076, + "grad_norm": 0.7858090304991342, + "learning_rate": 1.9813183188819005e-05, + "loss": 0.515, + "step": 1269 + }, + { + "epoch": 0.2503943217665615, + "grad_norm": 0.7486528699456562, + "learning_rate": 1.981288485964503e-05, + "loss": 0.482, + "step": 1270 + }, + { + "epoch": 0.25059148264984227, + "grad_norm": 0.6374094423910737, + "learning_rate": 1.98125862947086e-05, + "loss": 0.4309, + "step": 1271 + }, + { + "epoch": 0.250788643533123, + "grad_norm": 0.7096340783400006, + "learning_rate": 1.981228749401688e-05, + "loss": 0.483, + "step": 1272 + }, + { + "epoch": 0.2509858044164038, + "grad_norm": 0.7444158649998527, + "learning_rate": 1.9811988457577054e-05, + "loss": 0.4592, + "step": 1273 + }, + { + "epoch": 0.25118296529968454, + "grad_norm": 0.7501735940555235, + "learning_rate": 1.98116891853963e-05, + "loss": 0.4499, + "step": 1274 + }, + { + "epoch": 0.2513801261829653, + "grad_norm": 0.6782176587394123, + "learning_rate": 1.9811389677481815e-05, + "loss": 0.4538, + "step": 1275 + }, + { + "epoch": 0.25157728706624605, + "grad_norm": 0.6415201673231513, + "learning_rate": 1.9811089933840788e-05, + "loss": 0.3966, + "step": 1276 + }, + { + "epoch": 0.2517744479495268, + "grad_norm": 0.7902039683370593, + "learning_rate": 1.9810789954480425e-05, + "loss": 0.4445, + "step": 1277 + }, + { + "epoch": 0.25197160883280756, + "grad_norm": 0.7144540560155214, + "learning_rate": 1.9810489739407934e-05, + "loss": 0.4711, + "step": 1278 + }, + { + "epoch": 0.2521687697160883, + "grad_norm": 0.6516750917925597, + "learning_rate": 1.9810189288630524e-05, + "loss": 0.4704, + "step": 1279 + }, + { + "epoch": 0.25236593059936907, + "grad_norm": 0.6422474397006303, + "learning_rate": 1.980988860215542e-05, + "loss": 0.4565, + "step": 1280 + }, + { + "epoch": 0.2525630914826498, + "grad_norm": 0.6600661011809978, + "learning_rate": 1.9809587679989843e-05, + "loss": 0.4603, + "step": 1281 + }, + { + "epoch": 0.2527602523659306, + "grad_norm": 0.6621495554696686, + "learning_rate": 1.980928652214102e-05, + "loss": 0.4539, + "step": 1282 + }, + { + "epoch": 0.25295741324921134, + "grad_norm": 0.6210665496532219, + "learning_rate": 1.980898512861619e-05, + "loss": 0.388, + "step": 1283 + }, + { + "epoch": 0.2531545741324921, + "grad_norm": 0.6372480467697316, + "learning_rate": 1.9808683499422595e-05, + "loss": 0.4625, + "step": 1284 + }, + { + "epoch": 0.25335173501577285, + "grad_norm": 0.7040049125741846, + "learning_rate": 1.9808381634567478e-05, + "loss": 0.4644, + "step": 1285 + }, + { + "epoch": 0.2535488958990536, + "grad_norm": 0.6788863637030159, + "learning_rate": 1.9808079534058092e-05, + "loss": 0.4233, + "step": 1286 + }, + { + "epoch": 0.25374605678233436, + "grad_norm": 0.7084313524121193, + "learning_rate": 1.9807777197901697e-05, + "loss": 0.4692, + "step": 1287 + }, + { + "epoch": 0.2539432176656151, + "grad_norm": 0.7122952126645031, + "learning_rate": 1.9807474626105557e-05, + "loss": 0.4687, + "step": 1288 + }, + { + "epoch": 0.2541403785488959, + "grad_norm": 0.9877933281913743, + "learning_rate": 1.9807171818676944e-05, + "loss": 0.4496, + "step": 1289 + }, + { + "epoch": 0.25433753943217663, + "grad_norm": 0.6865217783581581, + "learning_rate": 1.9806868775623127e-05, + "loss": 0.4448, + "step": 1290 + }, + { + "epoch": 0.25453470031545744, + "grad_norm": 0.6823194099522876, + "learning_rate": 1.980656549695139e-05, + "loss": 0.4556, + "step": 1291 + }, + { + "epoch": 0.2547318611987382, + "grad_norm": 0.6647345972741581, + "learning_rate": 1.9806261982669025e-05, + "loss": 0.4491, + "step": 1292 + }, + { + "epoch": 0.25492902208201895, + "grad_norm": 0.6302502151295573, + "learning_rate": 1.9805958232783314e-05, + "loss": 0.4464, + "step": 1293 + }, + { + "epoch": 0.2551261829652997, + "grad_norm": 0.6579780881631992, + "learning_rate": 1.980565424730156e-05, + "loss": 0.4485, + "step": 1294 + }, + { + "epoch": 0.25532334384858046, + "grad_norm": 0.7655010402371665, + "learning_rate": 1.9805350026231067e-05, + "loss": 0.4669, + "step": 1295 + }, + { + "epoch": 0.2555205047318612, + "grad_norm": 0.6625750789917858, + "learning_rate": 1.9805045569579144e-05, + "loss": 0.463, + "step": 1296 + }, + { + "epoch": 0.255717665615142, + "grad_norm": 0.6662591258788151, + "learning_rate": 1.9804740877353105e-05, + "loss": 0.4367, + "step": 1297 + }, + { + "epoch": 0.25591482649842273, + "grad_norm": 0.6108948522622889, + "learning_rate": 1.980443594956027e-05, + "loss": 0.4241, + "step": 1298 + }, + { + "epoch": 0.2561119873817035, + "grad_norm": 0.7914240432743016, + "learning_rate": 1.9804130786207966e-05, + "loss": 0.4388, + "step": 1299 + }, + { + "epoch": 0.25630914826498424, + "grad_norm": 0.6627711788576277, + "learning_rate": 1.9803825387303525e-05, + "loss": 0.4356, + "step": 1300 + }, + { + "epoch": 0.256506309148265, + "grad_norm": 0.6421042720478715, + "learning_rate": 1.9803519752854284e-05, + "loss": 0.4434, + "step": 1301 + }, + { + "epoch": 0.25670347003154576, + "grad_norm": 0.9154766392932525, + "learning_rate": 1.9803213882867583e-05, + "loss": 0.5037, + "step": 1302 + }, + { + "epoch": 0.2569006309148265, + "grad_norm": 0.6412059503472081, + "learning_rate": 1.9802907777350778e-05, + "loss": 0.4555, + "step": 1303 + }, + { + "epoch": 0.25709779179810727, + "grad_norm": 0.7328818019918474, + "learning_rate": 1.9802601436311223e-05, + "loss": 0.4795, + "step": 1304 + }, + { + "epoch": 0.257294952681388, + "grad_norm": 1.290582323709545, + "learning_rate": 1.980229485975627e-05, + "loss": 0.4574, + "step": 1305 + }, + { + "epoch": 0.2574921135646688, + "grad_norm": 0.7047549042877954, + "learning_rate": 1.980198804769329e-05, + "loss": 0.4441, + "step": 1306 + }, + { + "epoch": 0.25768927444794953, + "grad_norm": 0.6450143398070649, + "learning_rate": 1.9801681000129652e-05, + "loss": 0.4523, + "step": 1307 + }, + { + "epoch": 0.2578864353312303, + "grad_norm": 0.7236989507032727, + "learning_rate": 1.9801373717072732e-05, + "loss": 0.4811, + "step": 1308 + }, + { + "epoch": 0.25808359621451105, + "grad_norm": 0.5982776267715486, + "learning_rate": 1.980106619852992e-05, + "loss": 0.4257, + "step": 1309 + }, + { + "epoch": 0.2582807570977918, + "grad_norm": 1.5437331705738089, + "learning_rate": 1.98007584445086e-05, + "loss": 0.4755, + "step": 1310 + }, + { + "epoch": 0.25847791798107256, + "grad_norm": 0.665819086891947, + "learning_rate": 1.9800450455016163e-05, + "loss": 0.426, + "step": 1311 + }, + { + "epoch": 0.2586750788643533, + "grad_norm": 0.6986032017043294, + "learning_rate": 1.9800142230060012e-05, + "loss": 0.4677, + "step": 1312 + }, + { + "epoch": 0.25887223974763407, + "grad_norm": 0.7076440147684866, + "learning_rate": 1.9799833769647553e-05, + "loss": 0.4849, + "step": 1313 + }, + { + "epoch": 0.2590694006309148, + "grad_norm": 0.6993982811871999, + "learning_rate": 1.9799525073786196e-05, + "loss": 0.4636, + "step": 1314 + }, + { + "epoch": 0.2592665615141956, + "grad_norm": 0.6663223299060342, + "learning_rate": 1.9799216142483358e-05, + "loss": 0.4374, + "step": 1315 + }, + { + "epoch": 0.25946372239747634, + "grad_norm": 0.7322544020861116, + "learning_rate": 1.9798906975746462e-05, + "loss": 0.4616, + "step": 1316 + }, + { + "epoch": 0.2596608832807571, + "grad_norm": 4.868764964094287, + "learning_rate": 1.9798597573582935e-05, + "loss": 0.5068, + "step": 1317 + }, + { + "epoch": 0.25985804416403785, + "grad_norm": 1.6121049926148328, + "learning_rate": 1.9798287936000203e-05, + "loss": 0.4858, + "step": 1318 + }, + { + "epoch": 0.2600552050473186, + "grad_norm": 0.7881657434369355, + "learning_rate": 1.979797806300572e-05, + "loss": 0.4707, + "step": 1319 + }, + { + "epoch": 0.26025236593059936, + "grad_norm": 0.6719396543796176, + "learning_rate": 1.9797667954606923e-05, + "loss": 0.4375, + "step": 1320 + }, + { + "epoch": 0.2604495268138801, + "grad_norm": 0.677646758521938, + "learning_rate": 1.9797357610811264e-05, + "loss": 0.3941, + "step": 1321 + }, + { + "epoch": 0.2606466876971609, + "grad_norm": 0.6629789255172303, + "learning_rate": 1.9797047031626197e-05, + "loss": 0.4666, + "step": 1322 + }, + { + "epoch": 0.26084384858044163, + "grad_norm": 0.7016369371591297, + "learning_rate": 1.9796736217059184e-05, + "loss": 0.4176, + "step": 1323 + }, + { + "epoch": 0.2610410094637224, + "grad_norm": 0.7392651241817684, + "learning_rate": 1.9796425167117697e-05, + "loss": 0.4569, + "step": 1324 + }, + { + "epoch": 0.26123817034700314, + "grad_norm": 3.4278364452599943, + "learning_rate": 1.9796113881809207e-05, + "loss": 0.4897, + "step": 1325 + }, + { + "epoch": 0.2614353312302839, + "grad_norm": 0.7430113958710807, + "learning_rate": 1.979580236114119e-05, + "loss": 0.4281, + "step": 1326 + }, + { + "epoch": 0.26163249211356465, + "grad_norm": 0.7574764251542814, + "learning_rate": 1.9795490605121133e-05, + "loss": 0.4285, + "step": 1327 + }, + { + "epoch": 0.2618296529968454, + "grad_norm": 0.7573150663369252, + "learning_rate": 1.9795178613756526e-05, + "loss": 0.4214, + "step": 1328 + }, + { + "epoch": 0.26202681388012616, + "grad_norm": 0.7233219537322484, + "learning_rate": 1.9794866387054866e-05, + "loss": 0.4333, + "step": 1329 + }, + { + "epoch": 0.2622239747634069, + "grad_norm": 0.6536227209487654, + "learning_rate": 1.9794553925023648e-05, + "loss": 0.508, + "step": 1330 + }, + { + "epoch": 0.2624211356466877, + "grad_norm": 0.697150695998406, + "learning_rate": 1.979424122767039e-05, + "loss": 0.4402, + "step": 1331 + }, + { + "epoch": 0.26261829652996843, + "grad_norm": 1.1510688800982456, + "learning_rate": 1.97939282950026e-05, + "loss": 0.4935, + "step": 1332 + }, + { + "epoch": 0.2628154574132492, + "grad_norm": 0.6936122849738883, + "learning_rate": 1.979361512702779e-05, + "loss": 0.4369, + "step": 1333 + }, + { + "epoch": 0.26301261829652994, + "grad_norm": 0.7316037732825653, + "learning_rate": 1.9793301723753494e-05, + "loss": 0.4774, + "step": 1334 + }, + { + "epoch": 0.2632097791798107, + "grad_norm": 0.7046667225142733, + "learning_rate": 1.9792988085187237e-05, + "loss": 0.4586, + "step": 1335 + }, + { + "epoch": 0.2634069400630915, + "grad_norm": 0.7576252182556432, + "learning_rate": 1.9792674211336557e-05, + "loss": 0.4444, + "step": 1336 + }, + { + "epoch": 0.26360410094637227, + "grad_norm": 0.702436953338292, + "learning_rate": 1.9792360102208987e-05, + "loss": 0.4508, + "step": 1337 + }, + { + "epoch": 0.263801261829653, + "grad_norm": 0.8581168449077242, + "learning_rate": 1.9792045757812083e-05, + "loss": 0.4536, + "step": 1338 + }, + { + "epoch": 0.2639984227129338, + "grad_norm": 0.7421141473102278, + "learning_rate": 1.9791731178153398e-05, + "loss": 0.4884, + "step": 1339 + }, + { + "epoch": 0.26419558359621453, + "grad_norm": 0.8029593819817289, + "learning_rate": 1.979141636324048e-05, + "loss": 0.5005, + "step": 1340 + }, + { + "epoch": 0.2643927444794953, + "grad_norm": 0.7626034173756989, + "learning_rate": 1.97911013130809e-05, + "loss": 0.4701, + "step": 1341 + }, + { + "epoch": 0.26458990536277605, + "grad_norm": 0.6947755523676065, + "learning_rate": 1.979078602768223e-05, + "loss": 0.4649, + "step": 1342 + }, + { + "epoch": 0.2647870662460568, + "grad_norm": 0.7173606924037355, + "learning_rate": 1.9790470507052043e-05, + "loss": 0.4636, + "step": 1343 + }, + { + "epoch": 0.26498422712933756, + "grad_norm": 0.7174536980291077, + "learning_rate": 1.979015475119791e-05, + "loss": 0.4722, + "step": 1344 + }, + { + "epoch": 0.2651813880126183, + "grad_norm": 0.7091313571434286, + "learning_rate": 1.978983876012743e-05, + "loss": 0.4604, + "step": 1345 + }, + { + "epoch": 0.26537854889589907, + "grad_norm": 0.7938427813870562, + "learning_rate": 1.978952253384819e-05, + "loss": 0.4683, + "step": 1346 + }, + { + "epoch": 0.2655757097791798, + "grad_norm": 0.8665000592475282, + "learning_rate": 1.9789206072367788e-05, + "loss": 0.4651, + "step": 1347 + }, + { + "epoch": 0.2657728706624606, + "grad_norm": 0.7476112206662554, + "learning_rate": 1.9788889375693826e-05, + "loss": 0.4501, + "step": 1348 + }, + { + "epoch": 0.26597003154574134, + "grad_norm": 0.6411119915064069, + "learning_rate": 1.978857244383391e-05, + "loss": 0.4332, + "step": 1349 + }, + { + "epoch": 0.2661671924290221, + "grad_norm": 0.7501426692435633, + "learning_rate": 1.9788255276795665e-05, + "loss": 0.4482, + "step": 1350 + }, + { + "epoch": 0.26636435331230285, + "grad_norm": 0.6924050647879918, + "learning_rate": 1.97879378745867e-05, + "loss": 0.4781, + "step": 1351 + }, + { + "epoch": 0.2665615141955836, + "grad_norm": 0.7692951834407704, + "learning_rate": 1.9787620237214648e-05, + "loss": 0.4492, + "step": 1352 + }, + { + "epoch": 0.26675867507886436, + "grad_norm": 0.7874785369772107, + "learning_rate": 1.9787302364687137e-05, + "loss": 0.4625, + "step": 1353 + }, + { + "epoch": 0.2669558359621451, + "grad_norm": 0.7706538665841983, + "learning_rate": 1.9786984257011804e-05, + "loss": 0.5065, + "step": 1354 + }, + { + "epoch": 0.26715299684542587, + "grad_norm": 0.9023943582130787, + "learning_rate": 1.9786665914196293e-05, + "loss": 0.4269, + "step": 1355 + }, + { + "epoch": 0.26735015772870663, + "grad_norm": 0.7444077480799488, + "learning_rate": 1.978634733624825e-05, + "loss": 0.4632, + "step": 1356 + }, + { + "epoch": 0.2675473186119874, + "grad_norm": 0.750725217959198, + "learning_rate": 1.9786028523175334e-05, + "loss": 0.4813, + "step": 1357 + }, + { + "epoch": 0.26774447949526814, + "grad_norm": 0.6159139836796179, + "learning_rate": 1.9785709474985205e-05, + "loss": 0.4647, + "step": 1358 + }, + { + "epoch": 0.2679416403785489, + "grad_norm": 0.7129947513385421, + "learning_rate": 1.978539019168552e-05, + "loss": 0.4177, + "step": 1359 + }, + { + "epoch": 0.26813880126182965, + "grad_norm": 0.6516268614502504, + "learning_rate": 1.9785070673283958e-05, + "loss": 0.4365, + "step": 1360 + }, + { + "epoch": 0.2683359621451104, + "grad_norm": 0.7106387839347237, + "learning_rate": 1.9784750919788192e-05, + "loss": 0.4908, + "step": 1361 + }, + { + "epoch": 0.26853312302839116, + "grad_norm": 0.6645867370230059, + "learning_rate": 1.978443093120591e-05, + "loss": 0.4646, + "step": 1362 + }, + { + "epoch": 0.2687302839116719, + "grad_norm": 0.6609654446353767, + "learning_rate": 1.978411070754479e-05, + "loss": 0.4464, + "step": 1363 + }, + { + "epoch": 0.2689274447949527, + "grad_norm": 0.6509684679600655, + "learning_rate": 1.9783790248812535e-05, + "loss": 0.4378, + "step": 1364 + }, + { + "epoch": 0.26912460567823343, + "grad_norm": 0.7410255193953944, + "learning_rate": 1.9783469555016838e-05, + "loss": 0.4883, + "step": 1365 + }, + { + "epoch": 0.2693217665615142, + "grad_norm": 0.6410457849255176, + "learning_rate": 1.9783148626165408e-05, + "loss": 0.4566, + "step": 1366 + }, + { + "epoch": 0.26951892744479494, + "grad_norm": 0.7212189325599111, + "learning_rate": 1.978282746226595e-05, + "loss": 0.4748, + "step": 1367 + }, + { + "epoch": 0.2697160883280757, + "grad_norm": 0.6386044383277297, + "learning_rate": 1.9782506063326188e-05, + "loss": 0.4525, + "step": 1368 + }, + { + "epoch": 0.26991324921135645, + "grad_norm": 0.6183024303790638, + "learning_rate": 1.978218442935384e-05, + "loss": 0.4419, + "step": 1369 + }, + { + "epoch": 0.2701104100946372, + "grad_norm": 1.0311747282997497, + "learning_rate": 1.9781862560356632e-05, + "loss": 0.4705, + "step": 1370 + }, + { + "epoch": 0.27030757097791797, + "grad_norm": 0.8876601008055358, + "learning_rate": 1.97815404563423e-05, + "loss": 0.4952, + "step": 1371 + }, + { + "epoch": 0.2705047318611987, + "grad_norm": 0.6707553837334197, + "learning_rate": 1.978121811731858e-05, + "loss": 0.4727, + "step": 1372 + }, + { + "epoch": 0.2707018927444795, + "grad_norm": 0.6324861998325033, + "learning_rate": 1.978089554329322e-05, + "loss": 0.4512, + "step": 1373 + }, + { + "epoch": 0.27089905362776023, + "grad_norm": 0.7208718808928518, + "learning_rate": 1.9780572734273965e-05, + "loss": 0.4708, + "step": 1374 + }, + { + "epoch": 0.271096214511041, + "grad_norm": 0.6289916939191427, + "learning_rate": 1.9780249690268577e-05, + "loss": 0.46, + "step": 1375 + }, + { + "epoch": 0.27129337539432175, + "grad_norm": 0.6255312002347381, + "learning_rate": 1.977992641128481e-05, + "loss": 0.4113, + "step": 1376 + }, + { + "epoch": 0.2714905362776025, + "grad_norm": 0.7538032993514353, + "learning_rate": 1.977960289733044e-05, + "loss": 0.5171, + "step": 1377 + }, + { + "epoch": 0.27168769716088326, + "grad_norm": 0.6617762757068099, + "learning_rate": 1.977927914841323e-05, + "loss": 0.4574, + "step": 1378 + }, + { + "epoch": 0.271884858044164, + "grad_norm": 0.6317670145204217, + "learning_rate": 1.9778955164540966e-05, + "loss": 0.4416, + "step": 1379 + }, + { + "epoch": 0.27208201892744477, + "grad_norm": 0.6625339760031199, + "learning_rate": 1.977863094572143e-05, + "loss": 0.4695, + "step": 1380 + }, + { + "epoch": 0.2722791798107255, + "grad_norm": 0.8851653534578081, + "learning_rate": 1.977830649196241e-05, + "loss": 0.469, + "step": 1381 + }, + { + "epoch": 0.27247634069400634, + "grad_norm": 0.6525212576179507, + "learning_rate": 1.9777981803271702e-05, + "loss": 0.4607, + "step": 1382 + }, + { + "epoch": 0.2726735015772871, + "grad_norm": 0.687794207764544, + "learning_rate": 1.9777656879657104e-05, + "loss": 0.4329, + "step": 1383 + }, + { + "epoch": 0.27287066246056785, + "grad_norm": 0.6477780040394759, + "learning_rate": 1.9777331721126432e-05, + "loss": 0.455, + "step": 1384 + }, + { + "epoch": 0.2730678233438486, + "grad_norm": 0.6704764509430108, + "learning_rate": 1.9777006327687486e-05, + "loss": 0.4756, + "step": 1385 + }, + { + "epoch": 0.27326498422712936, + "grad_norm": 0.6673187768283187, + "learning_rate": 1.9776680699348093e-05, + "loss": 0.4525, + "step": 1386 + }, + { + "epoch": 0.2734621451104101, + "grad_norm": 0.6725761328570758, + "learning_rate": 1.977635483611607e-05, + "loss": 0.4465, + "step": 1387 + }, + { + "epoch": 0.27365930599369087, + "grad_norm": 0.6036295529404659, + "learning_rate": 1.9776028737999256e-05, + "loss": 0.4371, + "step": 1388 + }, + { + "epoch": 0.2738564668769716, + "grad_norm": 0.649936023702026, + "learning_rate": 1.9775702405005473e-05, + "loss": 0.4419, + "step": 1389 + }, + { + "epoch": 0.2740536277602524, + "grad_norm": 0.6259329731865518, + "learning_rate": 1.977537583714257e-05, + "loss": 0.4607, + "step": 1390 + }, + { + "epoch": 0.27425078864353314, + "grad_norm": 0.9337258788421762, + "learning_rate": 1.9775049034418384e-05, + "loss": 0.4314, + "step": 1391 + }, + { + "epoch": 0.2744479495268139, + "grad_norm": 0.7553643495249596, + "learning_rate": 1.977472199684078e-05, + "loss": 0.4019, + "step": 1392 + }, + { + "epoch": 0.27464511041009465, + "grad_norm": 0.6681682048717218, + "learning_rate": 1.9774394724417608e-05, + "loss": 0.4669, + "step": 1393 + }, + { + "epoch": 0.2748422712933754, + "grad_norm": 0.6660846989527527, + "learning_rate": 1.977406721715673e-05, + "loss": 0.4592, + "step": 1394 + }, + { + "epoch": 0.27503943217665616, + "grad_norm": 1.030125307014766, + "learning_rate": 1.9773739475066015e-05, + "loss": 0.4519, + "step": 1395 + }, + { + "epoch": 0.2752365930599369, + "grad_norm": 0.6165906390246885, + "learning_rate": 1.977341149815334e-05, + "loss": 0.4611, + "step": 1396 + }, + { + "epoch": 0.2754337539432177, + "grad_norm": 0.6301071810608176, + "learning_rate": 1.977308328642658e-05, + "loss": 0.4728, + "step": 1397 + }, + { + "epoch": 0.27563091482649843, + "grad_norm": 0.603328270231981, + "learning_rate": 1.9772754839893627e-05, + "loss": 0.434, + "step": 1398 + }, + { + "epoch": 0.2758280757097792, + "grad_norm": 0.6648768357451612, + "learning_rate": 1.9772426158562367e-05, + "loss": 0.4612, + "step": 1399 + }, + { + "epoch": 0.27602523659305994, + "grad_norm": 0.6209334691515703, + "learning_rate": 1.9772097242440703e-05, + "loss": 0.4629, + "step": 1400 + }, + { + "epoch": 0.2762223974763407, + "grad_norm": 0.6615598121420424, + "learning_rate": 1.9771768091536528e-05, + "loss": 0.4546, + "step": 1401 + }, + { + "epoch": 0.27641955835962145, + "grad_norm": 0.6824317459563148, + "learning_rate": 1.977143870585776e-05, + "loss": 0.4428, + "step": 1402 + }, + { + "epoch": 0.2766167192429022, + "grad_norm": 0.6670787550837416, + "learning_rate": 1.9771109085412304e-05, + "loss": 0.4997, + "step": 1403 + }, + { + "epoch": 0.27681388012618297, + "grad_norm": 0.8229581570264787, + "learning_rate": 1.9770779230208088e-05, + "loss": 0.4375, + "step": 1404 + }, + { + "epoch": 0.2770110410094637, + "grad_norm": 0.637951151429009, + "learning_rate": 1.977044914025303e-05, + "loss": 0.4322, + "step": 1405 + }, + { + "epoch": 0.2772082018927445, + "grad_norm": 0.607442608155513, + "learning_rate": 1.9770118815555063e-05, + "loss": 0.4242, + "step": 1406 + }, + { + "epoch": 0.27740536277602523, + "grad_norm": 0.6993192647998349, + "learning_rate": 1.9769788256122125e-05, + "loss": 0.4741, + "step": 1407 + }, + { + "epoch": 0.277602523659306, + "grad_norm": 0.6691537581034921, + "learning_rate": 1.9769457461962154e-05, + "loss": 0.4504, + "step": 1408 + }, + { + "epoch": 0.27779968454258674, + "grad_norm": 0.7485422674380534, + "learning_rate": 1.9769126433083102e-05, + "loss": 0.4825, + "step": 1409 + }, + { + "epoch": 0.2779968454258675, + "grad_norm": 0.6486895377627935, + "learning_rate": 1.976879516949292e-05, + "loss": 0.459, + "step": 1410 + }, + { + "epoch": 0.27819400630914826, + "grad_norm": 0.669976702587913, + "learning_rate": 1.976846367119957e-05, + "loss": 0.4618, + "step": 1411 + }, + { + "epoch": 0.278391167192429, + "grad_norm": 0.6947541489605775, + "learning_rate": 1.976813193821101e-05, + "loss": 0.4782, + "step": 1412 + }, + { + "epoch": 0.27858832807570977, + "grad_norm": 0.7279085002000397, + "learning_rate": 1.9767799970535214e-05, + "loss": 0.4584, + "step": 1413 + }, + { + "epoch": 0.2787854889589905, + "grad_norm": 0.6324612775847488, + "learning_rate": 1.9767467768180163e-05, + "loss": 0.4429, + "step": 1414 + }, + { + "epoch": 0.2789826498422713, + "grad_norm": 0.7075686828831383, + "learning_rate": 1.9767135331153827e-05, + "loss": 0.497, + "step": 1415 + }, + { + "epoch": 0.27917981072555204, + "grad_norm": 0.677366770732698, + "learning_rate": 1.97668026594642e-05, + "loss": 0.4739, + "step": 1416 + }, + { + "epoch": 0.2793769716088328, + "grad_norm": 0.6413070384790073, + "learning_rate": 1.9766469753119274e-05, + "loss": 0.4609, + "step": 1417 + }, + { + "epoch": 0.27957413249211355, + "grad_norm": 0.6875558936981296, + "learning_rate": 1.976613661212705e-05, + "loss": 0.5027, + "step": 1418 + }, + { + "epoch": 0.2797712933753943, + "grad_norm": 0.7069012738884121, + "learning_rate": 1.976580323649553e-05, + "loss": 0.4675, + "step": 1419 + }, + { + "epoch": 0.27996845425867506, + "grad_norm": 0.6999152312103506, + "learning_rate": 1.976546962623272e-05, + "loss": 0.479, + "step": 1420 + }, + { + "epoch": 0.2801656151419558, + "grad_norm": 0.632966968386772, + "learning_rate": 1.9765135781346637e-05, + "loss": 0.4458, + "step": 1421 + }, + { + "epoch": 0.28036277602523657, + "grad_norm": 0.680032576952019, + "learning_rate": 1.9764801701845307e-05, + "loss": 0.4904, + "step": 1422 + }, + { + "epoch": 0.2805599369085173, + "grad_norm": 0.6793474835680345, + "learning_rate": 1.9764467387736748e-05, + "loss": 0.4594, + "step": 1423 + }, + { + "epoch": 0.2807570977917981, + "grad_norm": 0.6990046169084455, + "learning_rate": 1.9764132839029e-05, + "loss": 0.4825, + "step": 1424 + }, + { + "epoch": 0.28095425867507884, + "grad_norm": 0.7012958434364689, + "learning_rate": 1.9763798055730096e-05, + "loss": 0.4394, + "step": 1425 + }, + { + "epoch": 0.2811514195583596, + "grad_norm": 0.649381673277496, + "learning_rate": 1.9763463037848082e-05, + "loss": 0.4832, + "step": 1426 + }, + { + "epoch": 0.2813485804416404, + "grad_norm": 0.7051628624005888, + "learning_rate": 1.9763127785391007e-05, + "loss": 0.4962, + "step": 1427 + }, + { + "epoch": 0.28154574132492116, + "grad_norm": 0.6757095316378556, + "learning_rate": 1.976279229836692e-05, + "loss": 0.4448, + "step": 1428 + }, + { + "epoch": 0.2817429022082019, + "grad_norm": 0.7505169408393606, + "learning_rate": 1.976245657678389e-05, + "loss": 0.4875, + "step": 1429 + }, + { + "epoch": 0.2819400630914827, + "grad_norm": 0.6482256189057635, + "learning_rate": 1.9762120620649978e-05, + "loss": 0.4457, + "step": 1430 + }, + { + "epoch": 0.28213722397476343, + "grad_norm": 0.6341748454996773, + "learning_rate": 1.9761784429973257e-05, + "loss": 0.4298, + "step": 1431 + }, + { + "epoch": 0.2823343848580442, + "grad_norm": 0.6426740598337791, + "learning_rate": 1.9761448004761804e-05, + "loss": 0.4351, + "step": 1432 + }, + { + "epoch": 0.28253154574132494, + "grad_norm": 0.6893390849187452, + "learning_rate": 1.9761111345023702e-05, + "loss": 0.4528, + "step": 1433 + }, + { + "epoch": 0.2827287066246057, + "grad_norm": 0.6931202899611968, + "learning_rate": 1.976077445076704e-05, + "loss": 0.4504, + "step": 1434 + }, + { + "epoch": 0.28292586750788645, + "grad_norm": 0.6396492394646054, + "learning_rate": 1.976043732199991e-05, + "loss": 0.5051, + "step": 1435 + }, + { + "epoch": 0.2831230283911672, + "grad_norm": 0.6068638024331832, + "learning_rate": 1.9760099958730414e-05, + "loss": 0.3965, + "step": 1436 + }, + { + "epoch": 0.28332018927444796, + "grad_norm": 0.6793441643898733, + "learning_rate": 1.9759762360966658e-05, + "loss": 0.4565, + "step": 1437 + }, + { + "epoch": 0.2835173501577287, + "grad_norm": 0.6338152033705464, + "learning_rate": 1.9759424528716748e-05, + "loss": 0.3908, + "step": 1438 + }, + { + "epoch": 0.2837145110410095, + "grad_norm": 0.6375149590618007, + "learning_rate": 1.975908646198881e-05, + "loss": 0.4457, + "step": 1439 + }, + { + "epoch": 0.28391167192429023, + "grad_norm": 0.6912444261554432, + "learning_rate": 1.9758748160790956e-05, + "loss": 0.4556, + "step": 1440 + }, + { + "epoch": 0.284108832807571, + "grad_norm": 0.7308297162675815, + "learning_rate": 1.975840962513132e-05, + "loss": 0.484, + "step": 1441 + }, + { + "epoch": 0.28430599369085174, + "grad_norm": 0.7358253544207752, + "learning_rate": 1.9758070855018033e-05, + "loss": 0.4667, + "step": 1442 + }, + { + "epoch": 0.2845031545741325, + "grad_norm": 0.6240361195624918, + "learning_rate": 1.975773185045924e-05, + "loss": 0.4549, + "step": 1443 + }, + { + "epoch": 0.28470031545741326, + "grad_norm": 0.6895402014641637, + "learning_rate": 1.975739261146308e-05, + "loss": 0.4234, + "step": 1444 + }, + { + "epoch": 0.284897476340694, + "grad_norm": 2.677764220478418, + "learning_rate": 1.97570531380377e-05, + "loss": 0.4986, + "step": 1445 + }, + { + "epoch": 0.28509463722397477, + "grad_norm": 0.7134750571138294, + "learning_rate": 1.975671343019126e-05, + "loss": 0.4143, + "step": 1446 + }, + { + "epoch": 0.2852917981072555, + "grad_norm": 0.7374402505710781, + "learning_rate": 1.9756373487931932e-05, + "loss": 0.478, + "step": 1447 + }, + { + "epoch": 0.2854889589905363, + "grad_norm": 0.7841802091710707, + "learning_rate": 1.975603331126787e-05, + "loss": 0.4791, + "step": 1448 + }, + { + "epoch": 0.28568611987381703, + "grad_norm": 0.7851659427951097, + "learning_rate": 1.975569290020725e-05, + "loss": 0.4445, + "step": 1449 + }, + { + "epoch": 0.2858832807570978, + "grad_norm": 0.7016574816261865, + "learning_rate": 1.9755352254758253e-05, + "loss": 0.4718, + "step": 1450 + }, + { + "epoch": 0.28608044164037855, + "grad_norm": 1.360154947822406, + "learning_rate": 1.975501137492906e-05, + "loss": 0.4778, + "step": 1451 + }, + { + "epoch": 0.2862776025236593, + "grad_norm": 0.7088540961818571, + "learning_rate": 1.9754670260727865e-05, + "loss": 0.4956, + "step": 1452 + }, + { + "epoch": 0.28647476340694006, + "grad_norm": 0.6877943073595973, + "learning_rate": 1.9754328912162864e-05, + "loss": 0.4666, + "step": 1453 + }, + { + "epoch": 0.2866719242902208, + "grad_norm": 0.7632821008022346, + "learning_rate": 1.975398732924225e-05, + "loss": 0.4624, + "step": 1454 + }, + { + "epoch": 0.28686908517350157, + "grad_norm": 0.649955168958308, + "learning_rate": 1.975364551197424e-05, + "loss": 0.4327, + "step": 1455 + }, + { + "epoch": 0.2870662460567823, + "grad_norm": 1.5748278853806321, + "learning_rate": 1.975330346036704e-05, + "loss": 0.4463, + "step": 1456 + }, + { + "epoch": 0.2872634069400631, + "grad_norm": 0.6418235523968513, + "learning_rate": 1.975296117442887e-05, + "loss": 0.4426, + "step": 1457 + }, + { + "epoch": 0.28746056782334384, + "grad_norm": 0.7529789256896934, + "learning_rate": 1.9752618654167954e-05, + "loss": 0.4525, + "step": 1458 + }, + { + "epoch": 0.2876577287066246, + "grad_norm": 0.6837140757349681, + "learning_rate": 1.975227589959252e-05, + "loss": 0.4664, + "step": 1459 + }, + { + "epoch": 0.28785488958990535, + "grad_norm": 0.7302417926946705, + "learning_rate": 1.9751932910710808e-05, + "loss": 0.4344, + "step": 1460 + }, + { + "epoch": 0.2880520504731861, + "grad_norm": 0.6549023096371199, + "learning_rate": 1.9751589687531052e-05, + "loss": 0.4637, + "step": 1461 + }, + { + "epoch": 0.28824921135646686, + "grad_norm": 0.7572816037486845, + "learning_rate": 1.97512462300615e-05, + "loss": 0.4808, + "step": 1462 + }, + { + "epoch": 0.2884463722397476, + "grad_norm": 0.7438685296075004, + "learning_rate": 1.9750902538310407e-05, + "loss": 0.4554, + "step": 1463 + }, + { + "epoch": 0.2886435331230284, + "grad_norm": 0.7921043989775165, + "learning_rate": 1.9750558612286025e-05, + "loss": 0.4669, + "step": 1464 + }, + { + "epoch": 0.28884069400630913, + "grad_norm": 1.7620430759198427, + "learning_rate": 1.9750214451996623e-05, + "loss": 0.4356, + "step": 1465 + }, + { + "epoch": 0.2890378548895899, + "grad_norm": 0.7869312505011672, + "learning_rate": 1.9749870057450464e-05, + "loss": 0.4502, + "step": 1466 + }, + { + "epoch": 0.28923501577287064, + "grad_norm": 0.7845532647336225, + "learning_rate": 1.974952542865583e-05, + "loss": 0.4835, + "step": 1467 + }, + { + "epoch": 0.2894321766561514, + "grad_norm": 0.7515938063415738, + "learning_rate": 1.9749180565620995e-05, + "loss": 0.4507, + "step": 1468 + }, + { + "epoch": 0.28962933753943215, + "grad_norm": 0.8913235246679625, + "learning_rate": 1.9748835468354243e-05, + "loss": 0.4729, + "step": 1469 + }, + { + "epoch": 0.2898264984227129, + "grad_norm": 0.8870979739457769, + "learning_rate": 1.9748490136863867e-05, + "loss": 0.4391, + "step": 1470 + }, + { + "epoch": 0.29002365930599366, + "grad_norm": 0.7727887896133023, + "learning_rate": 1.9748144571158167e-05, + "loss": 0.501, + "step": 1471 + }, + { + "epoch": 0.2902208201892745, + "grad_norm": 0.6725824982109148, + "learning_rate": 1.9747798771245446e-05, + "loss": 0.4284, + "step": 1472 + }, + { + "epoch": 0.29041798107255523, + "grad_norm": 0.7636840941258795, + "learning_rate": 1.9747452737134006e-05, + "loss": 0.4975, + "step": 1473 + }, + { + "epoch": 0.290615141955836, + "grad_norm": 0.695534933075417, + "learning_rate": 1.974710646883217e-05, + "loss": 0.4367, + "step": 1474 + }, + { + "epoch": 0.29081230283911674, + "grad_norm": 0.7808275437629759, + "learning_rate": 1.9746759966348244e-05, + "loss": 0.4821, + "step": 1475 + }, + { + "epoch": 0.2910094637223975, + "grad_norm": 0.6839691016507893, + "learning_rate": 1.9746413229690565e-05, + "loss": 0.4496, + "step": 1476 + }, + { + "epoch": 0.29120662460567825, + "grad_norm": 0.7291630955736097, + "learning_rate": 1.974606625886746e-05, + "loss": 0.467, + "step": 1477 + }, + { + "epoch": 0.291403785488959, + "grad_norm": 0.7415071271998195, + "learning_rate": 1.9745719053887265e-05, + "loss": 0.4563, + "step": 1478 + }, + { + "epoch": 0.29160094637223977, + "grad_norm": 0.721134342027533, + "learning_rate": 1.974537161475832e-05, + "loss": 0.4157, + "step": 1479 + }, + { + "epoch": 0.2917981072555205, + "grad_norm": 0.6815829464296383, + "learning_rate": 1.9745023941488974e-05, + "loss": 0.4959, + "step": 1480 + }, + { + "epoch": 0.2919952681388013, + "grad_norm": 0.6455560207981099, + "learning_rate": 1.974467603408758e-05, + "loss": 0.4131, + "step": 1481 + }, + { + "epoch": 0.29219242902208203, + "grad_norm": 0.7495965931748421, + "learning_rate": 1.9744327892562497e-05, + "loss": 0.4536, + "step": 1482 + }, + { + "epoch": 0.2923895899053628, + "grad_norm": 4.622108646120691, + "learning_rate": 1.974397951692209e-05, + "loss": 0.4472, + "step": 1483 + }, + { + "epoch": 0.29258675078864355, + "grad_norm": 0.7671628175372462, + "learning_rate": 1.9743630907174727e-05, + "loss": 0.4463, + "step": 1484 + }, + { + "epoch": 0.2927839116719243, + "grad_norm": 0.6306822829326908, + "learning_rate": 1.9743282063328786e-05, + "loss": 0.4321, + "step": 1485 + }, + { + "epoch": 0.29298107255520506, + "grad_norm": 0.8026977169526986, + "learning_rate": 1.9742932985392646e-05, + "loss": 0.44, + "step": 1486 + }, + { + "epoch": 0.2931782334384858, + "grad_norm": 0.6458962336741704, + "learning_rate": 1.97425836733747e-05, + "loss": 0.415, + "step": 1487 + }, + { + "epoch": 0.29337539432176657, + "grad_norm": 0.8366441003492903, + "learning_rate": 1.9742234127283328e-05, + "loss": 0.5109, + "step": 1488 + }, + { + "epoch": 0.2935725552050473, + "grad_norm": 3.206384511793217, + "learning_rate": 1.9741884347126937e-05, + "loss": 0.4654, + "step": 1489 + }, + { + "epoch": 0.2937697160883281, + "grad_norm": 1.0905491570369534, + "learning_rate": 1.9741534332913934e-05, + "loss": 0.4734, + "step": 1490 + }, + { + "epoch": 0.29396687697160884, + "grad_norm": 0.7932149212181082, + "learning_rate": 1.9741184084652723e-05, + "loss": 0.4809, + "step": 1491 + }, + { + "epoch": 0.2941640378548896, + "grad_norm": 0.907352399489384, + "learning_rate": 1.9740833602351718e-05, + "loss": 0.4856, + "step": 1492 + }, + { + "epoch": 0.29436119873817035, + "grad_norm": 0.8172339427555916, + "learning_rate": 1.9740482886019342e-05, + "loss": 0.4348, + "step": 1493 + }, + { + "epoch": 0.2945583596214511, + "grad_norm": 0.7379386874605652, + "learning_rate": 1.974013193566402e-05, + "loss": 0.4315, + "step": 1494 + }, + { + "epoch": 0.29475552050473186, + "grad_norm": 0.7315832037117764, + "learning_rate": 1.9739780751294188e-05, + "loss": 0.4302, + "step": 1495 + }, + { + "epoch": 0.2949526813880126, + "grad_norm": 0.7363421312105899, + "learning_rate": 1.9739429332918276e-05, + "loss": 0.4428, + "step": 1496 + }, + { + "epoch": 0.29514984227129337, + "grad_norm": 0.7954453286705275, + "learning_rate": 1.973907768054473e-05, + "loss": 0.4826, + "step": 1497 + }, + { + "epoch": 0.29534700315457413, + "grad_norm": 0.6920665293293689, + "learning_rate": 1.9738725794182004e-05, + "loss": 0.45, + "step": 1498 + }, + { + "epoch": 0.2955441640378549, + "grad_norm": 0.7476711485594634, + "learning_rate": 1.9738373673838545e-05, + "loss": 0.47, + "step": 1499 + }, + { + "epoch": 0.29574132492113564, + "grad_norm": 0.6378986468629754, + "learning_rate": 1.9738021319522817e-05, + "loss": 0.446, + "step": 1500 + }, + { + "epoch": 0.2959384858044164, + "grad_norm": 0.7449299773188399, + "learning_rate": 1.9737668731243284e-05, + "loss": 0.4622, + "step": 1501 + }, + { + "epoch": 0.29613564668769715, + "grad_norm": 0.6648614812967949, + "learning_rate": 1.973731590900842e-05, + "loss": 0.4512, + "step": 1502 + }, + { + "epoch": 0.2963328075709779, + "grad_norm": 0.7321775927414682, + "learning_rate": 1.97369628528267e-05, + "loss": 0.423, + "step": 1503 + }, + { + "epoch": 0.29652996845425866, + "grad_norm": 0.6801575733467247, + "learning_rate": 1.9736609562706604e-05, + "loss": 0.4424, + "step": 1504 + }, + { + "epoch": 0.2967271293375394, + "grad_norm": 0.6497325650317418, + "learning_rate": 1.9736256038656624e-05, + "loss": 0.4494, + "step": 1505 + }, + { + "epoch": 0.2969242902208202, + "grad_norm": 0.7063493502541413, + "learning_rate": 1.9735902280685252e-05, + "loss": 0.4375, + "step": 1506 + }, + { + "epoch": 0.29712145110410093, + "grad_norm": 0.6907055254378035, + "learning_rate": 1.9735548288800988e-05, + "loss": 0.4723, + "step": 1507 + }, + { + "epoch": 0.2973186119873817, + "grad_norm": 0.6955235821677667, + "learning_rate": 1.9735194063012337e-05, + "loss": 0.4377, + "step": 1508 + }, + { + "epoch": 0.29751577287066244, + "grad_norm": 0.6505153342650509, + "learning_rate": 1.9734839603327805e-05, + "loss": 0.4741, + "step": 1509 + }, + { + "epoch": 0.2977129337539432, + "grad_norm": 1.21645699588291, + "learning_rate": 1.973448490975592e-05, + "loss": 0.488, + "step": 1510 + }, + { + "epoch": 0.29791009463722395, + "grad_norm": 5.633561734216787, + "learning_rate": 1.9734129982305187e-05, + "loss": 0.5188, + "step": 1511 + }, + { + "epoch": 0.2981072555205047, + "grad_norm": 5.960949833229553, + "learning_rate": 1.9733774820984146e-05, + "loss": 0.4864, + "step": 1512 + }, + { + "epoch": 0.29830441640378547, + "grad_norm": 0.8936987496926002, + "learning_rate": 1.9733419425801326e-05, + "loss": 0.4904, + "step": 1513 + }, + { + "epoch": 0.2985015772870662, + "grad_norm": 0.8736184927683982, + "learning_rate": 1.9733063796765267e-05, + "loss": 0.4807, + "step": 1514 + }, + { + "epoch": 0.298698738170347, + "grad_norm": 0.8389206829886541, + "learning_rate": 1.9732707933884508e-05, + "loss": 0.4708, + "step": 1515 + }, + { + "epoch": 0.29889589905362773, + "grad_norm": 0.877518939540052, + "learning_rate": 1.973235183716761e-05, + "loss": 0.4064, + "step": 1516 + }, + { + "epoch": 0.2990930599369085, + "grad_norm": 1.0406389624738088, + "learning_rate": 1.9731995506623118e-05, + "loss": 0.4824, + "step": 1517 + }, + { + "epoch": 0.2992902208201893, + "grad_norm": 0.8112544551340672, + "learning_rate": 1.9731638942259596e-05, + "loss": 0.4748, + "step": 1518 + }, + { + "epoch": 0.29948738170347006, + "grad_norm": 0.7560476486010564, + "learning_rate": 1.9731282144085613e-05, + "loss": 0.4459, + "step": 1519 + }, + { + "epoch": 0.2996845425867508, + "grad_norm": 0.8150605495041554, + "learning_rate": 1.973092511210974e-05, + "loss": 0.4697, + "step": 1520 + }, + { + "epoch": 0.29988170347003157, + "grad_norm": 0.7564808190893473, + "learning_rate": 1.9730567846340552e-05, + "loss": 0.4705, + "step": 1521 + }, + { + "epoch": 0.3000788643533123, + "grad_norm": 0.7262145297310585, + "learning_rate": 1.973021034678664e-05, + "loss": 0.3978, + "step": 1522 + }, + { + "epoch": 0.3002760252365931, + "grad_norm": 0.7038980359439974, + "learning_rate": 1.9729852613456586e-05, + "loss": 0.4584, + "step": 1523 + }, + { + "epoch": 0.30047318611987384, + "grad_norm": 0.7588921782146265, + "learning_rate": 1.972949464635899e-05, + "loss": 0.4819, + "step": 1524 + }, + { + "epoch": 0.3006703470031546, + "grad_norm": 0.7168024227685293, + "learning_rate": 1.9729136445502446e-05, + "loss": 0.4864, + "step": 1525 + }, + { + "epoch": 0.30086750788643535, + "grad_norm": 0.6493407202654529, + "learning_rate": 1.9728778010895567e-05, + "loss": 0.4107, + "step": 1526 + }, + { + "epoch": 0.3010646687697161, + "grad_norm": 0.6952700541408848, + "learning_rate": 1.9728419342546962e-05, + "loss": 0.495, + "step": 1527 + }, + { + "epoch": 0.30126182965299686, + "grad_norm": 1.1339617708695726, + "learning_rate": 1.9728060440465246e-05, + "loss": 0.4262, + "step": 1528 + }, + { + "epoch": 0.3014589905362776, + "grad_norm": 0.6443436669507463, + "learning_rate": 1.9727701304659046e-05, + "loss": 0.4323, + "step": 1529 + }, + { + "epoch": 0.30165615141955837, + "grad_norm": 0.7193540103567199, + "learning_rate": 1.9727341935136987e-05, + "loss": 0.4688, + "step": 1530 + }, + { + "epoch": 0.3018533123028391, + "grad_norm": 0.6808229865935738, + "learning_rate": 1.9726982331907706e-05, + "loss": 0.4637, + "step": 1531 + }, + { + "epoch": 0.3020504731861199, + "grad_norm": 0.6469885340908015, + "learning_rate": 1.972662249497984e-05, + "loss": 0.4533, + "step": 1532 + }, + { + "epoch": 0.30224763406940064, + "grad_norm": 0.8050241386096714, + "learning_rate": 1.9726262424362033e-05, + "loss": 0.4629, + "step": 1533 + }, + { + "epoch": 0.3024447949526814, + "grad_norm": 1.1279481880937756, + "learning_rate": 1.9725902120062942e-05, + "loss": 0.4769, + "step": 1534 + }, + { + "epoch": 0.30264195583596215, + "grad_norm": 0.9362607518901818, + "learning_rate": 1.972554158209122e-05, + "loss": 0.4499, + "step": 1535 + }, + { + "epoch": 0.3028391167192429, + "grad_norm": 0.6311429250558253, + "learning_rate": 1.972518081045553e-05, + "loss": 0.4707, + "step": 1536 + }, + { + "epoch": 0.30303627760252366, + "grad_norm": 0.6908119659769627, + "learning_rate": 1.9724819805164542e-05, + "loss": 0.459, + "step": 1537 + }, + { + "epoch": 0.3032334384858044, + "grad_norm": 1.3140084577134628, + "learning_rate": 1.972445856622692e-05, + "loss": 0.4604, + "step": 1538 + }, + { + "epoch": 0.3034305993690852, + "grad_norm": 1.4336266646144273, + "learning_rate": 1.9724097093651356e-05, + "loss": 0.5054, + "step": 1539 + }, + { + "epoch": 0.30362776025236593, + "grad_norm": 0.6266235358734742, + "learning_rate": 1.9723735387446526e-05, + "loss": 0.4195, + "step": 1540 + }, + { + "epoch": 0.3038249211356467, + "grad_norm": 0.664253082557445, + "learning_rate": 1.9723373447621125e-05, + "loss": 0.4466, + "step": 1541 + }, + { + "epoch": 0.30402208201892744, + "grad_norm": 0.6631894672531906, + "learning_rate": 1.9723011274183844e-05, + "loss": 0.4613, + "step": 1542 + }, + { + "epoch": 0.3042192429022082, + "grad_norm": 0.9552950397058307, + "learning_rate": 1.9722648867143384e-05, + "loss": 0.4328, + "step": 1543 + }, + { + "epoch": 0.30441640378548895, + "grad_norm": 0.6534330380880412, + "learning_rate": 1.972228622650846e-05, + "loss": 0.472, + "step": 1544 + }, + { + "epoch": 0.3046135646687697, + "grad_norm": 0.6792854850313309, + "learning_rate": 1.972192335228778e-05, + "loss": 0.4639, + "step": 1545 + }, + { + "epoch": 0.30481072555205047, + "grad_norm": 0.8246065548514672, + "learning_rate": 1.972156024449006e-05, + "loss": 0.4678, + "step": 1546 + }, + { + "epoch": 0.3050078864353312, + "grad_norm": 0.6805673028086667, + "learning_rate": 1.972119690312403e-05, + "loss": 0.4392, + "step": 1547 + }, + { + "epoch": 0.305205047318612, + "grad_norm": 1.1098363616208942, + "learning_rate": 1.9720833328198416e-05, + "loss": 0.457, + "step": 1548 + }, + { + "epoch": 0.30540220820189273, + "grad_norm": 1.9882754170201284, + "learning_rate": 1.972046951972195e-05, + "loss": 0.4782, + "step": 1549 + }, + { + "epoch": 0.3055993690851735, + "grad_norm": 0.6655712983680275, + "learning_rate": 1.972010547770338e-05, + "loss": 0.4654, + "step": 1550 + }, + { + "epoch": 0.30579652996845424, + "grad_norm": 0.6357244449306213, + "learning_rate": 1.9719741202151442e-05, + "loss": 0.4337, + "step": 1551 + }, + { + "epoch": 0.305993690851735, + "grad_norm": 0.6432848249907509, + "learning_rate": 1.9719376693074898e-05, + "loss": 0.4656, + "step": 1552 + }, + { + "epoch": 0.30619085173501576, + "grad_norm": 0.6490834027479839, + "learning_rate": 1.97190119504825e-05, + "loss": 0.4442, + "step": 1553 + }, + { + "epoch": 0.3063880126182965, + "grad_norm": 0.9593162347229904, + "learning_rate": 1.9718646974383016e-05, + "loss": 0.417, + "step": 1554 + }, + { + "epoch": 0.30658517350157727, + "grad_norm": 0.8582061855411773, + "learning_rate": 1.9718281764785213e-05, + "loss": 0.4533, + "step": 1555 + }, + { + "epoch": 0.306782334384858, + "grad_norm": 0.6829232907025286, + "learning_rate": 1.9717916321697862e-05, + "loss": 0.4531, + "step": 1556 + }, + { + "epoch": 0.3069794952681388, + "grad_norm": 0.6946751006171858, + "learning_rate": 1.9717550645129745e-05, + "loss": 0.4332, + "step": 1557 + }, + { + "epoch": 0.30717665615141954, + "grad_norm": 0.7343732521374964, + "learning_rate": 1.971718473508965e-05, + "loss": 0.4563, + "step": 1558 + }, + { + "epoch": 0.3073738170347003, + "grad_norm": 0.7092985413488407, + "learning_rate": 1.9716818591586367e-05, + "loss": 0.4427, + "step": 1559 + }, + { + "epoch": 0.30757097791798105, + "grad_norm": 0.6920199000103088, + "learning_rate": 1.9716452214628688e-05, + "loss": 0.4867, + "step": 1560 + }, + { + "epoch": 0.3077681388012618, + "grad_norm": 0.7518768098411801, + "learning_rate": 1.9716085604225425e-05, + "loss": 0.4288, + "step": 1561 + }, + { + "epoch": 0.30796529968454256, + "grad_norm": 0.9921646912510204, + "learning_rate": 1.9715718760385377e-05, + "loss": 0.458, + "step": 1562 + }, + { + "epoch": 0.30816246056782337, + "grad_norm": 0.723626350748604, + "learning_rate": 1.9715351683117364e-05, + "loss": 0.4865, + "step": 1563 + }, + { + "epoch": 0.3083596214511041, + "grad_norm": 0.6699114054096069, + "learning_rate": 1.9714984372430205e-05, + "loss": 0.4719, + "step": 1564 + }, + { + "epoch": 0.3085567823343849, + "grad_norm": 0.7208356036921162, + "learning_rate": 1.971461682833272e-05, + "loss": 0.4499, + "step": 1565 + }, + { + "epoch": 0.30875394321766564, + "grad_norm": 0.696266504957194, + "learning_rate": 1.9714249050833743e-05, + "loss": 0.4685, + "step": 1566 + }, + { + "epoch": 0.3089511041009464, + "grad_norm": 0.7045591753323159, + "learning_rate": 1.971388103994211e-05, + "loss": 0.4693, + "step": 1567 + }, + { + "epoch": 0.30914826498422715, + "grad_norm": 0.9972061656036422, + "learning_rate": 1.9713512795666663e-05, + "loss": 0.4655, + "step": 1568 + }, + { + "epoch": 0.3093454258675079, + "grad_norm": 0.6343439339945319, + "learning_rate": 1.971314431801625e-05, + "loss": 0.4607, + "step": 1569 + }, + { + "epoch": 0.30954258675078866, + "grad_norm": 1.3214688571967796, + "learning_rate": 1.9712775606999718e-05, + "loss": 0.5253, + "step": 1570 + }, + { + "epoch": 0.3097397476340694, + "grad_norm": 1.1643733839819645, + "learning_rate": 1.9712406662625934e-05, + "loss": 0.4845, + "step": 1571 + }, + { + "epoch": 0.3099369085173502, + "grad_norm": 0.6395778049163567, + "learning_rate": 1.9712037484903758e-05, + "loss": 0.4646, + "step": 1572 + }, + { + "epoch": 0.31013406940063093, + "grad_norm": 0.728804625037493, + "learning_rate": 1.971166807384206e-05, + "loss": 0.4829, + "step": 1573 + }, + { + "epoch": 0.3103312302839117, + "grad_norm": 0.6692642203500485, + "learning_rate": 1.9711298429449716e-05, + "loss": 0.4628, + "step": 1574 + }, + { + "epoch": 0.31052839116719244, + "grad_norm": 0.6801411829078509, + "learning_rate": 1.9710928551735606e-05, + "loss": 0.4806, + "step": 1575 + }, + { + "epoch": 0.3107255520504732, + "grad_norm": 0.9147951787329351, + "learning_rate": 1.971055844070862e-05, + "loss": 0.4823, + "step": 1576 + }, + { + "epoch": 0.31092271293375395, + "grad_norm": 0.6704142482301383, + "learning_rate": 1.9710188096377645e-05, + "loss": 0.493, + "step": 1577 + }, + { + "epoch": 0.3111198738170347, + "grad_norm": 0.7934181252723825, + "learning_rate": 1.9709817518751585e-05, + "loss": 0.4711, + "step": 1578 + }, + { + "epoch": 0.31131703470031546, + "grad_norm": 0.9638101618156172, + "learning_rate": 1.9709446707839336e-05, + "loss": 0.4734, + "step": 1579 + }, + { + "epoch": 0.3115141955835962, + "grad_norm": 0.8058930264480674, + "learning_rate": 1.9709075663649812e-05, + "loss": 0.4672, + "step": 1580 + }, + { + "epoch": 0.311711356466877, + "grad_norm": 0.648347607286343, + "learning_rate": 1.9708704386191924e-05, + "loss": 0.4387, + "step": 1581 + }, + { + "epoch": 0.31190851735015773, + "grad_norm": 0.677310939518794, + "learning_rate": 1.97083328754746e-05, + "loss": 0.4441, + "step": 1582 + }, + { + "epoch": 0.3121056782334385, + "grad_norm": 1.102580749565265, + "learning_rate": 1.9707961131506756e-05, + "loss": 0.4659, + "step": 1583 + }, + { + "epoch": 0.31230283911671924, + "grad_norm": 0.7198499814401207, + "learning_rate": 1.9707589154297328e-05, + "loss": 0.4805, + "step": 1584 + }, + { + "epoch": 0.3125, + "grad_norm": 0.6444043482783797, + "learning_rate": 1.9707216943855258e-05, + "loss": 0.4437, + "step": 1585 + }, + { + "epoch": 0.31269716088328076, + "grad_norm": 0.8244715296197911, + "learning_rate": 1.970684450018948e-05, + "loss": 0.4363, + "step": 1586 + }, + { + "epoch": 0.3128943217665615, + "grad_norm": 0.6445385325086617, + "learning_rate": 1.9706471823308946e-05, + "loss": 0.4599, + "step": 1587 + }, + { + "epoch": 0.31309148264984227, + "grad_norm": 0.699849167744654, + "learning_rate": 1.9706098913222608e-05, + "loss": 0.5236, + "step": 1588 + }, + { + "epoch": 0.313288643533123, + "grad_norm": 0.7229780587068287, + "learning_rate": 1.970572576993943e-05, + "loss": 0.412, + "step": 1589 + }, + { + "epoch": 0.3134858044164038, + "grad_norm": 0.6479915833004943, + "learning_rate": 1.9705352393468374e-05, + "loss": 0.4697, + "step": 1590 + }, + { + "epoch": 0.31368296529968454, + "grad_norm": 0.650021857370611, + "learning_rate": 1.9704978783818413e-05, + "loss": 0.4326, + "step": 1591 + }, + { + "epoch": 0.3138801261829653, + "grad_norm": 0.6861324174548465, + "learning_rate": 1.970460494099852e-05, + "loss": 0.4125, + "step": 1592 + }, + { + "epoch": 0.31407728706624605, + "grad_norm": 0.6801449769884733, + "learning_rate": 1.9704230865017675e-05, + "loss": 0.5069, + "step": 1593 + }, + { + "epoch": 0.3142744479495268, + "grad_norm": 0.7205207674630276, + "learning_rate": 1.970385655588487e-05, + "loss": 0.4757, + "step": 1594 + }, + { + "epoch": 0.31447160883280756, + "grad_norm": 0.6629052999763434, + "learning_rate": 1.9703482013609098e-05, + "loss": 0.4351, + "step": 1595 + }, + { + "epoch": 0.3146687697160883, + "grad_norm": 0.7088009223433047, + "learning_rate": 1.9703107238199356e-05, + "loss": 0.4652, + "step": 1596 + }, + { + "epoch": 0.31486593059936907, + "grad_norm": 0.9099735140613966, + "learning_rate": 1.9702732229664653e-05, + "loss": 0.4856, + "step": 1597 + }, + { + "epoch": 0.3150630914826498, + "grad_norm": 0.744041311387149, + "learning_rate": 1.9702356988013988e-05, + "loss": 0.4675, + "step": 1598 + }, + { + "epoch": 0.3152602523659306, + "grad_norm": 0.6721850270536037, + "learning_rate": 1.970198151325639e-05, + "loss": 0.4876, + "step": 1599 + }, + { + "epoch": 0.31545741324921134, + "grad_norm": 0.7183831817138312, + "learning_rate": 1.9701605805400866e-05, + "loss": 0.4715, + "step": 1600 + }, + { + "epoch": 0.3156545741324921, + "grad_norm": 0.6393423636617389, + "learning_rate": 1.9701229864456452e-05, + "loss": 0.4591, + "step": 1601 + }, + { + "epoch": 0.31585173501577285, + "grad_norm": 0.6363184919995188, + "learning_rate": 1.970085369043218e-05, + "loss": 0.4134, + "step": 1602 + }, + { + "epoch": 0.3160488958990536, + "grad_norm": 0.703734451223413, + "learning_rate": 1.9700477283337084e-05, + "loss": 0.4589, + "step": 1603 + }, + { + "epoch": 0.31624605678233436, + "grad_norm": 0.6584994125713037, + "learning_rate": 1.9700100643180213e-05, + "loss": 0.4482, + "step": 1604 + }, + { + "epoch": 0.3164432176656151, + "grad_norm": 0.6078730627425877, + "learning_rate": 1.9699723769970608e-05, + "loss": 0.4399, + "step": 1605 + }, + { + "epoch": 0.3166403785488959, + "grad_norm": 0.6961873263732445, + "learning_rate": 1.969934666371733e-05, + "loss": 0.4775, + "step": 1606 + }, + { + "epoch": 0.31683753943217663, + "grad_norm": 0.724397941037564, + "learning_rate": 1.969896932442944e-05, + "loss": 0.4981, + "step": 1607 + }, + { + "epoch": 0.31703470031545744, + "grad_norm": 0.7312028182555175, + "learning_rate": 1.9698591752115997e-05, + "loss": 0.4688, + "step": 1608 + }, + { + "epoch": 0.3172318611987382, + "grad_norm": 0.6505489803123736, + "learning_rate": 1.969821394678608e-05, + "loss": 0.4298, + "step": 1609 + }, + { + "epoch": 0.31742902208201895, + "grad_norm": 0.6779628156309334, + "learning_rate": 1.969783590844876e-05, + "loss": 0.4806, + "step": 1610 + }, + { + "epoch": 0.3176261829652997, + "grad_norm": 0.7180529086461475, + "learning_rate": 1.9697457637113126e-05, + "loss": 0.5021, + "step": 1611 + }, + { + "epoch": 0.31782334384858046, + "grad_norm": 0.6009368475283028, + "learning_rate": 1.969707913278826e-05, + "loss": 0.4411, + "step": 1612 + }, + { + "epoch": 0.3180205047318612, + "grad_norm": 0.6337105416643031, + "learning_rate": 1.969670039548326e-05, + "loss": 0.4554, + "step": 1613 + }, + { + "epoch": 0.318217665615142, + "grad_norm": 0.7204205483322568, + "learning_rate": 1.9696321425207227e-05, + "loss": 0.4912, + "step": 1614 + }, + { + "epoch": 0.31841482649842273, + "grad_norm": 0.6061996997155742, + "learning_rate": 1.969594222196926e-05, + "loss": 0.4442, + "step": 1615 + }, + { + "epoch": 0.3186119873817035, + "grad_norm": 0.6433833110520849, + "learning_rate": 1.9695562785778473e-05, + "loss": 0.4289, + "step": 1616 + }, + { + "epoch": 0.31880914826498424, + "grad_norm": 0.5906452185455755, + "learning_rate": 1.9695183116643983e-05, + "loss": 0.408, + "step": 1617 + }, + { + "epoch": 0.319006309148265, + "grad_norm": 0.6795767414105931, + "learning_rate": 1.9694803214574914e-05, + "loss": 0.4325, + "step": 1618 + }, + { + "epoch": 0.31920347003154576, + "grad_norm": 0.6337536961202738, + "learning_rate": 1.9694423079580387e-05, + "loss": 0.4426, + "step": 1619 + }, + { + "epoch": 0.3194006309148265, + "grad_norm": 0.6793514535697415, + "learning_rate": 1.969404271166954e-05, + "loss": 0.4756, + "step": 1620 + }, + { + "epoch": 0.31959779179810727, + "grad_norm": 0.5997185524987442, + "learning_rate": 1.9693662110851507e-05, + "loss": 0.4417, + "step": 1621 + }, + { + "epoch": 0.319794952681388, + "grad_norm": 0.6223038049361463, + "learning_rate": 1.969328127713544e-05, + "loss": 0.4317, + "step": 1622 + }, + { + "epoch": 0.3199921135646688, + "grad_norm": 0.6472194848635398, + "learning_rate": 1.9692900210530482e-05, + "loss": 0.4702, + "step": 1623 + }, + { + "epoch": 0.32018927444794953, + "grad_norm": 0.5962256875505306, + "learning_rate": 1.9692518911045793e-05, + "loss": 0.4499, + "step": 1624 + }, + { + "epoch": 0.3203864353312303, + "grad_norm": 0.6998497786192203, + "learning_rate": 1.969213737869053e-05, + "loss": 0.4395, + "step": 1625 + }, + { + "epoch": 0.32058359621451105, + "grad_norm": 0.6753566663287628, + "learning_rate": 1.969175561347386e-05, + "loss": 0.4741, + "step": 1626 + }, + { + "epoch": 0.3207807570977918, + "grad_norm": 0.6364586995100593, + "learning_rate": 1.969137361540496e-05, + "loss": 0.425, + "step": 1627 + }, + { + "epoch": 0.32097791798107256, + "grad_norm": 0.6316173890013236, + "learning_rate": 1.9690991384493002e-05, + "loss": 0.426, + "step": 1628 + }, + { + "epoch": 0.3211750788643533, + "grad_norm": 0.6592976432535022, + "learning_rate": 1.969060892074717e-05, + "loss": 0.4435, + "step": 1629 + }, + { + "epoch": 0.32137223974763407, + "grad_norm": 0.6057087728738385, + "learning_rate": 1.969022622417666e-05, + "loss": 0.4434, + "step": 1630 + }, + { + "epoch": 0.3215694006309148, + "grad_norm": 0.6341926570691082, + "learning_rate": 1.968984329479066e-05, + "loss": 0.4479, + "step": 1631 + }, + { + "epoch": 0.3217665615141956, + "grad_norm": 0.6729916605171744, + "learning_rate": 1.9689460132598372e-05, + "loss": 0.4832, + "step": 1632 + }, + { + "epoch": 0.32196372239747634, + "grad_norm": 0.8010790117508052, + "learning_rate": 1.9689076737608998e-05, + "loss": 0.4815, + "step": 1633 + }, + { + "epoch": 0.3221608832807571, + "grad_norm": 0.6342908894048237, + "learning_rate": 1.9688693109831755e-05, + "loss": 0.4412, + "step": 1634 + }, + { + "epoch": 0.32235804416403785, + "grad_norm": 0.6574543494629974, + "learning_rate": 1.9688309249275857e-05, + "loss": 0.4337, + "step": 1635 + }, + { + "epoch": 0.3225552050473186, + "grad_norm": 0.6700412322589491, + "learning_rate": 1.9687925155950526e-05, + "loss": 0.4612, + "step": 1636 + }, + { + "epoch": 0.32275236593059936, + "grad_norm": 0.6880798043825762, + "learning_rate": 1.9687540829864996e-05, + "loss": 0.4633, + "step": 1637 + }, + { + "epoch": 0.3229495268138801, + "grad_norm": 0.5858751037584682, + "learning_rate": 1.9687156271028493e-05, + "loss": 0.4147, + "step": 1638 + }, + { + "epoch": 0.3231466876971609, + "grad_norm": 1.1648261296219227, + "learning_rate": 1.968677147945026e-05, + "loss": 0.4663, + "step": 1639 + }, + { + "epoch": 0.32334384858044163, + "grad_norm": 0.6393378112398456, + "learning_rate": 1.9686386455139544e-05, + "loss": 0.4392, + "step": 1640 + }, + { + "epoch": 0.3235410094637224, + "grad_norm": 0.6736562294968578, + "learning_rate": 1.9686001198105587e-05, + "loss": 0.459, + "step": 1641 + }, + { + "epoch": 0.32373817034700314, + "grad_norm": 0.6291265298426209, + "learning_rate": 1.9685615708357656e-05, + "loss": 0.455, + "step": 1642 + }, + { + "epoch": 0.3239353312302839, + "grad_norm": 0.6956696368636405, + "learning_rate": 1.9685229985905007e-05, + "loss": 0.4735, + "step": 1643 + }, + { + "epoch": 0.32413249211356465, + "grad_norm": 0.7248600768104795, + "learning_rate": 1.9684844030756907e-05, + "loss": 0.4599, + "step": 1644 + }, + { + "epoch": 0.3243296529968454, + "grad_norm": 0.8868152265197408, + "learning_rate": 1.9684457842922632e-05, + "loss": 0.4353, + "step": 1645 + }, + { + "epoch": 0.32452681388012616, + "grad_norm": 0.7520432959600722, + "learning_rate": 1.9684071422411456e-05, + "loss": 0.4873, + "step": 1646 + }, + { + "epoch": 0.3247239747634069, + "grad_norm": 0.7377406807353308, + "learning_rate": 1.968368476923267e-05, + "loss": 0.461, + "step": 1647 + }, + { + "epoch": 0.3249211356466877, + "grad_norm": 0.708141024519448, + "learning_rate": 1.968329788339555e-05, + "loss": 0.467, + "step": 1648 + }, + { + "epoch": 0.32511829652996843, + "grad_norm": 0.6848367537784837, + "learning_rate": 1.9682910764909405e-05, + "loss": 0.496, + "step": 1649 + }, + { + "epoch": 0.3253154574132492, + "grad_norm": 0.6665467412273421, + "learning_rate": 1.9682523413783533e-05, + "loss": 0.4732, + "step": 1650 + }, + { + "epoch": 0.32551261829652994, + "grad_norm": 0.6953139108715455, + "learning_rate": 1.968213583002724e-05, + "loss": 0.4518, + "step": 1651 + }, + { + "epoch": 0.3257097791798107, + "grad_norm": 0.6427842151833715, + "learning_rate": 1.9681748013649834e-05, + "loss": 0.4153, + "step": 1652 + }, + { + "epoch": 0.3259069400630915, + "grad_norm": 1.183472131884011, + "learning_rate": 1.968135996466064e-05, + "loss": 0.4764, + "step": 1653 + }, + { + "epoch": 0.32610410094637227, + "grad_norm": 0.617392531814384, + "learning_rate": 1.968097168306897e-05, + "loss": 0.4081, + "step": 1654 + }, + { + "epoch": 0.326301261829653, + "grad_norm": 0.6869167388782502, + "learning_rate": 1.9680583168884163e-05, + "loss": 0.4505, + "step": 1655 + }, + { + "epoch": 0.3264984227129338, + "grad_norm": 0.8700641935343537, + "learning_rate": 1.9680194422115548e-05, + "loss": 0.5009, + "step": 1656 + }, + { + "epoch": 0.32669558359621453, + "grad_norm": 0.696926074465654, + "learning_rate": 1.9679805442772464e-05, + "loss": 0.4871, + "step": 1657 + }, + { + "epoch": 0.3268927444794953, + "grad_norm": 0.7205797312738156, + "learning_rate": 1.9679416230864265e-05, + "loss": 0.487, + "step": 1658 + }, + { + "epoch": 0.32708990536277605, + "grad_norm": 0.7034418967452446, + "learning_rate": 1.967902678640029e-05, + "loss": 0.4442, + "step": 1659 + }, + { + "epoch": 0.3272870662460568, + "grad_norm": 0.6205957767275062, + "learning_rate": 1.9678637109389903e-05, + "loss": 0.4606, + "step": 1660 + }, + { + "epoch": 0.32748422712933756, + "grad_norm": 0.6767913682774107, + "learning_rate": 1.967824719984247e-05, + "loss": 0.4449, + "step": 1661 + }, + { + "epoch": 0.3276813880126183, + "grad_norm": 0.7628685405323068, + "learning_rate": 1.967785705776735e-05, + "loss": 0.4912, + "step": 1662 + }, + { + "epoch": 0.32787854889589907, + "grad_norm": 0.6449573923061704, + "learning_rate": 1.9677466683173922e-05, + "loss": 0.4424, + "step": 1663 + }, + { + "epoch": 0.3280757097791798, + "grad_norm": 0.6596295374884681, + "learning_rate": 1.9677076076071568e-05, + "loss": 0.4491, + "step": 1664 + }, + { + "epoch": 0.3282728706624606, + "grad_norm": 0.6027707481490366, + "learning_rate": 1.967668523646966e-05, + "loss": 0.4272, + "step": 1665 + }, + { + "epoch": 0.32847003154574134, + "grad_norm": 0.6919043992802008, + "learning_rate": 1.9676294164377603e-05, + "loss": 0.4597, + "step": 1666 + }, + { + "epoch": 0.3286671924290221, + "grad_norm": 0.6975499115349002, + "learning_rate": 1.9675902859804786e-05, + "loss": 0.4978, + "step": 1667 + }, + { + "epoch": 0.32886435331230285, + "grad_norm": 0.6526383582936955, + "learning_rate": 1.967551132276061e-05, + "loss": 0.4506, + "step": 1668 + }, + { + "epoch": 0.3290615141955836, + "grad_norm": 0.6951437907766501, + "learning_rate": 1.9675119553254477e-05, + "loss": 0.4678, + "step": 1669 + }, + { + "epoch": 0.32925867507886436, + "grad_norm": 0.6405156580206511, + "learning_rate": 1.9674727551295812e-05, + "loss": 0.4657, + "step": 1670 + }, + { + "epoch": 0.3294558359621451, + "grad_norm": 0.6686701711260229, + "learning_rate": 1.9674335316894024e-05, + "loss": 0.4424, + "step": 1671 + }, + { + "epoch": 0.32965299684542587, + "grad_norm": 0.6364715264193335, + "learning_rate": 1.9673942850058542e-05, + "loss": 0.4423, + "step": 1672 + }, + { + "epoch": 0.32985015772870663, + "grad_norm": 0.6132748220879023, + "learning_rate": 1.9673550150798787e-05, + "loss": 0.4247, + "step": 1673 + }, + { + "epoch": 0.3300473186119874, + "grad_norm": 0.6645141377088348, + "learning_rate": 1.9673157219124207e-05, + "loss": 0.466, + "step": 1674 + }, + { + "epoch": 0.33024447949526814, + "grad_norm": 0.6213535676703986, + "learning_rate": 1.967276405504423e-05, + "loss": 0.4326, + "step": 1675 + }, + { + "epoch": 0.3304416403785489, + "grad_norm": 1.0248391227718814, + "learning_rate": 1.9672370658568306e-05, + "loss": 0.5144, + "step": 1676 + }, + { + "epoch": 0.33063880126182965, + "grad_norm": 0.6705592825340083, + "learning_rate": 1.967197702970589e-05, + "loss": 0.4504, + "step": 1677 + }, + { + "epoch": 0.3308359621451104, + "grad_norm": 0.6159547925544582, + "learning_rate": 1.967158316846644e-05, + "loss": 0.4262, + "step": 1678 + }, + { + "epoch": 0.33103312302839116, + "grad_norm": 0.6625745295364192, + "learning_rate": 1.9671189074859412e-05, + "loss": 0.463, + "step": 1679 + }, + { + "epoch": 0.3312302839116719, + "grad_norm": 0.6192642001745661, + "learning_rate": 1.967079474889428e-05, + "loss": 0.4283, + "step": 1680 + }, + { + "epoch": 0.3314274447949527, + "grad_norm": 0.659809255678557, + "learning_rate": 1.9670400190580516e-05, + "loss": 0.4443, + "step": 1681 + }, + { + "epoch": 0.33162460567823343, + "grad_norm": 0.60127325810791, + "learning_rate": 1.9670005399927602e-05, + "loss": 0.4579, + "step": 1682 + }, + { + "epoch": 0.3318217665615142, + "grad_norm": 0.7413899493606506, + "learning_rate": 1.9669610376945013e-05, + "loss": 0.487, + "step": 1683 + }, + { + "epoch": 0.33201892744479494, + "grad_norm": 0.6582626993097654, + "learning_rate": 1.9669215121642255e-05, + "loss": 0.4734, + "step": 1684 + }, + { + "epoch": 0.3322160883280757, + "grad_norm": 0.7151538898984898, + "learning_rate": 1.9668819634028816e-05, + "loss": 0.4446, + "step": 1685 + }, + { + "epoch": 0.33241324921135645, + "grad_norm": 0.6913048824848471, + "learning_rate": 1.96684239141142e-05, + "loss": 0.4253, + "step": 1686 + }, + { + "epoch": 0.3326104100946372, + "grad_norm": 0.6414540359989238, + "learning_rate": 1.966802796190791e-05, + "loss": 0.4317, + "step": 1687 + }, + { + "epoch": 0.33280757097791797, + "grad_norm": 0.9207986171363428, + "learning_rate": 1.9667631777419466e-05, + "loss": 0.4627, + "step": 1688 + }, + { + "epoch": 0.3330047318611987, + "grad_norm": 0.616029959208574, + "learning_rate": 1.966723536065838e-05, + "loss": 0.4655, + "step": 1689 + }, + { + "epoch": 0.3332018927444795, + "grad_norm": 0.6883300541317802, + "learning_rate": 1.9666838711634182e-05, + "loss": 0.4618, + "step": 1690 + }, + { + "epoch": 0.33339905362776023, + "grad_norm": 0.650342941791099, + "learning_rate": 1.9666441830356397e-05, + "loss": 0.444, + "step": 1691 + }, + { + "epoch": 0.333596214511041, + "grad_norm": 0.6491057879424494, + "learning_rate": 1.9666044716834566e-05, + "loss": 0.4624, + "step": 1692 + }, + { + "epoch": 0.33379337539432175, + "grad_norm": 0.6311071726696637, + "learning_rate": 1.9665647371078225e-05, + "loss": 0.4329, + "step": 1693 + }, + { + "epoch": 0.3339905362776025, + "grad_norm": 0.7246225471538845, + "learning_rate": 1.966524979309692e-05, + "loss": 0.5341, + "step": 1694 + }, + { + "epoch": 0.33418769716088326, + "grad_norm": 0.6987766778208062, + "learning_rate": 1.966485198290021e-05, + "loss": 0.48, + "step": 1695 + }, + { + "epoch": 0.334384858044164, + "grad_norm": 0.6138348348118343, + "learning_rate": 1.9664453940497642e-05, + "loss": 0.4447, + "step": 1696 + }, + { + "epoch": 0.33458201892744477, + "grad_norm": 0.6567138637243869, + "learning_rate": 1.966405566589879e-05, + "loss": 0.4991, + "step": 1697 + }, + { + "epoch": 0.3347791798107255, + "grad_norm": 0.6352662846456245, + "learning_rate": 1.9663657159113217e-05, + "loss": 0.4652, + "step": 1698 + }, + { + "epoch": 0.33497634069400634, + "grad_norm": 0.8883699236623674, + "learning_rate": 1.96632584201505e-05, + "loss": 0.4797, + "step": 1699 + }, + { + "epoch": 0.3351735015772871, + "grad_norm": 0.6256688722873787, + "learning_rate": 1.9662859449020214e-05, + "loss": 0.4771, + "step": 1700 + }, + { + "epoch": 0.33537066246056785, + "grad_norm": 0.6792106596589087, + "learning_rate": 1.966246024573195e-05, + "loss": 0.4561, + "step": 1701 + }, + { + "epoch": 0.3355678233438486, + "grad_norm": 0.5975526642006157, + "learning_rate": 1.96620608102953e-05, + "loss": 0.4372, + "step": 1702 + }, + { + "epoch": 0.33576498422712936, + "grad_norm": 0.6481968588062909, + "learning_rate": 1.9661661142719856e-05, + "loss": 0.4679, + "step": 1703 + }, + { + "epoch": 0.3359621451104101, + "grad_norm": 0.625117310912806, + "learning_rate": 1.966126124301522e-05, + "loss": 0.4722, + "step": 1704 + }, + { + "epoch": 0.33615930599369087, + "grad_norm": 0.6779932286491652, + "learning_rate": 1.9660861111191004e-05, + "loss": 0.4548, + "step": 1705 + }, + { + "epoch": 0.3363564668769716, + "grad_norm": 0.5897827532389534, + "learning_rate": 1.9660460747256823e-05, + "loss": 0.4221, + "step": 1706 + }, + { + "epoch": 0.3365536277602524, + "grad_norm": 0.600745494776992, + "learning_rate": 1.9660060151222292e-05, + "loss": 0.4359, + "step": 1707 + }, + { + "epoch": 0.33675078864353314, + "grad_norm": 0.6263245381562469, + "learning_rate": 1.9659659323097037e-05, + "loss": 0.4537, + "step": 1708 + }, + { + "epoch": 0.3369479495268139, + "grad_norm": 0.6059447411600158, + "learning_rate": 1.9659258262890683e-05, + "loss": 0.4557, + "step": 1709 + }, + { + "epoch": 0.33714511041009465, + "grad_norm": 0.6276333332545005, + "learning_rate": 1.9658856970612878e-05, + "loss": 0.4539, + "step": 1710 + }, + { + "epoch": 0.3373422712933754, + "grad_norm": 0.61826217784891, + "learning_rate": 1.965845544627325e-05, + "loss": 0.441, + "step": 1711 + }, + { + "epoch": 0.33753943217665616, + "grad_norm": 0.6053223097223316, + "learning_rate": 1.9658053689881453e-05, + "loss": 0.4813, + "step": 1712 + }, + { + "epoch": 0.3377365930599369, + "grad_norm": 0.5730530402368444, + "learning_rate": 1.965765170144714e-05, + "loss": 0.4452, + "step": 1713 + }, + { + "epoch": 0.3379337539432177, + "grad_norm": 4.5489083531739265, + "learning_rate": 1.9657249480979968e-05, + "loss": 0.4672, + "step": 1714 + }, + { + "epoch": 0.33813091482649843, + "grad_norm": 0.6468829198177741, + "learning_rate": 1.9656847028489597e-05, + "loss": 0.447, + "step": 1715 + }, + { + "epoch": 0.3383280757097792, + "grad_norm": 0.6197378264065176, + "learning_rate": 1.9656444343985705e-05, + "loss": 0.4681, + "step": 1716 + }, + { + "epoch": 0.33852523659305994, + "grad_norm": 0.7104123212898853, + "learning_rate": 1.9656041427477957e-05, + "loss": 0.4457, + "step": 1717 + }, + { + "epoch": 0.3387223974763407, + "grad_norm": 0.7070319729266388, + "learning_rate": 1.965563827897604e-05, + "loss": 0.4517, + "step": 1718 + }, + { + "epoch": 0.33891955835962145, + "grad_norm": 0.6119590288529166, + "learning_rate": 1.9655234898489634e-05, + "loss": 0.4346, + "step": 1719 + }, + { + "epoch": 0.3391167192429022, + "grad_norm": 0.6245842477692922, + "learning_rate": 1.965483128602844e-05, + "loss": 0.4604, + "step": 1720 + }, + { + "epoch": 0.33931388012618297, + "grad_norm": 1.2443841217690506, + "learning_rate": 1.9654427441602145e-05, + "loss": 0.479, + "step": 1721 + }, + { + "epoch": 0.3395110410094637, + "grad_norm": 0.6732229801004805, + "learning_rate": 1.9654023365220456e-05, + "loss": 0.4513, + "step": 1722 + }, + { + "epoch": 0.3397082018927445, + "grad_norm": 0.6568708942903064, + "learning_rate": 1.9653619056893082e-05, + "loss": 0.4406, + "step": 1723 + }, + { + "epoch": 0.33990536277602523, + "grad_norm": 0.663524577710516, + "learning_rate": 1.9653214516629737e-05, + "loss": 0.4763, + "step": 1724 + }, + { + "epoch": 0.340102523659306, + "grad_norm": 0.6675810755775939, + "learning_rate": 1.965280974444014e-05, + "loss": 0.4489, + "step": 1725 + }, + { + "epoch": 0.34029968454258674, + "grad_norm": 0.6563303589975222, + "learning_rate": 1.9652404740334015e-05, + "loss": 0.4309, + "step": 1726 + }, + { + "epoch": 0.3404968454258675, + "grad_norm": 0.7598373649184584, + "learning_rate": 1.9651999504321094e-05, + "loss": 0.4272, + "step": 1727 + }, + { + "epoch": 0.34069400630914826, + "grad_norm": 0.651084326470726, + "learning_rate": 1.9651594036411107e-05, + "loss": 0.4475, + "step": 1728 + }, + { + "epoch": 0.340891167192429, + "grad_norm": 0.6799855930682017, + "learning_rate": 1.9651188336613807e-05, + "loss": 0.4524, + "step": 1729 + }, + { + "epoch": 0.34108832807570977, + "grad_norm": 0.8147708964689288, + "learning_rate": 1.9650782404938933e-05, + "loss": 0.4748, + "step": 1730 + }, + { + "epoch": 0.3412854889589905, + "grad_norm": 1.9599798652041918, + "learning_rate": 1.965037624139624e-05, + "loss": 0.4634, + "step": 1731 + }, + { + "epoch": 0.3414826498422713, + "grad_norm": 0.9750946996661596, + "learning_rate": 1.9649969845995486e-05, + "loss": 0.4522, + "step": 1732 + }, + { + "epoch": 0.34167981072555204, + "grad_norm": 0.7830583556096765, + "learning_rate": 1.9649563218746436e-05, + "loss": 0.4837, + "step": 1733 + }, + { + "epoch": 0.3418769716088328, + "grad_norm": 1.8522800709343359, + "learning_rate": 1.9649156359658857e-05, + "loss": 0.4907, + "step": 1734 + }, + { + "epoch": 0.34207413249211355, + "grad_norm": 0.7087784601927121, + "learning_rate": 1.964874926874253e-05, + "loss": 0.4537, + "step": 1735 + }, + { + "epoch": 0.3422712933753943, + "grad_norm": 0.9263867760746262, + "learning_rate": 1.9648341946007228e-05, + "loss": 0.4481, + "step": 1736 + }, + { + "epoch": 0.34246845425867506, + "grad_norm": 0.7131592632953431, + "learning_rate": 1.9647934391462743e-05, + "loss": 0.4859, + "step": 1737 + }, + { + "epoch": 0.3426656151419558, + "grad_norm": 0.6812545398255945, + "learning_rate": 1.9647526605118863e-05, + "loss": 0.4419, + "step": 1738 + }, + { + "epoch": 0.34286277602523657, + "grad_norm": 0.8324328718435976, + "learning_rate": 1.964711858698539e-05, + "loss": 0.4495, + "step": 1739 + }, + { + "epoch": 0.3430599369085173, + "grad_norm": 0.7569149729504874, + "learning_rate": 1.964671033707212e-05, + "loss": 0.4779, + "step": 1740 + }, + { + "epoch": 0.3432570977917981, + "grad_norm": 0.7011727969185421, + "learning_rate": 1.9646301855388868e-05, + "loss": 0.486, + "step": 1741 + }, + { + "epoch": 0.34345425867507884, + "grad_norm": 0.6487547062067612, + "learning_rate": 1.9645893141945444e-05, + "loss": 0.4265, + "step": 1742 + }, + { + "epoch": 0.3436514195583596, + "grad_norm": 0.7657920846629185, + "learning_rate": 1.9645484196751676e-05, + "loss": 0.455, + "step": 1743 + }, + { + "epoch": 0.3438485804416404, + "grad_norm": 0.696631675500191, + "learning_rate": 1.9645075019817374e-05, + "loss": 0.4743, + "step": 1744 + }, + { + "epoch": 0.34404574132492116, + "grad_norm": 0.6735160176131795, + "learning_rate": 1.9644665611152384e-05, + "loss": 0.458, + "step": 1745 + }, + { + "epoch": 0.3442429022082019, + "grad_norm": 0.7721263917052835, + "learning_rate": 1.964425597076653e-05, + "loss": 0.4765, + "step": 1746 + }, + { + "epoch": 0.3444400630914827, + "grad_norm": 0.7497548232948051, + "learning_rate": 1.9643846098669664e-05, + "loss": 0.4772, + "step": 1747 + }, + { + "epoch": 0.34463722397476343, + "grad_norm": 0.6847650811355418, + "learning_rate": 1.9643435994871626e-05, + "loss": 0.4417, + "step": 1748 + }, + { + "epoch": 0.3448343848580442, + "grad_norm": 0.6433657002066264, + "learning_rate": 1.9643025659382274e-05, + "loss": 0.415, + "step": 1749 + }, + { + "epoch": 0.34503154574132494, + "grad_norm": 0.6855538495003168, + "learning_rate": 1.9642615092211468e-05, + "loss": 0.4665, + "step": 1750 + }, + { + "epoch": 0.3452287066246057, + "grad_norm": 0.6890543003455781, + "learning_rate": 1.9642204293369066e-05, + "loss": 0.4571, + "step": 1751 + }, + { + "epoch": 0.34542586750788645, + "grad_norm": 0.8006291058144541, + "learning_rate": 1.9641793262864942e-05, + "loss": 0.4617, + "step": 1752 + }, + { + "epoch": 0.3456230283911672, + "grad_norm": 0.7454470217038145, + "learning_rate": 1.9641382000708972e-05, + "loss": 0.4697, + "step": 1753 + }, + { + "epoch": 0.34582018927444796, + "grad_norm": 0.6090095712411363, + "learning_rate": 1.9640970506911033e-05, + "loss": 0.4375, + "step": 1754 + }, + { + "epoch": 0.3460173501577287, + "grad_norm": 0.7015624149655699, + "learning_rate": 1.9640558781481015e-05, + "loss": 0.4219, + "step": 1755 + }, + { + "epoch": 0.3462145110410095, + "grad_norm": 0.7670637934340552, + "learning_rate": 1.9640146824428807e-05, + "loss": 0.4899, + "step": 1756 + }, + { + "epoch": 0.34641167192429023, + "grad_norm": 0.7363863325209535, + "learning_rate": 1.963973463576431e-05, + "loss": 0.4664, + "step": 1757 + }, + { + "epoch": 0.346608832807571, + "grad_norm": 0.6131339077035906, + "learning_rate": 1.9639322215497423e-05, + "loss": 0.4072, + "step": 1758 + }, + { + "epoch": 0.34680599369085174, + "grad_norm": 0.6707877768295646, + "learning_rate": 1.963890956363806e-05, + "loss": 0.4662, + "step": 1759 + }, + { + "epoch": 0.3470031545741325, + "grad_norm": 0.677317205988406, + "learning_rate": 1.9638496680196135e-05, + "loss": 0.4106, + "step": 1760 + }, + { + "epoch": 0.34720031545741326, + "grad_norm": 0.6492787706072033, + "learning_rate": 1.963808356518156e-05, + "loss": 0.4077, + "step": 1761 + }, + { + "epoch": 0.347397476340694, + "grad_norm": 0.7823981215932518, + "learning_rate": 1.9637670218604267e-05, + "loss": 0.4573, + "step": 1762 + }, + { + "epoch": 0.34759463722397477, + "grad_norm": 0.5953310114048408, + "learning_rate": 1.9637256640474187e-05, + "loss": 0.4539, + "step": 1763 + }, + { + "epoch": 0.3477917981072555, + "grad_norm": 0.759527295176903, + "learning_rate": 1.9636842830801255e-05, + "loss": 0.4354, + "step": 1764 + }, + { + "epoch": 0.3479889589905363, + "grad_norm": 0.8180906973487583, + "learning_rate": 1.9636428789595413e-05, + "loss": 0.4312, + "step": 1765 + }, + { + "epoch": 0.34818611987381703, + "grad_norm": 4.382079962402747, + "learning_rate": 1.963601451686661e-05, + "loss": 0.4406, + "step": 1766 + }, + { + "epoch": 0.3483832807570978, + "grad_norm": 0.7560610305482834, + "learning_rate": 1.9635600012624798e-05, + "loss": 0.4313, + "step": 1767 + }, + { + "epoch": 0.34858044164037855, + "grad_norm": 0.7418288958717549, + "learning_rate": 1.9635185276879936e-05, + "loss": 0.4793, + "step": 1768 + }, + { + "epoch": 0.3487776025236593, + "grad_norm": 1.072851573671909, + "learning_rate": 1.963477030964199e-05, + "loss": 0.485, + "step": 1769 + }, + { + "epoch": 0.34897476340694006, + "grad_norm": 0.7920038346678471, + "learning_rate": 1.963435511092093e-05, + "loss": 0.4696, + "step": 1770 + }, + { + "epoch": 0.3491719242902208, + "grad_norm": 0.7093452794943754, + "learning_rate": 1.9633939680726724e-05, + "loss": 0.4727, + "step": 1771 + }, + { + "epoch": 0.34936908517350157, + "grad_norm": 0.8112163451470932, + "learning_rate": 1.9633524019069365e-05, + "loss": 0.5013, + "step": 1772 + }, + { + "epoch": 0.3495662460567823, + "grad_norm": 3.917584519790698, + "learning_rate": 1.963310812595883e-05, + "loss": 0.5098, + "step": 1773 + }, + { + "epoch": 0.3497634069400631, + "grad_norm": 1.000199323692884, + "learning_rate": 1.9632692001405113e-05, + "loss": 0.4594, + "step": 1774 + }, + { + "epoch": 0.34996056782334384, + "grad_norm": 0.8133085348010322, + "learning_rate": 1.9632275645418218e-05, + "loss": 0.4881, + "step": 1775 + }, + { + "epoch": 0.3501577287066246, + "grad_norm": 0.9472822270595948, + "learning_rate": 1.963185905800814e-05, + "loss": 0.4973, + "step": 1776 + }, + { + "epoch": 0.35035488958990535, + "grad_norm": 0.9453081319239314, + "learning_rate": 1.9631442239184894e-05, + "loss": 0.4266, + "step": 1777 + }, + { + "epoch": 0.3505520504731861, + "grad_norm": 0.7602206983614878, + "learning_rate": 1.9631025188958492e-05, + "loss": 0.4783, + "step": 1778 + }, + { + "epoch": 0.35074921135646686, + "grad_norm": 0.8604246184240586, + "learning_rate": 1.9630607907338953e-05, + "loss": 0.4293, + "step": 1779 + }, + { + "epoch": 0.3509463722397476, + "grad_norm": 0.7616354505596147, + "learning_rate": 1.9630190394336304e-05, + "loss": 0.5098, + "step": 1780 + }, + { + "epoch": 0.3511435331230284, + "grad_norm": 0.819713708870105, + "learning_rate": 1.9629772649960574e-05, + "loss": 0.4656, + "step": 1781 + }, + { + "epoch": 0.35134069400630913, + "grad_norm": 0.6981497101426539, + "learning_rate": 1.9629354674221803e-05, + "loss": 0.4555, + "step": 1782 + }, + { + "epoch": 0.3515378548895899, + "grad_norm": 0.8239076401594878, + "learning_rate": 1.962893646713003e-05, + "loss": 0.4619, + "step": 1783 + }, + { + "epoch": 0.35173501577287064, + "grad_norm": 0.6349622015636447, + "learning_rate": 1.9628518028695307e-05, + "loss": 0.4508, + "step": 1784 + }, + { + "epoch": 0.3519321766561514, + "grad_norm": 0.7787614675612314, + "learning_rate": 1.9628099358927684e-05, + "loss": 0.4399, + "step": 1785 + }, + { + "epoch": 0.35212933753943215, + "grad_norm": 0.6213692400849354, + "learning_rate": 1.962768045783722e-05, + "loss": 0.4302, + "step": 1786 + }, + { + "epoch": 0.3523264984227129, + "grad_norm": 0.8089918611577429, + "learning_rate": 1.9627261325433976e-05, + "loss": 0.4447, + "step": 1787 + }, + { + "epoch": 0.35252365930599366, + "grad_norm": 0.8637850568343813, + "learning_rate": 1.962684196172803e-05, + "loss": 0.4683, + "step": 1788 + }, + { + "epoch": 0.3527208201892745, + "grad_norm": 0.7190171498413642, + "learning_rate": 1.9626422366729453e-05, + "loss": 0.4591, + "step": 1789 + }, + { + "epoch": 0.35291798107255523, + "grad_norm": 0.7746352347399187, + "learning_rate": 1.9626002540448325e-05, + "loss": 0.4545, + "step": 1790 + }, + { + "epoch": 0.353115141955836, + "grad_norm": 0.7705463288801266, + "learning_rate": 1.9625582482894735e-05, + "loss": 0.4505, + "step": 1791 + }, + { + "epoch": 0.35331230283911674, + "grad_norm": 0.6701387506672304, + "learning_rate": 1.9625162194078775e-05, + "loss": 0.4823, + "step": 1792 + }, + { + "epoch": 0.3535094637223975, + "grad_norm": 0.7068914753075993, + "learning_rate": 1.9624741674010544e-05, + "loss": 0.4654, + "step": 1793 + }, + { + "epoch": 0.35370662460567825, + "grad_norm": 0.6064047478157758, + "learning_rate": 1.9624320922700138e-05, + "loss": 0.4057, + "step": 1794 + }, + { + "epoch": 0.353903785488959, + "grad_norm": 0.6898360183497596, + "learning_rate": 1.9623899940157675e-05, + "loss": 0.4221, + "step": 1795 + }, + { + "epoch": 0.35410094637223977, + "grad_norm": 0.6629057087304656, + "learning_rate": 1.9623478726393266e-05, + "loss": 0.5155, + "step": 1796 + }, + { + "epoch": 0.3542981072555205, + "grad_norm": 0.6794292374990497, + "learning_rate": 1.9623057281417028e-05, + "loss": 0.4402, + "step": 1797 + }, + { + "epoch": 0.3544952681388013, + "grad_norm": 0.6122789190306797, + "learning_rate": 1.9622635605239095e-05, + "loss": 0.4431, + "step": 1798 + }, + { + "epoch": 0.35469242902208203, + "grad_norm": 0.6459796432517372, + "learning_rate": 1.9622213697869587e-05, + "loss": 0.4388, + "step": 1799 + }, + { + "epoch": 0.3548895899053628, + "grad_norm": 0.6623695426508792, + "learning_rate": 1.9621791559318648e-05, + "loss": 0.4812, + "step": 1800 + }, + { + "epoch": 0.35508675078864355, + "grad_norm": 0.6324064996658313, + "learning_rate": 1.962136918959642e-05, + "loss": 0.4675, + "step": 1801 + }, + { + "epoch": 0.3552839116719243, + "grad_norm": 0.5963028662353506, + "learning_rate": 1.9620946588713048e-05, + "loss": 0.4412, + "step": 1802 + }, + { + "epoch": 0.35548107255520506, + "grad_norm": 0.6744345317892008, + "learning_rate": 1.9620523756678685e-05, + "loss": 0.4177, + "step": 1803 + }, + { + "epoch": 0.3556782334384858, + "grad_norm": 0.645630963745389, + "learning_rate": 1.9620100693503494e-05, + "loss": 0.4511, + "step": 1804 + }, + { + "epoch": 0.35587539432176657, + "grad_norm": 0.6672682741456414, + "learning_rate": 1.9619677399197634e-05, + "loss": 0.4382, + "step": 1805 + }, + { + "epoch": 0.3560725552050473, + "grad_norm": 0.7740606862475723, + "learning_rate": 1.961925387377128e-05, + "loss": 0.4462, + "step": 1806 + }, + { + "epoch": 0.3562697160883281, + "grad_norm": 0.6497305621170518, + "learning_rate": 1.9618830117234603e-05, + "loss": 0.4725, + "step": 1807 + }, + { + "epoch": 0.35646687697160884, + "grad_norm": 0.640815305534194, + "learning_rate": 1.9618406129597787e-05, + "loss": 0.4651, + "step": 1808 + }, + { + "epoch": 0.3566640378548896, + "grad_norm": 0.6155816175110397, + "learning_rate": 1.961798191087102e-05, + "loss": 0.4281, + "step": 1809 + }, + { + "epoch": 0.35686119873817035, + "grad_norm": 0.633839320877963, + "learning_rate": 1.9617557461064495e-05, + "loss": 0.4938, + "step": 1810 + }, + { + "epoch": 0.3570583596214511, + "grad_norm": 0.635922394462134, + "learning_rate": 1.96171327801884e-05, + "loss": 0.4393, + "step": 1811 + }, + { + "epoch": 0.35725552050473186, + "grad_norm": 0.6410247655705709, + "learning_rate": 1.961670786825295e-05, + "loss": 0.4629, + "step": 1812 + }, + { + "epoch": 0.3574526813880126, + "grad_norm": 1.9279852657649488, + "learning_rate": 1.9616282725268347e-05, + "loss": 0.4694, + "step": 1813 + }, + { + "epoch": 0.35764984227129337, + "grad_norm": 0.6710119594137577, + "learning_rate": 1.961585735124481e-05, + "loss": 0.4594, + "step": 1814 + }, + { + "epoch": 0.35784700315457413, + "grad_norm": 0.7397595757858347, + "learning_rate": 1.9615431746192553e-05, + "loss": 0.4534, + "step": 1815 + }, + { + "epoch": 0.3580441640378549, + "grad_norm": 0.8225787141793098, + "learning_rate": 1.9615005910121806e-05, + "loss": 0.4832, + "step": 1816 + }, + { + "epoch": 0.35824132492113564, + "grad_norm": 0.6983577011027828, + "learning_rate": 1.96145798430428e-05, + "loss": 0.4377, + "step": 1817 + }, + { + "epoch": 0.3584384858044164, + "grad_norm": 0.7361587716765654, + "learning_rate": 1.9614153544965773e-05, + "loss": 0.4893, + "step": 1818 + }, + { + "epoch": 0.35863564668769715, + "grad_norm": 0.6203265764684508, + "learning_rate": 1.9613727015900962e-05, + "loss": 0.3938, + "step": 1819 + }, + { + "epoch": 0.3588328075709779, + "grad_norm": 0.6980283936814929, + "learning_rate": 1.9613300255858615e-05, + "loss": 0.4953, + "step": 1820 + }, + { + "epoch": 0.35902996845425866, + "grad_norm": 0.7026401510766678, + "learning_rate": 1.9612873264848994e-05, + "loss": 0.4594, + "step": 1821 + }, + { + "epoch": 0.3592271293375394, + "grad_norm": 0.6405607709544402, + "learning_rate": 1.9612446042882345e-05, + "loss": 0.4146, + "step": 1822 + }, + { + "epoch": 0.3594242902208202, + "grad_norm": 0.8137624725348499, + "learning_rate": 1.961201858996894e-05, + "loss": 0.4603, + "step": 1823 + }, + { + "epoch": 0.35962145110410093, + "grad_norm": 0.6238197035171101, + "learning_rate": 1.9611590906119055e-05, + "loss": 0.4301, + "step": 1824 + }, + { + "epoch": 0.3598186119873817, + "grad_norm": 0.7765705822707861, + "learning_rate": 1.9611162991342952e-05, + "loss": 0.4622, + "step": 1825 + }, + { + "epoch": 0.36001577287066244, + "grad_norm": 0.8129569752866966, + "learning_rate": 1.961073484565092e-05, + "loss": 0.4634, + "step": 1826 + }, + { + "epoch": 0.3602129337539432, + "grad_norm": 0.7674729263580945, + "learning_rate": 1.9610306469053243e-05, + "loss": 0.4797, + "step": 1827 + }, + { + "epoch": 0.36041009463722395, + "grad_norm": 0.6433782160365153, + "learning_rate": 1.9609877861560213e-05, + "loss": 0.4427, + "step": 1828 + }, + { + "epoch": 0.3606072555205047, + "grad_norm": 0.8189751748622848, + "learning_rate": 1.9609449023182133e-05, + "loss": 0.4925, + "step": 1829 + }, + { + "epoch": 0.36080441640378547, + "grad_norm": 0.6603921192162837, + "learning_rate": 1.9609019953929298e-05, + "loss": 0.4328, + "step": 1830 + }, + { + "epoch": 0.3610015772870662, + "grad_norm": 0.7020104873363077, + "learning_rate": 1.960859065381202e-05, + "loss": 0.4532, + "step": 1831 + }, + { + "epoch": 0.361198738170347, + "grad_norm": 0.6514968871084654, + "learning_rate": 1.9608161122840614e-05, + "loss": 0.4262, + "step": 1832 + }, + { + "epoch": 0.36139589905362773, + "grad_norm": 1.3363500449543768, + "learning_rate": 1.9607731361025402e-05, + "loss": 0.4738, + "step": 1833 + }, + { + "epoch": 0.3615930599369085, + "grad_norm": 0.6353691592363815, + "learning_rate": 1.9607301368376706e-05, + "loss": 0.4918, + "step": 1834 + }, + { + "epoch": 0.3617902208201893, + "grad_norm": 0.6784040176469287, + "learning_rate": 1.9606871144904855e-05, + "loss": 0.4827, + "step": 1835 + }, + { + "epoch": 0.36198738170347006, + "grad_norm": 0.6264342538936625, + "learning_rate": 1.960644069062019e-05, + "loss": 0.4172, + "step": 1836 + }, + { + "epoch": 0.3621845425867508, + "grad_norm": 3.5120658505114277, + "learning_rate": 1.9606010005533055e-05, + "loss": 0.4948, + "step": 1837 + }, + { + "epoch": 0.36238170347003157, + "grad_norm": 0.9363587178765608, + "learning_rate": 1.960557908965379e-05, + "loss": 0.468, + "step": 1838 + }, + { + "epoch": 0.3625788643533123, + "grad_norm": 0.6948665441777858, + "learning_rate": 1.9605147942992752e-05, + "loss": 0.4493, + "step": 1839 + }, + { + "epoch": 0.3627760252365931, + "grad_norm": 0.7506323131912949, + "learning_rate": 1.9604716565560303e-05, + "loss": 0.4361, + "step": 1840 + }, + { + "epoch": 0.36297318611987384, + "grad_norm": 0.6646663396446123, + "learning_rate": 1.96042849573668e-05, + "loss": 0.4565, + "step": 1841 + }, + { + "epoch": 0.3631703470031546, + "grad_norm": 0.7021081874099678, + "learning_rate": 1.9603853118422618e-05, + "loss": 0.4791, + "step": 1842 + }, + { + "epoch": 0.36336750788643535, + "grad_norm": 0.7405667369095288, + "learning_rate": 1.960342104873813e-05, + "loss": 0.4507, + "step": 1843 + }, + { + "epoch": 0.3635646687697161, + "grad_norm": 0.8937542644763167, + "learning_rate": 1.9602988748323718e-05, + "loss": 0.4739, + "step": 1844 + }, + { + "epoch": 0.36376182965299686, + "grad_norm": 0.647962965489675, + "learning_rate": 1.960255621718977e-05, + "loss": 0.4505, + "step": 1845 + }, + { + "epoch": 0.3639589905362776, + "grad_norm": 0.6679476483113629, + "learning_rate": 1.9602123455346677e-05, + "loss": 0.4594, + "step": 1846 + }, + { + "epoch": 0.36415615141955837, + "grad_norm": 0.6248714424291023, + "learning_rate": 1.960169046280483e-05, + "loss": 0.4452, + "step": 1847 + }, + { + "epoch": 0.3643533123028391, + "grad_norm": 0.5629821941055176, + "learning_rate": 1.960125723957464e-05, + "loss": 0.4021, + "step": 1848 + }, + { + "epoch": 0.3645504731861199, + "grad_norm": 0.6196696074939825, + "learning_rate": 1.9600823785666515e-05, + "loss": 0.4365, + "step": 1849 + }, + { + "epoch": 0.36474763406940064, + "grad_norm": 0.6115029014398423, + "learning_rate": 1.9600390101090867e-05, + "loss": 0.4279, + "step": 1850 + }, + { + "epoch": 0.3649447949526814, + "grad_norm": 0.6120747583019098, + "learning_rate": 1.9599956185858112e-05, + "loss": 0.4548, + "step": 1851 + }, + { + "epoch": 0.36514195583596215, + "grad_norm": 0.7496535911475231, + "learning_rate": 1.959952203997868e-05, + "loss": 0.4646, + "step": 1852 + }, + { + "epoch": 0.3653391167192429, + "grad_norm": 0.6382300755564924, + "learning_rate": 1.9599087663463003e-05, + "loss": 0.4355, + "step": 1853 + }, + { + "epoch": 0.36553627760252366, + "grad_norm": 0.6602319601851913, + "learning_rate": 1.9598653056321512e-05, + "loss": 0.4632, + "step": 1854 + }, + { + "epoch": 0.3657334384858044, + "grad_norm": 0.6191066463069663, + "learning_rate": 1.9598218218564656e-05, + "loss": 0.4238, + "step": 1855 + }, + { + "epoch": 0.3659305993690852, + "grad_norm": 0.6072202775912791, + "learning_rate": 1.9597783150202873e-05, + "loss": 0.4683, + "step": 1856 + }, + { + "epoch": 0.36612776025236593, + "grad_norm": 0.5689905034550738, + "learning_rate": 1.9597347851246623e-05, + "loss": 0.4102, + "step": 1857 + }, + { + "epoch": 0.3663249211356467, + "grad_norm": 0.6023153592091132, + "learning_rate": 1.959691232170636e-05, + "loss": 0.4748, + "step": 1858 + }, + { + "epoch": 0.36652208201892744, + "grad_norm": 0.6035103505300697, + "learning_rate": 1.9596476561592553e-05, + "loss": 0.429, + "step": 1859 + }, + { + "epoch": 0.3667192429022082, + "grad_norm": 0.6246219996358824, + "learning_rate": 1.9596040570915666e-05, + "loss": 0.4179, + "step": 1860 + }, + { + "epoch": 0.36691640378548895, + "grad_norm": 0.6764471802280777, + "learning_rate": 1.959560434968618e-05, + "loss": 0.5002, + "step": 1861 + }, + { + "epoch": 0.3671135646687697, + "grad_norm": 0.6218434897962343, + "learning_rate": 1.959516789791457e-05, + "loss": 0.4436, + "step": 1862 + }, + { + "epoch": 0.36731072555205047, + "grad_norm": 0.6206678178233864, + "learning_rate": 1.959473121561132e-05, + "loss": 0.4553, + "step": 1863 + }, + { + "epoch": 0.3675078864353312, + "grad_norm": 0.5901550517207009, + "learning_rate": 1.9594294302786933e-05, + "loss": 0.4377, + "step": 1864 + }, + { + "epoch": 0.367705047318612, + "grad_norm": 0.6112016781481284, + "learning_rate": 1.9593857159451897e-05, + "loss": 0.4622, + "step": 1865 + }, + { + "epoch": 0.36790220820189273, + "grad_norm": 0.6154064090788753, + "learning_rate": 1.9593419785616716e-05, + "loss": 0.4587, + "step": 1866 + }, + { + "epoch": 0.3680993690851735, + "grad_norm": 0.6350370151618049, + "learning_rate": 1.95929821812919e-05, + "loss": 0.4683, + "step": 1867 + }, + { + "epoch": 0.36829652996845424, + "grad_norm": 1.6840479608127483, + "learning_rate": 1.9592544346487958e-05, + "loss": 0.4761, + "step": 1868 + }, + { + "epoch": 0.368493690851735, + "grad_norm": 0.586315544529278, + "learning_rate": 1.9592106281215418e-05, + "loss": 0.4171, + "step": 1869 + }, + { + "epoch": 0.36869085173501576, + "grad_norm": 0.6098943093456206, + "learning_rate": 1.95916679854848e-05, + "loss": 0.4483, + "step": 1870 + }, + { + "epoch": 0.3688880126182965, + "grad_norm": 0.6030006559111836, + "learning_rate": 1.959122945930663e-05, + "loss": 0.4379, + "step": 1871 + }, + { + "epoch": 0.36908517350157727, + "grad_norm": 0.6788148490316216, + "learning_rate": 1.9590790702691453e-05, + "loss": 0.4691, + "step": 1872 + }, + { + "epoch": 0.369282334384858, + "grad_norm": 0.6629668903573882, + "learning_rate": 1.9590351715649803e-05, + "loss": 0.4551, + "step": 1873 + }, + { + "epoch": 0.3694794952681388, + "grad_norm": 0.7978043884977309, + "learning_rate": 1.9589912498192233e-05, + "loss": 0.4405, + "step": 1874 + }, + { + "epoch": 0.36967665615141954, + "grad_norm": 0.7049104610666904, + "learning_rate": 1.958947305032929e-05, + "loss": 0.4673, + "step": 1875 + }, + { + "epoch": 0.3698738170347003, + "grad_norm": 0.6949512497820435, + "learning_rate": 1.9589033372071537e-05, + "loss": 0.4147, + "step": 1876 + }, + { + "epoch": 0.37007097791798105, + "grad_norm": 0.7972703080682879, + "learning_rate": 1.9588593463429532e-05, + "loss": 0.4636, + "step": 1877 + }, + { + "epoch": 0.3702681388012618, + "grad_norm": 0.6733154115167292, + "learning_rate": 1.958815332441385e-05, + "loss": 0.4556, + "step": 1878 + }, + { + "epoch": 0.37046529968454256, + "grad_norm": 0.633932691284626, + "learning_rate": 1.9587712955035064e-05, + "loss": 0.457, + "step": 1879 + }, + { + "epoch": 0.37066246056782337, + "grad_norm": 0.6626177934715735, + "learning_rate": 1.958727235530375e-05, + "loss": 0.4682, + "step": 1880 + }, + { + "epoch": 0.3708596214511041, + "grad_norm": 0.6359733697611749, + "learning_rate": 1.9586831525230496e-05, + "loss": 0.4482, + "step": 1881 + }, + { + "epoch": 0.3710567823343849, + "grad_norm": 0.8674782473950938, + "learning_rate": 1.9586390464825896e-05, + "loss": 0.447, + "step": 1882 + }, + { + "epoch": 0.37125394321766564, + "grad_norm": 0.7212842297017934, + "learning_rate": 1.958594917410055e-05, + "loss": 0.4928, + "step": 1883 + }, + { + "epoch": 0.3714511041009464, + "grad_norm": 0.5994454432815358, + "learning_rate": 1.958550765306505e-05, + "loss": 0.4354, + "step": 1884 + }, + { + "epoch": 0.37164826498422715, + "grad_norm": 0.5881488736842267, + "learning_rate": 1.9585065901730013e-05, + "loss": 0.4114, + "step": 1885 + }, + { + "epoch": 0.3718454258675079, + "grad_norm": 0.5729776216743723, + "learning_rate": 1.9584623920106044e-05, + "loss": 0.3996, + "step": 1886 + }, + { + "epoch": 0.37204258675078866, + "grad_norm": 0.6292024150236367, + "learning_rate": 1.9584181708203772e-05, + "loss": 0.4452, + "step": 1887 + }, + { + "epoch": 0.3722397476340694, + "grad_norm": 0.6434967873079545, + "learning_rate": 1.958373926603381e-05, + "loss": 0.4864, + "step": 1888 + }, + { + "epoch": 0.3724369085173502, + "grad_norm": 0.6189683570278633, + "learning_rate": 1.95832965936068e-05, + "loss": 0.4288, + "step": 1889 + }, + { + "epoch": 0.37263406940063093, + "grad_norm": 0.6925481427276183, + "learning_rate": 1.958285369093337e-05, + "loss": 0.4982, + "step": 1890 + }, + { + "epoch": 0.3728312302839117, + "grad_norm": 0.5954823776943323, + "learning_rate": 1.9582410558024162e-05, + "loss": 0.4159, + "step": 1891 + }, + { + "epoch": 0.37302839116719244, + "grad_norm": 0.6671992939469052, + "learning_rate": 1.9581967194889826e-05, + "loss": 0.4524, + "step": 1892 + }, + { + "epoch": 0.3732255520504732, + "grad_norm": 0.6322337439769842, + "learning_rate": 1.9581523601541012e-05, + "loss": 0.4909, + "step": 1893 + }, + { + "epoch": 0.37342271293375395, + "grad_norm": 0.6712642730299693, + "learning_rate": 1.9581079777988375e-05, + "loss": 0.4483, + "step": 1894 + }, + { + "epoch": 0.3736198738170347, + "grad_norm": 0.6673822790768886, + "learning_rate": 1.958063572424258e-05, + "loss": 0.4431, + "step": 1895 + }, + { + "epoch": 0.37381703470031546, + "grad_norm": 2.28916156009429, + "learning_rate": 1.9580191440314304e-05, + "loss": 0.5002, + "step": 1896 + }, + { + "epoch": 0.3740141955835962, + "grad_norm": 0.6439890915303407, + "learning_rate": 1.9579746926214205e-05, + "loss": 0.4325, + "step": 1897 + }, + { + "epoch": 0.374211356466877, + "grad_norm": 0.7309784046021603, + "learning_rate": 1.9579302181952977e-05, + "loss": 0.4615, + "step": 1898 + }, + { + "epoch": 0.37440851735015773, + "grad_norm": 0.5998628834963219, + "learning_rate": 1.9578857207541296e-05, + "loss": 0.4563, + "step": 1899 + }, + { + "epoch": 0.3746056782334385, + "grad_norm": 0.6925977755644835, + "learning_rate": 1.957841200298986e-05, + "loss": 0.4705, + "step": 1900 + }, + { + "epoch": 0.37480283911671924, + "grad_norm": 0.7297338025869237, + "learning_rate": 1.9577966568309358e-05, + "loss": 0.4749, + "step": 1901 + }, + { + "epoch": 0.375, + "grad_norm": 0.7547412508318946, + "learning_rate": 1.9577520903510497e-05, + "loss": 0.4826, + "step": 1902 + }, + { + "epoch": 0.37519716088328076, + "grad_norm": 0.6220978689967857, + "learning_rate": 1.957707500860399e-05, + "loss": 0.4348, + "step": 1903 + }, + { + "epoch": 0.3753943217665615, + "grad_norm": 0.6472282954783044, + "learning_rate": 1.9576628883600533e-05, + "loss": 0.4102, + "step": 1904 + }, + { + "epoch": 0.37559148264984227, + "grad_norm": 0.6841423764886676, + "learning_rate": 1.9576182528510864e-05, + "loss": 0.4776, + "step": 1905 + }, + { + "epoch": 0.375788643533123, + "grad_norm": 0.5900859903843134, + "learning_rate": 1.957573594334569e-05, + "loss": 0.4334, + "step": 1906 + }, + { + "epoch": 0.3759858044164038, + "grad_norm": 0.6428081199582281, + "learning_rate": 1.9575289128115758e-05, + "loss": 0.4459, + "step": 1907 + }, + { + "epoch": 0.37618296529968454, + "grad_norm": 0.6123822425477046, + "learning_rate": 1.9574842082831788e-05, + "loss": 0.4674, + "step": 1908 + }, + { + "epoch": 0.3763801261829653, + "grad_norm": 0.7460401880955564, + "learning_rate": 1.957439480750453e-05, + "loss": 0.4364, + "step": 1909 + }, + { + "epoch": 0.37657728706624605, + "grad_norm": 0.6613935489626954, + "learning_rate": 1.957394730214472e-05, + "loss": 0.4688, + "step": 1910 + }, + { + "epoch": 0.3767744479495268, + "grad_norm": 1.1706846880413952, + "learning_rate": 1.9573499566763124e-05, + "loss": 0.4627, + "step": 1911 + }, + { + "epoch": 0.37697160883280756, + "grad_norm": 1.4249063300767362, + "learning_rate": 1.9573051601370485e-05, + "loss": 0.4546, + "step": 1912 + }, + { + "epoch": 0.3771687697160883, + "grad_norm": 0.7269862868607329, + "learning_rate": 1.9572603405977573e-05, + "loss": 0.4929, + "step": 1913 + }, + { + "epoch": 0.37736593059936907, + "grad_norm": 0.7060411924408715, + "learning_rate": 1.957215498059516e-05, + "loss": 0.4406, + "step": 1914 + }, + { + "epoch": 0.3775630914826498, + "grad_norm": 0.6494093122802824, + "learning_rate": 1.957170632523401e-05, + "loss": 0.4622, + "step": 1915 + }, + { + "epoch": 0.3777602523659306, + "grad_norm": 0.6622590367838171, + "learning_rate": 1.957125743990491e-05, + "loss": 0.5108, + "step": 1916 + }, + { + "epoch": 0.37795741324921134, + "grad_norm": 0.6744814131396353, + "learning_rate": 1.9570808324618646e-05, + "loss": 0.4583, + "step": 1917 + }, + { + "epoch": 0.3781545741324921, + "grad_norm": 3.197758463117182, + "learning_rate": 1.9570358979386e-05, + "loss": 0.4479, + "step": 1918 + }, + { + "epoch": 0.37835173501577285, + "grad_norm": 0.6725127235656607, + "learning_rate": 1.956990940421777e-05, + "loss": 0.4218, + "step": 1919 + }, + { + "epoch": 0.3785488958990536, + "grad_norm": 0.6774677552967868, + "learning_rate": 1.9569459599124765e-05, + "loss": 0.4654, + "step": 1920 + }, + { + "epoch": 0.37874605678233436, + "grad_norm": 2.1023918912871715, + "learning_rate": 1.9569009564117783e-05, + "loss": 0.4601, + "step": 1921 + }, + { + "epoch": 0.3789432176656151, + "grad_norm": 0.8729529815113425, + "learning_rate": 1.9568559299207645e-05, + "loss": 0.4609, + "step": 1922 + }, + { + "epoch": 0.3791403785488959, + "grad_norm": 0.6586715530887697, + "learning_rate": 1.9568108804405162e-05, + "loss": 0.454, + "step": 1923 + }, + { + "epoch": 0.37933753943217663, + "grad_norm": 0.6950591232290906, + "learning_rate": 1.956765807972116e-05, + "loss": 0.4587, + "step": 1924 + }, + { + "epoch": 0.37953470031545744, + "grad_norm": 0.5859553428507086, + "learning_rate": 1.9567207125166466e-05, + "loss": 0.4559, + "step": 1925 + }, + { + "epoch": 0.3797318611987382, + "grad_norm": 0.654061925363833, + "learning_rate": 1.9566755940751916e-05, + "loss": 0.4539, + "step": 1926 + }, + { + "epoch": 0.37992902208201895, + "grad_norm": 0.7394858768473541, + "learning_rate": 1.956630452648835e-05, + "loss": 0.4135, + "step": 1927 + }, + { + "epoch": 0.3801261829652997, + "grad_norm": 0.7096797120999595, + "learning_rate": 1.956585288238662e-05, + "loss": 0.4421, + "step": 1928 + }, + { + "epoch": 0.38032334384858046, + "grad_norm": 0.8603308022382222, + "learning_rate": 1.9565401008457567e-05, + "loss": 0.4694, + "step": 1929 + }, + { + "epoch": 0.3805205047318612, + "grad_norm": 0.6288376088275459, + "learning_rate": 1.956494890471205e-05, + "loss": 0.4178, + "step": 1930 + }, + { + "epoch": 0.380717665615142, + "grad_norm": 0.6656485291036316, + "learning_rate": 1.9564496571160935e-05, + "loss": 0.4571, + "step": 1931 + }, + { + "epoch": 0.38091482649842273, + "grad_norm": 0.6168171412436806, + "learning_rate": 1.9564044007815087e-05, + "loss": 0.4142, + "step": 1932 + }, + { + "epoch": 0.3811119873817035, + "grad_norm": 0.7052282174762466, + "learning_rate": 1.956359121468538e-05, + "loss": 0.4445, + "step": 1933 + }, + { + "epoch": 0.38130914826498424, + "grad_norm": 0.7120124227298402, + "learning_rate": 1.9563138191782692e-05, + "loss": 0.4557, + "step": 1934 + }, + { + "epoch": 0.381506309148265, + "grad_norm": 0.6317595911299724, + "learning_rate": 1.956268493911791e-05, + "loss": 0.4587, + "step": 1935 + }, + { + "epoch": 0.38170347003154576, + "grad_norm": 0.6798856792123635, + "learning_rate": 1.9562231456701922e-05, + "loss": 0.4612, + "step": 1936 + }, + { + "epoch": 0.3819006309148265, + "grad_norm": 1.1002750501181588, + "learning_rate": 1.9561777744545616e-05, + "loss": 0.4516, + "step": 1937 + }, + { + "epoch": 0.38209779179810727, + "grad_norm": 0.7801889450244034, + "learning_rate": 1.9561323802659908e-05, + "loss": 0.4882, + "step": 1938 + }, + { + "epoch": 0.382294952681388, + "grad_norm": 0.6312650051830359, + "learning_rate": 1.9560869631055693e-05, + "loss": 0.4616, + "step": 1939 + }, + { + "epoch": 0.3824921135646688, + "grad_norm": 0.802737212887057, + "learning_rate": 1.9560415229743885e-05, + "loss": 0.4417, + "step": 1940 + }, + { + "epoch": 0.38268927444794953, + "grad_norm": 0.7357740131308222, + "learning_rate": 1.9559960598735403e-05, + "loss": 0.4701, + "step": 1941 + }, + { + "epoch": 0.3828864353312303, + "grad_norm": 0.6284042286654438, + "learning_rate": 1.9559505738041167e-05, + "loss": 0.4459, + "step": 1942 + }, + { + "epoch": 0.38308359621451105, + "grad_norm": 0.6451867717261572, + "learning_rate": 1.955905064767211e-05, + "loss": 0.4818, + "step": 1943 + }, + { + "epoch": 0.3832807570977918, + "grad_norm": 0.9841866252584824, + "learning_rate": 1.955859532763916e-05, + "loss": 0.412, + "step": 1944 + }, + { + "epoch": 0.38347791798107256, + "grad_norm": 0.7692893239650029, + "learning_rate": 1.955813977795326e-05, + "loss": 0.4328, + "step": 1945 + }, + { + "epoch": 0.3836750788643533, + "grad_norm": 0.740511429503883, + "learning_rate": 1.955768399862536e-05, + "loss": 0.4491, + "step": 1946 + }, + { + "epoch": 0.38387223974763407, + "grad_norm": 0.6674798746464747, + "learning_rate": 1.95572279896664e-05, + "loss": 0.499, + "step": 1947 + }, + { + "epoch": 0.3840694006309148, + "grad_norm": 0.8324473358303699, + "learning_rate": 1.9556771751087343e-05, + "loss": 0.4653, + "step": 1948 + }, + { + "epoch": 0.3842665615141956, + "grad_norm": 0.7000802793104866, + "learning_rate": 1.955631528289915e-05, + "loss": 0.4386, + "step": 1949 + }, + { + "epoch": 0.38446372239747634, + "grad_norm": 0.7005116186670807, + "learning_rate": 1.9555858585112784e-05, + "loss": 0.4615, + "step": 1950 + }, + { + "epoch": 0.3846608832807571, + "grad_norm": 0.7023205716896679, + "learning_rate": 1.9555401657739222e-05, + "loss": 0.4507, + "step": 1951 + }, + { + "epoch": 0.38485804416403785, + "grad_norm": 0.6267525065077723, + "learning_rate": 1.9554944500789438e-05, + "loss": 0.4291, + "step": 1952 + }, + { + "epoch": 0.3850552050473186, + "grad_norm": 0.6719159244755133, + "learning_rate": 1.955448711427442e-05, + "loss": 0.4891, + "step": 1953 + }, + { + "epoch": 0.38525236593059936, + "grad_norm": 0.6908923759078575, + "learning_rate": 1.9554029498205154e-05, + "loss": 0.4753, + "step": 1954 + }, + { + "epoch": 0.3854495268138801, + "grad_norm": 0.6560268963325017, + "learning_rate": 1.9553571652592637e-05, + "loss": 0.4271, + "step": 1955 + }, + { + "epoch": 0.3856466876971609, + "grad_norm": 0.8242232954813055, + "learning_rate": 1.9553113577447866e-05, + "loss": 0.4337, + "step": 1956 + }, + { + "epoch": 0.38584384858044163, + "grad_norm": 0.5786503721027892, + "learning_rate": 1.9552655272781848e-05, + "loss": 0.4491, + "step": 1957 + }, + { + "epoch": 0.3860410094637224, + "grad_norm": 0.6178851746400039, + "learning_rate": 1.9552196738605596e-05, + "loss": 0.4676, + "step": 1958 + }, + { + "epoch": 0.38623817034700314, + "grad_norm": 0.6433261446448528, + "learning_rate": 1.9551737974930124e-05, + "loss": 0.4405, + "step": 1959 + }, + { + "epoch": 0.3864353312302839, + "grad_norm": 0.657702346180845, + "learning_rate": 1.9551278981766453e-05, + "loss": 0.4697, + "step": 1960 + }, + { + "epoch": 0.38663249211356465, + "grad_norm": 0.5617264106547595, + "learning_rate": 1.9550819759125613e-05, + "loss": 0.4263, + "step": 1961 + }, + { + "epoch": 0.3868296529968454, + "grad_norm": 0.6078848131857936, + "learning_rate": 1.955036030701864e-05, + "loss": 0.4693, + "step": 1962 + }, + { + "epoch": 0.38702681388012616, + "grad_norm": 0.6098840021530041, + "learning_rate": 1.954990062545657e-05, + "loss": 0.4535, + "step": 1963 + }, + { + "epoch": 0.3872239747634069, + "grad_norm": 0.6666148228118189, + "learning_rate": 1.9549440714450447e-05, + "loss": 0.4376, + "step": 1964 + }, + { + "epoch": 0.3874211356466877, + "grad_norm": 0.6007383801169028, + "learning_rate": 1.9548980574011315e-05, + "loss": 0.4488, + "step": 1965 + }, + { + "epoch": 0.38761829652996843, + "grad_norm": 1.1165576178172691, + "learning_rate": 1.954852020415024e-05, + "loss": 0.4622, + "step": 1966 + }, + { + "epoch": 0.3878154574132492, + "grad_norm": 0.6069174688430108, + "learning_rate": 1.9548059604878277e-05, + "loss": 0.414, + "step": 1967 + }, + { + "epoch": 0.38801261829652994, + "grad_norm": 0.6426555376380005, + "learning_rate": 1.9547598776206492e-05, + "loss": 0.4859, + "step": 1968 + }, + { + "epoch": 0.3882097791798107, + "grad_norm": 0.643323906059335, + "learning_rate": 1.954713771814596e-05, + "loss": 0.476, + "step": 1969 + }, + { + "epoch": 0.3884069400630915, + "grad_norm": 0.6214422402840049, + "learning_rate": 1.9546676430707758e-05, + "loss": 0.456, + "step": 1970 + }, + { + "epoch": 0.38860410094637227, + "grad_norm": 0.5933166205495725, + "learning_rate": 1.954621491390296e-05, + "loss": 0.4282, + "step": 1971 + }, + { + "epoch": 0.388801261829653, + "grad_norm": 0.6469075563555098, + "learning_rate": 1.9545753167742664e-05, + "loss": 0.4395, + "step": 1972 + }, + { + "epoch": 0.3889984227129338, + "grad_norm": 0.575746232416297, + "learning_rate": 1.9545291192237962e-05, + "loss": 0.4426, + "step": 1973 + }, + { + "epoch": 0.38919558359621453, + "grad_norm": 0.8936881259787512, + "learning_rate": 1.954482898739995e-05, + "loss": 0.4218, + "step": 1974 + }, + { + "epoch": 0.3893927444794953, + "grad_norm": 0.5907560428830476, + "learning_rate": 1.9544366553239738e-05, + "loss": 0.4408, + "step": 1975 + }, + { + "epoch": 0.38958990536277605, + "grad_norm": 0.6531849605246557, + "learning_rate": 1.9543903889768435e-05, + "loss": 0.4718, + "step": 1976 + }, + { + "epoch": 0.3897870662460568, + "grad_norm": 0.5847696666259158, + "learning_rate": 1.9543440996997152e-05, + "loss": 0.4284, + "step": 1977 + }, + { + "epoch": 0.38998422712933756, + "grad_norm": 0.6685285347892534, + "learning_rate": 1.9542977874937014e-05, + "loss": 0.3962, + "step": 1978 + }, + { + "epoch": 0.3901813880126183, + "grad_norm": 0.5988664018630944, + "learning_rate": 1.954251452359915e-05, + "loss": 0.4522, + "step": 1979 + }, + { + "epoch": 0.39037854889589907, + "grad_norm": 0.683915321776376, + "learning_rate": 1.9542050942994686e-05, + "loss": 0.4443, + "step": 1980 + }, + { + "epoch": 0.3905757097791798, + "grad_norm": 0.5965994237087414, + "learning_rate": 1.9541587133134766e-05, + "loss": 0.4836, + "step": 1981 + }, + { + "epoch": 0.3907728706624606, + "grad_norm": 0.6874247628508368, + "learning_rate": 1.9541123094030528e-05, + "loss": 0.4478, + "step": 1982 + }, + { + "epoch": 0.39097003154574134, + "grad_norm": 0.6164585865243617, + "learning_rate": 1.954065882569313e-05, + "loss": 0.4458, + "step": 1983 + }, + { + "epoch": 0.3911671924290221, + "grad_norm": 0.6370965104931681, + "learning_rate": 1.954019432813372e-05, + "loss": 0.421, + "step": 1984 + }, + { + "epoch": 0.39136435331230285, + "grad_norm": 0.6849623127670889, + "learning_rate": 1.9539729601363456e-05, + "loss": 0.4799, + "step": 1985 + }, + { + "epoch": 0.3915615141955836, + "grad_norm": 0.6885520124409992, + "learning_rate": 1.9539264645393508e-05, + "loss": 0.4901, + "step": 1986 + }, + { + "epoch": 0.39175867507886436, + "grad_norm": 0.6261317061235863, + "learning_rate": 1.9538799460235044e-05, + "loss": 0.4266, + "step": 1987 + }, + { + "epoch": 0.3919558359621451, + "grad_norm": 0.6441025202938847, + "learning_rate": 1.953833404589924e-05, + "loss": 0.4428, + "step": 1988 + }, + { + "epoch": 0.39215299684542587, + "grad_norm": 0.7945693849241009, + "learning_rate": 1.953786840239728e-05, + "loss": 0.4498, + "step": 1989 + }, + { + "epoch": 0.39235015772870663, + "grad_norm": 0.6236426936646374, + "learning_rate": 1.953740252974035e-05, + "loss": 0.4433, + "step": 1990 + }, + { + "epoch": 0.3925473186119874, + "grad_norm": 0.7354383225024252, + "learning_rate": 1.9536936427939647e-05, + "loss": 0.4586, + "step": 1991 + }, + { + "epoch": 0.39274447949526814, + "grad_norm": 0.7831532504579899, + "learning_rate": 1.9536470097006363e-05, + "loss": 0.4887, + "step": 1992 + }, + { + "epoch": 0.3929416403785489, + "grad_norm": 0.6872639246256627, + "learning_rate": 1.9536003536951708e-05, + "loss": 0.4405, + "step": 1993 + }, + { + "epoch": 0.39313880126182965, + "grad_norm": 0.6736696437372744, + "learning_rate": 1.9535536747786884e-05, + "loss": 0.441, + "step": 1994 + }, + { + "epoch": 0.3933359621451104, + "grad_norm": 0.7082789400644977, + "learning_rate": 1.953506972952312e-05, + "loss": 0.4596, + "step": 1995 + }, + { + "epoch": 0.39353312302839116, + "grad_norm": 0.7181262186981547, + "learning_rate": 1.9534602482171618e-05, + "loss": 0.4626, + "step": 1996 + }, + { + "epoch": 0.3937302839116719, + "grad_norm": 0.7063551366210826, + "learning_rate": 1.9534135005743614e-05, + "loss": 0.4472, + "step": 1997 + }, + { + "epoch": 0.3939274447949527, + "grad_norm": 0.7562123798608619, + "learning_rate": 1.9533667300250343e-05, + "loss": 0.4832, + "step": 1998 + }, + { + "epoch": 0.39412460567823343, + "grad_norm": 0.6940975477174752, + "learning_rate": 1.9533199365703035e-05, + "loss": 0.4811, + "step": 1999 + }, + { + "epoch": 0.3943217665615142, + "grad_norm": 0.6260579790621669, + "learning_rate": 1.9532731202112935e-05, + "loss": 0.4616, + "step": 2000 + }, + { + "epoch": 0.39451892744479494, + "grad_norm": 0.6557298785672231, + "learning_rate": 1.9532262809491294e-05, + "loss": 0.4785, + "step": 2001 + }, + { + "epoch": 0.3947160883280757, + "grad_norm": 0.6533941335015038, + "learning_rate": 1.953179418784936e-05, + "loss": 0.4448, + "step": 2002 + }, + { + "epoch": 0.39491324921135645, + "grad_norm": 0.6709234409900684, + "learning_rate": 1.9531325337198394e-05, + "loss": 0.4481, + "step": 2003 + }, + { + "epoch": 0.3951104100946372, + "grad_norm": 1.528459654964046, + "learning_rate": 1.9530856257549664e-05, + "loss": 0.4861, + "step": 2004 + }, + { + "epoch": 0.39530757097791797, + "grad_norm": 0.6914635119888252, + "learning_rate": 1.9530386948914436e-05, + "loss": 0.4359, + "step": 2005 + }, + { + "epoch": 0.3955047318611987, + "grad_norm": 0.8898433190590056, + "learning_rate": 1.9529917411303984e-05, + "loss": 0.4643, + "step": 2006 + }, + { + "epoch": 0.3957018927444795, + "grad_norm": 0.6810967882315899, + "learning_rate": 1.95294476447296e-05, + "loss": 0.458, + "step": 2007 + }, + { + "epoch": 0.39589905362776023, + "grad_norm": 0.6805859515758788, + "learning_rate": 1.9528977649202554e-05, + "loss": 0.4199, + "step": 2008 + }, + { + "epoch": 0.396096214511041, + "grad_norm": 0.5998994373884956, + "learning_rate": 1.9528507424734148e-05, + "loss": 0.4118, + "step": 2009 + }, + { + "epoch": 0.39629337539432175, + "grad_norm": 0.8444604178654211, + "learning_rate": 1.9528036971335678e-05, + "loss": 0.4933, + "step": 2010 + }, + { + "epoch": 0.3964905362776025, + "grad_norm": 1.2768307772665541, + "learning_rate": 1.952756628901845e-05, + "loss": 0.4256, + "step": 2011 + }, + { + "epoch": 0.39668769716088326, + "grad_norm": 0.7512293164398522, + "learning_rate": 1.9527095377793765e-05, + "loss": 0.4721, + "step": 2012 + }, + { + "epoch": 0.396884858044164, + "grad_norm": 0.9372345492820653, + "learning_rate": 1.9526624237672945e-05, + "loss": 0.4319, + "step": 2013 + }, + { + "epoch": 0.39708201892744477, + "grad_norm": 3.610388601557778, + "learning_rate": 1.9526152868667302e-05, + "loss": 0.511, + "step": 2014 + }, + { + "epoch": 0.3972791798107255, + "grad_norm": 0.9133527465332176, + "learning_rate": 1.952568127078817e-05, + "loss": 0.4852, + "step": 2015 + }, + { + "epoch": 0.39747634069400634, + "grad_norm": 0.801460794349205, + "learning_rate": 1.952520944404687e-05, + "loss": 0.4679, + "step": 2016 + }, + { + "epoch": 0.3976735015772871, + "grad_norm": 0.6184192490263251, + "learning_rate": 1.9524737388454745e-05, + "loss": 0.4306, + "step": 2017 + }, + { + "epoch": 0.39787066246056785, + "grad_norm": 0.7074190008986967, + "learning_rate": 1.9524265104023133e-05, + "loss": 0.4233, + "step": 2018 + }, + { + "epoch": 0.3980678233438486, + "grad_norm": 0.7029283154800328, + "learning_rate": 1.9523792590763382e-05, + "loss": 0.4627, + "step": 2019 + }, + { + "epoch": 0.39826498422712936, + "grad_norm": 0.851532590870496, + "learning_rate": 1.9523319848686845e-05, + "loss": 0.4506, + "step": 2020 + }, + { + "epoch": 0.3984621451104101, + "grad_norm": 0.7525221362585885, + "learning_rate": 1.952284687780488e-05, + "loss": 0.4434, + "step": 2021 + }, + { + "epoch": 0.39865930599369087, + "grad_norm": 0.8241310038169775, + "learning_rate": 1.952237367812885e-05, + "loss": 0.4552, + "step": 2022 + }, + { + "epoch": 0.3988564668769716, + "grad_norm": 0.6291466115452881, + "learning_rate": 1.952190024967012e-05, + "loss": 0.441, + "step": 2023 + }, + { + "epoch": 0.3990536277602524, + "grad_norm": 0.8316147796097879, + "learning_rate": 1.9521426592440075e-05, + "loss": 0.4394, + "step": 2024 + }, + { + "epoch": 0.39925078864353314, + "grad_norm": 0.6195605062293554, + "learning_rate": 1.9520952706450083e-05, + "loss": 0.4547, + "step": 2025 + }, + { + "epoch": 0.3994479495268139, + "grad_norm": 1.0728222016893565, + "learning_rate": 1.952047859171154e-05, + "loss": 0.4847, + "step": 2026 + }, + { + "epoch": 0.39964511041009465, + "grad_norm": 0.6543029079987064, + "learning_rate": 1.9520004248235826e-05, + "loss": 0.4412, + "step": 2027 + }, + { + "epoch": 0.3998422712933754, + "grad_norm": 0.8215362022698259, + "learning_rate": 1.9519529676034347e-05, + "loss": 0.4413, + "step": 2028 + }, + { + "epoch": 0.40003943217665616, + "grad_norm": 0.6394436878246803, + "learning_rate": 1.95190548751185e-05, + "loss": 0.4091, + "step": 2029 + }, + { + "epoch": 0.4002365930599369, + "grad_norm": 0.8545402719858461, + "learning_rate": 1.9518579845499698e-05, + "loss": 0.4713, + "step": 2030 + }, + { + "epoch": 0.4004337539432177, + "grad_norm": 0.6930745549889441, + "learning_rate": 1.9518104587189348e-05, + "loss": 0.4474, + "step": 2031 + }, + { + "epoch": 0.40063091482649843, + "grad_norm": 0.8730350351232401, + "learning_rate": 1.951762910019887e-05, + "loss": 0.4678, + "step": 2032 + }, + { + "epoch": 0.4008280757097792, + "grad_norm": 0.7083655229046756, + "learning_rate": 1.9517153384539685e-05, + "loss": 0.4452, + "step": 2033 + }, + { + "epoch": 0.40102523659305994, + "grad_norm": 0.9617195558090388, + "learning_rate": 1.9516677440223232e-05, + "loss": 0.4631, + "step": 2034 + }, + { + "epoch": 0.4012223974763407, + "grad_norm": 0.7580517534109538, + "learning_rate": 1.9516201267260935e-05, + "loss": 0.4548, + "step": 2035 + }, + { + "epoch": 0.40141955835962145, + "grad_norm": 1.0525747613425824, + "learning_rate": 1.9515724865664242e-05, + "loss": 0.4953, + "step": 2036 + }, + { + "epoch": 0.4016167192429022, + "grad_norm": 0.7820613688472805, + "learning_rate": 1.9515248235444595e-05, + "loss": 0.4339, + "step": 2037 + }, + { + "epoch": 0.40181388012618297, + "grad_norm": 0.7403224002007617, + "learning_rate": 1.9514771376613446e-05, + "loss": 0.4766, + "step": 2038 + }, + { + "epoch": 0.4020110410094637, + "grad_norm": 0.8359871396836571, + "learning_rate": 1.9514294289182253e-05, + "loss": 0.5007, + "step": 2039 + }, + { + "epoch": 0.4022082018927445, + "grad_norm": 51.06976499844091, + "learning_rate": 1.9513816973162475e-05, + "loss": 0.7096, + "step": 2040 + }, + { + "epoch": 0.40240536277602523, + "grad_norm": 0.9110749969850523, + "learning_rate": 1.9513339428565588e-05, + "loss": 0.4644, + "step": 2041 + }, + { + "epoch": 0.402602523659306, + "grad_norm": 0.7178947368074214, + "learning_rate": 1.9512861655403057e-05, + "loss": 0.4586, + "step": 2042 + }, + { + "epoch": 0.40279968454258674, + "grad_norm": 0.6762975622616101, + "learning_rate": 1.9512383653686364e-05, + "loss": 0.4217, + "step": 2043 + }, + { + "epoch": 0.4029968454258675, + "grad_norm": 0.7412728040347322, + "learning_rate": 1.9511905423426992e-05, + "loss": 0.5072, + "step": 2044 + }, + { + "epoch": 0.40319400630914826, + "grad_norm": 0.8922820307079298, + "learning_rate": 1.9511426964636437e-05, + "loss": 0.4553, + "step": 2045 + }, + { + "epoch": 0.403391167192429, + "grad_norm": 0.6604569080074368, + "learning_rate": 1.9510948277326188e-05, + "loss": 0.4388, + "step": 2046 + }, + { + "epoch": 0.40358832807570977, + "grad_norm": 0.7657116520818048, + "learning_rate": 1.9510469361507747e-05, + "loss": 0.4703, + "step": 2047 + }, + { + "epoch": 0.4037854889589905, + "grad_norm": 0.6403472748402534, + "learning_rate": 1.950999021719262e-05, + "loss": 0.4426, + "step": 2048 + }, + { + "epoch": 0.4039826498422713, + "grad_norm": 0.7942729480613527, + "learning_rate": 1.950951084439232e-05, + "loss": 0.4722, + "step": 2049 + }, + { + "epoch": 0.40417981072555204, + "grad_norm": 0.6777646106474043, + "learning_rate": 1.9509031243118365e-05, + "loss": 0.4596, + "step": 2050 + }, + { + "epoch": 0.4043769716088328, + "grad_norm": 0.6657983977712271, + "learning_rate": 1.9508551413382274e-05, + "loss": 0.4649, + "step": 2051 + }, + { + "epoch": 0.40457413249211355, + "grad_norm": 0.7409227648041087, + "learning_rate": 1.950807135519558e-05, + "loss": 0.4949, + "step": 2052 + }, + { + "epoch": 0.4047712933753943, + "grad_norm": 0.6979782066122284, + "learning_rate": 1.9507591068569812e-05, + "loss": 0.519, + "step": 2053 + }, + { + "epoch": 0.40496845425867506, + "grad_norm": 0.6505454029015588, + "learning_rate": 1.9507110553516518e-05, + "loss": 0.4148, + "step": 2054 + }, + { + "epoch": 0.4051656151419558, + "grad_norm": 0.6905829191499583, + "learning_rate": 1.9506629810047233e-05, + "loss": 0.4269, + "step": 2055 + }, + { + "epoch": 0.40536277602523657, + "grad_norm": 0.6726973855018206, + "learning_rate": 1.9506148838173512e-05, + "loss": 0.4714, + "step": 2056 + }, + { + "epoch": 0.4055599369085173, + "grad_norm": 0.6449181697582981, + "learning_rate": 1.950566763790691e-05, + "loss": 0.4611, + "step": 2057 + }, + { + "epoch": 0.4057570977917981, + "grad_norm": 0.6541125973886486, + "learning_rate": 1.9505186209258987e-05, + "loss": 0.4657, + "step": 2058 + }, + { + "epoch": 0.40595425867507884, + "grad_norm": 0.6492031501355185, + "learning_rate": 1.950470455224131e-05, + "loss": 0.419, + "step": 2059 + }, + { + "epoch": 0.4061514195583596, + "grad_norm": 0.6958217286443721, + "learning_rate": 1.9504222666865457e-05, + "loss": 0.4559, + "step": 2060 + }, + { + "epoch": 0.4063485804416404, + "grad_norm": 0.6660544919270931, + "learning_rate": 1.9503740553142995e-05, + "loss": 0.4614, + "step": 2061 + }, + { + "epoch": 0.40654574132492116, + "grad_norm": 0.6441549325271921, + "learning_rate": 1.9503258211085515e-05, + "loss": 0.4832, + "step": 2062 + }, + { + "epoch": 0.4067429022082019, + "grad_norm": 1.5150959012964547, + "learning_rate": 1.9502775640704606e-05, + "loss": 0.434, + "step": 2063 + }, + { + "epoch": 0.4069400630914827, + "grad_norm": 0.6244719601196084, + "learning_rate": 1.9502292842011857e-05, + "loss": 0.4173, + "step": 2064 + }, + { + "epoch": 0.40713722397476343, + "grad_norm": 0.6628533283416533, + "learning_rate": 1.950180981501887e-05, + "loss": 0.4414, + "step": 2065 + }, + { + "epoch": 0.4073343848580442, + "grad_norm": 0.7233298009650888, + "learning_rate": 1.950132655973725e-05, + "loss": 0.4296, + "step": 2066 + }, + { + "epoch": 0.40753154574132494, + "grad_norm": 0.6648377046224346, + "learning_rate": 1.9500843076178612e-05, + "loss": 0.4455, + "step": 2067 + }, + { + "epoch": 0.4077287066246057, + "grad_norm": 0.6564892733023274, + "learning_rate": 1.9500359364354565e-05, + "loss": 0.4545, + "step": 2068 + }, + { + "epoch": 0.40792586750788645, + "grad_norm": 0.6435233243344421, + "learning_rate": 1.9499875424276734e-05, + "loss": 0.4551, + "step": 2069 + }, + { + "epoch": 0.4081230283911672, + "grad_norm": 0.6478000940884499, + "learning_rate": 1.9499391255956745e-05, + "loss": 0.4262, + "step": 2070 + }, + { + "epoch": 0.40832018927444796, + "grad_norm": 2.6194886693717487, + "learning_rate": 1.949890685940623e-05, + "loss": 0.4949, + "step": 2071 + }, + { + "epoch": 0.4085173501577287, + "grad_norm": 0.7796501553012304, + "learning_rate": 1.949842223463683e-05, + "loss": 0.4521, + "step": 2072 + }, + { + "epoch": 0.4087145110410095, + "grad_norm": 1.1718471052149675, + "learning_rate": 1.9497937381660188e-05, + "loss": 0.43, + "step": 2073 + }, + { + "epoch": 0.40891167192429023, + "grad_norm": 0.7069733785890165, + "learning_rate": 1.949745230048795e-05, + "loss": 0.4567, + "step": 2074 + }, + { + "epoch": 0.409108832807571, + "grad_norm": 0.7134975969386784, + "learning_rate": 1.9496966991131775e-05, + "loss": 0.3937, + "step": 2075 + }, + { + "epoch": 0.40930599369085174, + "grad_norm": 0.6204131343159649, + "learning_rate": 1.9496481453603318e-05, + "loss": 0.4501, + "step": 2076 + }, + { + "epoch": 0.4095031545741325, + "grad_norm": 0.6233092334560004, + "learning_rate": 1.9495995687914244e-05, + "loss": 0.4122, + "step": 2077 + }, + { + "epoch": 0.40970031545741326, + "grad_norm": 0.6614510142665343, + "learning_rate": 1.949550969407623e-05, + "loss": 0.4269, + "step": 2078 + }, + { + "epoch": 0.409897476340694, + "grad_norm": 0.6647109624994366, + "learning_rate": 1.949502347210095e-05, + "loss": 0.464, + "step": 2079 + }, + { + "epoch": 0.41009463722397477, + "grad_norm": 0.6704844205652758, + "learning_rate": 1.949453702200008e-05, + "loss": 0.471, + "step": 2080 + }, + { + "epoch": 0.4102917981072555, + "grad_norm": 0.6946952276485558, + "learning_rate": 1.9494050343785317e-05, + "loss": 0.461, + "step": 2081 + }, + { + "epoch": 0.4104889589905363, + "grad_norm": 0.6724334955144293, + "learning_rate": 1.9493563437468344e-05, + "loss": 0.4693, + "step": 2082 + }, + { + "epoch": 0.41068611987381703, + "grad_norm": 0.7089092005187811, + "learning_rate": 1.9493076303060866e-05, + "loss": 0.4482, + "step": 2083 + }, + { + "epoch": 0.4108832807570978, + "grad_norm": 0.5996155537534351, + "learning_rate": 1.9492588940574588e-05, + "loss": 0.445, + "step": 2084 + }, + { + "epoch": 0.41108044164037855, + "grad_norm": 0.6632658160746489, + "learning_rate": 1.9492101350021216e-05, + "loss": 0.42, + "step": 2085 + }, + { + "epoch": 0.4112776025236593, + "grad_norm": 0.6093300916331452, + "learning_rate": 1.9491613531412463e-05, + "loss": 0.4507, + "step": 2086 + }, + { + "epoch": 0.41147476340694006, + "grad_norm": 0.6358547202310659, + "learning_rate": 1.949112548476005e-05, + "loss": 0.4588, + "step": 2087 + }, + { + "epoch": 0.4116719242902208, + "grad_norm": 0.6371219707058599, + "learning_rate": 1.9490637210075708e-05, + "loss": 0.4772, + "step": 2088 + }, + { + "epoch": 0.41186908517350157, + "grad_norm": 0.5999352611847947, + "learning_rate": 1.9490148707371163e-05, + "loss": 0.4604, + "step": 2089 + }, + { + "epoch": 0.4120662460567823, + "grad_norm": 0.653380052430745, + "learning_rate": 1.9489659976658152e-05, + "loss": 0.4377, + "step": 2090 + }, + { + "epoch": 0.4122634069400631, + "grad_norm": 0.6174059523962631, + "learning_rate": 1.948917101794842e-05, + "loss": 0.4545, + "step": 2091 + }, + { + "epoch": 0.41246056782334384, + "grad_norm": 0.6989529903953323, + "learning_rate": 1.9488681831253706e-05, + "loss": 0.4615, + "step": 2092 + }, + { + "epoch": 0.4126577287066246, + "grad_norm": 0.6751206895361119, + "learning_rate": 1.9488192416585775e-05, + "loss": 0.4749, + "step": 2093 + }, + { + "epoch": 0.41285488958990535, + "grad_norm": 0.6067234928335872, + "learning_rate": 1.948770277395638e-05, + "loss": 0.4495, + "step": 2094 + }, + { + "epoch": 0.4130520504731861, + "grad_norm": 0.6038861471266463, + "learning_rate": 1.9487212903377286e-05, + "loss": 0.4318, + "step": 2095 + }, + { + "epoch": 0.41324921135646686, + "grad_norm": 0.6099951703516203, + "learning_rate": 1.9486722804860262e-05, + "loss": 0.4232, + "step": 2096 + }, + { + "epoch": 0.4134463722397476, + "grad_norm": 0.6018481400930096, + "learning_rate": 1.948623247841708e-05, + "loss": 0.4583, + "step": 2097 + }, + { + "epoch": 0.4136435331230284, + "grad_norm": 0.6300586347763767, + "learning_rate": 1.948574192405953e-05, + "loss": 0.4805, + "step": 2098 + }, + { + "epoch": 0.41384069400630913, + "grad_norm": 0.6261753743075679, + "learning_rate": 1.9485251141799387e-05, + "loss": 0.4929, + "step": 2099 + }, + { + "epoch": 0.4140378548895899, + "grad_norm": 0.5502783264260944, + "learning_rate": 1.9484760131648447e-05, + "loss": 0.4201, + "step": 2100 + }, + { + "epoch": 0.41423501577287064, + "grad_norm": 0.6373152594526875, + "learning_rate": 1.9484268893618504e-05, + "loss": 0.5034, + "step": 2101 + }, + { + "epoch": 0.4144321766561514, + "grad_norm": 0.550511658521269, + "learning_rate": 1.9483777427721367e-05, + "loss": 0.4215, + "step": 2102 + }, + { + "epoch": 0.41462933753943215, + "grad_norm": 0.600033811832785, + "learning_rate": 1.948328573396884e-05, + "loss": 0.4536, + "step": 2103 + }, + { + "epoch": 0.4148264984227129, + "grad_norm": 0.591267981029353, + "learning_rate": 1.9482793812372732e-05, + "loss": 0.4195, + "step": 2104 + }, + { + "epoch": 0.41502365930599366, + "grad_norm": 0.6355093625177733, + "learning_rate": 1.9482301662944872e-05, + "loss": 0.45, + "step": 2105 + }, + { + "epoch": 0.4152208201892745, + "grad_norm": 0.6066083667212502, + "learning_rate": 1.9481809285697076e-05, + "loss": 0.438, + "step": 2106 + }, + { + "epoch": 0.41541798107255523, + "grad_norm": 0.6268489081451546, + "learning_rate": 1.9481316680641175e-05, + "loss": 0.4482, + "step": 2107 + }, + { + "epoch": 0.415615141955836, + "grad_norm": 0.6292327520694636, + "learning_rate": 1.9480823847789007e-05, + "loss": 0.4657, + "step": 2108 + }, + { + "epoch": 0.41581230283911674, + "grad_norm": 0.5829052402848848, + "learning_rate": 1.9480330787152413e-05, + "loss": 0.429, + "step": 2109 + }, + { + "epoch": 0.4160094637223975, + "grad_norm": 1.3748165275525082, + "learning_rate": 1.9479837498743236e-05, + "loss": 0.4193, + "step": 2110 + }, + { + "epoch": 0.41620662460567825, + "grad_norm": 0.5682917102110292, + "learning_rate": 1.9479343982573326e-05, + "loss": 0.4422, + "step": 2111 + }, + { + "epoch": 0.416403785488959, + "grad_norm": 0.5947136497114629, + "learning_rate": 1.9478850238654546e-05, + "loss": 0.4614, + "step": 2112 + }, + { + "epoch": 0.41660094637223977, + "grad_norm": 0.7037292516298015, + "learning_rate": 1.9478356266998757e-05, + "loss": 0.5048, + "step": 2113 + }, + { + "epoch": 0.4167981072555205, + "grad_norm": 0.6478893055908316, + "learning_rate": 1.947786206761782e-05, + "loss": 0.4463, + "step": 2114 + }, + { + "epoch": 0.4169952681388013, + "grad_norm": 0.6013780035939855, + "learning_rate": 1.9477367640523622e-05, + "loss": 0.4595, + "step": 2115 + }, + { + "epoch": 0.41719242902208203, + "grad_norm": 0.5984366322194706, + "learning_rate": 1.947687298572803e-05, + "loss": 0.4312, + "step": 2116 + }, + { + "epoch": 0.4173895899053628, + "grad_norm": 0.6057400198881939, + "learning_rate": 1.9476378103242934e-05, + "loss": 0.4423, + "step": 2117 + }, + { + "epoch": 0.41758675078864355, + "grad_norm": 0.5985571921288283, + "learning_rate": 1.9475882993080223e-05, + "loss": 0.4192, + "step": 2118 + }, + { + "epoch": 0.4177839116719243, + "grad_norm": 0.6313104451272632, + "learning_rate": 1.947538765525179e-05, + "loss": 0.4382, + "step": 2119 + }, + { + "epoch": 0.41798107255520506, + "grad_norm": 0.6434996318524585, + "learning_rate": 1.9474892089769538e-05, + "loss": 0.4643, + "step": 2120 + }, + { + "epoch": 0.4181782334384858, + "grad_norm": 0.5983507556937149, + "learning_rate": 1.947439629664538e-05, + "loss": 0.4306, + "step": 2121 + }, + { + "epoch": 0.41837539432176657, + "grad_norm": 0.6227894910009829, + "learning_rate": 1.9473900275891214e-05, + "loss": 0.4185, + "step": 2122 + }, + { + "epoch": 0.4185725552050473, + "grad_norm": 0.6249459358371175, + "learning_rate": 1.9473404027518965e-05, + "loss": 0.4422, + "step": 2123 + }, + { + "epoch": 0.4187697160883281, + "grad_norm": 0.5581857659991755, + "learning_rate": 1.9472907551540557e-05, + "loss": 0.3957, + "step": 2124 + }, + { + "epoch": 0.41896687697160884, + "grad_norm": 0.6086539772510696, + "learning_rate": 1.9472410847967917e-05, + "loss": 0.4538, + "step": 2125 + }, + { + "epoch": 0.4191640378548896, + "grad_norm": 0.6001107475689484, + "learning_rate": 1.947191391681298e-05, + "loss": 0.4105, + "step": 2126 + }, + { + "epoch": 0.41936119873817035, + "grad_norm": 0.66517888101447, + "learning_rate": 1.947141675808768e-05, + "loss": 0.4399, + "step": 2127 + }, + { + "epoch": 0.4195583596214511, + "grad_norm": 0.5752990303617271, + "learning_rate": 1.9470919371803966e-05, + "loss": 0.4208, + "step": 2128 + }, + { + "epoch": 0.41975552050473186, + "grad_norm": 0.6156099256870735, + "learning_rate": 1.947042175797379e-05, + "loss": 0.4578, + "step": 2129 + }, + { + "epoch": 0.4199526813880126, + "grad_norm": 0.5825517100633804, + "learning_rate": 1.94699239166091e-05, + "loss": 0.45, + "step": 2130 + }, + { + "epoch": 0.42014984227129337, + "grad_norm": 0.6011744479891673, + "learning_rate": 1.9469425847721865e-05, + "loss": 0.4734, + "step": 2131 + }, + { + "epoch": 0.42034700315457413, + "grad_norm": 0.5606446063107278, + "learning_rate": 1.9468927551324045e-05, + "loss": 0.4242, + "step": 2132 + }, + { + "epoch": 0.4205441640378549, + "grad_norm": 0.5730880385070863, + "learning_rate": 1.946842902742762e-05, + "loss": 0.4439, + "step": 2133 + }, + { + "epoch": 0.42074132492113564, + "grad_norm": 0.6125765323790818, + "learning_rate": 1.9467930276044557e-05, + "loss": 0.4655, + "step": 2134 + }, + { + "epoch": 0.4209384858044164, + "grad_norm": 0.5806851297859154, + "learning_rate": 1.946743129718685e-05, + "loss": 0.441, + "step": 2135 + }, + { + "epoch": 0.42113564668769715, + "grad_norm": 0.6077365774621822, + "learning_rate": 1.946693209086648e-05, + "loss": 0.4479, + "step": 2136 + }, + { + "epoch": 0.4213328075709779, + "grad_norm": 0.6092599482658368, + "learning_rate": 1.9466432657095443e-05, + "loss": 0.4439, + "step": 2137 + }, + { + "epoch": 0.42152996845425866, + "grad_norm": 0.5972879209489058, + "learning_rate": 1.9465932995885737e-05, + "loss": 0.4174, + "step": 2138 + }, + { + "epoch": 0.4217271293375394, + "grad_norm": 0.6448412899637194, + "learning_rate": 1.946543310724937e-05, + "loss": 0.4802, + "step": 2139 + }, + { + "epoch": 0.4219242902208202, + "grad_norm": 0.5919021815947381, + "learning_rate": 1.946493299119835e-05, + "loss": 0.428, + "step": 2140 + }, + { + "epoch": 0.42212145110410093, + "grad_norm": 0.5832897545671972, + "learning_rate": 1.9464432647744693e-05, + "loss": 0.4152, + "step": 2141 + }, + { + "epoch": 0.4223186119873817, + "grad_norm": 0.7594410044159404, + "learning_rate": 1.9463932076900416e-05, + "loss": 0.4921, + "step": 2142 + }, + { + "epoch": 0.42251577287066244, + "grad_norm": 0.5750713155262214, + "learning_rate": 1.9463431278677552e-05, + "loss": 0.4064, + "step": 2143 + }, + { + "epoch": 0.4227129337539432, + "grad_norm": 0.6628109681634553, + "learning_rate": 1.946293025308813e-05, + "loss": 0.4372, + "step": 2144 + }, + { + "epoch": 0.42291009463722395, + "grad_norm": 0.6309603811899096, + "learning_rate": 1.946242900014419e-05, + "loss": 0.4862, + "step": 2145 + }, + { + "epoch": 0.4231072555205047, + "grad_norm": 0.6540676115612354, + "learning_rate": 1.9461927519857772e-05, + "loss": 0.4509, + "step": 2146 + }, + { + "epoch": 0.42330441640378547, + "grad_norm": 0.6395341421094025, + "learning_rate": 1.9461425812240925e-05, + "loss": 0.4526, + "step": 2147 + }, + { + "epoch": 0.4235015772870662, + "grad_norm": 0.6394713150271438, + "learning_rate": 1.9460923877305706e-05, + "loss": 0.4616, + "step": 2148 + }, + { + "epoch": 0.423698738170347, + "grad_norm": 0.6069065769872102, + "learning_rate": 1.9460421715064172e-05, + "loss": 0.4289, + "step": 2149 + }, + { + "epoch": 0.42389589905362773, + "grad_norm": 0.6064632740648996, + "learning_rate": 1.9459919325528384e-05, + "loss": 0.4635, + "step": 2150 + }, + { + "epoch": 0.4240930599369085, + "grad_norm": 0.616873552338082, + "learning_rate": 1.945941670871042e-05, + "loss": 0.4648, + "step": 2151 + }, + { + "epoch": 0.4242902208201893, + "grad_norm": 0.5805094134522223, + "learning_rate": 1.945891386462235e-05, + "loss": 0.4254, + "step": 2152 + }, + { + "epoch": 0.42448738170347006, + "grad_norm": 0.6016630618010061, + "learning_rate": 1.9458410793276256e-05, + "loss": 0.4537, + "step": 2153 + }, + { + "epoch": 0.4246845425867508, + "grad_norm": 0.6221143855785904, + "learning_rate": 1.9457907494684227e-05, + "loss": 0.4689, + "step": 2154 + }, + { + "epoch": 0.42488170347003157, + "grad_norm": 0.786382722019145, + "learning_rate": 1.9457403968858358e-05, + "loss": 0.4614, + "step": 2155 + }, + { + "epoch": 0.4250788643533123, + "grad_norm": 0.5847320244340704, + "learning_rate": 1.9456900215810737e-05, + "loss": 0.4293, + "step": 2156 + }, + { + "epoch": 0.4252760252365931, + "grad_norm": 0.695033418240993, + "learning_rate": 1.9456396235553474e-05, + "loss": 0.4215, + "step": 2157 + }, + { + "epoch": 0.42547318611987384, + "grad_norm": 0.6844045999840678, + "learning_rate": 1.9455892028098677e-05, + "loss": 0.5193, + "step": 2158 + }, + { + "epoch": 0.4256703470031546, + "grad_norm": 0.6184226453391258, + "learning_rate": 1.945538759345846e-05, + "loss": 0.486, + "step": 2159 + }, + { + "epoch": 0.42586750788643535, + "grad_norm": 0.6324228885355624, + "learning_rate": 1.9454882931644942e-05, + "loss": 0.4585, + "step": 2160 + }, + { + "epoch": 0.4260646687697161, + "grad_norm": 0.6165127471239128, + "learning_rate": 1.9454378042670245e-05, + "loss": 0.46, + "step": 2161 + }, + { + "epoch": 0.42626182965299686, + "grad_norm": 0.6347931854232857, + "learning_rate": 1.9453872926546505e-05, + "loss": 0.4598, + "step": 2162 + }, + { + "epoch": 0.4264589905362776, + "grad_norm": 0.6539072568560251, + "learning_rate": 1.9453367583285853e-05, + "loss": 0.4291, + "step": 2163 + }, + { + "epoch": 0.42665615141955837, + "grad_norm": 0.6184889881753938, + "learning_rate": 1.945286201290043e-05, + "loss": 0.4624, + "step": 2164 + }, + { + "epoch": 0.4268533123028391, + "grad_norm": 0.619366970782029, + "learning_rate": 1.945235621540239e-05, + "loss": 0.4838, + "step": 2165 + }, + { + "epoch": 0.4270504731861199, + "grad_norm": 0.5940273163643314, + "learning_rate": 1.9451850190803877e-05, + "loss": 0.4219, + "step": 2166 + }, + { + "epoch": 0.42724763406940064, + "grad_norm": 0.6055311252553117, + "learning_rate": 1.9451343939117052e-05, + "loss": 0.4413, + "step": 2167 + }, + { + "epoch": 0.4274447949526814, + "grad_norm": 0.6326779232777279, + "learning_rate": 1.9450837460354073e-05, + "loss": 0.4506, + "step": 2168 + }, + { + "epoch": 0.42764195583596215, + "grad_norm": 0.6835756822811451, + "learning_rate": 1.9450330754527118e-05, + "loss": 0.4916, + "step": 2169 + }, + { + "epoch": 0.4278391167192429, + "grad_norm": 0.6561532078291141, + "learning_rate": 1.9449823821648357e-05, + "loss": 0.461, + "step": 2170 + }, + { + "epoch": 0.42803627760252366, + "grad_norm": 0.7237386411398435, + "learning_rate": 1.944931666172997e-05, + "loss": 0.4694, + "step": 2171 + }, + { + "epoch": 0.4282334384858044, + "grad_norm": 0.615056095691249, + "learning_rate": 1.9448809274784136e-05, + "loss": 0.4374, + "step": 2172 + }, + { + "epoch": 0.4284305993690852, + "grad_norm": 0.5965296460751116, + "learning_rate": 1.944830166082305e-05, + "loss": 0.4651, + "step": 2173 + }, + { + "epoch": 0.42862776025236593, + "grad_norm": 0.6166157125058764, + "learning_rate": 1.9447793819858912e-05, + "loss": 0.4427, + "step": 2174 + }, + { + "epoch": 0.4288249211356467, + "grad_norm": 0.5827660473699446, + "learning_rate": 1.944728575190392e-05, + "loss": 0.4428, + "step": 2175 + }, + { + "epoch": 0.42902208201892744, + "grad_norm": 0.5867427682120541, + "learning_rate": 1.9446777456970276e-05, + "loss": 0.4346, + "step": 2176 + }, + { + "epoch": 0.4292192429022082, + "grad_norm": 0.5549950519191269, + "learning_rate": 1.9446268935070197e-05, + "loss": 0.4119, + "step": 2177 + }, + { + "epoch": 0.42941640378548895, + "grad_norm": 0.6125305517287657, + "learning_rate": 1.94457601862159e-05, + "loss": 0.4829, + "step": 2178 + }, + { + "epoch": 0.4296135646687697, + "grad_norm": 0.6921490966397458, + "learning_rate": 1.944525121041961e-05, + "loss": 0.4758, + "step": 2179 + }, + { + "epoch": 0.42981072555205047, + "grad_norm": 0.585352142286126, + "learning_rate": 1.944474200769355e-05, + "loss": 0.4497, + "step": 2180 + }, + { + "epoch": 0.4300078864353312, + "grad_norm": 0.6266546137858079, + "learning_rate": 1.944423257804996e-05, + "loss": 0.4481, + "step": 2181 + }, + { + "epoch": 0.430205047318612, + "grad_norm": 0.6516582163069267, + "learning_rate": 1.9443722921501074e-05, + "loss": 0.4473, + "step": 2182 + }, + { + "epoch": 0.43040220820189273, + "grad_norm": 0.6099356219217796, + "learning_rate": 1.9443213038059145e-05, + "loss": 0.4546, + "step": 2183 + }, + { + "epoch": 0.4305993690851735, + "grad_norm": 0.6526605435968623, + "learning_rate": 1.944270292773641e-05, + "loss": 0.4225, + "step": 2184 + }, + { + "epoch": 0.43079652996845424, + "grad_norm": 0.7044967095269495, + "learning_rate": 1.944219259054514e-05, + "loss": 0.4483, + "step": 2185 + }, + { + "epoch": 0.430993690851735, + "grad_norm": 0.6635399522882317, + "learning_rate": 1.9441682026497587e-05, + "loss": 0.4396, + "step": 2186 + }, + { + "epoch": 0.43119085173501576, + "grad_norm": 0.7462506165933283, + "learning_rate": 1.944117123560602e-05, + "loss": 0.4184, + "step": 2187 + }, + { + "epoch": 0.4313880126182965, + "grad_norm": 0.630701248717489, + "learning_rate": 1.944066021788271e-05, + "loss": 0.4723, + "step": 2188 + }, + { + "epoch": 0.43158517350157727, + "grad_norm": 0.6815590189822918, + "learning_rate": 1.9440148973339937e-05, + "loss": 0.461, + "step": 2189 + }, + { + "epoch": 0.431782334384858, + "grad_norm": 0.5608877290281826, + "learning_rate": 1.9439637501989984e-05, + "loss": 0.4261, + "step": 2190 + }, + { + "epoch": 0.4319794952681388, + "grad_norm": 0.5678482544734272, + "learning_rate": 1.9439125803845136e-05, + "loss": 0.4343, + "step": 2191 + }, + { + "epoch": 0.43217665615141954, + "grad_norm": 0.6142946467397039, + "learning_rate": 1.9438613878917693e-05, + "loss": 0.4446, + "step": 2192 + }, + { + "epoch": 0.4323738170347003, + "grad_norm": 0.5554804879141013, + "learning_rate": 1.9438101727219946e-05, + "loss": 0.4212, + "step": 2193 + }, + { + "epoch": 0.43257097791798105, + "grad_norm": 0.7084789316007144, + "learning_rate": 1.943758934876421e-05, + "loss": 0.4591, + "step": 2194 + }, + { + "epoch": 0.4327681388012618, + "grad_norm": 0.615339701668569, + "learning_rate": 1.9437076743562785e-05, + "loss": 0.4361, + "step": 2195 + }, + { + "epoch": 0.43296529968454256, + "grad_norm": 0.790061368854975, + "learning_rate": 1.943656391162799e-05, + "loss": 0.4531, + "step": 2196 + }, + { + "epoch": 0.43316246056782337, + "grad_norm": 0.6198525493844337, + "learning_rate": 1.943605085297215e-05, + "loss": 0.4438, + "step": 2197 + }, + { + "epoch": 0.4333596214511041, + "grad_norm": 0.6215188641437128, + "learning_rate": 1.943553756760759e-05, + "loss": 0.4106, + "step": 2198 + }, + { + "epoch": 0.4335567823343849, + "grad_norm": 0.6051622234791362, + "learning_rate": 1.9435024055546644e-05, + "loss": 0.4059, + "step": 2199 + }, + { + "epoch": 0.43375394321766564, + "grad_norm": 0.6070081944728062, + "learning_rate": 1.9434510316801644e-05, + "loss": 0.4448, + "step": 2200 + }, + { + "epoch": 0.4339511041009464, + "grad_norm": 0.6052231890529391, + "learning_rate": 1.9433996351384936e-05, + "loss": 0.4059, + "step": 2201 + }, + { + "epoch": 0.43414826498422715, + "grad_norm": 0.654268310381098, + "learning_rate": 1.943348215930887e-05, + "loss": 0.4578, + "step": 2202 + }, + { + "epoch": 0.4343454258675079, + "grad_norm": 0.5896345157357851, + "learning_rate": 1.9432967740585797e-05, + "loss": 0.4017, + "step": 2203 + }, + { + "epoch": 0.43454258675078866, + "grad_norm": 0.7026359807455905, + "learning_rate": 1.9432453095228078e-05, + "loss": 0.4789, + "step": 2204 + }, + { + "epoch": 0.4347397476340694, + "grad_norm": 0.9860127509953628, + "learning_rate": 1.9431938223248076e-05, + "loss": 0.4495, + "step": 2205 + }, + { + "epoch": 0.4349369085173502, + "grad_norm": 0.5609344606036918, + "learning_rate": 1.9431423124658165e-05, + "loss": 0.4384, + "step": 2206 + }, + { + "epoch": 0.43513406940063093, + "grad_norm": 0.6369183512758136, + "learning_rate": 1.943090779947072e-05, + "loss": 0.4943, + "step": 2207 + }, + { + "epoch": 0.4353312302839117, + "grad_norm": 0.5765316628970028, + "learning_rate": 1.9430392247698117e-05, + "loss": 0.4622, + "step": 2208 + }, + { + "epoch": 0.43552839116719244, + "grad_norm": 0.5960810357622368, + "learning_rate": 1.9429876469352746e-05, + "loss": 0.4399, + "step": 2209 + }, + { + "epoch": 0.4357255520504732, + "grad_norm": 0.6096812488671478, + "learning_rate": 1.9429360464447e-05, + "loss": 0.4627, + "step": 2210 + }, + { + "epoch": 0.43592271293375395, + "grad_norm": 0.5776557605282492, + "learning_rate": 1.9428844232993275e-05, + "loss": 0.411, + "step": 2211 + }, + { + "epoch": 0.4361198738170347, + "grad_norm": 0.6052522052120832, + "learning_rate": 1.9428327775003978e-05, + "loss": 0.4448, + "step": 2212 + }, + { + "epoch": 0.43631703470031546, + "grad_norm": 0.5994361538832661, + "learning_rate": 1.942781109049151e-05, + "loss": 0.4331, + "step": 2213 + }, + { + "epoch": 0.4365141955835962, + "grad_norm": 0.6941586070053137, + "learning_rate": 1.9427294179468287e-05, + "loss": 0.457, + "step": 2214 + }, + { + "epoch": 0.436711356466877, + "grad_norm": 0.6424583665064718, + "learning_rate": 1.942677704194673e-05, + "loss": 0.4651, + "step": 2215 + }, + { + "epoch": 0.43690851735015773, + "grad_norm": 0.6074306424288837, + "learning_rate": 1.9426259677939264e-05, + "loss": 0.4518, + "step": 2216 + }, + { + "epoch": 0.4371056782334385, + "grad_norm": 0.6404681725403938, + "learning_rate": 1.9425742087458318e-05, + "loss": 0.4514, + "step": 2217 + }, + { + "epoch": 0.43730283911671924, + "grad_norm": 0.656741496330048, + "learning_rate": 1.942522427051633e-05, + "loss": 0.4355, + "step": 2218 + }, + { + "epoch": 0.4375, + "grad_norm": 0.6260232241706599, + "learning_rate": 1.942470622712574e-05, + "loss": 0.4314, + "step": 2219 + }, + { + "epoch": 0.43769716088328076, + "grad_norm": 0.6238147702128097, + "learning_rate": 1.942418795729899e-05, + "loss": 0.3956, + "step": 2220 + }, + { + "epoch": 0.4378943217665615, + "grad_norm": 0.6274596830509961, + "learning_rate": 1.9423669461048534e-05, + "loss": 0.4483, + "step": 2221 + }, + { + "epoch": 0.43809148264984227, + "grad_norm": 0.6495347166026656, + "learning_rate": 1.942315073838683e-05, + "loss": 0.4945, + "step": 2222 + }, + { + "epoch": 0.438288643533123, + "grad_norm": 0.6005997995972219, + "learning_rate": 1.942263178932634e-05, + "loss": 0.4541, + "step": 2223 + }, + { + "epoch": 0.4384858044164038, + "grad_norm": 0.5741164876920474, + "learning_rate": 1.942211261387954e-05, + "loss": 0.4174, + "step": 2224 + }, + { + "epoch": 0.43868296529968454, + "grad_norm": 0.6438713859666909, + "learning_rate": 1.9421593212058894e-05, + "loss": 0.4801, + "step": 2225 + }, + { + "epoch": 0.4388801261829653, + "grad_norm": 0.6248222261459098, + "learning_rate": 1.9421073583876882e-05, + "loss": 0.4509, + "step": 2226 + }, + { + "epoch": 0.43907728706624605, + "grad_norm": 0.6012676799031479, + "learning_rate": 1.9420553729345993e-05, + "loss": 0.4304, + "step": 2227 + }, + { + "epoch": 0.4392744479495268, + "grad_norm": 0.6083239790788654, + "learning_rate": 1.942003364847871e-05, + "loss": 0.4619, + "step": 2228 + }, + { + "epoch": 0.43947160883280756, + "grad_norm": 0.624739792282654, + "learning_rate": 1.9419513341287537e-05, + "loss": 0.4327, + "step": 2229 + }, + { + "epoch": 0.4396687697160883, + "grad_norm": 0.6099088599325223, + "learning_rate": 1.9418992807784967e-05, + "loss": 0.4361, + "step": 2230 + }, + { + "epoch": 0.43986593059936907, + "grad_norm": 0.6184689759512897, + "learning_rate": 1.9418472047983512e-05, + "loss": 0.4254, + "step": 2231 + }, + { + "epoch": 0.4400630914826498, + "grad_norm": 0.6336984491426801, + "learning_rate": 1.941795106189568e-05, + "loss": 0.4799, + "step": 2232 + }, + { + "epoch": 0.4402602523659306, + "grad_norm": 0.6363183171959824, + "learning_rate": 1.9417429849533992e-05, + "loss": 0.4786, + "step": 2233 + }, + { + "epoch": 0.44045741324921134, + "grad_norm": 0.634185491973065, + "learning_rate": 1.9416908410910965e-05, + "loss": 0.4645, + "step": 2234 + }, + { + "epoch": 0.4406545741324921, + "grad_norm": 0.5890796293233364, + "learning_rate": 1.941638674603913e-05, + "loss": 0.4377, + "step": 2235 + }, + { + "epoch": 0.44085173501577285, + "grad_norm": 0.7416948900678109, + "learning_rate": 1.9415864854931024e-05, + "loss": 0.4953, + "step": 2236 + }, + { + "epoch": 0.4410488958990536, + "grad_norm": 0.6100278936853679, + "learning_rate": 1.941534273759918e-05, + "loss": 0.4839, + "step": 2237 + }, + { + "epoch": 0.44124605678233436, + "grad_norm": 0.679877854057984, + "learning_rate": 1.9414820394056143e-05, + "loss": 0.4574, + "step": 2238 + }, + { + "epoch": 0.4414432176656151, + "grad_norm": 0.6337313987110674, + "learning_rate": 1.9414297824314466e-05, + "loss": 0.4798, + "step": 2239 + }, + { + "epoch": 0.4416403785488959, + "grad_norm": 0.602381577533317, + "learning_rate": 1.9413775028386702e-05, + "loss": 0.4421, + "step": 2240 + }, + { + "epoch": 0.44183753943217663, + "grad_norm": 0.7468962543654029, + "learning_rate": 1.9413252006285416e-05, + "loss": 0.4569, + "step": 2241 + }, + { + "epoch": 0.44203470031545744, + "grad_norm": 0.6131581613338967, + "learning_rate": 1.9412728758023166e-05, + "loss": 0.469, + "step": 2242 + }, + { + "epoch": 0.4422318611987382, + "grad_norm": 0.6247638637761316, + "learning_rate": 1.9412205283612527e-05, + "loss": 0.4668, + "step": 2243 + }, + { + "epoch": 0.44242902208201895, + "grad_norm": 0.6074735973961677, + "learning_rate": 1.9411681583066077e-05, + "loss": 0.4293, + "step": 2244 + }, + { + "epoch": 0.4426261829652997, + "grad_norm": 0.5960980998564303, + "learning_rate": 1.94111576563964e-05, + "loss": 0.3827, + "step": 2245 + }, + { + "epoch": 0.44282334384858046, + "grad_norm": 0.5711582764920223, + "learning_rate": 1.9410633503616077e-05, + "loss": 0.4133, + "step": 2246 + }, + { + "epoch": 0.4430205047318612, + "grad_norm": 0.572834729047988, + "learning_rate": 1.9410109124737708e-05, + "loss": 0.4487, + "step": 2247 + }, + { + "epoch": 0.443217665615142, + "grad_norm": 0.5941534079486432, + "learning_rate": 1.940958451977389e-05, + "loss": 0.439, + "step": 2248 + }, + { + "epoch": 0.44341482649842273, + "grad_norm": 0.6057683342246314, + "learning_rate": 1.9409059688737226e-05, + "loss": 0.466, + "step": 2249 + }, + { + "epoch": 0.4436119873817035, + "grad_norm": 0.6128435575114525, + "learning_rate": 1.9408534631640328e-05, + "loss": 0.4392, + "step": 2250 + }, + { + "epoch": 0.44380914826498424, + "grad_norm": 0.593633225129246, + "learning_rate": 1.9408009348495808e-05, + "loss": 0.4084, + "step": 2251 + }, + { + "epoch": 0.444006309148265, + "grad_norm": 0.656165917461389, + "learning_rate": 1.9407483839316284e-05, + "loss": 0.4554, + "step": 2252 + }, + { + "epoch": 0.44420347003154576, + "grad_norm": 0.5874000065946977, + "learning_rate": 1.9406958104114387e-05, + "loss": 0.4118, + "step": 2253 + }, + { + "epoch": 0.4444006309148265, + "grad_norm": 0.6161464173752417, + "learning_rate": 1.9406432142902748e-05, + "loss": 0.4291, + "step": 2254 + }, + { + "epoch": 0.44459779179810727, + "grad_norm": 0.6113342594946223, + "learning_rate": 1.9405905955694e-05, + "loss": 0.4582, + "step": 2255 + }, + { + "epoch": 0.444794952681388, + "grad_norm": 0.6825339340881406, + "learning_rate": 1.9405379542500786e-05, + "loss": 0.4354, + "step": 2256 + }, + { + "epoch": 0.4449921135646688, + "grad_norm": 0.6663414389872298, + "learning_rate": 1.9404852903335752e-05, + "loss": 0.4717, + "step": 2257 + }, + { + "epoch": 0.44518927444794953, + "grad_norm": 0.5510169867193097, + "learning_rate": 1.9404326038211558e-05, + "loss": 0.397, + "step": 2258 + }, + { + "epoch": 0.4453864353312303, + "grad_norm": 0.6117720610593416, + "learning_rate": 1.9403798947140857e-05, + "loss": 0.4518, + "step": 2259 + }, + { + "epoch": 0.44558359621451105, + "grad_norm": 0.6131687458644133, + "learning_rate": 1.9403271630136312e-05, + "loss": 0.4612, + "step": 2260 + }, + { + "epoch": 0.4457807570977918, + "grad_norm": 0.5958608772380412, + "learning_rate": 1.9402744087210594e-05, + "loss": 0.4574, + "step": 2261 + }, + { + "epoch": 0.44597791798107256, + "grad_norm": 0.6397300220383113, + "learning_rate": 1.9402216318376377e-05, + "loss": 0.4359, + "step": 2262 + }, + { + "epoch": 0.4461750788643533, + "grad_norm": 0.6132315582223438, + "learning_rate": 1.940168832364634e-05, + "loss": 0.4525, + "step": 2263 + }, + { + "epoch": 0.44637223974763407, + "grad_norm": 0.6126178978812497, + "learning_rate": 1.9401160103033173e-05, + "loss": 0.449, + "step": 2264 + }, + { + "epoch": 0.4465694006309148, + "grad_norm": 0.6024472555100853, + "learning_rate": 1.9400631656549566e-05, + "loss": 0.4552, + "step": 2265 + }, + { + "epoch": 0.4467665615141956, + "grad_norm": 0.6325409183391781, + "learning_rate": 1.9400102984208208e-05, + "loss": 0.4723, + "step": 2266 + }, + { + "epoch": 0.44696372239747634, + "grad_norm": 0.5774023003574291, + "learning_rate": 1.939957408602181e-05, + "loss": 0.43, + "step": 2267 + }, + { + "epoch": 0.4471608832807571, + "grad_norm": 0.5773695869045778, + "learning_rate": 1.939904496200307e-05, + "loss": 0.4507, + "step": 2268 + }, + { + "epoch": 0.44735804416403785, + "grad_norm": 0.5873923629131955, + "learning_rate": 1.939851561216471e-05, + "loss": 0.426, + "step": 2269 + }, + { + "epoch": 0.4475552050473186, + "grad_norm": 0.6124825928799854, + "learning_rate": 1.939798603651944e-05, + "loss": 0.4621, + "step": 2270 + }, + { + "epoch": 0.44775236593059936, + "grad_norm": 0.6004472428898958, + "learning_rate": 1.939745623507999e-05, + "loss": 0.4213, + "step": 2271 + }, + { + "epoch": 0.4479495268138801, + "grad_norm": 0.584595466954028, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.4066, + "step": 2272 + }, + { + "epoch": 0.4481466876971609, + "grad_norm": 0.6100782446987316, + "learning_rate": 1.9396395954869463e-05, + "loss": 0.4527, + "step": 2273 + }, + { + "epoch": 0.44834384858044163, + "grad_norm": 0.5988478613742948, + "learning_rate": 1.939586547612386e-05, + "loss": 0.4301, + "step": 2274 + }, + { + "epoch": 0.4485410094637224, + "grad_norm": 0.5892383164078985, + "learning_rate": 1.939533477163502e-05, + "loss": 0.4637, + "step": 2275 + }, + { + "epoch": 0.44873817034700314, + "grad_norm": 0.6188258893089192, + "learning_rate": 1.93948038414157e-05, + "loss": 0.4286, + "step": 2276 + }, + { + "epoch": 0.4489353312302839, + "grad_norm": 0.5859833886607909, + "learning_rate": 1.9394272685478646e-05, + "loss": 0.4393, + "step": 2277 + }, + { + "epoch": 0.44913249211356465, + "grad_norm": 0.6243646580192782, + "learning_rate": 1.9393741303836633e-05, + "loss": 0.4472, + "step": 2278 + }, + { + "epoch": 0.4493296529968454, + "grad_norm": 0.597880669331921, + "learning_rate": 1.9393209696502416e-05, + "loss": 0.4429, + "step": 2279 + }, + { + "epoch": 0.44952681388012616, + "grad_norm": 0.6092038363303693, + "learning_rate": 1.9392677863488773e-05, + "loss": 0.4427, + "step": 2280 + }, + { + "epoch": 0.4497239747634069, + "grad_norm": 0.5945045969064071, + "learning_rate": 1.9392145804808484e-05, + "loss": 0.4521, + "step": 2281 + }, + { + "epoch": 0.4499211356466877, + "grad_norm": 0.5341262843103938, + "learning_rate": 1.939161352047432e-05, + "loss": 0.4044, + "step": 2282 + }, + { + "epoch": 0.45011829652996843, + "grad_norm": 0.6003095082379514, + "learning_rate": 1.9391081010499085e-05, + "loss": 0.4407, + "step": 2283 + }, + { + "epoch": 0.4503154574132492, + "grad_norm": 0.6015907908122791, + "learning_rate": 1.9390548274895563e-05, + "loss": 0.4399, + "step": 2284 + }, + { + "epoch": 0.45051261829652994, + "grad_norm": 0.5889743466466599, + "learning_rate": 1.9390015313676558e-05, + "loss": 0.4748, + "step": 2285 + }, + { + "epoch": 0.4507097791798107, + "grad_norm": 0.589434443034765, + "learning_rate": 1.938948212685487e-05, + "loss": 0.4502, + "step": 2286 + }, + { + "epoch": 0.4509069400630915, + "grad_norm": 0.5794383012877807, + "learning_rate": 1.9388948714443317e-05, + "loss": 0.4249, + "step": 2287 + }, + { + "epoch": 0.45110410094637227, + "grad_norm": 0.5975560713887517, + "learning_rate": 1.938841507645471e-05, + "loss": 0.463, + "step": 2288 + }, + { + "epoch": 0.451301261829653, + "grad_norm": 0.5692154145280981, + "learning_rate": 1.938788121290187e-05, + "loss": 0.4282, + "step": 2289 + }, + { + "epoch": 0.4514984227129338, + "grad_norm": 0.602494745609716, + "learning_rate": 1.938734712379762e-05, + "loss": 0.4488, + "step": 2290 + }, + { + "epoch": 0.45169558359621453, + "grad_norm": 0.587913158355413, + "learning_rate": 1.93868128091548e-05, + "loss": 0.4679, + "step": 2291 + }, + { + "epoch": 0.4518927444794953, + "grad_norm": 0.5718028537813307, + "learning_rate": 1.9386278268986243e-05, + "loss": 0.4573, + "step": 2292 + }, + { + "epoch": 0.45208990536277605, + "grad_norm": 0.6226913772837778, + "learning_rate": 1.938574350330479e-05, + "loss": 0.4422, + "step": 2293 + }, + { + "epoch": 0.4522870662460568, + "grad_norm": 0.6822157964341871, + "learning_rate": 1.9385208512123293e-05, + "loss": 0.4359, + "step": 2294 + }, + { + "epoch": 0.45248422712933756, + "grad_norm": 0.631137044281597, + "learning_rate": 1.9384673295454603e-05, + "loss": 0.446, + "step": 2295 + }, + { + "epoch": 0.4526813880126183, + "grad_norm": 0.6264948809573254, + "learning_rate": 1.9384137853311576e-05, + "loss": 0.4355, + "step": 2296 + }, + { + "epoch": 0.45287854889589907, + "grad_norm": 0.566650305099432, + "learning_rate": 1.9383602185707082e-05, + "loss": 0.4404, + "step": 2297 + }, + { + "epoch": 0.4530757097791798, + "grad_norm": 0.6365696893318133, + "learning_rate": 1.938306629265399e-05, + "loss": 0.4407, + "step": 2298 + }, + { + "epoch": 0.4532728706624606, + "grad_norm": 0.6109514828404591, + "learning_rate": 1.9382530174165175e-05, + "loss": 0.4282, + "step": 2299 + }, + { + "epoch": 0.45347003154574134, + "grad_norm": 0.5636586001477206, + "learning_rate": 1.9381993830253515e-05, + "loss": 0.4267, + "step": 2300 + }, + { + "epoch": 0.4536671924290221, + "grad_norm": 0.705135133601523, + "learning_rate": 1.93814572609319e-05, + "loss": 0.4815, + "step": 2301 + }, + { + "epoch": 0.45386435331230285, + "grad_norm": 0.6238025886975589, + "learning_rate": 1.9380920466213217e-05, + "loss": 0.4439, + "step": 2302 + }, + { + "epoch": 0.4540615141955836, + "grad_norm": 0.7081859082809666, + "learning_rate": 1.9380383446110368e-05, + "loss": 0.4483, + "step": 2303 + }, + { + "epoch": 0.45425867507886436, + "grad_norm": 0.6531999149924236, + "learning_rate": 1.937984620063625e-05, + "loss": 0.4331, + "step": 2304 + }, + { + "epoch": 0.4544558359621451, + "grad_norm": 0.578550340908979, + "learning_rate": 1.9379308729803775e-05, + "loss": 0.3904, + "step": 2305 + }, + { + "epoch": 0.45465299684542587, + "grad_norm": 0.638465715516517, + "learning_rate": 1.9378771033625855e-05, + "loss": 0.4311, + "step": 2306 + }, + { + "epoch": 0.45485015772870663, + "grad_norm": 1.0881560912923343, + "learning_rate": 1.9378233112115406e-05, + "loss": 0.4634, + "step": 2307 + }, + { + "epoch": 0.4550473186119874, + "grad_norm": 0.6713906883533458, + "learning_rate": 1.9377694965285356e-05, + "loss": 0.4347, + "step": 2308 + }, + { + "epoch": 0.45524447949526814, + "grad_norm": 0.6120327107112462, + "learning_rate": 1.9377156593148632e-05, + "loss": 0.4135, + "step": 2309 + }, + { + "epoch": 0.4554416403785489, + "grad_norm": 0.6652885718648691, + "learning_rate": 1.937661799571817e-05, + "loss": 0.4182, + "step": 2310 + }, + { + "epoch": 0.45563880126182965, + "grad_norm": 0.8090629623799119, + "learning_rate": 1.937607917300691e-05, + "loss": 0.4534, + "step": 2311 + }, + { + "epoch": 0.4558359621451104, + "grad_norm": 1.1834526081440842, + "learning_rate": 1.9375540125027796e-05, + "loss": 0.4866, + "step": 2312 + }, + { + "epoch": 0.45603312302839116, + "grad_norm": 0.6151631247816741, + "learning_rate": 1.937500085179378e-05, + "loss": 0.4517, + "step": 2313 + }, + { + "epoch": 0.4562302839116719, + "grad_norm": 1.0542335058904364, + "learning_rate": 1.937446135331782e-05, + "loss": 0.4423, + "step": 2314 + }, + { + "epoch": 0.4564274447949527, + "grad_norm": 0.6255363893355489, + "learning_rate": 1.9373921629612876e-05, + "loss": 0.46, + "step": 2315 + }, + { + "epoch": 0.45662460567823343, + "grad_norm": 0.5392418471066178, + "learning_rate": 1.937338168069192e-05, + "loss": 0.3992, + "step": 2316 + }, + { + "epoch": 0.4568217665615142, + "grad_norm": 0.7140904386654281, + "learning_rate": 1.9372841506567916e-05, + "loss": 0.4168, + "step": 2317 + }, + { + "epoch": 0.45701892744479494, + "grad_norm": 0.8124956257464673, + "learning_rate": 1.937230110725385e-05, + "loss": 0.4444, + "step": 2318 + }, + { + "epoch": 0.4572160883280757, + "grad_norm": 0.5825397168265147, + "learning_rate": 1.93717604827627e-05, + "loss": 0.4125, + "step": 2319 + }, + { + "epoch": 0.45741324921135645, + "grad_norm": 0.5926863567356023, + "learning_rate": 1.937121963310746e-05, + "loss": 0.4414, + "step": 2320 + }, + { + "epoch": 0.4576104100946372, + "grad_norm": 0.649677750333782, + "learning_rate": 1.9370678558301117e-05, + "loss": 0.4315, + "step": 2321 + }, + { + "epoch": 0.45780757097791797, + "grad_norm": 1.1050583010405093, + "learning_rate": 1.937013725835668e-05, + "loss": 0.4948, + "step": 2322 + }, + { + "epoch": 0.4580047318611987, + "grad_norm": 0.6313289986933639, + "learning_rate": 1.9369595733287147e-05, + "loss": 0.4265, + "step": 2323 + }, + { + "epoch": 0.4582018927444795, + "grad_norm": 0.6683692439342528, + "learning_rate": 1.9369053983105533e-05, + "loss": 0.4648, + "step": 2324 + }, + { + "epoch": 0.45839905362776023, + "grad_norm": 0.7637595774268595, + "learning_rate": 1.9368512007824852e-05, + "loss": 0.4634, + "step": 2325 + }, + { + "epoch": 0.458596214511041, + "grad_norm": 0.634965367240232, + "learning_rate": 1.9367969807458125e-05, + "loss": 0.4925, + "step": 2326 + }, + { + "epoch": 0.45879337539432175, + "grad_norm": 0.6164240403541263, + "learning_rate": 1.936742738201838e-05, + "loss": 0.4655, + "step": 2327 + }, + { + "epoch": 0.4589905362776025, + "grad_norm": 0.6481028520735399, + "learning_rate": 1.9366884731518648e-05, + "loss": 0.4842, + "step": 2328 + }, + { + "epoch": 0.45918769716088326, + "grad_norm": 0.6211874377041051, + "learning_rate": 1.9366341855971967e-05, + "loss": 0.468, + "step": 2329 + }, + { + "epoch": 0.459384858044164, + "grad_norm": 0.7567018049090279, + "learning_rate": 1.936579875539138e-05, + "loss": 0.4688, + "step": 2330 + }, + { + "epoch": 0.45958201892744477, + "grad_norm": 0.6196167687850394, + "learning_rate": 1.9365255429789934e-05, + "loss": 0.4041, + "step": 2331 + }, + { + "epoch": 0.4597791798107255, + "grad_norm": 0.5842148695471889, + "learning_rate": 1.9364711879180688e-05, + "loss": 0.4289, + "step": 2332 + }, + { + "epoch": 0.45997634069400634, + "grad_norm": 0.6604436365980313, + "learning_rate": 1.9364168103576696e-05, + "loss": 0.4489, + "step": 2333 + }, + { + "epoch": 0.4601735015772871, + "grad_norm": 1.3308649937601518, + "learning_rate": 1.9363624102991022e-05, + "loss": 0.451, + "step": 2334 + }, + { + "epoch": 0.46037066246056785, + "grad_norm": 0.6093584018980908, + "learning_rate": 1.9363079877436744e-05, + "loss": 0.4565, + "step": 2335 + }, + { + "epoch": 0.4605678233438486, + "grad_norm": 0.7358912383290898, + "learning_rate": 1.936253542692693e-05, + "loss": 0.4456, + "step": 2336 + }, + { + "epoch": 0.46076498422712936, + "grad_norm": 0.5582505407627746, + "learning_rate": 1.936199075147466e-05, + "loss": 0.3969, + "step": 2337 + }, + { + "epoch": 0.4609621451104101, + "grad_norm": 0.6033721253583829, + "learning_rate": 1.936144585109302e-05, + "loss": 0.4284, + "step": 2338 + }, + { + "epoch": 0.46115930599369087, + "grad_norm": 0.6175274199097704, + "learning_rate": 1.9360900725795112e-05, + "loss": 0.4613, + "step": 2339 + }, + { + "epoch": 0.4613564668769716, + "grad_norm": 0.5765237098211964, + "learning_rate": 1.9360355375594025e-05, + "loss": 0.4307, + "step": 2340 + }, + { + "epoch": 0.4615536277602524, + "grad_norm": 0.7141205911743096, + "learning_rate": 1.9359809800502858e-05, + "loss": 0.4762, + "step": 2341 + }, + { + "epoch": 0.46175078864353314, + "grad_norm": 0.8234992518594635, + "learning_rate": 1.9359264000534726e-05, + "loss": 0.4912, + "step": 2342 + }, + { + "epoch": 0.4619479495268139, + "grad_norm": 0.5994202989618423, + "learning_rate": 1.9358717975702735e-05, + "loss": 0.4382, + "step": 2343 + }, + { + "epoch": 0.46214511041009465, + "grad_norm": 0.6761671021504989, + "learning_rate": 1.9358171726020014e-05, + "loss": 0.447, + "step": 2344 + }, + { + "epoch": 0.4623422712933754, + "grad_norm": 0.6365526106317112, + "learning_rate": 1.9357625251499682e-05, + "loss": 0.4319, + "step": 2345 + }, + { + "epoch": 0.46253943217665616, + "grad_norm": 0.7520248964390419, + "learning_rate": 1.9357078552154864e-05, + "loss": 0.4967, + "step": 2346 + }, + { + "epoch": 0.4627365930599369, + "grad_norm": 0.6558895684983119, + "learning_rate": 1.9356531627998696e-05, + "loss": 0.4489, + "step": 2347 + }, + { + "epoch": 0.4629337539432177, + "grad_norm": 0.6828662741868297, + "learning_rate": 1.9355984479044324e-05, + "loss": 0.4477, + "step": 2348 + }, + { + "epoch": 0.46313091482649843, + "grad_norm": 0.6163218102500873, + "learning_rate": 1.9355437105304893e-05, + "loss": 0.4666, + "step": 2349 + }, + { + "epoch": 0.4633280757097792, + "grad_norm": 0.629723944986669, + "learning_rate": 1.9354889506793548e-05, + "loss": 0.4734, + "step": 2350 + }, + { + "epoch": 0.46352523659305994, + "grad_norm": 0.5904354659114018, + "learning_rate": 1.935434168352345e-05, + "loss": 0.4522, + "step": 2351 + }, + { + "epoch": 0.4637223974763407, + "grad_norm": 0.6172193362795846, + "learning_rate": 1.935379363550776e-05, + "loss": 0.4478, + "step": 2352 + }, + { + "epoch": 0.46391955835962145, + "grad_norm": 0.6097296169978487, + "learning_rate": 1.9353245362759647e-05, + "loss": 0.4407, + "step": 2353 + }, + { + "epoch": 0.4641167192429022, + "grad_norm": 0.6073643122176055, + "learning_rate": 1.9352696865292278e-05, + "loss": 0.4116, + "step": 2354 + }, + { + "epoch": 0.46431388012618297, + "grad_norm": 0.6723779390449313, + "learning_rate": 1.935214814311884e-05, + "loss": 0.4183, + "step": 2355 + }, + { + "epoch": 0.4645110410094637, + "grad_norm": 0.5416360235400502, + "learning_rate": 1.935159919625251e-05, + "loss": 0.4128, + "step": 2356 + }, + { + "epoch": 0.4647082018927445, + "grad_norm": 0.6434910428180349, + "learning_rate": 1.9351050024706476e-05, + "loss": 0.3999, + "step": 2357 + }, + { + "epoch": 0.46490536277602523, + "grad_norm": 0.5545962685011621, + "learning_rate": 1.9350500628493938e-05, + "loss": 0.4059, + "step": 2358 + }, + { + "epoch": 0.465102523659306, + "grad_norm": 0.6598887221595904, + "learning_rate": 1.9349951007628093e-05, + "loss": 0.449, + "step": 2359 + }, + { + "epoch": 0.46529968454258674, + "grad_norm": 0.6254882672594428, + "learning_rate": 1.934940116212214e-05, + "loss": 0.4422, + "step": 2360 + }, + { + "epoch": 0.4654968454258675, + "grad_norm": 0.6643297973480146, + "learning_rate": 1.93488510919893e-05, + "loss": 0.4693, + "step": 2361 + }, + { + "epoch": 0.46569400630914826, + "grad_norm": 0.6671960843874735, + "learning_rate": 1.9348300797242784e-05, + "loss": 0.4674, + "step": 2362 + }, + { + "epoch": 0.465891167192429, + "grad_norm": 0.6583457703373533, + "learning_rate": 1.934775027789581e-05, + "loss": 0.4454, + "step": 2363 + }, + { + "epoch": 0.46608832807570977, + "grad_norm": 0.6019895419204817, + "learning_rate": 1.934719953396161e-05, + "loss": 0.4585, + "step": 2364 + }, + { + "epoch": 0.4662854889589905, + "grad_norm": 0.5539573629879283, + "learning_rate": 1.9346648565453412e-05, + "loss": 0.3792, + "step": 2365 + }, + { + "epoch": 0.4664826498422713, + "grad_norm": 0.5851973307214345, + "learning_rate": 1.934609737238446e-05, + "loss": 0.4116, + "step": 2366 + }, + { + "epoch": 0.46667981072555204, + "grad_norm": 0.5995849117001575, + "learning_rate": 1.9345545954767985e-05, + "loss": 0.4468, + "step": 2367 + }, + { + "epoch": 0.4668769716088328, + "grad_norm": 0.713328117155134, + "learning_rate": 1.934499431261725e-05, + "loss": 0.4863, + "step": 2368 + }, + { + "epoch": 0.46707413249211355, + "grad_norm": 0.6211541889706627, + "learning_rate": 1.93444424459455e-05, + "loss": 0.4355, + "step": 2369 + }, + { + "epoch": 0.4672712933753943, + "grad_norm": 0.5811288166838467, + "learning_rate": 1.934389035476599e-05, + "loss": 0.4487, + "step": 2370 + }, + { + "epoch": 0.46746845425867506, + "grad_norm": 0.7101811484585888, + "learning_rate": 1.9343338039091992e-05, + "loss": 0.4968, + "step": 2371 + }, + { + "epoch": 0.4676656151419558, + "grad_norm": 0.6800579406115673, + "learning_rate": 1.9342785498936775e-05, + "loss": 0.4606, + "step": 2372 + }, + { + "epoch": 0.46786277602523657, + "grad_norm": 0.6222849178571329, + "learning_rate": 1.934223273431361e-05, + "loss": 0.4408, + "step": 2373 + }, + { + "epoch": 0.4680599369085173, + "grad_norm": 0.6463617017560804, + "learning_rate": 1.9341679745235783e-05, + "loss": 0.3811, + "step": 2374 + }, + { + "epoch": 0.4682570977917981, + "grad_norm": 0.6104669769368634, + "learning_rate": 1.9341126531716575e-05, + "loss": 0.4963, + "step": 2375 + }, + { + "epoch": 0.46845425867507884, + "grad_norm": 0.6755247672185276, + "learning_rate": 1.934057309376928e-05, + "loss": 0.4448, + "step": 2376 + }, + { + "epoch": 0.4686514195583596, + "grad_norm": 0.5842414289182047, + "learning_rate": 1.93400194314072e-05, + "loss": 0.4339, + "step": 2377 + }, + { + "epoch": 0.4688485804416404, + "grad_norm": 0.6069017142653811, + "learning_rate": 1.9339465544643623e-05, + "loss": 0.4667, + "step": 2378 + }, + { + "epoch": 0.46904574132492116, + "grad_norm": 0.7521849096494124, + "learning_rate": 1.9338911433491868e-05, + "loss": 0.4821, + "step": 2379 + }, + { + "epoch": 0.4692429022082019, + "grad_norm": 0.6450195680499727, + "learning_rate": 1.933835709796525e-05, + "loss": 0.451, + "step": 2380 + }, + { + "epoch": 0.4694400630914827, + "grad_norm": 0.6039688998164772, + "learning_rate": 1.933780253807708e-05, + "loss": 0.4661, + "step": 2381 + }, + { + "epoch": 0.46963722397476343, + "grad_norm": 0.5834276251343435, + "learning_rate": 1.933724775384068e-05, + "loss": 0.4327, + "step": 2382 + }, + { + "epoch": 0.4698343848580442, + "grad_norm": 0.6142679272929852, + "learning_rate": 1.9336692745269388e-05, + "loss": 0.4644, + "step": 2383 + }, + { + "epoch": 0.47003154574132494, + "grad_norm": 0.6124895653388481, + "learning_rate": 1.9336137512376532e-05, + "loss": 0.4099, + "step": 2384 + }, + { + "epoch": 0.4702287066246057, + "grad_norm": 0.6384760441175886, + "learning_rate": 1.9335582055175454e-05, + "loss": 0.4585, + "step": 2385 + }, + { + "epoch": 0.47042586750788645, + "grad_norm": 0.6605064367442711, + "learning_rate": 1.9335026373679503e-05, + "loss": 0.4624, + "step": 2386 + }, + { + "epoch": 0.4706230283911672, + "grad_norm": 0.9621836476711323, + "learning_rate": 1.9334470467902024e-05, + "loss": 0.4828, + "step": 2387 + }, + { + "epoch": 0.47082018927444796, + "grad_norm": 0.6671978180371507, + "learning_rate": 1.9333914337856373e-05, + "loss": 0.4651, + "step": 2388 + }, + { + "epoch": 0.4710173501577287, + "grad_norm": 0.6234859549259715, + "learning_rate": 1.933335798355591e-05, + "loss": 0.4176, + "step": 2389 + }, + { + "epoch": 0.4712145110410095, + "grad_norm": 0.7746006898875549, + "learning_rate": 1.9332801405014013e-05, + "loss": 0.4175, + "step": 2390 + }, + { + "epoch": 0.47141167192429023, + "grad_norm": 0.6524991860281778, + "learning_rate": 1.9332244602244042e-05, + "loss": 0.4798, + "step": 2391 + }, + { + "epoch": 0.471608832807571, + "grad_norm": 0.6671756189730206, + "learning_rate": 1.9331687575259378e-05, + "loss": 0.4623, + "step": 2392 + }, + { + "epoch": 0.47180599369085174, + "grad_norm": 0.5980869066369106, + "learning_rate": 1.933113032407341e-05, + "loss": 0.3908, + "step": 2393 + }, + { + "epoch": 0.4720031545741325, + "grad_norm": 0.6523513564819309, + "learning_rate": 1.933057284869952e-05, + "loss": 0.4516, + "step": 2394 + }, + { + "epoch": 0.47220031545741326, + "grad_norm": 0.6761092970782154, + "learning_rate": 1.93300151491511e-05, + "loss": 0.4672, + "step": 2395 + }, + { + "epoch": 0.472397476340694, + "grad_norm": 0.6145122564628309, + "learning_rate": 1.9329457225441554e-05, + "loss": 0.4365, + "step": 2396 + }, + { + "epoch": 0.47259463722397477, + "grad_norm": 0.9246573499880514, + "learning_rate": 1.932889907758429e-05, + "loss": 0.4831, + "step": 2397 + }, + { + "epoch": 0.4727917981072555, + "grad_norm": 3.0455979525914487, + "learning_rate": 1.9328340705592708e-05, + "loss": 0.4655, + "step": 2398 + }, + { + "epoch": 0.4729889589905363, + "grad_norm": 3.2802273681352085, + "learning_rate": 1.932778210948023e-05, + "loss": 0.4443, + "step": 2399 + }, + { + "epoch": 0.47318611987381703, + "grad_norm": 0.715855291735479, + "learning_rate": 1.9327223289260274e-05, + "loss": 0.4375, + "step": 2400 + }, + { + "epoch": 0.4733832807570978, + "grad_norm": 0.7740268211710657, + "learning_rate": 1.932666424494627e-05, + "loss": 0.4653, + "step": 2401 + }, + { + "epoch": 0.47358044164037855, + "grad_norm": 0.6204311682591072, + "learning_rate": 1.9326104976551643e-05, + "loss": 0.4134, + "step": 2402 + }, + { + "epoch": 0.4737776025236593, + "grad_norm": 0.7091391867960442, + "learning_rate": 1.932554548408984e-05, + "loss": 0.449, + "step": 2403 + }, + { + "epoch": 0.47397476340694006, + "grad_norm": 0.7803886372499086, + "learning_rate": 1.932498576757429e-05, + "loss": 0.4777, + "step": 2404 + }, + { + "epoch": 0.4741719242902208, + "grad_norm": 0.716866961037204, + "learning_rate": 1.9324425827018452e-05, + "loss": 0.4242, + "step": 2405 + }, + { + "epoch": 0.47436908517350157, + "grad_norm": 0.6489682185779229, + "learning_rate": 1.932386566243577e-05, + "loss": 0.4261, + "step": 2406 + }, + { + "epoch": 0.4745662460567823, + "grad_norm": 0.6254026998694395, + "learning_rate": 1.9323305273839713e-05, + "loss": 0.4012, + "step": 2407 + }, + { + "epoch": 0.4747634069400631, + "grad_norm": 0.664068211708124, + "learning_rate": 1.9322744661243732e-05, + "loss": 0.4401, + "step": 2408 + }, + { + "epoch": 0.47496056782334384, + "grad_norm": 0.859745025842503, + "learning_rate": 1.9322183824661306e-05, + "loss": 0.4634, + "step": 2409 + }, + { + "epoch": 0.4751577287066246, + "grad_norm": 0.7510084681382883, + "learning_rate": 1.932162276410591e-05, + "loss": 0.4403, + "step": 2410 + }, + { + "epoch": 0.47535488958990535, + "grad_norm": 0.62038420709768, + "learning_rate": 1.9321061479591017e-05, + "loss": 0.4503, + "step": 2411 + }, + { + "epoch": 0.4755520504731861, + "grad_norm": 0.6669051292027413, + "learning_rate": 1.9320499971130114e-05, + "loss": 0.4232, + "step": 2412 + }, + { + "epoch": 0.47574921135646686, + "grad_norm": 0.6402332048996934, + "learning_rate": 1.93199382387367e-05, + "loss": 0.4647, + "step": 2413 + }, + { + "epoch": 0.4759463722397476, + "grad_norm": 0.6407606660148513, + "learning_rate": 1.9319376282424255e-05, + "loss": 0.415, + "step": 2414 + }, + { + "epoch": 0.4761435331230284, + "grad_norm": 0.6662928458835895, + "learning_rate": 1.9318814102206296e-05, + "loss": 0.4525, + "step": 2415 + }, + { + "epoch": 0.47634069400630913, + "grad_norm": 0.66510230765238, + "learning_rate": 1.9318251698096322e-05, + "loss": 0.4325, + "step": 2416 + }, + { + "epoch": 0.4765378548895899, + "grad_norm": 0.8178454623617458, + "learning_rate": 1.931768907010785e-05, + "loss": 0.448, + "step": 2417 + }, + { + "epoch": 0.47673501577287064, + "grad_norm": 0.6008989268966388, + "learning_rate": 1.931712621825439e-05, + "loss": 0.4561, + "step": 2418 + }, + { + "epoch": 0.4769321766561514, + "grad_norm": 0.6058146707232347, + "learning_rate": 1.9316563142549475e-05, + "loss": 0.4263, + "step": 2419 + }, + { + "epoch": 0.47712933753943215, + "grad_norm": 0.6620728520314264, + "learning_rate": 1.9315999843006624e-05, + "loss": 0.4697, + "step": 2420 + }, + { + "epoch": 0.4773264984227129, + "grad_norm": 0.6180305422847591, + "learning_rate": 1.9315436319639375e-05, + "loss": 0.4495, + "step": 2421 + }, + { + "epoch": 0.47752365930599366, + "grad_norm": 0.7048061474007149, + "learning_rate": 1.9314872572461265e-05, + "loss": 0.4593, + "step": 2422 + }, + { + "epoch": 0.4777208201892745, + "grad_norm": 1.0945355318730723, + "learning_rate": 1.9314308601485842e-05, + "loss": 0.4426, + "step": 2423 + }, + { + "epoch": 0.47791798107255523, + "grad_norm": 0.5490282310707831, + "learning_rate": 1.9313744406726656e-05, + "loss": 0.4061, + "step": 2424 + }, + { + "epoch": 0.478115141955836, + "grad_norm": 0.6400659060403348, + "learning_rate": 1.931317998819726e-05, + "loss": 0.4378, + "step": 2425 + }, + { + "epoch": 0.47831230283911674, + "grad_norm": 0.6364254959448777, + "learning_rate": 1.931261534591121e-05, + "loss": 0.4796, + "step": 2426 + }, + { + "epoch": 0.4785094637223975, + "grad_norm": 0.5901543416183237, + "learning_rate": 1.9312050479882082e-05, + "loss": 0.4326, + "step": 2427 + }, + { + "epoch": 0.47870662460567825, + "grad_norm": 0.7359527860806958, + "learning_rate": 1.9311485390123442e-05, + "loss": 0.5009, + "step": 2428 + }, + { + "epoch": 0.478903785488959, + "grad_norm": 0.9323925635649081, + "learning_rate": 1.931092007664886e-05, + "loss": 0.4826, + "step": 2429 + }, + { + "epoch": 0.47910094637223977, + "grad_norm": 0.5787415894460493, + "learning_rate": 1.9310354539471935e-05, + "loss": 0.4639, + "step": 2430 + }, + { + "epoch": 0.4792981072555205, + "grad_norm": 0.6479424198694343, + "learning_rate": 1.930978877860624e-05, + "loss": 0.4823, + "step": 2431 + }, + { + "epoch": 0.4794952681388013, + "grad_norm": 0.5894770732012385, + "learning_rate": 1.9309222794065373e-05, + "loss": 0.4604, + "step": 2432 + }, + { + "epoch": 0.47969242902208203, + "grad_norm": 0.639884540685977, + "learning_rate": 1.930865658586293e-05, + "loss": 0.4675, + "step": 2433 + }, + { + "epoch": 0.4798895899053628, + "grad_norm": 0.6684635586126257, + "learning_rate": 1.930809015401252e-05, + "loss": 0.4585, + "step": 2434 + }, + { + "epoch": 0.48008675078864355, + "grad_norm": 0.6336667692984291, + "learning_rate": 1.9307523498527744e-05, + "loss": 0.471, + "step": 2435 + }, + { + "epoch": 0.4802839116719243, + "grad_norm": 0.6876919208132971, + "learning_rate": 1.930695661942222e-05, + "loss": 0.4582, + "step": 2436 + }, + { + "epoch": 0.48048107255520506, + "grad_norm": 0.5960087219303766, + "learning_rate": 1.9306389516709575e-05, + "loss": 0.4476, + "step": 2437 + }, + { + "epoch": 0.4806782334384858, + "grad_norm": 0.6171728207106327, + "learning_rate": 1.9305822190403422e-05, + "loss": 0.4362, + "step": 2438 + }, + { + "epoch": 0.48087539432176657, + "grad_norm": 0.6355089188560651, + "learning_rate": 1.9305254640517398e-05, + "loss": 0.4392, + "step": 2439 + }, + { + "epoch": 0.4810725552050473, + "grad_norm": 1.1033461138258283, + "learning_rate": 1.9304686867065138e-05, + "loss": 0.4564, + "step": 2440 + }, + { + "epoch": 0.4812697160883281, + "grad_norm": 0.6067878466977591, + "learning_rate": 1.9304118870060283e-05, + "loss": 0.4234, + "step": 2441 + }, + { + "epoch": 0.48146687697160884, + "grad_norm": 0.6684348483494865, + "learning_rate": 1.930355064951648e-05, + "loss": 0.4558, + "step": 2442 + }, + { + "epoch": 0.4816640378548896, + "grad_norm": 0.62445972264211, + "learning_rate": 1.930298220544738e-05, + "loss": 0.4464, + "step": 2443 + }, + { + "epoch": 0.48186119873817035, + "grad_norm": 0.6151015537788406, + "learning_rate": 1.9302413537866642e-05, + "loss": 0.4514, + "step": 2444 + }, + { + "epoch": 0.4820583596214511, + "grad_norm": 0.6241079129743627, + "learning_rate": 1.9301844646787927e-05, + "loss": 0.4433, + "step": 2445 + }, + { + "epoch": 0.48225552050473186, + "grad_norm": 0.7891240381146803, + "learning_rate": 1.93012755322249e-05, + "loss": 0.4893, + "step": 2446 + }, + { + "epoch": 0.4824526813880126, + "grad_norm": 0.6404678342701038, + "learning_rate": 1.9300706194191244e-05, + "loss": 0.4532, + "step": 2447 + }, + { + "epoch": 0.48264984227129337, + "grad_norm": 0.6493424413450147, + "learning_rate": 1.930013663270063e-05, + "loss": 0.4498, + "step": 2448 + }, + { + "epoch": 0.48284700315457413, + "grad_norm": 0.6508616343699691, + "learning_rate": 1.929956684776674e-05, + "loss": 0.487, + "step": 2449 + }, + { + "epoch": 0.4830441640378549, + "grad_norm": 0.6434094284010808, + "learning_rate": 1.929899683940327e-05, + "loss": 0.4437, + "step": 2450 + }, + { + "epoch": 0.48324132492113564, + "grad_norm": 0.6735775372022499, + "learning_rate": 1.9298426607623915e-05, + "loss": 0.4301, + "step": 2451 + }, + { + "epoch": 0.4834384858044164, + "grad_norm": 0.6116434104938371, + "learning_rate": 1.929785615244237e-05, + "loss": 0.433, + "step": 2452 + }, + { + "epoch": 0.48363564668769715, + "grad_norm": 0.6272151580269819, + "learning_rate": 1.9297285473872343e-05, + "loss": 0.4447, + "step": 2453 + }, + { + "epoch": 0.4838328075709779, + "grad_norm": 0.6438884702800719, + "learning_rate": 1.929671457192755e-05, + "loss": 0.475, + "step": 2454 + }, + { + "epoch": 0.48402996845425866, + "grad_norm": 0.7535169667410828, + "learning_rate": 1.9296143446621697e-05, + "loss": 0.4784, + "step": 2455 + }, + { + "epoch": 0.4842271293375394, + "grad_norm": 44.84808687603809, + "learning_rate": 1.9295572097968514e-05, + "loss": 0.7183, + "step": 2456 + }, + { + "epoch": 0.4844242902208202, + "grad_norm": 0.6854267912996111, + "learning_rate": 1.9295000525981725e-05, + "loss": 0.4581, + "step": 2457 + }, + { + "epoch": 0.48462145110410093, + "grad_norm": 0.7188232334712162, + "learning_rate": 1.929442873067506e-05, + "loss": 0.4354, + "step": 2458 + }, + { + "epoch": 0.4848186119873817, + "grad_norm": 9.85290231143863, + "learning_rate": 1.9293856712062267e-05, + "loss": 0.5323, + "step": 2459 + }, + { + "epoch": 0.48501577287066244, + "grad_norm": 0.6382321261076438, + "learning_rate": 1.9293284470157082e-05, + "loss": 0.4332, + "step": 2460 + }, + { + "epoch": 0.4852129337539432, + "grad_norm": 0.8581690728691596, + "learning_rate": 1.9292712004973248e-05, + "loss": 0.4314, + "step": 2461 + }, + { + "epoch": 0.48541009463722395, + "grad_norm": 0.6494937725982591, + "learning_rate": 1.9292139316524528e-05, + "loss": 0.4891, + "step": 2462 + }, + { + "epoch": 0.4856072555205047, + "grad_norm": 0.6193320454689136, + "learning_rate": 1.9291566404824676e-05, + "loss": 0.4077, + "step": 2463 + }, + { + "epoch": 0.48580441640378547, + "grad_norm": 0.6374169775943455, + "learning_rate": 1.9290993269887458e-05, + "loss": 0.4282, + "step": 2464 + }, + { + "epoch": 0.4860015772870662, + "grad_norm": 0.6787679196471768, + "learning_rate": 1.9290419911726647e-05, + "loss": 0.4902, + "step": 2465 + }, + { + "epoch": 0.486198738170347, + "grad_norm": 0.6598399921444862, + "learning_rate": 1.9289846330356018e-05, + "loss": 0.4991, + "step": 2466 + }, + { + "epoch": 0.48639589905362773, + "grad_norm": 0.6206359700753303, + "learning_rate": 1.9289272525789348e-05, + "loss": 0.4457, + "step": 2467 + }, + { + "epoch": 0.4865930599369085, + "grad_norm": 0.5930790685776189, + "learning_rate": 1.9288698498040423e-05, + "loss": 0.4066, + "step": 2468 + }, + { + "epoch": 0.4867902208201893, + "grad_norm": 0.651999060628564, + "learning_rate": 1.928812424712304e-05, + "loss": 0.4191, + "step": 2469 + }, + { + "epoch": 0.48698738170347006, + "grad_norm": 0.615287126559333, + "learning_rate": 1.9287549773050988e-05, + "loss": 0.4301, + "step": 2470 + }, + { + "epoch": 0.4871845425867508, + "grad_norm": 0.7055975170195661, + "learning_rate": 1.9286975075838077e-05, + "loss": 0.4719, + "step": 2471 + }, + { + "epoch": 0.48738170347003157, + "grad_norm": 0.5856890230925806, + "learning_rate": 1.9286400155498107e-05, + "loss": 0.3913, + "step": 2472 + }, + { + "epoch": 0.4875788643533123, + "grad_norm": 0.8762126171240702, + "learning_rate": 1.92858250120449e-05, + "loss": 0.4059, + "step": 2473 + }, + { + "epoch": 0.4877760252365931, + "grad_norm": 0.6128580611588724, + "learning_rate": 1.9285249645492266e-05, + "loss": 0.4451, + "step": 2474 + }, + { + "epoch": 0.48797318611987384, + "grad_norm": 0.803850920215572, + "learning_rate": 1.928467405585403e-05, + "loss": 0.4162, + "step": 2475 + }, + { + "epoch": 0.4881703470031546, + "grad_norm": 0.684760361276952, + "learning_rate": 1.9284098243144028e-05, + "loss": 0.4731, + "step": 2476 + }, + { + "epoch": 0.48836750788643535, + "grad_norm": 0.6490585074155004, + "learning_rate": 1.9283522207376088e-05, + "loss": 0.4238, + "step": 2477 + }, + { + "epoch": 0.4885646687697161, + "grad_norm": 0.6065001605718633, + "learning_rate": 1.9282945948564047e-05, + "loss": 0.435, + "step": 2478 + }, + { + "epoch": 0.48876182965299686, + "grad_norm": 0.6557206451679547, + "learning_rate": 1.9282369466721756e-05, + "loss": 0.4611, + "step": 2479 + }, + { + "epoch": 0.4889589905362776, + "grad_norm": 0.5885897744446494, + "learning_rate": 1.9281792761863067e-05, + "loss": 0.475, + "step": 2480 + }, + { + "epoch": 0.48915615141955837, + "grad_norm": 0.6645075192181713, + "learning_rate": 1.928121583400183e-05, + "loss": 0.4432, + "step": 2481 + }, + { + "epoch": 0.4893533123028391, + "grad_norm": 0.854625778831215, + "learning_rate": 1.9280638683151903e-05, + "loss": 0.4287, + "step": 2482 + }, + { + "epoch": 0.4895504731861199, + "grad_norm": 10.207500341633537, + "learning_rate": 1.9280061309327164e-05, + "loss": 0.4656, + "step": 2483 + }, + { + "epoch": 0.48974763406940064, + "grad_norm": 0.8188261610562382, + "learning_rate": 1.9279483712541477e-05, + "loss": 0.4747, + "step": 2484 + }, + { + "epoch": 0.4899447949526814, + "grad_norm": 0.6708338710083666, + "learning_rate": 1.9278905892808725e-05, + "loss": 0.466, + "step": 2485 + }, + { + "epoch": 0.49014195583596215, + "grad_norm": 0.827156096180289, + "learning_rate": 1.9278327850142783e-05, + "loss": 0.4859, + "step": 2486 + }, + { + "epoch": 0.4903391167192429, + "grad_norm": 0.7039638642215147, + "learning_rate": 1.9277749584557543e-05, + "loss": 0.457, + "step": 2487 + }, + { + "epoch": 0.49053627760252366, + "grad_norm": 0.6238491942723505, + "learning_rate": 1.9277171096066895e-05, + "loss": 0.4593, + "step": 2488 + }, + { + "epoch": 0.4907334384858044, + "grad_norm": 0.6466067939563221, + "learning_rate": 1.9276592384684745e-05, + "loss": 0.4694, + "step": 2489 + }, + { + "epoch": 0.4909305993690852, + "grad_norm": 0.6199822833188464, + "learning_rate": 1.9276013450424995e-05, + "loss": 0.4572, + "step": 2490 + }, + { + "epoch": 0.49112776025236593, + "grad_norm": 1.053077823726099, + "learning_rate": 1.9275434293301544e-05, + "loss": 0.4559, + "step": 2491 + }, + { + "epoch": 0.4913249211356467, + "grad_norm": 0.6349781493349106, + "learning_rate": 1.9274854913328317e-05, + "loss": 0.5007, + "step": 2492 + }, + { + "epoch": 0.49152208201892744, + "grad_norm": 0.616100989000789, + "learning_rate": 1.9274275310519234e-05, + "loss": 0.417, + "step": 2493 + }, + { + "epoch": 0.4917192429022082, + "grad_norm": 0.6027522466238368, + "learning_rate": 1.9273695484888216e-05, + "loss": 0.454, + "step": 2494 + }, + { + "epoch": 0.49191640378548895, + "grad_norm": 0.6388698511914507, + "learning_rate": 1.9273115436449198e-05, + "loss": 0.4398, + "step": 2495 + }, + { + "epoch": 0.4921135646687697, + "grad_norm": 1.088727821676831, + "learning_rate": 1.9272535165216112e-05, + "loss": 0.4679, + "step": 2496 + }, + { + "epoch": 0.49231072555205047, + "grad_norm": 0.6152883070859394, + "learning_rate": 1.9271954671202902e-05, + "loss": 0.4152, + "step": 2497 + }, + { + "epoch": 0.4925078864353312, + "grad_norm": 0.5214691152255505, + "learning_rate": 1.9271373954423517e-05, + "loss": 0.4056, + "step": 2498 + }, + { + "epoch": 0.492705047318612, + "grad_norm": 0.641068964137141, + "learning_rate": 1.9270793014891906e-05, + "loss": 0.4716, + "step": 2499 + }, + { + "epoch": 0.49290220820189273, + "grad_norm": 0.5667713763054852, + "learning_rate": 1.9270211852622024e-05, + "loss": 0.4235, + "step": 2500 + }, + { + "epoch": 0.4930993690851735, + "grad_norm": 0.7075480774876152, + "learning_rate": 1.926963046762784e-05, + "loss": 0.4775, + "step": 2501 + }, + { + "epoch": 0.49329652996845424, + "grad_norm": 0.5892770191488615, + "learning_rate": 1.9269048859923318e-05, + "loss": 0.4885, + "step": 2502 + }, + { + "epoch": 0.493493690851735, + "grad_norm": 0.6792628358438927, + "learning_rate": 1.9268467029522432e-05, + "loss": 0.4683, + "step": 2503 + }, + { + "epoch": 0.49369085173501576, + "grad_norm": 0.637559207810441, + "learning_rate": 1.9267884976439163e-05, + "loss": 0.4298, + "step": 2504 + }, + { + "epoch": 0.4938880126182965, + "grad_norm": 0.5698260135323511, + "learning_rate": 1.9267302700687494e-05, + "loss": 0.4026, + "step": 2505 + }, + { + "epoch": 0.49408517350157727, + "grad_norm": 0.5949689949186419, + "learning_rate": 1.9266720202281413e-05, + "loss": 0.4076, + "step": 2506 + }, + { + "epoch": 0.494282334384858, + "grad_norm": 0.6042856982676498, + "learning_rate": 1.9266137481234918e-05, + "loss": 0.4727, + "step": 2507 + }, + { + "epoch": 0.4944794952681388, + "grad_norm": 0.5738882939815504, + "learning_rate": 1.9265554537562008e-05, + "loss": 0.4585, + "step": 2508 + }, + { + "epoch": 0.49467665615141954, + "grad_norm": 1.6832821361365304, + "learning_rate": 1.926497137127669e-05, + "loss": 0.435, + "step": 2509 + }, + { + "epoch": 0.4948738170347003, + "grad_norm": 0.7344488196925597, + "learning_rate": 1.9264387982392972e-05, + "loss": 0.4663, + "step": 2510 + }, + { + "epoch": 0.49507097791798105, + "grad_norm": 0.8273300571453689, + "learning_rate": 1.926380437092487e-05, + "loss": 0.4682, + "step": 2511 + }, + { + "epoch": 0.4952681388012618, + "grad_norm": 0.6195232818493738, + "learning_rate": 1.9263220536886413e-05, + "loss": 0.4296, + "step": 2512 + }, + { + "epoch": 0.49546529968454256, + "grad_norm": 0.6274418197133689, + "learning_rate": 1.9262636480291618e-05, + "loss": 0.4445, + "step": 2513 + }, + { + "epoch": 0.49566246056782337, + "grad_norm": 0.6220175805229026, + "learning_rate": 1.9262052201154525e-05, + "loss": 0.4491, + "step": 2514 + }, + { + "epoch": 0.4958596214511041, + "grad_norm": 1.2792193280662592, + "learning_rate": 1.926146769948917e-05, + "loss": 0.441, + "step": 2515 + }, + { + "epoch": 0.4960567823343849, + "grad_norm": 0.6309907361680024, + "learning_rate": 1.926088297530959e-05, + "loss": 0.4213, + "step": 2516 + }, + { + "epoch": 0.49625394321766564, + "grad_norm": 0.6327839276931332, + "learning_rate": 1.9260298028629846e-05, + "loss": 0.4576, + "step": 2517 + }, + { + "epoch": 0.4964511041009464, + "grad_norm": 0.6462083871492283, + "learning_rate": 1.925971285946398e-05, + "loss": 0.4825, + "step": 2518 + }, + { + "epoch": 0.49664826498422715, + "grad_norm": 0.659095376571992, + "learning_rate": 1.9259127467826055e-05, + "loss": 0.4919, + "step": 2519 + }, + { + "epoch": 0.4968454258675079, + "grad_norm": 0.7047541361963703, + "learning_rate": 1.925854185373014e-05, + "loss": 0.4622, + "step": 2520 + }, + { + "epoch": 0.49704258675078866, + "grad_norm": 0.6556353060348127, + "learning_rate": 1.9257956017190297e-05, + "loss": 0.4269, + "step": 2521 + }, + { + "epoch": 0.4972397476340694, + "grad_norm": 1.249097804465773, + "learning_rate": 1.9257369958220612e-05, + "loss": 0.4644, + "step": 2522 + }, + { + "epoch": 0.4974369085173502, + "grad_norm": 0.5862685793054593, + "learning_rate": 1.9256783676835153e-05, + "loss": 0.4157, + "step": 2523 + }, + { + "epoch": 0.49763406940063093, + "grad_norm": 0.5843281300414458, + "learning_rate": 1.9256197173048013e-05, + "loss": 0.4691, + "step": 2524 + }, + { + "epoch": 0.4978312302839117, + "grad_norm": 0.642053300811986, + "learning_rate": 1.925561044687328e-05, + "loss": 0.4671, + "step": 2525 + }, + { + "epoch": 0.49802839116719244, + "grad_norm": 2.457307700484372, + "learning_rate": 1.9255023498325055e-05, + "loss": 0.4732, + "step": 2526 + }, + { + "epoch": 0.4982255520504732, + "grad_norm": 1.0179133708857775, + "learning_rate": 1.9254436327417436e-05, + "loss": 0.4766, + "step": 2527 + }, + { + "epoch": 0.49842271293375395, + "grad_norm": 0.5737583222016323, + "learning_rate": 1.9253848934164533e-05, + "loss": 0.4076, + "step": 2528 + }, + { + "epoch": 0.4986198738170347, + "grad_norm": 1.222029502495448, + "learning_rate": 1.9253261318580456e-05, + "loss": 0.482, + "step": 2529 + }, + { + "epoch": 0.49881703470031546, + "grad_norm": 0.7053057865997824, + "learning_rate": 1.9252673480679328e-05, + "loss": 0.4837, + "step": 2530 + }, + { + "epoch": 0.4990141955835962, + "grad_norm": 0.6259696508511272, + "learning_rate": 1.9252085420475263e-05, + "loss": 0.4232, + "step": 2531 + }, + { + "epoch": 0.499211356466877, + "grad_norm": 0.6853061232406765, + "learning_rate": 1.92514971379824e-05, + "loss": 0.443, + "step": 2532 + }, + { + "epoch": 0.49940851735015773, + "grad_norm": 0.8584950321451298, + "learning_rate": 1.9250908633214863e-05, + "loss": 0.4483, + "step": 2533 + }, + { + "epoch": 0.4996056782334385, + "grad_norm": 0.6391676971123401, + "learning_rate": 1.92503199061868e-05, + "loss": 0.4797, + "step": 2534 + }, + { + "epoch": 0.49980283911671924, + "grad_norm": 0.638850981357959, + "learning_rate": 1.924973095691235e-05, + "loss": 0.4311, + "step": 2535 + }, + { + "epoch": 0.5, + "grad_norm": 0.6527695472648593, + "learning_rate": 1.9249141785405666e-05, + "loss": 0.4149, + "step": 2536 + }, + { + "epoch": 0.5, + "eval_loss": 0.45128634572029114, + "eval_runtime": 344.9468, + "eval_samples_per_second": 23.569, + "eval_steps_per_second": 1.476, + "step": 2536 + }, + { + "epoch": 0.5001971608832808, + "grad_norm": 0.6134815871569128, + "learning_rate": 1.9248552391680902e-05, + "loss": 0.4394, + "step": 2537 + }, + { + "epoch": 0.5003943217665615, + "grad_norm": 0.7190198346074168, + "learning_rate": 1.924796277575222e-05, + "loss": 0.4758, + "step": 2538 + }, + { + "epoch": 0.5005914826498423, + "grad_norm": 0.6176497275780272, + "learning_rate": 1.9247372937633785e-05, + "loss": 0.4417, + "step": 2539 + }, + { + "epoch": 0.500788643533123, + "grad_norm": 0.6368942778751133, + "learning_rate": 1.9246782877339767e-05, + "loss": 0.4418, + "step": 2540 + }, + { + "epoch": 0.5009858044164038, + "grad_norm": 0.6136453962550936, + "learning_rate": 1.9246192594884344e-05, + "loss": 0.4454, + "step": 2541 + }, + { + "epoch": 0.5011829652996845, + "grad_norm": 0.6118109954623908, + "learning_rate": 1.9245602090281698e-05, + "loss": 0.4704, + "step": 2542 + }, + { + "epoch": 0.5013801261829653, + "grad_norm": 0.5608720572816418, + "learning_rate": 1.924501136354602e-05, + "loss": 0.4331, + "step": 2543 + }, + { + "epoch": 0.501577287066246, + "grad_norm": 0.818907256932753, + "learning_rate": 1.924442041469149e-05, + "loss": 0.4582, + "step": 2544 + }, + { + "epoch": 0.5017744479495269, + "grad_norm": 0.5786413688970435, + "learning_rate": 1.9243829243732324e-05, + "loss": 0.4307, + "step": 2545 + }, + { + "epoch": 0.5019716088328076, + "grad_norm": 0.7567124449798754, + "learning_rate": 1.924323785068271e-05, + "loss": 0.4212, + "step": 2546 + }, + { + "epoch": 0.5021687697160884, + "grad_norm": 0.7087458808657718, + "learning_rate": 1.9242646235556868e-05, + "loss": 0.4734, + "step": 2547 + }, + { + "epoch": 0.5023659305993691, + "grad_norm": 0.8128756129153467, + "learning_rate": 1.9242054398369005e-05, + "loss": 0.4908, + "step": 2548 + }, + { + "epoch": 0.5025630914826499, + "grad_norm": 0.7036392962632404, + "learning_rate": 1.9241462339133342e-05, + "loss": 0.429, + "step": 2549 + }, + { + "epoch": 0.5027602523659306, + "grad_norm": 2.7528973749927608, + "learning_rate": 1.9240870057864106e-05, + "loss": 0.4387, + "step": 2550 + }, + { + "epoch": 0.5029574132492114, + "grad_norm": 0.6836200522610701, + "learning_rate": 1.9240277554575523e-05, + "loss": 0.434, + "step": 2551 + }, + { + "epoch": 0.5031545741324921, + "grad_norm": 0.6838558197223942, + "learning_rate": 1.923968482928183e-05, + "loss": 0.4878, + "step": 2552 + }, + { + "epoch": 0.5033517350157729, + "grad_norm": 0.6771965904434417, + "learning_rate": 1.9239091881997274e-05, + "loss": 0.4582, + "step": 2553 + }, + { + "epoch": 0.5035488958990536, + "grad_norm": 0.9137036161581693, + "learning_rate": 1.923849871273609e-05, + "loss": 0.4471, + "step": 2554 + }, + { + "epoch": 0.5037460567823344, + "grad_norm": 0.5838279107533122, + "learning_rate": 1.923790532151254e-05, + "loss": 0.4392, + "step": 2555 + }, + { + "epoch": 0.5039432176656151, + "grad_norm": 0.751287528093079, + "learning_rate": 1.9237311708340867e-05, + "loss": 0.4608, + "step": 2556 + }, + { + "epoch": 0.5041403785488959, + "grad_norm": 0.6438474116366795, + "learning_rate": 1.9236717873235347e-05, + "loss": 0.4483, + "step": 2557 + }, + { + "epoch": 0.5043375394321766, + "grad_norm": 0.6691961023553488, + "learning_rate": 1.923612381621024e-05, + "loss": 0.4729, + "step": 2558 + }, + { + "epoch": 0.5045347003154574, + "grad_norm": 0.8212757423894174, + "learning_rate": 1.923552953727982e-05, + "loss": 0.3985, + "step": 2559 + }, + { + "epoch": 0.5047318611987381, + "grad_norm": 0.8451322507658569, + "learning_rate": 1.923493503645837e-05, + "loss": 0.4875, + "step": 2560 + }, + { + "epoch": 0.504929022082019, + "grad_norm": 1.3003193055502538, + "learning_rate": 1.9234340313760163e-05, + "loss": 0.4569, + "step": 2561 + }, + { + "epoch": 0.5051261829652997, + "grad_norm": 0.6906807412142325, + "learning_rate": 1.9233745369199495e-05, + "loss": 0.4659, + "step": 2562 + }, + { + "epoch": 0.5053233438485805, + "grad_norm": 0.7693812343976348, + "learning_rate": 1.923315020279066e-05, + "loss": 0.4422, + "step": 2563 + }, + { + "epoch": 0.5055205047318612, + "grad_norm": 1.128657931598075, + "learning_rate": 1.9232554814547953e-05, + "loss": 0.4615, + "step": 2564 + }, + { + "epoch": 0.505717665615142, + "grad_norm": 1.1572958591539255, + "learning_rate": 1.923195920448569e-05, + "loss": 0.4783, + "step": 2565 + }, + { + "epoch": 0.5059148264984227, + "grad_norm": 0.698446780450944, + "learning_rate": 1.9231363372618165e-05, + "loss": 0.4343, + "step": 2566 + }, + { + "epoch": 0.5061119873817035, + "grad_norm": 0.5921597967019708, + "learning_rate": 1.92307673189597e-05, + "loss": 0.3959, + "step": 2567 + }, + { + "epoch": 0.5063091482649842, + "grad_norm": 0.7489688974414478, + "learning_rate": 1.923017104352462e-05, + "loss": 0.4237, + "step": 2568 + }, + { + "epoch": 0.506506309148265, + "grad_norm": 0.6464100318912962, + "learning_rate": 1.9229574546327247e-05, + "loss": 0.4587, + "step": 2569 + }, + { + "epoch": 0.5067034700315457, + "grad_norm": 0.5894037282136724, + "learning_rate": 1.9228977827381914e-05, + "loss": 0.412, + "step": 2570 + }, + { + "epoch": 0.5069006309148265, + "grad_norm": 0.5896415150715301, + "learning_rate": 1.922838088670296e-05, + "loss": 0.4415, + "step": 2571 + }, + { + "epoch": 0.5070977917981072, + "grad_norm": 0.6093444816516489, + "learning_rate": 1.9227783724304716e-05, + "loss": 0.4246, + "step": 2572 + }, + { + "epoch": 0.507294952681388, + "grad_norm": 0.6133472401577025, + "learning_rate": 1.922718634020154e-05, + "loss": 0.4265, + "step": 2573 + }, + { + "epoch": 0.5074921135646687, + "grad_norm": 0.6834888429313623, + "learning_rate": 1.922658873440778e-05, + "loss": 0.4595, + "step": 2574 + }, + { + "epoch": 0.5076892744479495, + "grad_norm": 0.6948580579520518, + "learning_rate": 1.92259909069378e-05, + "loss": 0.448, + "step": 2575 + }, + { + "epoch": 0.5078864353312302, + "grad_norm": 0.6374166697259309, + "learning_rate": 1.9225392857805955e-05, + "loss": 0.436, + "step": 2576 + }, + { + "epoch": 0.508083596214511, + "grad_norm": 0.6666234831625328, + "learning_rate": 1.922479458702662e-05, + "loss": 0.4592, + "step": 2577 + }, + { + "epoch": 0.5082807570977917, + "grad_norm": 0.6537577476123761, + "learning_rate": 1.9224196094614163e-05, + "loss": 0.4444, + "step": 2578 + }, + { + "epoch": 0.5084779179810726, + "grad_norm": 0.608893272413261, + "learning_rate": 1.9223597380582967e-05, + "loss": 0.4611, + "step": 2579 + }, + { + "epoch": 0.5086750788643533, + "grad_norm": 0.6235476792381212, + "learning_rate": 1.9222998444947417e-05, + "loss": 0.4138, + "step": 2580 + }, + { + "epoch": 0.5088722397476341, + "grad_norm": 0.6352774634847786, + "learning_rate": 1.92223992877219e-05, + "loss": 0.4596, + "step": 2581 + }, + { + "epoch": 0.5090694006309149, + "grad_norm": 0.7096357346244977, + "learning_rate": 1.922179990892082e-05, + "loss": 0.4706, + "step": 2582 + }, + { + "epoch": 0.5092665615141956, + "grad_norm": 0.9207989672506494, + "learning_rate": 1.9221200308558566e-05, + "loss": 0.4623, + "step": 2583 + }, + { + "epoch": 0.5094637223974764, + "grad_norm": 0.6470580276117848, + "learning_rate": 1.922060048664955e-05, + "loss": 0.4691, + "step": 2584 + }, + { + "epoch": 0.5096608832807571, + "grad_norm": 0.6435644649688526, + "learning_rate": 1.9220000443208183e-05, + "loss": 0.4633, + "step": 2585 + }, + { + "epoch": 0.5098580441640379, + "grad_norm": 0.7356132379703066, + "learning_rate": 1.9219400178248876e-05, + "loss": 0.4805, + "step": 2586 + }, + { + "epoch": 0.5100552050473186, + "grad_norm": 0.7125001237201177, + "learning_rate": 1.9218799691786062e-05, + "loss": 0.4589, + "step": 2587 + }, + { + "epoch": 0.5102523659305994, + "grad_norm": 0.5882472375305573, + "learning_rate": 1.9218198983834155e-05, + "loss": 0.4088, + "step": 2588 + }, + { + "epoch": 0.5104495268138801, + "grad_norm": 0.6522458410768266, + "learning_rate": 1.9217598054407598e-05, + "loss": 0.4515, + "step": 2589 + }, + { + "epoch": 0.5106466876971609, + "grad_norm": 0.6181026041148553, + "learning_rate": 1.9216996903520827e-05, + "loss": 0.4571, + "step": 2590 + }, + { + "epoch": 0.5108438485804416, + "grad_norm": 0.5899550767444578, + "learning_rate": 1.9216395531188277e-05, + "loss": 0.4303, + "step": 2591 + }, + { + "epoch": 0.5110410094637224, + "grad_norm": 0.6129864766903566, + "learning_rate": 1.9215793937424404e-05, + "loss": 0.4711, + "step": 2592 + }, + { + "epoch": 0.5112381703470031, + "grad_norm": 0.687929702723702, + "learning_rate": 1.9215192122243663e-05, + "loss": 0.4661, + "step": 2593 + }, + { + "epoch": 0.511435331230284, + "grad_norm": 0.5550118700233378, + "learning_rate": 1.921459008566051e-05, + "loss": 0.4122, + "step": 2594 + }, + { + "epoch": 0.5116324921135647, + "grad_norm": 0.6517123976025492, + "learning_rate": 1.921398782768941e-05, + "loss": 0.4537, + "step": 2595 + }, + { + "epoch": 0.5118296529968455, + "grad_norm": 0.5998912924516978, + "learning_rate": 1.9213385348344827e-05, + "loss": 0.4539, + "step": 2596 + }, + { + "epoch": 0.5120268138801262, + "grad_norm": 0.6170261164147766, + "learning_rate": 1.9212782647641247e-05, + "loss": 0.4556, + "step": 2597 + }, + { + "epoch": 0.512223974763407, + "grad_norm": 0.6459407251957247, + "learning_rate": 1.9212179725593144e-05, + "loss": 0.4729, + "step": 2598 + }, + { + "epoch": 0.5124211356466877, + "grad_norm": 0.6260843018867657, + "learning_rate": 1.9211576582215e-05, + "loss": 0.4851, + "step": 2599 + }, + { + "epoch": 0.5126182965299685, + "grad_norm": 0.5489077348828413, + "learning_rate": 1.921097321752132e-05, + "loss": 0.4099, + "step": 2600 + }, + { + "epoch": 0.5128154574132492, + "grad_norm": 0.903385996056016, + "learning_rate": 1.9210369631526583e-05, + "loss": 0.429, + "step": 2601 + }, + { + "epoch": 0.51301261829653, + "grad_norm": 0.608980064766385, + "learning_rate": 1.9209765824245302e-05, + "loss": 0.4643, + "step": 2602 + }, + { + "epoch": 0.5132097791798107, + "grad_norm": 0.5824584984075388, + "learning_rate": 1.9209161795691975e-05, + "loss": 0.4359, + "step": 2603 + }, + { + "epoch": 0.5134069400630915, + "grad_norm": 62.5920871913074, + "learning_rate": 1.9208557545881127e-05, + "loss": 0.7084, + "step": 2604 + }, + { + "epoch": 0.5136041009463722, + "grad_norm": 0.7237096419484113, + "learning_rate": 1.9207953074827264e-05, + "loss": 0.4803, + "step": 2605 + }, + { + "epoch": 0.513801261829653, + "grad_norm": 0.6244628305928418, + "learning_rate": 1.9207348382544914e-05, + "loss": 0.4455, + "step": 2606 + }, + { + "epoch": 0.5139984227129337, + "grad_norm": 0.6740117956920548, + "learning_rate": 1.9206743469048606e-05, + "loss": 0.4243, + "step": 2607 + }, + { + "epoch": 0.5141955835962145, + "grad_norm": 0.64353274531661, + "learning_rate": 1.920613833435287e-05, + "loss": 0.4422, + "step": 2608 + }, + { + "epoch": 0.5143927444794952, + "grad_norm": 0.6873646678800827, + "learning_rate": 1.920553297847225e-05, + "loss": 0.4708, + "step": 2609 + }, + { + "epoch": 0.514589905362776, + "grad_norm": 0.8576563607865519, + "learning_rate": 1.9204927401421284e-05, + "loss": 0.4518, + "step": 2610 + }, + { + "epoch": 0.5147870662460567, + "grad_norm": 0.6288653111671121, + "learning_rate": 1.9204321603214523e-05, + "loss": 0.4579, + "step": 2611 + }, + { + "epoch": 0.5149842271293376, + "grad_norm": 0.631342041680086, + "learning_rate": 1.9203715583866527e-05, + "loss": 0.4919, + "step": 2612 + }, + { + "epoch": 0.5151813880126183, + "grad_norm": 0.7478480009750041, + "learning_rate": 1.920310934339185e-05, + "loss": 0.4642, + "step": 2613 + }, + { + "epoch": 0.5153785488958991, + "grad_norm": 0.6757289333762163, + "learning_rate": 1.920250288180506e-05, + "loss": 0.4699, + "step": 2614 + }, + { + "epoch": 0.5155757097791798, + "grad_norm": 0.569219529443845, + "learning_rate": 1.9201896199120728e-05, + "loss": 0.4038, + "step": 2615 + }, + { + "epoch": 0.5157728706624606, + "grad_norm": 0.6266639043108642, + "learning_rate": 1.920128929535343e-05, + "loss": 0.4584, + "step": 2616 + }, + { + "epoch": 0.5159700315457413, + "grad_norm": 0.6019770065246319, + "learning_rate": 1.9200682170517746e-05, + "loss": 0.4545, + "step": 2617 + }, + { + "epoch": 0.5161671924290221, + "grad_norm": 0.7396548559691547, + "learning_rate": 1.9200074824628267e-05, + "loss": 0.4819, + "step": 2618 + }, + { + "epoch": 0.5163643533123028, + "grad_norm": 0.6032049311937472, + "learning_rate": 1.9199467257699577e-05, + "loss": 0.4672, + "step": 2619 + }, + { + "epoch": 0.5165615141955836, + "grad_norm": 0.5534977232280806, + "learning_rate": 1.919885946974628e-05, + "loss": 0.433, + "step": 2620 + }, + { + "epoch": 0.5167586750788643, + "grad_norm": 0.6139976333469656, + "learning_rate": 1.9198251460782974e-05, + "loss": 0.4354, + "step": 2621 + }, + { + "epoch": 0.5169558359621451, + "grad_norm": 0.6302918986448092, + "learning_rate": 1.9197643230824272e-05, + "loss": 0.4114, + "step": 2622 + }, + { + "epoch": 0.5171529968454258, + "grad_norm": 0.6836953420990394, + "learning_rate": 1.9197034779884785e-05, + "loss": 0.4611, + "step": 2623 + }, + { + "epoch": 0.5173501577287066, + "grad_norm": 0.5979385220267467, + "learning_rate": 1.919642610797913e-05, + "loss": 0.4061, + "step": 2624 + }, + { + "epoch": 0.5175473186119873, + "grad_norm": 0.657979498979339, + "learning_rate": 1.9195817215121933e-05, + "loss": 0.4931, + "step": 2625 + }, + { + "epoch": 0.5177444794952681, + "grad_norm": 0.9799814092912918, + "learning_rate": 1.9195208101327818e-05, + "loss": 0.4581, + "step": 2626 + }, + { + "epoch": 0.517941640378549, + "grad_norm": 0.6066811057663211, + "learning_rate": 1.9194598766611426e-05, + "loss": 0.4601, + "step": 2627 + }, + { + "epoch": 0.5181388012618297, + "grad_norm": 0.6509679416434392, + "learning_rate": 1.9193989210987396e-05, + "loss": 0.503, + "step": 2628 + }, + { + "epoch": 0.5183359621451105, + "grad_norm": 0.5884583173326658, + "learning_rate": 1.919337943447037e-05, + "loss": 0.4744, + "step": 2629 + }, + { + "epoch": 0.5185331230283912, + "grad_norm": 0.6369404434163963, + "learning_rate": 1.9192769437075e-05, + "loss": 0.4354, + "step": 2630 + }, + { + "epoch": 0.518730283911672, + "grad_norm": 0.6280177351268637, + "learning_rate": 1.919215921881594e-05, + "loss": 0.4608, + "step": 2631 + }, + { + "epoch": 0.5189274447949527, + "grad_norm": 0.6573633987043811, + "learning_rate": 1.9191548779707854e-05, + "loss": 0.4649, + "step": 2632 + }, + { + "epoch": 0.5191246056782335, + "grad_norm": 0.5972245759129801, + "learning_rate": 1.9190938119765404e-05, + "loss": 0.4536, + "step": 2633 + }, + { + "epoch": 0.5193217665615142, + "grad_norm": 0.625177544249115, + "learning_rate": 1.9190327239003267e-05, + "loss": 0.4509, + "step": 2634 + }, + { + "epoch": 0.519518927444795, + "grad_norm": 0.5920183204667294, + "learning_rate": 1.9189716137436118e-05, + "loss": 0.466, + "step": 2635 + }, + { + "epoch": 0.5197160883280757, + "grad_norm": 0.5840691953513331, + "learning_rate": 1.9189104815078633e-05, + "loss": 0.4253, + "step": 2636 + }, + { + "epoch": 0.5199132492113565, + "grad_norm": 0.5749628137522924, + "learning_rate": 1.918849327194551e-05, + "loss": 0.4263, + "step": 2637 + }, + { + "epoch": 0.5201104100946372, + "grad_norm": 0.645892075328813, + "learning_rate": 1.9187881508051433e-05, + "loss": 0.4376, + "step": 2638 + }, + { + "epoch": 0.520307570977918, + "grad_norm": 0.5985813732201994, + "learning_rate": 1.9187269523411108e-05, + "loss": 0.469, + "step": 2639 + }, + { + "epoch": 0.5205047318611987, + "grad_norm": 0.5662446538186162, + "learning_rate": 1.918665731803923e-05, + "loss": 0.4278, + "step": 2640 + }, + { + "epoch": 0.5207018927444795, + "grad_norm": 0.5774151154266736, + "learning_rate": 1.9186044891950514e-05, + "loss": 0.4541, + "step": 2641 + }, + { + "epoch": 0.5208990536277602, + "grad_norm": 0.5628515665275783, + "learning_rate": 1.9185432245159675e-05, + "loss": 0.4533, + "step": 2642 + }, + { + "epoch": 0.521096214511041, + "grad_norm": 0.602812437259254, + "learning_rate": 1.9184819377681425e-05, + "loss": 0.4411, + "step": 2643 + }, + { + "epoch": 0.5212933753943217, + "grad_norm": 0.6047746007443482, + "learning_rate": 1.9184206289530496e-05, + "loss": 0.4664, + "step": 2644 + }, + { + "epoch": 0.5214905362776026, + "grad_norm": 0.8256532576919925, + "learning_rate": 1.918359298072161e-05, + "loss": 0.4513, + "step": 2645 + }, + { + "epoch": 0.5216876971608833, + "grad_norm": 0.5378632559684782, + "learning_rate": 1.9182979451269513e-05, + "loss": 0.4264, + "step": 2646 + }, + { + "epoch": 0.5218848580441641, + "grad_norm": 0.6701591614996074, + "learning_rate": 1.9182365701188933e-05, + "loss": 0.4815, + "step": 2647 + }, + { + "epoch": 0.5220820189274448, + "grad_norm": 0.6319925276235775, + "learning_rate": 1.918175173049463e-05, + "loss": 0.4799, + "step": 2648 + }, + { + "epoch": 0.5222791798107256, + "grad_norm": 0.6247215611056411, + "learning_rate": 1.9181137539201343e-05, + "loss": 0.4698, + "step": 2649 + }, + { + "epoch": 0.5224763406940063, + "grad_norm": 0.5842030623500385, + "learning_rate": 1.9180523127323834e-05, + "loss": 0.4394, + "step": 2650 + }, + { + "epoch": 0.5226735015772871, + "grad_norm": 0.5932577854627189, + "learning_rate": 1.9179908494876863e-05, + "loss": 0.451, + "step": 2651 + }, + { + "epoch": 0.5228706624605678, + "grad_norm": 0.6071689058545614, + "learning_rate": 1.91792936418752e-05, + "loss": 0.4458, + "step": 2652 + }, + { + "epoch": 0.5230678233438486, + "grad_norm": 0.6417327540656751, + "learning_rate": 1.917867856833361e-05, + "loss": 0.4421, + "step": 2653 + }, + { + "epoch": 0.5232649842271293, + "grad_norm": 8.906897332645489, + "learning_rate": 1.9178063274266884e-05, + "loss": 0.4716, + "step": 2654 + }, + { + "epoch": 0.5234621451104101, + "grad_norm": 0.76731746274502, + "learning_rate": 1.9177447759689792e-05, + "loss": 0.4537, + "step": 2655 + }, + { + "epoch": 0.5236593059936908, + "grad_norm": 0.6082849024874425, + "learning_rate": 1.9176832024617125e-05, + "loss": 0.4746, + "step": 2656 + }, + { + "epoch": 0.5238564668769716, + "grad_norm": 0.6698492507500728, + "learning_rate": 1.9176216069063683e-05, + "loss": 0.4377, + "step": 2657 + }, + { + "epoch": 0.5240536277602523, + "grad_norm": 0.6683170097007389, + "learning_rate": 1.917559989304426e-05, + "loss": 0.4536, + "step": 2658 + }, + { + "epoch": 0.5242507886435331, + "grad_norm": 0.6479408068752674, + "learning_rate": 1.9174983496573657e-05, + "loss": 0.4526, + "step": 2659 + }, + { + "epoch": 0.5244479495268138, + "grad_norm": 0.6865722121413244, + "learning_rate": 1.917436687966669e-05, + "loss": 0.4687, + "step": 2660 + }, + { + "epoch": 0.5246451104100947, + "grad_norm": 0.6689601140224801, + "learning_rate": 1.917375004233817e-05, + "loss": 0.4841, + "step": 2661 + }, + { + "epoch": 0.5248422712933754, + "grad_norm": 0.6723818049998349, + "learning_rate": 1.9173132984602914e-05, + "loss": 0.4405, + "step": 2662 + }, + { + "epoch": 0.5250394321766562, + "grad_norm": 0.6763870303856786, + "learning_rate": 1.9172515706475755e-05, + "loss": 0.4439, + "step": 2663 + }, + { + "epoch": 0.5252365930599369, + "grad_norm": 0.7423211243997336, + "learning_rate": 1.9171898207971518e-05, + "loss": 0.466, + "step": 2664 + }, + { + "epoch": 0.5254337539432177, + "grad_norm": 0.5578326316635425, + "learning_rate": 1.9171280489105043e-05, + "loss": 0.4387, + "step": 2665 + }, + { + "epoch": 0.5256309148264984, + "grad_norm": 0.6327176422438838, + "learning_rate": 1.9170662549891162e-05, + "loss": 0.4419, + "step": 2666 + }, + { + "epoch": 0.5258280757097792, + "grad_norm": 0.6097938287122503, + "learning_rate": 1.9170044390344737e-05, + "loss": 0.4684, + "step": 2667 + }, + { + "epoch": 0.5260252365930599, + "grad_norm": 0.6652570785788708, + "learning_rate": 1.9169426010480604e-05, + "loss": 0.4439, + "step": 2668 + }, + { + "epoch": 0.5262223974763407, + "grad_norm": 0.5705839336000622, + "learning_rate": 1.916880741031363e-05, + "loss": 0.4416, + "step": 2669 + }, + { + "epoch": 0.5264195583596214, + "grad_norm": 0.5966638281252562, + "learning_rate": 1.9168188589858675e-05, + "loss": 0.4328, + "step": 2670 + }, + { + "epoch": 0.5266167192429022, + "grad_norm": 0.5661798360166143, + "learning_rate": 1.9167569549130604e-05, + "loss": 0.4222, + "step": 2671 + }, + { + "epoch": 0.526813880126183, + "grad_norm": 0.6409908246303587, + "learning_rate": 1.9166950288144296e-05, + "loss": 0.4262, + "step": 2672 + }, + { + "epoch": 0.5270110410094637, + "grad_norm": 0.5846011872700821, + "learning_rate": 1.916633080691462e-05, + "loss": 0.4606, + "step": 2673 + }, + { + "epoch": 0.5272082018927445, + "grad_norm": 0.6281868356100663, + "learning_rate": 1.9165711105456468e-05, + "loss": 0.4683, + "step": 2674 + }, + { + "epoch": 0.5274053627760252, + "grad_norm": 0.6643878477972568, + "learning_rate": 1.9165091183784722e-05, + "loss": 0.4666, + "step": 2675 + }, + { + "epoch": 0.527602523659306, + "grad_norm": 0.877520567898525, + "learning_rate": 1.9164471041914283e-05, + "loss": 0.4992, + "step": 2676 + }, + { + "epoch": 0.5277996845425867, + "grad_norm": 0.8982575542215294, + "learning_rate": 1.9163850679860046e-05, + "loss": 0.4169, + "step": 2677 + }, + { + "epoch": 0.5279968454258676, + "grad_norm": 0.9377399621895196, + "learning_rate": 1.9163230097636917e-05, + "loss": 0.4429, + "step": 2678 + }, + { + "epoch": 0.5281940063091483, + "grad_norm": 0.5944294971560273, + "learning_rate": 1.9162609295259805e-05, + "loss": 0.4963, + "step": 2679 + }, + { + "epoch": 0.5283911671924291, + "grad_norm": 0.6623678780030434, + "learning_rate": 1.9161988272743627e-05, + "loss": 0.4514, + "step": 2680 + }, + { + "epoch": 0.5285883280757098, + "grad_norm": 0.7305199400460632, + "learning_rate": 1.9161367030103303e-05, + "loss": 0.4526, + "step": 2681 + }, + { + "epoch": 0.5287854889589906, + "grad_norm": 0.6299026603110343, + "learning_rate": 1.9160745567353758e-05, + "loss": 0.4495, + "step": 2682 + }, + { + "epoch": 0.5289826498422713, + "grad_norm": 0.606070194816596, + "learning_rate": 1.9160123884509923e-05, + "loss": 0.4555, + "step": 2683 + }, + { + "epoch": 0.5291798107255521, + "grad_norm": 0.5864541808065882, + "learning_rate": 1.9159501981586738e-05, + "loss": 0.4447, + "step": 2684 + }, + { + "epoch": 0.5293769716088328, + "grad_norm": 0.5378633706513408, + "learning_rate": 1.9158879858599138e-05, + "loss": 0.4205, + "step": 2685 + }, + { + "epoch": 0.5295741324921136, + "grad_norm": 0.5692796059391859, + "learning_rate": 1.9158257515562075e-05, + "loss": 0.446, + "step": 2686 + }, + { + "epoch": 0.5297712933753943, + "grad_norm": 0.5718248315346496, + "learning_rate": 1.91576349524905e-05, + "loss": 0.4417, + "step": 2687 + }, + { + "epoch": 0.5299684542586751, + "grad_norm": 0.5934922303137548, + "learning_rate": 1.9157012169399372e-05, + "loss": 0.4271, + "step": 2688 + }, + { + "epoch": 0.5301656151419558, + "grad_norm": 0.7026701042054814, + "learning_rate": 1.9156389166303652e-05, + "loss": 0.4772, + "step": 2689 + }, + { + "epoch": 0.5303627760252366, + "grad_norm": 0.5525996284668334, + "learning_rate": 1.9155765943218304e-05, + "loss": 0.3835, + "step": 2690 + }, + { + "epoch": 0.5305599369085173, + "grad_norm": 0.6691635308990771, + "learning_rate": 1.9155142500158312e-05, + "loss": 0.4962, + "step": 2691 + }, + { + "epoch": 0.5307570977917981, + "grad_norm": 0.6586289639914643, + "learning_rate": 1.9154518837138644e-05, + "loss": 0.4623, + "step": 2692 + }, + { + "epoch": 0.5309542586750788, + "grad_norm": 0.5627097183725107, + "learning_rate": 1.9153894954174294e-05, + "loss": 0.4543, + "step": 2693 + }, + { + "epoch": 0.5311514195583596, + "grad_norm": 0.5790687836358622, + "learning_rate": 1.9153270851280245e-05, + "loss": 0.4632, + "step": 2694 + }, + { + "epoch": 0.5313485804416404, + "grad_norm": 0.6605341566789122, + "learning_rate": 1.915264652847149e-05, + "loss": 0.4662, + "step": 2695 + }, + { + "epoch": 0.5315457413249212, + "grad_norm": 0.5991069128744081, + "learning_rate": 1.9152021985763035e-05, + "loss": 0.4425, + "step": 2696 + }, + { + "epoch": 0.5317429022082019, + "grad_norm": 0.6215310692072236, + "learning_rate": 1.9151397223169877e-05, + "loss": 0.4386, + "step": 2697 + }, + { + "epoch": 0.5319400630914827, + "grad_norm": 0.5638306576019859, + "learning_rate": 1.9150772240707038e-05, + "loss": 0.4245, + "step": 2698 + }, + { + "epoch": 0.5321372239747634, + "grad_norm": 0.6748512477251645, + "learning_rate": 1.915014703838952e-05, + "loss": 0.4187, + "step": 2699 + }, + { + "epoch": 0.5323343848580442, + "grad_norm": 0.6108680696319301, + "learning_rate": 1.9149521616232354e-05, + "loss": 0.4595, + "step": 2700 + }, + { + "epoch": 0.5325315457413249, + "grad_norm": 0.596636107232528, + "learning_rate": 1.9148895974250562e-05, + "loss": 0.4858, + "step": 2701 + }, + { + "epoch": 0.5327287066246057, + "grad_norm": 0.5630137990858612, + "learning_rate": 1.9148270112459178e-05, + "loss": 0.4191, + "step": 2702 + }, + { + "epoch": 0.5329258675078864, + "grad_norm": 0.6635599997569216, + "learning_rate": 1.9147644030873236e-05, + "loss": 0.4665, + "step": 2703 + }, + { + "epoch": 0.5331230283911672, + "grad_norm": 0.792415273908661, + "learning_rate": 1.914701772950778e-05, + "loss": 0.4355, + "step": 2704 + }, + { + "epoch": 0.5333201892744479, + "grad_norm": 0.6016005658016478, + "learning_rate": 1.9146391208377856e-05, + "loss": 0.4361, + "step": 2705 + }, + { + "epoch": 0.5335173501577287, + "grad_norm": 0.58541615627604, + "learning_rate": 1.914576446749852e-05, + "loss": 0.4288, + "step": 2706 + }, + { + "epoch": 0.5337145110410094, + "grad_norm": 0.6092999107646474, + "learning_rate": 1.9145137506884826e-05, + "loss": 0.4174, + "step": 2707 + }, + { + "epoch": 0.5339116719242902, + "grad_norm": 0.7227982055069082, + "learning_rate": 1.914451032655184e-05, + "loss": 0.4822, + "step": 2708 + }, + { + "epoch": 0.5341088328075709, + "grad_norm": 0.6246387176004011, + "learning_rate": 1.914388292651463e-05, + "loss": 0.436, + "step": 2709 + }, + { + "epoch": 0.5343059936908517, + "grad_norm": 0.573293012232519, + "learning_rate": 1.9143255306788266e-05, + "loss": 0.4834, + "step": 2710 + }, + { + "epoch": 0.5345031545741324, + "grad_norm": 0.5767714766984778, + "learning_rate": 1.9142627467387833e-05, + "loss": 0.4374, + "step": 2711 + }, + { + "epoch": 0.5347003154574133, + "grad_norm": 0.5672517687751784, + "learning_rate": 1.9141999408328412e-05, + "loss": 0.4461, + "step": 2712 + }, + { + "epoch": 0.534897476340694, + "grad_norm": 0.5888251273732579, + "learning_rate": 1.914137112962509e-05, + "loss": 0.4775, + "step": 2713 + }, + { + "epoch": 0.5350946372239748, + "grad_norm": 0.633920406740737, + "learning_rate": 1.914074263129297e-05, + "loss": 0.4626, + "step": 2714 + }, + { + "epoch": 0.5352917981072555, + "grad_norm": 0.6205409845878437, + "learning_rate": 1.9140113913347145e-05, + "loss": 0.4504, + "step": 2715 + }, + { + "epoch": 0.5354889589905363, + "grad_norm": 1.6126724762687141, + "learning_rate": 1.9139484975802723e-05, + "loss": 0.4977, + "step": 2716 + }, + { + "epoch": 0.535686119873817, + "grad_norm": 0.6349889203833219, + "learning_rate": 1.9138855818674814e-05, + "loss": 0.4596, + "step": 2717 + }, + { + "epoch": 0.5358832807570978, + "grad_norm": 0.5920992239234482, + "learning_rate": 1.9138226441978533e-05, + "loss": 0.4302, + "step": 2718 + }, + { + "epoch": 0.5360804416403786, + "grad_norm": 0.5585845890938809, + "learning_rate": 1.9137596845729005e-05, + "loss": 0.4447, + "step": 2719 + }, + { + "epoch": 0.5362776025236593, + "grad_norm": 0.6075692038210041, + "learning_rate": 1.9136967029941354e-05, + "loss": 0.4849, + "step": 2720 + }, + { + "epoch": 0.5364747634069401, + "grad_norm": 0.6678891128497162, + "learning_rate": 1.9136336994630712e-05, + "loss": 0.4498, + "step": 2721 + }, + { + "epoch": 0.5366719242902208, + "grad_norm": 0.6172599424463148, + "learning_rate": 1.9135706739812217e-05, + "loss": 0.4605, + "step": 2722 + }, + { + "epoch": 0.5368690851735016, + "grad_norm": 0.5878777248662985, + "learning_rate": 1.913507626550101e-05, + "loss": 0.4588, + "step": 2723 + }, + { + "epoch": 0.5370662460567823, + "grad_norm": 0.6514199512488131, + "learning_rate": 1.9134445571712237e-05, + "loss": 0.3952, + "step": 2724 + }, + { + "epoch": 0.5372634069400631, + "grad_norm": 0.5691982494960713, + "learning_rate": 1.9133814658461056e-05, + "loss": 0.419, + "step": 2725 + }, + { + "epoch": 0.5374605678233438, + "grad_norm": 0.6165356310225942, + "learning_rate": 1.9133183525762622e-05, + "loss": 0.4368, + "step": 2726 + }, + { + "epoch": 0.5376577287066246, + "grad_norm": 0.6483155000970097, + "learning_rate": 1.9132552173632097e-05, + "loss": 0.4526, + "step": 2727 + }, + { + "epoch": 0.5378548895899053, + "grad_norm": 0.6668794939528021, + "learning_rate": 1.9131920602084656e-05, + "loss": 0.4663, + "step": 2728 + }, + { + "epoch": 0.5380520504731862, + "grad_norm": 0.7674580396358098, + "learning_rate": 1.9131288811135465e-05, + "loss": 0.4869, + "step": 2729 + }, + { + "epoch": 0.5382492113564669, + "grad_norm": 0.7119703322980993, + "learning_rate": 1.9130656800799706e-05, + "loss": 0.4495, + "step": 2730 + }, + { + "epoch": 0.5384463722397477, + "grad_norm": 0.5810364329746894, + "learning_rate": 1.9130024571092565e-05, + "loss": 0.4188, + "step": 2731 + }, + { + "epoch": 0.5386435331230284, + "grad_norm": 0.74091178804136, + "learning_rate": 1.9129392122029233e-05, + "loss": 0.4444, + "step": 2732 + }, + { + "epoch": 0.5388406940063092, + "grad_norm": 0.5752138416355487, + "learning_rate": 1.9128759453624904e-05, + "loss": 0.4434, + "step": 2733 + }, + { + "epoch": 0.5390378548895899, + "grad_norm": 0.7080034594600536, + "learning_rate": 1.9128126565894776e-05, + "loss": 0.4436, + "step": 2734 + }, + { + "epoch": 0.5392350157728707, + "grad_norm": 0.5927536597698786, + "learning_rate": 1.9127493458854055e-05, + "loss": 0.4587, + "step": 2735 + }, + { + "epoch": 0.5394321766561514, + "grad_norm": 0.7051671166398329, + "learning_rate": 1.9126860132517958e-05, + "loss": 0.4431, + "step": 2736 + }, + { + "epoch": 0.5396293375394322, + "grad_norm": 0.6126458882890922, + "learning_rate": 1.9126226586901693e-05, + "loss": 0.4926, + "step": 2737 + }, + { + "epoch": 0.5398264984227129, + "grad_norm": 0.6919952144838898, + "learning_rate": 1.9125592822020485e-05, + "loss": 0.4734, + "step": 2738 + }, + { + "epoch": 0.5400236593059937, + "grad_norm": 0.6350862584452707, + "learning_rate": 1.912495883788956e-05, + "loss": 0.4682, + "step": 2739 + }, + { + "epoch": 0.5402208201892744, + "grad_norm": 0.653383083517542, + "learning_rate": 1.9124324634524153e-05, + "loss": 0.4427, + "step": 2740 + }, + { + "epoch": 0.5404179810725552, + "grad_norm": 0.6323326038671259, + "learning_rate": 1.91236902119395e-05, + "loss": 0.4433, + "step": 2741 + }, + { + "epoch": 0.5406151419558359, + "grad_norm": 0.7015096284263576, + "learning_rate": 1.912305557015084e-05, + "loss": 0.4368, + "step": 2742 + }, + { + "epoch": 0.5408123028391167, + "grad_norm": 0.6210161210833985, + "learning_rate": 1.9122420709173422e-05, + "loss": 0.409, + "step": 2743 + }, + { + "epoch": 0.5410094637223974, + "grad_norm": 0.7037911524098324, + "learning_rate": 1.9121785629022502e-05, + "loss": 0.4446, + "step": 2744 + }, + { + "epoch": 0.5412066246056783, + "grad_norm": 0.6143364835057887, + "learning_rate": 1.9121150329713334e-05, + "loss": 0.4485, + "step": 2745 + }, + { + "epoch": 0.541403785488959, + "grad_norm": 0.6298390892240697, + "learning_rate": 1.9120514811261187e-05, + "loss": 0.4014, + "step": 2746 + }, + { + "epoch": 0.5416009463722398, + "grad_norm": 0.6010553806758819, + "learning_rate": 1.9119879073681328e-05, + "loss": 0.4275, + "step": 2747 + }, + { + "epoch": 0.5417981072555205, + "grad_norm": 0.6014151169068335, + "learning_rate": 1.9119243116989022e-05, + "loss": 0.438, + "step": 2748 + }, + { + "epoch": 0.5419952681388013, + "grad_norm": 0.743894988167943, + "learning_rate": 1.9118606941199565e-05, + "loss": 0.4777, + "step": 2749 + }, + { + "epoch": 0.542192429022082, + "grad_norm": 0.612605392695115, + "learning_rate": 1.911797054632823e-05, + "loss": 0.4606, + "step": 2750 + }, + { + "epoch": 0.5423895899053628, + "grad_norm": 0.6393149368458824, + "learning_rate": 1.911733393239031e-05, + "loss": 0.4622, + "step": 2751 + }, + { + "epoch": 0.5425867507886435, + "grad_norm": 0.6406348967467802, + "learning_rate": 1.9116697099401103e-05, + "loss": 0.4575, + "step": 2752 + }, + { + "epoch": 0.5427839116719243, + "grad_norm": 0.6210504950614151, + "learning_rate": 1.9116060047375903e-05, + "loss": 0.4361, + "step": 2753 + }, + { + "epoch": 0.542981072555205, + "grad_norm": 0.5936925930435107, + "learning_rate": 1.911542277633002e-05, + "loss": 0.4278, + "step": 2754 + }, + { + "epoch": 0.5431782334384858, + "grad_norm": 0.6521721561800861, + "learning_rate": 1.9114785286278767e-05, + "loss": 0.4693, + "step": 2755 + }, + { + "epoch": 0.5433753943217665, + "grad_norm": 0.5816573139740299, + "learning_rate": 1.9114147577237452e-05, + "loss": 0.4478, + "step": 2756 + }, + { + "epoch": 0.5435725552050473, + "grad_norm": 0.5927002843696301, + "learning_rate": 1.9113509649221403e-05, + "loss": 0.4289, + "step": 2757 + }, + { + "epoch": 0.543769716088328, + "grad_norm": 0.6312074883108042, + "learning_rate": 1.911287150224595e-05, + "loss": 0.4581, + "step": 2758 + }, + { + "epoch": 0.5439668769716088, + "grad_norm": 0.63059238176542, + "learning_rate": 1.9112233136326416e-05, + "loss": 0.4336, + "step": 2759 + }, + { + "epoch": 0.5441640378548895, + "grad_norm": 0.722821516571449, + "learning_rate": 1.9111594551478146e-05, + "loss": 0.4523, + "step": 2760 + }, + { + "epoch": 0.5443611987381703, + "grad_norm": 0.5865645060807954, + "learning_rate": 1.9110955747716478e-05, + "loss": 0.4407, + "step": 2761 + }, + { + "epoch": 0.544558359621451, + "grad_norm": 0.6247985614675826, + "learning_rate": 1.911031672505676e-05, + "loss": 0.4229, + "step": 2762 + }, + { + "epoch": 0.5447555205047319, + "grad_norm": 0.6542180266531745, + "learning_rate": 1.9109677483514346e-05, + "loss": 0.5063, + "step": 2763 + }, + { + "epoch": 0.5449526813880127, + "grad_norm": 0.5911009525801598, + "learning_rate": 1.91090380231046e-05, + "loss": 0.4443, + "step": 2764 + }, + { + "epoch": 0.5451498422712934, + "grad_norm": 0.8124320978143662, + "learning_rate": 1.9108398343842873e-05, + "loss": 0.4413, + "step": 2765 + }, + { + "epoch": 0.5453470031545742, + "grad_norm": 0.5550449237988123, + "learning_rate": 1.910775844574454e-05, + "loss": 0.4284, + "step": 2766 + }, + { + "epoch": 0.5455441640378549, + "grad_norm": 0.597109232711348, + "learning_rate": 1.910711832882498e-05, + "loss": 0.4775, + "step": 2767 + }, + { + "epoch": 0.5457413249211357, + "grad_norm": 0.7408895073247752, + "learning_rate": 1.910647799309957e-05, + "loss": 0.4716, + "step": 2768 + }, + { + "epoch": 0.5459384858044164, + "grad_norm": 0.5592810141688882, + "learning_rate": 1.9105837438583693e-05, + "loss": 0.4494, + "step": 2769 + }, + { + "epoch": 0.5461356466876972, + "grad_norm": 0.6407004174091825, + "learning_rate": 1.9105196665292735e-05, + "loss": 0.4471, + "step": 2770 + }, + { + "epoch": 0.5463328075709779, + "grad_norm": 0.5920197717658957, + "learning_rate": 1.9104555673242092e-05, + "loss": 0.4632, + "step": 2771 + }, + { + "epoch": 0.5465299684542587, + "grad_norm": 0.5534555677305497, + "learning_rate": 1.9103914462447172e-05, + "loss": 0.4683, + "step": 2772 + }, + { + "epoch": 0.5467271293375394, + "grad_norm": 0.6333249803496639, + "learning_rate": 1.9103273032923378e-05, + "loss": 0.4167, + "step": 2773 + }, + { + "epoch": 0.5469242902208202, + "grad_norm": 0.5735766563287512, + "learning_rate": 1.9102631384686116e-05, + "loss": 0.4556, + "step": 2774 + }, + { + "epoch": 0.5471214511041009, + "grad_norm": 0.6225909847389804, + "learning_rate": 1.91019895177508e-05, + "loss": 0.4532, + "step": 2775 + }, + { + "epoch": 0.5473186119873817, + "grad_norm": 0.6265493049371124, + "learning_rate": 1.910134743213286e-05, + "loss": 0.4352, + "step": 2776 + }, + { + "epoch": 0.5475157728706624, + "grad_norm": 0.7433505225427158, + "learning_rate": 1.910070512784772e-05, + "loss": 0.4435, + "step": 2777 + }, + { + "epoch": 0.5477129337539433, + "grad_norm": 0.6861455293800391, + "learning_rate": 1.910006260491081e-05, + "loss": 0.4573, + "step": 2778 + }, + { + "epoch": 0.547910094637224, + "grad_norm": 0.6005840206295373, + "learning_rate": 1.9099419863337567e-05, + "loss": 0.4565, + "step": 2779 + }, + { + "epoch": 0.5481072555205048, + "grad_norm": 0.5980496434263233, + "learning_rate": 1.909877690314343e-05, + "loss": 0.4452, + "step": 2780 + }, + { + "epoch": 0.5483044164037855, + "grad_norm": 0.6336383497213709, + "learning_rate": 1.9098133724343853e-05, + "loss": 0.4745, + "step": 2781 + }, + { + "epoch": 0.5485015772870663, + "grad_norm": 0.600357744618508, + "learning_rate": 1.9097490326954288e-05, + "loss": 0.4485, + "step": 2782 + }, + { + "epoch": 0.548698738170347, + "grad_norm": 0.6216658860334479, + "learning_rate": 1.9096846710990192e-05, + "loss": 0.4393, + "step": 2783 + }, + { + "epoch": 0.5488958990536278, + "grad_norm": 0.5921780263629273, + "learning_rate": 1.9096202876467028e-05, + "loss": 0.4365, + "step": 2784 + }, + { + "epoch": 0.5490930599369085, + "grad_norm": 0.5793518128764239, + "learning_rate": 1.9095558823400266e-05, + "loss": 0.458, + "step": 2785 + }, + { + "epoch": 0.5492902208201893, + "grad_norm": 0.5941381807403091, + "learning_rate": 1.9094914551805377e-05, + "loss": 0.4269, + "step": 2786 + }, + { + "epoch": 0.54948738170347, + "grad_norm": 0.5243117956195389, + "learning_rate": 1.909427006169784e-05, + "loss": 0.4263, + "step": 2787 + }, + { + "epoch": 0.5496845425867508, + "grad_norm": 0.5990209951331408, + "learning_rate": 1.9093625353093146e-05, + "loss": 0.4689, + "step": 2788 + }, + { + "epoch": 0.5498817034700315, + "grad_norm": 0.6974111022304779, + "learning_rate": 1.9092980426006774e-05, + "loss": 0.4537, + "step": 2789 + }, + { + "epoch": 0.5500788643533123, + "grad_norm": 0.5355199493063655, + "learning_rate": 1.909233528045423e-05, + "loss": 0.4156, + "step": 2790 + }, + { + "epoch": 0.550276025236593, + "grad_norm": 0.6415663325485629, + "learning_rate": 1.9091689916451006e-05, + "loss": 0.4639, + "step": 2791 + }, + { + "epoch": 0.5504731861198738, + "grad_norm": 0.5466690904751329, + "learning_rate": 1.909104433401261e-05, + "loss": 0.4197, + "step": 2792 + }, + { + "epoch": 0.5506703470031545, + "grad_norm": 0.5535455971550053, + "learning_rate": 1.9090398533154552e-05, + "loss": 0.4221, + "step": 2793 + }, + { + "epoch": 0.5508675078864353, + "grad_norm": 0.6772223151490671, + "learning_rate": 1.908975251389235e-05, + "loss": 0.4584, + "step": 2794 + }, + { + "epoch": 0.551064668769716, + "grad_norm": 0.639977189301138, + "learning_rate": 1.9089106276241523e-05, + "loss": 0.479, + "step": 2795 + }, + { + "epoch": 0.5512618296529969, + "grad_norm": 0.5718559924246178, + "learning_rate": 1.9088459820217602e-05, + "loss": 0.428, + "step": 2796 + }, + { + "epoch": 0.5514589905362776, + "grad_norm": 0.5777292639541047, + "learning_rate": 1.908781314583611e-05, + "loss": 0.4451, + "step": 2797 + }, + { + "epoch": 0.5516561514195584, + "grad_norm": 0.5900189358667072, + "learning_rate": 1.9087166253112594e-05, + "loss": 0.4692, + "step": 2798 + }, + { + "epoch": 0.5518533123028391, + "grad_norm": 0.6961573118708897, + "learning_rate": 1.9086519142062587e-05, + "loss": 0.4743, + "step": 2799 + }, + { + "epoch": 0.5520504731861199, + "grad_norm": 0.5509038753437092, + "learning_rate": 1.9085871812701642e-05, + "loss": 0.4182, + "step": 2800 + }, + { + "epoch": 0.5522476340694006, + "grad_norm": 0.5723685332377061, + "learning_rate": 1.908522426504531e-05, + "loss": 0.4357, + "step": 2801 + }, + { + "epoch": 0.5524447949526814, + "grad_norm": 0.6424941228887284, + "learning_rate": 1.9084576499109148e-05, + "loss": 0.4767, + "step": 2802 + }, + { + "epoch": 0.5526419558359621, + "grad_norm": 0.6223021214852441, + "learning_rate": 1.908392851490872e-05, + "loss": 0.459, + "step": 2803 + }, + { + "epoch": 0.5528391167192429, + "grad_norm": 0.5876602293739934, + "learning_rate": 1.9083280312459595e-05, + "loss": 0.4468, + "step": 2804 + }, + { + "epoch": 0.5530362776025236, + "grad_norm": 0.6188653567310646, + "learning_rate": 1.9082631891777345e-05, + "loss": 0.42, + "step": 2805 + }, + { + "epoch": 0.5532334384858044, + "grad_norm": 0.6259526543976492, + "learning_rate": 1.9081983252877548e-05, + "loss": 0.4388, + "step": 2806 + }, + { + "epoch": 0.5534305993690851, + "grad_norm": 0.5477244518993755, + "learning_rate": 1.9081334395775788e-05, + "loss": 0.4735, + "step": 2807 + }, + { + "epoch": 0.5536277602523659, + "grad_norm": 0.6135819658127445, + "learning_rate": 1.908068532048766e-05, + "loss": 0.4716, + "step": 2808 + }, + { + "epoch": 0.5538249211356467, + "grad_norm": 0.5562204082368883, + "learning_rate": 1.9080036027028752e-05, + "loss": 0.455, + "step": 2809 + }, + { + "epoch": 0.5540220820189274, + "grad_norm": 0.5792847495400975, + "learning_rate": 1.9079386515414667e-05, + "loss": 0.421, + "step": 2810 + }, + { + "epoch": 0.5542192429022083, + "grad_norm": 0.6003218566784827, + "learning_rate": 1.9078736785661012e-05, + "loss": 0.427, + "step": 2811 + }, + { + "epoch": 0.554416403785489, + "grad_norm": 0.5977583508502151, + "learning_rate": 1.9078086837783393e-05, + "loss": 0.4535, + "step": 2812 + }, + { + "epoch": 0.5546135646687698, + "grad_norm": 0.5699405638630519, + "learning_rate": 1.9077436671797426e-05, + "loss": 0.4305, + "step": 2813 + }, + { + "epoch": 0.5548107255520505, + "grad_norm": 0.5800436763185806, + "learning_rate": 1.9076786287718734e-05, + "loss": 0.4306, + "step": 2814 + }, + { + "epoch": 0.5550078864353313, + "grad_norm": 0.5825228330765003, + "learning_rate": 1.9076135685562942e-05, + "loss": 0.4059, + "step": 2815 + }, + { + "epoch": 0.555205047318612, + "grad_norm": 0.5807662460549189, + "learning_rate": 1.9075484865345678e-05, + "loss": 0.4399, + "step": 2816 + }, + { + "epoch": 0.5554022082018928, + "grad_norm": 0.6218425966437778, + "learning_rate": 1.9074833827082586e-05, + "loss": 0.4857, + "step": 2817 + }, + { + "epoch": 0.5555993690851735, + "grad_norm": 0.5785450171816167, + "learning_rate": 1.90741825707893e-05, + "loss": 0.4473, + "step": 2818 + }, + { + "epoch": 0.5557965299684543, + "grad_norm": 0.5632479711426183, + "learning_rate": 1.9073531096481475e-05, + "loss": 0.4494, + "step": 2819 + }, + { + "epoch": 0.555993690851735, + "grad_norm": 0.6791829687064139, + "learning_rate": 1.9072879404174755e-05, + "loss": 0.4438, + "step": 2820 + }, + { + "epoch": 0.5561908517350158, + "grad_norm": 0.6216151824829282, + "learning_rate": 1.90722274938848e-05, + "loss": 0.4486, + "step": 2821 + }, + { + "epoch": 0.5563880126182965, + "grad_norm": 0.6115679004172528, + "learning_rate": 1.9071575365627274e-05, + "loss": 0.4846, + "step": 2822 + }, + { + "epoch": 0.5565851735015773, + "grad_norm": 0.6026873566499071, + "learning_rate": 1.9070923019417848e-05, + "loss": 0.4395, + "step": 2823 + }, + { + "epoch": 0.556782334384858, + "grad_norm": 0.6123086454851155, + "learning_rate": 1.907027045527219e-05, + "loss": 0.406, + "step": 2824 + }, + { + "epoch": 0.5569794952681388, + "grad_norm": 0.5882773742606572, + "learning_rate": 1.906961767320598e-05, + "loss": 0.4679, + "step": 2825 + }, + { + "epoch": 0.5571766561514195, + "grad_norm": 0.5795368478426368, + "learning_rate": 1.90689646732349e-05, + "loss": 0.4508, + "step": 2826 + }, + { + "epoch": 0.5573738170347003, + "grad_norm": 0.6306886418135007, + "learning_rate": 1.9068311455374638e-05, + "loss": 0.4591, + "step": 2827 + }, + { + "epoch": 0.557570977917981, + "grad_norm": 0.6081030199830474, + "learning_rate": 1.9067658019640897e-05, + "loss": 0.4718, + "step": 2828 + }, + { + "epoch": 0.5577681388012619, + "grad_norm": 0.6178637399147585, + "learning_rate": 1.9067004366049367e-05, + "loss": 0.4833, + "step": 2829 + }, + { + "epoch": 0.5579652996845426, + "grad_norm": 0.6385458957420991, + "learning_rate": 1.9066350494615756e-05, + "loss": 0.4273, + "step": 2830 + }, + { + "epoch": 0.5581624605678234, + "grad_norm": 0.645226093197602, + "learning_rate": 1.9065696405355774e-05, + "loss": 0.4959, + "step": 2831 + }, + { + "epoch": 0.5583596214511041, + "grad_norm": 0.6114367401980242, + "learning_rate": 1.9065042098285132e-05, + "loss": 0.48, + "step": 2832 + }, + { + "epoch": 0.5585567823343849, + "grad_norm": 0.60055106272059, + "learning_rate": 1.9064387573419555e-05, + "loss": 0.4782, + "step": 2833 + }, + { + "epoch": 0.5587539432176656, + "grad_norm": 0.6195745515122923, + "learning_rate": 1.906373283077477e-05, + "loss": 0.4494, + "step": 2834 + }, + { + "epoch": 0.5589511041009464, + "grad_norm": 0.7119201520791544, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.4394, + "step": 2835 + }, + { + "epoch": 0.5591482649842271, + "grad_norm": 0.5747295277509834, + "learning_rate": 1.906242269221049e-05, + "loss": 0.4137, + "step": 2836 + }, + { + "epoch": 0.5593454258675079, + "grad_norm": 0.5453408326530682, + "learning_rate": 1.9061767296322477e-05, + "loss": 0.415, + "step": 2837 + }, + { + "epoch": 0.5595425867507886, + "grad_norm": 1.321620620951851, + "learning_rate": 1.9061111682718204e-05, + "loss": 0.4296, + "step": 2838 + }, + { + "epoch": 0.5597397476340694, + "grad_norm": 0.562269189961482, + "learning_rate": 1.9060455851413424e-05, + "loss": 0.4354, + "step": 2839 + }, + { + "epoch": 0.5599369085173501, + "grad_norm": 0.550600831319121, + "learning_rate": 1.90597998024239e-05, + "loss": 0.4179, + "step": 2840 + }, + { + "epoch": 0.5601340694006309, + "grad_norm": 0.5594944562329867, + "learning_rate": 1.905914353576539e-05, + "loss": 0.4536, + "step": 2841 + }, + { + "epoch": 0.5603312302839116, + "grad_norm": 0.5764617009479925, + "learning_rate": 1.9058487051453662e-05, + "loss": 0.4629, + "step": 2842 + }, + { + "epoch": 0.5605283911671924, + "grad_norm": 11.946607473336949, + "learning_rate": 1.9057830349504484e-05, + "loss": 0.4617, + "step": 2843 + }, + { + "epoch": 0.5607255520504731, + "grad_norm": 0.6562336888788557, + "learning_rate": 1.9057173429933636e-05, + "loss": 0.4496, + "step": 2844 + }, + { + "epoch": 0.560922712933754, + "grad_norm": 0.5696593285659094, + "learning_rate": 1.905651629275691e-05, + "loss": 0.4374, + "step": 2845 + }, + { + "epoch": 0.5611198738170347, + "grad_norm": 0.6107715466947922, + "learning_rate": 1.9055858937990083e-05, + "loss": 0.4478, + "step": 2846 + }, + { + "epoch": 0.5613170347003155, + "grad_norm": 0.6442549288365613, + "learning_rate": 1.905520136564895e-05, + "loss": 0.4537, + "step": 2847 + }, + { + "epoch": 0.5615141955835962, + "grad_norm": 0.5746797728643914, + "learning_rate": 1.9054543575749317e-05, + "loss": 0.4498, + "step": 2848 + }, + { + "epoch": 0.561711356466877, + "grad_norm": 0.9157971817240291, + "learning_rate": 1.905388556830698e-05, + "loss": 0.4024, + "step": 2849 + }, + { + "epoch": 0.5619085173501577, + "grad_norm": 0.5954555864900205, + "learning_rate": 1.905322734333775e-05, + "loss": 0.4712, + "step": 2850 + }, + { + "epoch": 0.5621056782334385, + "grad_norm": 0.68954636579152, + "learning_rate": 1.9052568900857443e-05, + "loss": 0.4205, + "step": 2851 + }, + { + "epoch": 0.5623028391167192, + "grad_norm": 0.7460974392901148, + "learning_rate": 1.9051910240881883e-05, + "loss": 0.4483, + "step": 2852 + }, + { + "epoch": 0.5625, + "grad_norm": 0.5355564846309835, + "learning_rate": 1.9051251363426883e-05, + "loss": 0.4232, + "step": 2853 + }, + { + "epoch": 0.5626971608832808, + "grad_norm": 0.7155811623930997, + "learning_rate": 1.9050592268508284e-05, + "loss": 0.4553, + "step": 2854 + }, + { + "epoch": 0.5628943217665615, + "grad_norm": 0.6320781522351271, + "learning_rate": 1.9049932956141917e-05, + "loss": 0.4436, + "step": 2855 + }, + { + "epoch": 0.5630914826498423, + "grad_norm": 0.6801664356997779, + "learning_rate": 1.9049273426343622e-05, + "loss": 0.4637, + "step": 2856 + }, + { + "epoch": 0.563288643533123, + "grad_norm": 0.5803175479602963, + "learning_rate": 1.9048613679129246e-05, + "loss": 0.466, + "step": 2857 + }, + { + "epoch": 0.5634858044164038, + "grad_norm": 0.6126039116402423, + "learning_rate": 1.904795371451464e-05, + "loss": 0.4587, + "step": 2858 + }, + { + "epoch": 0.5636829652996845, + "grad_norm": 0.5827030340943455, + "learning_rate": 1.904729353251566e-05, + "loss": 0.4518, + "step": 2859 + }, + { + "epoch": 0.5638801261829653, + "grad_norm": 0.601328312382053, + "learning_rate": 1.9046633133148164e-05, + "loss": 0.4675, + "step": 2860 + }, + { + "epoch": 0.564077287066246, + "grad_norm": 0.5717955796305233, + "learning_rate": 1.9045972516428026e-05, + "loss": 0.4549, + "step": 2861 + }, + { + "epoch": 0.5642744479495269, + "grad_norm": 0.5613732974457137, + "learning_rate": 1.904531168237111e-05, + "loss": 0.443, + "step": 2862 + }, + { + "epoch": 0.5644716088328076, + "grad_norm": 0.5744128359613806, + "learning_rate": 1.90446506309933e-05, + "loss": 0.4439, + "step": 2863 + }, + { + "epoch": 0.5646687697160884, + "grad_norm": 0.6119814238497652, + "learning_rate": 1.9043989362310472e-05, + "loss": 0.4707, + "step": 2864 + }, + { + "epoch": 0.5648659305993691, + "grad_norm": 0.5567314391501524, + "learning_rate": 1.9043327876338517e-05, + "loss": 0.4271, + "step": 2865 + }, + { + "epoch": 0.5650630914826499, + "grad_norm": 0.6354333087115706, + "learning_rate": 1.904266617309333e-05, + "loss": 0.4968, + "step": 2866 + }, + { + "epoch": 0.5652602523659306, + "grad_norm": 0.5773958814672833, + "learning_rate": 1.9042004252590804e-05, + "loss": 0.407, + "step": 2867 + }, + { + "epoch": 0.5654574132492114, + "grad_norm": 0.5615283708079536, + "learning_rate": 1.9041342114846844e-05, + "loss": 0.4448, + "step": 2868 + }, + { + "epoch": 0.5656545741324921, + "grad_norm": 0.634498935968448, + "learning_rate": 1.9040679759877358e-05, + "loss": 0.4633, + "step": 2869 + }, + { + "epoch": 0.5658517350157729, + "grad_norm": 0.6007126242862416, + "learning_rate": 1.904001718769826e-05, + "loss": 0.477, + "step": 2870 + }, + { + "epoch": 0.5660488958990536, + "grad_norm": 0.5903936190376452, + "learning_rate": 1.903935439832547e-05, + "loss": 0.4441, + "step": 2871 + }, + { + "epoch": 0.5662460567823344, + "grad_norm": 0.6404835660073024, + "learning_rate": 1.9038691391774913e-05, + "loss": 0.4424, + "step": 2872 + }, + { + "epoch": 0.5664432176656151, + "grad_norm": 0.524427319947031, + "learning_rate": 1.9038028168062517e-05, + "loss": 0.4137, + "step": 2873 + }, + { + "epoch": 0.5666403785488959, + "grad_norm": 0.7149877450692915, + "learning_rate": 1.9037364727204216e-05, + "loss": 0.4914, + "step": 2874 + }, + { + "epoch": 0.5668375394321766, + "grad_norm": 0.5728331748504152, + "learning_rate": 1.9036701069215947e-05, + "loss": 0.4609, + "step": 2875 + }, + { + "epoch": 0.5670347003154574, + "grad_norm": 0.6344827370627714, + "learning_rate": 1.9036037194113656e-05, + "loss": 0.4456, + "step": 2876 + }, + { + "epoch": 0.5672318611987381, + "grad_norm": 0.606955120653267, + "learning_rate": 1.90353731019133e-05, + "loss": 0.4341, + "step": 2877 + }, + { + "epoch": 0.567429022082019, + "grad_norm": 0.7183503039613219, + "learning_rate": 1.9034708792630824e-05, + "loss": 0.4661, + "step": 2878 + }, + { + "epoch": 0.5676261829652997, + "grad_norm": 0.6264186805484021, + "learning_rate": 1.9034044266282196e-05, + "loss": 0.4745, + "step": 2879 + }, + { + "epoch": 0.5678233438485805, + "grad_norm": 0.686093957072564, + "learning_rate": 1.903337952288338e-05, + "loss": 0.4532, + "step": 2880 + }, + { + "epoch": 0.5680205047318612, + "grad_norm": 0.5783475117989667, + "learning_rate": 1.9032714562450345e-05, + "loss": 0.449, + "step": 2881 + }, + { + "epoch": 0.568217665615142, + "grad_norm": 0.6403276197455571, + "learning_rate": 1.903204938499907e-05, + "loss": 0.4453, + "step": 2882 + }, + { + "epoch": 0.5684148264984227, + "grad_norm": 0.561414469598965, + "learning_rate": 1.9031383990545532e-05, + "loss": 0.4499, + "step": 2883 + }, + { + "epoch": 0.5686119873817035, + "grad_norm": 0.6500041432778779, + "learning_rate": 1.9030718379105726e-05, + "loss": 0.4347, + "step": 2884 + }, + { + "epoch": 0.5688091482649842, + "grad_norm": 0.504940933975182, + "learning_rate": 1.9030052550695636e-05, + "loss": 0.3846, + "step": 2885 + }, + { + "epoch": 0.569006309148265, + "grad_norm": 0.6281846779360019, + "learning_rate": 1.902938650533126e-05, + "loss": 0.4713, + "step": 2886 + }, + { + "epoch": 0.5692034700315457, + "grad_norm": 0.5974696177502193, + "learning_rate": 1.9028720243028604e-05, + "loss": 0.4417, + "step": 2887 + }, + { + "epoch": 0.5694006309148265, + "grad_norm": 0.5826680180672237, + "learning_rate": 1.9028053763803673e-05, + "loss": 0.4566, + "step": 2888 + }, + { + "epoch": 0.5695977917981072, + "grad_norm": 0.5875996261062907, + "learning_rate": 1.902738706767248e-05, + "loss": 0.4656, + "step": 2889 + }, + { + "epoch": 0.569794952681388, + "grad_norm": 0.5602438327843606, + "learning_rate": 1.902672015465104e-05, + "loss": 0.4268, + "step": 2890 + }, + { + "epoch": 0.5699921135646687, + "grad_norm": 0.6034100519824507, + "learning_rate": 1.9026053024755384e-05, + "loss": 0.499, + "step": 2891 + }, + { + "epoch": 0.5701892744479495, + "grad_norm": 0.5961260542439667, + "learning_rate": 1.902538567800153e-05, + "loss": 0.4556, + "step": 2892 + }, + { + "epoch": 0.5703864353312302, + "grad_norm": 0.5847619792244266, + "learning_rate": 1.902471811440552e-05, + "loss": 0.444, + "step": 2893 + }, + { + "epoch": 0.570583596214511, + "grad_norm": 0.5717255423270757, + "learning_rate": 1.902405033398339e-05, + "loss": 0.4604, + "step": 2894 + }, + { + "epoch": 0.5707807570977917, + "grad_norm": 0.5775461408981852, + "learning_rate": 1.9023382336751185e-05, + "loss": 0.4372, + "step": 2895 + }, + { + "epoch": 0.5709779179810726, + "grad_norm": 0.5830553771363007, + "learning_rate": 1.902271412272495e-05, + "loss": 0.4547, + "step": 2896 + }, + { + "epoch": 0.5711750788643533, + "grad_norm": 0.6305168162558938, + "learning_rate": 1.9022045691920742e-05, + "loss": 0.4703, + "step": 2897 + }, + { + "epoch": 0.5713722397476341, + "grad_norm": 0.5848064263309912, + "learning_rate": 1.9021377044354624e-05, + "loss": 0.4542, + "step": 2898 + }, + { + "epoch": 0.5715694006309149, + "grad_norm": 0.6638321170730301, + "learning_rate": 1.9020708180042654e-05, + "loss": 0.4936, + "step": 2899 + }, + { + "epoch": 0.5717665615141956, + "grad_norm": 0.637624069032922, + "learning_rate": 1.902003909900091e-05, + "loss": 0.4551, + "step": 2900 + }, + { + "epoch": 0.5719637223974764, + "grad_norm": 0.623095105439941, + "learning_rate": 1.9019369801245458e-05, + "loss": 0.4491, + "step": 2901 + }, + { + "epoch": 0.5721608832807571, + "grad_norm": 0.548146806710325, + "learning_rate": 1.9018700286792388e-05, + "loss": 0.4262, + "step": 2902 + }, + { + "epoch": 0.5723580441640379, + "grad_norm": 0.6045037208451278, + "learning_rate": 1.9018030555657776e-05, + "loss": 0.4825, + "step": 2903 + }, + { + "epoch": 0.5725552050473186, + "grad_norm": 0.5859988705690671, + "learning_rate": 1.9017360607857724e-05, + "loss": 0.4446, + "step": 2904 + }, + { + "epoch": 0.5727523659305994, + "grad_norm": 0.565267471448854, + "learning_rate": 1.9016690443408314e-05, + "loss": 0.4103, + "step": 2905 + }, + { + "epoch": 0.5729495268138801, + "grad_norm": 0.6135424852694339, + "learning_rate": 1.901602006232566e-05, + "loss": 0.4822, + "step": 2906 + }, + { + "epoch": 0.5731466876971609, + "grad_norm": 0.5918245843382935, + "learning_rate": 1.901534946462586e-05, + "loss": 0.4231, + "step": 2907 + }, + { + "epoch": 0.5733438485804416, + "grad_norm": 0.5728688344893811, + "learning_rate": 1.9014678650325035e-05, + "loss": 0.4464, + "step": 2908 + }, + { + "epoch": 0.5735410094637224, + "grad_norm": 0.6157367306965482, + "learning_rate": 1.901400761943929e-05, + "loss": 0.4399, + "step": 2909 + }, + { + "epoch": 0.5737381703470031, + "grad_norm": 0.8263123254378486, + "learning_rate": 1.9013336371984756e-05, + "loss": 0.4274, + "step": 2910 + }, + { + "epoch": 0.573935331230284, + "grad_norm": 0.685238179627279, + "learning_rate": 1.9012664907977557e-05, + "loss": 0.4683, + "step": 2911 + }, + { + "epoch": 0.5741324921135647, + "grad_norm": 0.5358383697418044, + "learning_rate": 1.9011993227433826e-05, + "loss": 0.3689, + "step": 2912 + }, + { + "epoch": 0.5743296529968455, + "grad_norm": 0.6077623131242404, + "learning_rate": 1.9011321330369696e-05, + "loss": 0.4245, + "step": 2913 + }, + { + "epoch": 0.5745268138801262, + "grad_norm": 0.5994256634418688, + "learning_rate": 1.9010649216801316e-05, + "loss": 0.4768, + "step": 2914 + }, + { + "epoch": 0.574723974763407, + "grad_norm": 0.6159531989573963, + "learning_rate": 1.9009976886744837e-05, + "loss": 0.4501, + "step": 2915 + }, + { + "epoch": 0.5749211356466877, + "grad_norm": 0.6096049034892379, + "learning_rate": 1.9009304340216403e-05, + "loss": 0.4464, + "step": 2916 + }, + { + "epoch": 0.5751182965299685, + "grad_norm": 0.5603268106696802, + "learning_rate": 1.900863157723218e-05, + "loss": 0.458, + "step": 2917 + }, + { + "epoch": 0.5753154574132492, + "grad_norm": 0.5558645790716764, + "learning_rate": 1.9007958597808326e-05, + "loss": 0.3959, + "step": 2918 + }, + { + "epoch": 0.57551261829653, + "grad_norm": 0.6195875338599605, + "learning_rate": 1.9007285401961016e-05, + "loss": 0.5038, + "step": 2919 + }, + { + "epoch": 0.5757097791798107, + "grad_norm": 0.5783958420849634, + "learning_rate": 1.9006611989706417e-05, + "loss": 0.4647, + "step": 2920 + }, + { + "epoch": 0.5759069400630915, + "grad_norm": 0.5986709701715736, + "learning_rate": 1.9005938361060714e-05, + "loss": 0.457, + "step": 2921 + }, + { + "epoch": 0.5761041009463722, + "grad_norm": 0.5992935813507028, + "learning_rate": 1.900526451604009e-05, + "loss": 0.4146, + "step": 2922 + }, + { + "epoch": 0.576301261829653, + "grad_norm": 0.6114620175570423, + "learning_rate": 1.900459045466073e-05, + "loss": 0.4699, + "step": 2923 + }, + { + "epoch": 0.5764984227129337, + "grad_norm": 0.5778870682581325, + "learning_rate": 1.9003916176938837e-05, + "loss": 0.4132, + "step": 2924 + }, + { + "epoch": 0.5766955835962145, + "grad_norm": 0.556437544810432, + "learning_rate": 1.9003241682890607e-05, + "loss": 0.4196, + "step": 2925 + }, + { + "epoch": 0.5768927444794952, + "grad_norm": 0.5654030288060881, + "learning_rate": 1.9002566972532242e-05, + "loss": 0.4016, + "step": 2926 + }, + { + "epoch": 0.577089905362776, + "grad_norm": 0.5874171442829121, + "learning_rate": 1.9001892045879963e-05, + "loss": 0.4617, + "step": 2927 + }, + { + "epoch": 0.5772870662460567, + "grad_norm": 0.5875515040090509, + "learning_rate": 1.9001216902949974e-05, + "loss": 0.4511, + "step": 2928 + }, + { + "epoch": 0.5774842271293376, + "grad_norm": 0.5568685656549751, + "learning_rate": 1.9000541543758497e-05, + "loss": 0.4347, + "step": 2929 + }, + { + "epoch": 0.5776813880126183, + "grad_norm": 0.5746594931865293, + "learning_rate": 1.8999865968321765e-05, + "loss": 0.4523, + "step": 2930 + }, + { + "epoch": 0.5778785488958991, + "grad_norm": 0.5500934493880131, + "learning_rate": 1.8999190176656004e-05, + "loss": 0.401, + "step": 2931 + }, + { + "epoch": 0.5780757097791798, + "grad_norm": 0.5988645511870025, + "learning_rate": 1.8998514168777453e-05, + "loss": 0.4418, + "step": 2932 + }, + { + "epoch": 0.5782728706624606, + "grad_norm": 0.5787533040554713, + "learning_rate": 1.8997837944702352e-05, + "loss": 0.4552, + "step": 2933 + }, + { + "epoch": 0.5784700315457413, + "grad_norm": 0.5924302662548973, + "learning_rate": 1.899716150444695e-05, + "loss": 0.423, + "step": 2934 + }, + { + "epoch": 0.5786671924290221, + "grad_norm": 2.4787691128805935, + "learning_rate": 1.8996484848027496e-05, + "loss": 0.4051, + "step": 2935 + }, + { + "epoch": 0.5788643533123028, + "grad_norm": 0.722111103146418, + "learning_rate": 1.8995807975460246e-05, + "loss": 0.4383, + "step": 2936 + }, + { + "epoch": 0.5790615141955836, + "grad_norm": 0.7452661199567964, + "learning_rate": 1.8995130886761468e-05, + "loss": 0.4328, + "step": 2937 + }, + { + "epoch": 0.5792586750788643, + "grad_norm": 0.6272537937748907, + "learning_rate": 1.8994453581947428e-05, + "loss": 0.4501, + "step": 2938 + }, + { + "epoch": 0.5794558359621451, + "grad_norm": 0.5871428411077138, + "learning_rate": 1.8993776061034394e-05, + "loss": 0.4197, + "step": 2939 + }, + { + "epoch": 0.5796529968454258, + "grad_norm": 0.6082870109603066, + "learning_rate": 1.899309832403865e-05, + "loss": 0.4397, + "step": 2940 + }, + { + "epoch": 0.5798501577287066, + "grad_norm": 0.5698173415968192, + "learning_rate": 1.8992420370976476e-05, + "loss": 0.4297, + "step": 2941 + }, + { + "epoch": 0.5800473186119873, + "grad_norm": 0.659454422363326, + "learning_rate": 1.899174220186416e-05, + "loss": 0.4565, + "step": 2942 + }, + { + "epoch": 0.5802444794952681, + "grad_norm": 0.5921121311398183, + "learning_rate": 1.8991063816717998e-05, + "loss": 0.4357, + "step": 2943 + }, + { + "epoch": 0.580441640378549, + "grad_norm": 0.6950468872106844, + "learning_rate": 1.899038521555429e-05, + "loss": 0.4087, + "step": 2944 + }, + { + "epoch": 0.5806388012618297, + "grad_norm": 0.5936144532941147, + "learning_rate": 1.8989706398389335e-05, + "loss": 0.4237, + "step": 2945 + }, + { + "epoch": 0.5808359621451105, + "grad_norm": 0.5868915401669754, + "learning_rate": 1.8989027365239443e-05, + "loss": 0.4006, + "step": 2946 + }, + { + "epoch": 0.5810331230283912, + "grad_norm": 0.6069291322319756, + "learning_rate": 1.8988348116120926e-05, + "loss": 0.4487, + "step": 2947 + }, + { + "epoch": 0.581230283911672, + "grad_norm": 0.5598237023370913, + "learning_rate": 1.8987668651050117e-05, + "loss": 0.4363, + "step": 2948 + }, + { + "epoch": 0.5814274447949527, + "grad_norm": 0.6162522565496759, + "learning_rate": 1.8986988970043324e-05, + "loss": 0.4778, + "step": 2949 + }, + { + "epoch": 0.5816246056782335, + "grad_norm": 0.5922390245389011, + "learning_rate": 1.8986309073116883e-05, + "loss": 0.4492, + "step": 2950 + }, + { + "epoch": 0.5818217665615142, + "grad_norm": 0.5993508377014622, + "learning_rate": 1.8985628960287134e-05, + "loss": 0.4637, + "step": 2951 + }, + { + "epoch": 0.582018927444795, + "grad_norm": 0.549571504491819, + "learning_rate": 1.898494863157041e-05, + "loss": 0.425, + "step": 2952 + }, + { + "epoch": 0.5822160883280757, + "grad_norm": 0.5759972011389546, + "learning_rate": 1.898426808698306e-05, + "loss": 0.4408, + "step": 2953 + }, + { + "epoch": 0.5824132492113565, + "grad_norm": 0.5482466547534278, + "learning_rate": 1.8983587326541437e-05, + "loss": 0.4289, + "step": 2954 + }, + { + "epoch": 0.5826104100946372, + "grad_norm": 0.5913250211398492, + "learning_rate": 1.898290635026189e-05, + "loss": 0.4629, + "step": 2955 + }, + { + "epoch": 0.582807570977918, + "grad_norm": 0.570705758804096, + "learning_rate": 1.8982225158160788e-05, + "loss": 0.4442, + "step": 2956 + }, + { + "epoch": 0.5830047318611987, + "grad_norm": 0.5878782769052985, + "learning_rate": 1.898154375025449e-05, + "loss": 0.3961, + "step": 2957 + }, + { + "epoch": 0.5832018927444795, + "grad_norm": 0.5735902991837972, + "learning_rate": 1.8980862126559373e-05, + "loss": 0.4702, + "step": 2958 + }, + { + "epoch": 0.5833990536277602, + "grad_norm": 0.5657885791348485, + "learning_rate": 1.898018028709181e-05, + "loss": 0.455, + "step": 2959 + }, + { + "epoch": 0.583596214511041, + "grad_norm": 1.3679534419258494, + "learning_rate": 1.8979498231868183e-05, + "loss": 0.4948, + "step": 2960 + }, + { + "epoch": 0.5837933753943217, + "grad_norm": 0.6017442927840219, + "learning_rate": 1.897881596090488e-05, + "loss": 0.4728, + "step": 2961 + }, + { + "epoch": 0.5839905362776026, + "grad_norm": 0.5555623988000693, + "learning_rate": 1.8978133474218294e-05, + "loss": 0.4455, + "step": 2962 + }, + { + "epoch": 0.5841876971608833, + "grad_norm": 0.6041207462728285, + "learning_rate": 1.897745077182482e-05, + "loss": 0.4544, + "step": 2963 + }, + { + "epoch": 0.5843848580441641, + "grad_norm": 0.6250067192179154, + "learning_rate": 1.8976767853740866e-05, + "loss": 0.4752, + "step": 2964 + }, + { + "epoch": 0.5845820189274448, + "grad_norm": 0.571207168389721, + "learning_rate": 1.897608471998283e-05, + "loss": 0.4342, + "step": 2965 + }, + { + "epoch": 0.5847791798107256, + "grad_norm": 2.8095095602359463, + "learning_rate": 1.897540137056713e-05, + "loss": 0.4852, + "step": 2966 + }, + { + "epoch": 0.5849763406940063, + "grad_norm": 0.6427772218920856, + "learning_rate": 1.897471780551019e-05, + "loss": 0.4679, + "step": 2967 + }, + { + "epoch": 0.5851735015772871, + "grad_norm": 0.5594426060918991, + "learning_rate": 1.897403402482842e-05, + "loss": 0.4083, + "step": 2968 + }, + { + "epoch": 0.5853706624605678, + "grad_norm": 0.5801095527319007, + "learning_rate": 1.897335002853826e-05, + "loss": 0.423, + "step": 2969 + }, + { + "epoch": 0.5855678233438486, + "grad_norm": 0.7313301838049326, + "learning_rate": 1.897266581665614e-05, + "loss": 0.4418, + "step": 2970 + }, + { + "epoch": 0.5857649842271293, + "grad_norm": 0.615578548987388, + "learning_rate": 1.8971981389198495e-05, + "loss": 0.4555, + "step": 2971 + }, + { + "epoch": 0.5859621451104101, + "grad_norm": 0.5588797678763053, + "learning_rate": 1.8971296746181774e-05, + "loss": 0.423, + "step": 2972 + }, + { + "epoch": 0.5861593059936908, + "grad_norm": 0.6472108314860092, + "learning_rate": 1.8970611887622425e-05, + "loss": 0.4733, + "step": 2973 + }, + { + "epoch": 0.5863564668769716, + "grad_norm": 0.6277578303497903, + "learning_rate": 1.89699268135369e-05, + "loss": 0.4319, + "step": 2974 + }, + { + "epoch": 0.5865536277602523, + "grad_norm": 8.45953869638077, + "learning_rate": 1.8969241523941662e-05, + "loss": 0.4668, + "step": 2975 + }, + { + "epoch": 0.5867507886435331, + "grad_norm": 0.751366864143633, + "learning_rate": 1.896855601885317e-05, + "loss": 0.4836, + "step": 2976 + }, + { + "epoch": 0.5869479495268138, + "grad_norm": 0.5909034853303601, + "learning_rate": 1.89678702982879e-05, + "loss": 0.5005, + "step": 2977 + }, + { + "epoch": 0.5871451104100947, + "grad_norm": 0.7792417477818725, + "learning_rate": 1.8967184362262324e-05, + "loss": 0.4447, + "step": 2978 + }, + { + "epoch": 0.5873422712933754, + "grad_norm": 1.0145547450801644, + "learning_rate": 1.896649821079292e-05, + "loss": 0.4624, + "step": 2979 + }, + { + "epoch": 0.5875394321766562, + "grad_norm": 0.5792099704275391, + "learning_rate": 1.8965811843896178e-05, + "loss": 0.4701, + "step": 2980 + }, + { + "epoch": 0.5877365930599369, + "grad_norm": 0.6324701038532613, + "learning_rate": 1.8965125261588586e-05, + "loss": 0.4543, + "step": 2981 + }, + { + "epoch": 0.5879337539432177, + "grad_norm": 0.6298268546935512, + "learning_rate": 1.8964438463886638e-05, + "loss": 0.4484, + "step": 2982 + }, + { + "epoch": 0.5881309148264984, + "grad_norm": 0.6842407738734424, + "learning_rate": 1.896375145080684e-05, + "loss": 0.4355, + "step": 2983 + }, + { + "epoch": 0.5883280757097792, + "grad_norm": 0.6522884562722095, + "learning_rate": 1.8963064222365694e-05, + "loss": 0.469, + "step": 2984 + }, + { + "epoch": 0.5885252365930599, + "grad_norm": 0.6206326292297457, + "learning_rate": 1.896237677857971e-05, + "loss": 0.471, + "step": 2985 + }, + { + "epoch": 0.5887223974763407, + "grad_norm": 0.5650099483761517, + "learning_rate": 1.896168911946541e-05, + "loss": 0.4313, + "step": 2986 + }, + { + "epoch": 0.5889195583596214, + "grad_norm": 0.653001742089255, + "learning_rate": 1.896100124503931e-05, + "loss": 0.4548, + "step": 2987 + }, + { + "epoch": 0.5891167192429022, + "grad_norm": 0.5751129122223437, + "learning_rate": 1.896031315531794e-05, + "loss": 0.4307, + "step": 2988 + }, + { + "epoch": 0.589313880126183, + "grad_norm": 0.5731286729374794, + "learning_rate": 1.895962485031783e-05, + "loss": 0.429, + "step": 2989 + }, + { + "epoch": 0.5895110410094637, + "grad_norm": 0.5812391295799345, + "learning_rate": 1.8958936330055516e-05, + "loss": 0.4326, + "step": 2990 + }, + { + "epoch": 0.5897082018927445, + "grad_norm": 0.6341740958943612, + "learning_rate": 1.8958247594547543e-05, + "loss": 0.4421, + "step": 2991 + }, + { + "epoch": 0.5899053627760252, + "grad_norm": 0.5855292136590585, + "learning_rate": 1.895755864381046e-05, + "loss": 0.4476, + "step": 2992 + }, + { + "epoch": 0.590102523659306, + "grad_norm": 0.5897754099618571, + "learning_rate": 1.8956869477860813e-05, + "loss": 0.4347, + "step": 2993 + }, + { + "epoch": 0.5902996845425867, + "grad_norm": 0.5651589224007416, + "learning_rate": 1.8956180096715166e-05, + "loss": 0.47, + "step": 2994 + }, + { + "epoch": 0.5904968454258676, + "grad_norm": 0.5901739340372232, + "learning_rate": 1.895549050039008e-05, + "loss": 0.4463, + "step": 2995 + }, + { + "epoch": 0.5906940063091483, + "grad_norm": 0.5918462673605263, + "learning_rate": 1.8954800688902125e-05, + "loss": 0.4616, + "step": 2996 + }, + { + "epoch": 0.5908911671924291, + "grad_norm": 0.5857619648713869, + "learning_rate": 1.8954110662267868e-05, + "loss": 0.4655, + "step": 2997 + }, + { + "epoch": 0.5910883280757098, + "grad_norm": 0.5749020201058925, + "learning_rate": 1.8953420420503894e-05, + "loss": 0.4538, + "step": 2998 + }, + { + "epoch": 0.5912854889589906, + "grad_norm": 0.6364951608672293, + "learning_rate": 1.8952729963626783e-05, + "loss": 0.4516, + "step": 2999 + }, + { + "epoch": 0.5914826498422713, + "grad_norm": 0.5455105233117238, + "learning_rate": 1.8952039291653126e-05, + "loss": 0.4129, + "step": 3000 + }, + { + "epoch": 0.5916798107255521, + "grad_norm": 0.5688101335852396, + "learning_rate": 1.8951348404599518e-05, + "loss": 0.436, + "step": 3001 + }, + { + "epoch": 0.5918769716088328, + "grad_norm": 0.5869599963349247, + "learning_rate": 1.895065730248255e-05, + "loss": 0.4625, + "step": 3002 + }, + { + "epoch": 0.5920741324921136, + "grad_norm": 0.583433314102157, + "learning_rate": 1.8949965985318835e-05, + "loss": 0.4619, + "step": 3003 + }, + { + "epoch": 0.5922712933753943, + "grad_norm": 1.8179141375028536, + "learning_rate": 1.8949274453124985e-05, + "loss": 0.4457, + "step": 3004 + }, + { + "epoch": 0.5924684542586751, + "grad_norm": 1.0052816097153328, + "learning_rate": 1.8948582705917605e-05, + "loss": 0.3993, + "step": 3005 + }, + { + "epoch": 0.5926656151419558, + "grad_norm": 0.6655550567863493, + "learning_rate": 1.8947890743713316e-05, + "loss": 0.4633, + "step": 3006 + }, + { + "epoch": 0.5928627760252366, + "grad_norm": 0.6343745778626908, + "learning_rate": 1.8947198566528752e-05, + "loss": 0.4534, + "step": 3007 + }, + { + "epoch": 0.5930599369085173, + "grad_norm": 0.8146895268571629, + "learning_rate": 1.894650617438054e-05, + "loss": 0.4622, + "step": 3008 + }, + { + "epoch": 0.5932570977917981, + "grad_norm": 0.5919980337355466, + "learning_rate": 1.8945813567285303e-05, + "loss": 0.4444, + "step": 3009 + }, + { + "epoch": 0.5934542586750788, + "grad_norm": 0.6233642830617198, + "learning_rate": 1.8945120745259696e-05, + "loss": 0.4585, + "step": 3010 + }, + { + "epoch": 0.5936514195583596, + "grad_norm": 0.6560319675501647, + "learning_rate": 1.894442770832036e-05, + "loss": 0.4733, + "step": 3011 + }, + { + "epoch": 0.5938485804416404, + "grad_norm": 0.6361924491098656, + "learning_rate": 1.8943734456483944e-05, + "loss": 0.463, + "step": 3012 + }, + { + "epoch": 0.5940457413249212, + "grad_norm": 0.5872105410259241, + "learning_rate": 1.8943040989767104e-05, + "loss": 0.4613, + "step": 3013 + }, + { + "epoch": 0.5942429022082019, + "grad_norm": 0.9249850168680832, + "learning_rate": 1.8942347308186506e-05, + "loss": 0.4223, + "step": 3014 + }, + { + "epoch": 0.5944400630914827, + "grad_norm": 0.5669241268386221, + "learning_rate": 1.8941653411758813e-05, + "loss": 0.4231, + "step": 3015 + }, + { + "epoch": 0.5946372239747634, + "grad_norm": 0.6196619172043841, + "learning_rate": 1.894095930050069e-05, + "loss": 0.4403, + "step": 3016 + }, + { + "epoch": 0.5948343848580442, + "grad_norm": 0.563726313430164, + "learning_rate": 1.8940264974428827e-05, + "loss": 0.4407, + "step": 3017 + }, + { + "epoch": 0.5950315457413249, + "grad_norm": 0.6200267950921632, + "learning_rate": 1.8939570433559894e-05, + "loss": 0.4129, + "step": 3018 + }, + { + "epoch": 0.5952287066246057, + "grad_norm": 0.6093402086204679, + "learning_rate": 1.8938875677910586e-05, + "loss": 0.464, + "step": 3019 + }, + { + "epoch": 0.5954258675078864, + "grad_norm": 0.5932927503972287, + "learning_rate": 1.8938180707497588e-05, + "loss": 0.4361, + "step": 3020 + }, + { + "epoch": 0.5956230283911672, + "grad_norm": 0.57662299638576, + "learning_rate": 1.89374855223376e-05, + "loss": 0.4501, + "step": 3021 + }, + { + "epoch": 0.5958201892744479, + "grad_norm": 0.5619926542960811, + "learning_rate": 1.8936790122447327e-05, + "loss": 0.4682, + "step": 3022 + }, + { + "epoch": 0.5960173501577287, + "grad_norm": 0.7525793185982179, + "learning_rate": 1.8936094507843476e-05, + "loss": 0.4779, + "step": 3023 + }, + { + "epoch": 0.5962145110410094, + "grad_norm": 0.5548877208195396, + "learning_rate": 1.8935398678542752e-05, + "loss": 0.4321, + "step": 3024 + }, + { + "epoch": 0.5964116719242902, + "grad_norm": 0.5988617075988154, + "learning_rate": 1.8934702634561887e-05, + "loss": 0.4293, + "step": 3025 + }, + { + "epoch": 0.5966088328075709, + "grad_norm": 0.6782737257196965, + "learning_rate": 1.893400637591759e-05, + "loss": 0.4032, + "step": 3026 + }, + { + "epoch": 0.5968059936908517, + "grad_norm": 0.5513926310853049, + "learning_rate": 1.8933309902626598e-05, + "loss": 0.4134, + "step": 3027 + }, + { + "epoch": 0.5970031545741324, + "grad_norm": 0.5598624884027122, + "learning_rate": 1.893261321470564e-05, + "loss": 0.4409, + "step": 3028 + }, + { + "epoch": 0.5972003154574133, + "grad_norm": 0.5745829727781053, + "learning_rate": 1.893191631217146e-05, + "loss": 0.4699, + "step": 3029 + }, + { + "epoch": 0.597397476340694, + "grad_norm": 0.6008234547988593, + "learning_rate": 1.8931219195040796e-05, + "loss": 0.4526, + "step": 3030 + }, + { + "epoch": 0.5975946372239748, + "grad_norm": 0.9177487857817422, + "learning_rate": 1.8930521863330395e-05, + "loss": 0.4306, + "step": 3031 + }, + { + "epoch": 0.5977917981072555, + "grad_norm": 0.5675597820246167, + "learning_rate": 1.892982431705702e-05, + "loss": 0.4458, + "step": 3032 + }, + { + "epoch": 0.5979889589905363, + "grad_norm": 18.36759892361865, + "learning_rate": 1.892912655623742e-05, + "loss": 0.4854, + "step": 3033 + }, + { + "epoch": 0.598186119873817, + "grad_norm": 0.9555573861343462, + "learning_rate": 1.8928428580888365e-05, + "loss": 0.4249, + "step": 3034 + }, + { + "epoch": 0.5983832807570978, + "grad_norm": 0.6045541668448458, + "learning_rate": 1.8927730391026625e-05, + "loss": 0.4369, + "step": 3035 + }, + { + "epoch": 0.5985804416403786, + "grad_norm": 0.5905804898359281, + "learning_rate": 1.8927031986668973e-05, + "loss": 0.4367, + "step": 3036 + }, + { + "epoch": 0.5987776025236593, + "grad_norm": 0.6171855342108601, + "learning_rate": 1.8926333367832188e-05, + "loss": 0.4311, + "step": 3037 + }, + { + "epoch": 0.5989747634069401, + "grad_norm": 0.561383493809823, + "learning_rate": 1.8925634534533054e-05, + "loss": 0.4225, + "step": 3038 + }, + { + "epoch": 0.5991719242902208, + "grad_norm": 0.6623749634079279, + "learning_rate": 1.8924935486788362e-05, + "loss": 0.4703, + "step": 3039 + }, + { + "epoch": 0.5993690851735016, + "grad_norm": 0.6116882687261561, + "learning_rate": 1.892423622461491e-05, + "loss": 0.4566, + "step": 3040 + }, + { + "epoch": 0.5995662460567823, + "grad_norm": 0.5713571661306532, + "learning_rate": 1.8923536748029495e-05, + "loss": 0.4275, + "step": 3041 + }, + { + "epoch": 0.5997634069400631, + "grad_norm": 0.6069279944169308, + "learning_rate": 1.8922837057048925e-05, + "loss": 0.4318, + "step": 3042 + }, + { + "epoch": 0.5999605678233438, + "grad_norm": 0.5706152548682657, + "learning_rate": 1.8922137151690005e-05, + "loss": 0.4448, + "step": 3043 + }, + { + "epoch": 0.6001577287066246, + "grad_norm": 0.6191149624010278, + "learning_rate": 1.8921437031969557e-05, + "loss": 0.4668, + "step": 3044 + }, + { + "epoch": 0.6003548895899053, + "grad_norm": 0.537186705194169, + "learning_rate": 1.8920736697904406e-05, + "loss": 0.3969, + "step": 3045 + }, + { + "epoch": 0.6005520504731862, + "grad_norm": 0.5823679438434386, + "learning_rate": 1.8920036149511365e-05, + "loss": 0.4349, + "step": 3046 + }, + { + "epoch": 0.6007492113564669, + "grad_norm": 0.5764684381732735, + "learning_rate": 1.8919335386807275e-05, + "loss": 0.449, + "step": 3047 + }, + { + "epoch": 0.6009463722397477, + "grad_norm": 0.5972458224831989, + "learning_rate": 1.891863440980897e-05, + "loss": 0.4386, + "step": 3048 + }, + { + "epoch": 0.6011435331230284, + "grad_norm": 0.6843819029628029, + "learning_rate": 1.8917933218533285e-05, + "loss": 0.4148, + "step": 3049 + }, + { + "epoch": 0.6013406940063092, + "grad_norm": 23.053542751701265, + "learning_rate": 1.891723181299708e-05, + "loss": 0.4442, + "step": 3050 + }, + { + "epoch": 0.6015378548895899, + "grad_norm": 0.6757852367698026, + "learning_rate": 1.8916530193217197e-05, + "loss": 0.411, + "step": 3051 + }, + { + "epoch": 0.6017350157728707, + "grad_norm": 0.6378151249084146, + "learning_rate": 1.8915828359210494e-05, + "loss": 0.4503, + "step": 3052 + }, + { + "epoch": 0.6019321766561514, + "grad_norm": 0.8860348502775296, + "learning_rate": 1.8915126310993838e-05, + "loss": 0.4358, + "step": 3053 + }, + { + "epoch": 0.6021293375394322, + "grad_norm": 0.6733989234081417, + "learning_rate": 1.891442404858409e-05, + "loss": 0.4989, + "step": 3054 + }, + { + "epoch": 0.6023264984227129, + "grad_norm": 0.6616823955387168, + "learning_rate": 1.891372157199813e-05, + "loss": 0.4833, + "step": 3055 + }, + { + "epoch": 0.6025236593059937, + "grad_norm": 0.613822177088368, + "learning_rate": 1.8913018881252827e-05, + "loss": 0.4309, + "step": 3056 + }, + { + "epoch": 0.6027208201892744, + "grad_norm": 0.7450747392214685, + "learning_rate": 1.891231597636507e-05, + "loss": 0.4752, + "step": 3057 + }, + { + "epoch": 0.6029179810725552, + "grad_norm": 0.5822085751848699, + "learning_rate": 1.8911612857351743e-05, + "loss": 0.4617, + "step": 3058 + }, + { + "epoch": 0.6031151419558359, + "grad_norm": 0.5976169928739026, + "learning_rate": 1.891090952422974e-05, + "loss": 0.4175, + "step": 3059 + }, + { + "epoch": 0.6033123028391167, + "grad_norm": 0.6141756747523544, + "learning_rate": 1.891020597701596e-05, + "loss": 0.4467, + "step": 3060 + }, + { + "epoch": 0.6035094637223974, + "grad_norm": 0.7375579702582761, + "learning_rate": 1.890950221572731e-05, + "loss": 0.4993, + "step": 3061 + }, + { + "epoch": 0.6037066246056783, + "grad_norm": 0.6447777519058864, + "learning_rate": 1.8908798240380692e-05, + "loss": 0.4402, + "step": 3062 + }, + { + "epoch": 0.603903785488959, + "grad_norm": 0.572469473088046, + "learning_rate": 1.890809405099302e-05, + "loss": 0.4245, + "step": 3063 + }, + { + "epoch": 0.6041009463722398, + "grad_norm": 6.873905360016103, + "learning_rate": 1.8907389647581216e-05, + "loss": 0.5418, + "step": 3064 + }, + { + "epoch": 0.6042981072555205, + "grad_norm": 0.7088428326506333, + "learning_rate": 1.89066850301622e-05, + "loss": 0.4588, + "step": 3065 + }, + { + "epoch": 0.6044952681388013, + "grad_norm": 0.6487251096155348, + "learning_rate": 1.890598019875291e-05, + "loss": 0.47, + "step": 3066 + }, + { + "epoch": 0.604692429022082, + "grad_norm": 0.7195687916217166, + "learning_rate": 1.8905275153370272e-05, + "loss": 0.4444, + "step": 3067 + }, + { + "epoch": 0.6048895899053628, + "grad_norm": 0.6405425829888592, + "learning_rate": 1.890456989403122e-05, + "loss": 0.4242, + "step": 3068 + }, + { + "epoch": 0.6050867507886435, + "grad_norm": 1.1334558627993632, + "learning_rate": 1.8903864420752712e-05, + "loss": 0.497, + "step": 3069 + }, + { + "epoch": 0.6052839116719243, + "grad_norm": 0.612356344424891, + "learning_rate": 1.890315873355169e-05, + "loss": 0.3887, + "step": 3070 + }, + { + "epoch": 0.605481072555205, + "grad_norm": 0.666337306996091, + "learning_rate": 1.890245283244511e-05, + "loss": 0.4491, + "step": 3071 + }, + { + "epoch": 0.6056782334384858, + "grad_norm": 0.6409164321823476, + "learning_rate": 1.8901746717449932e-05, + "loss": 0.4323, + "step": 3072 + }, + { + "epoch": 0.6058753943217665, + "grad_norm": 0.6288119419802719, + "learning_rate": 1.8901040388583117e-05, + "loss": 0.4444, + "step": 3073 + }, + { + "epoch": 0.6060725552050473, + "grad_norm": 0.6764142603017349, + "learning_rate": 1.8900333845861643e-05, + "loss": 0.4598, + "step": 3074 + }, + { + "epoch": 0.606269716088328, + "grad_norm": 0.6209145176055464, + "learning_rate": 1.889962708930248e-05, + "loss": 0.4582, + "step": 3075 + }, + { + "epoch": 0.6064668769716088, + "grad_norm": 0.6837083145298952, + "learning_rate": 1.8898920118922607e-05, + "loss": 0.4073, + "step": 3076 + }, + { + "epoch": 0.6066640378548895, + "grad_norm": 0.6516951362240437, + "learning_rate": 1.8898212934739012e-05, + "loss": 0.4418, + "step": 3077 + }, + { + "epoch": 0.6068611987381703, + "grad_norm": 0.6228422659793167, + "learning_rate": 1.889750553676869e-05, + "loss": 0.4025, + "step": 3078 + }, + { + "epoch": 0.607058359621451, + "grad_norm": 0.9492283175824956, + "learning_rate": 1.8896797925028626e-05, + "loss": 0.4269, + "step": 3079 + }, + { + "epoch": 0.6072555205047319, + "grad_norm": 0.6988466626741697, + "learning_rate": 1.8896090099535834e-05, + "loss": 0.4966, + "step": 3080 + }, + { + "epoch": 0.6074526813880127, + "grad_norm": 1.6653405313238836, + "learning_rate": 1.889538206030731e-05, + "loss": 0.4327, + "step": 3081 + }, + { + "epoch": 0.6076498422712934, + "grad_norm": 0.6950411466750152, + "learning_rate": 1.8894673807360065e-05, + "loss": 0.4513, + "step": 3082 + }, + { + "epoch": 0.6078470031545742, + "grad_norm": 0.5702620043948126, + "learning_rate": 1.8893965340711126e-05, + "loss": 0.4481, + "step": 3083 + }, + { + "epoch": 0.6080441640378549, + "grad_norm": 0.6716600497276086, + "learning_rate": 1.8893256660377505e-05, + "loss": 0.4765, + "step": 3084 + }, + { + "epoch": 0.6082413249211357, + "grad_norm": 0.602345996688224, + "learning_rate": 1.8892547766376228e-05, + "loss": 0.4651, + "step": 3085 + }, + { + "epoch": 0.6084384858044164, + "grad_norm": 0.5853869541989889, + "learning_rate": 1.889183865872433e-05, + "loss": 0.4199, + "step": 3086 + }, + { + "epoch": 0.6086356466876972, + "grad_norm": 0.6334963552684939, + "learning_rate": 1.889112933743885e-05, + "loss": 0.4836, + "step": 3087 + }, + { + "epoch": 0.6088328075709779, + "grad_norm": 0.6800088424135952, + "learning_rate": 1.8890419802536826e-05, + "loss": 0.4875, + "step": 3088 + }, + { + "epoch": 0.6090299684542587, + "grad_norm": 0.6475039848768125, + "learning_rate": 1.888971005403531e-05, + "loss": 0.4679, + "step": 3089 + }, + { + "epoch": 0.6092271293375394, + "grad_norm": 0.6133780661367402, + "learning_rate": 1.8889000091951347e-05, + "loss": 0.4805, + "step": 3090 + }, + { + "epoch": 0.6094242902208202, + "grad_norm": 0.6087154224375381, + "learning_rate": 1.8888289916302e-05, + "loss": 0.4193, + "step": 3091 + }, + { + "epoch": 0.6096214511041009, + "grad_norm": 0.6392859491176853, + "learning_rate": 1.8887579527104332e-05, + "loss": 0.4499, + "step": 3092 + }, + { + "epoch": 0.6098186119873817, + "grad_norm": 0.561076270997986, + "learning_rate": 1.8886868924375407e-05, + "loss": 0.4607, + "step": 3093 + }, + { + "epoch": 0.6100157728706624, + "grad_norm": 0.6104019667478391, + "learning_rate": 1.8886158108132298e-05, + "loss": 0.448, + "step": 3094 + }, + { + "epoch": 0.6102129337539433, + "grad_norm": 0.5694020567275695, + "learning_rate": 1.888544707839209e-05, + "loss": 0.4318, + "step": 3095 + }, + { + "epoch": 0.610410094637224, + "grad_norm": 0.6155013174910009, + "learning_rate": 1.888473583517185e-05, + "loss": 0.4129, + "step": 3096 + }, + { + "epoch": 0.6106072555205048, + "grad_norm": 0.8830383227880512, + "learning_rate": 1.8884024378488686e-05, + "loss": 0.4668, + "step": 3097 + }, + { + "epoch": 0.6108044164037855, + "grad_norm": 0.5411137975815842, + "learning_rate": 1.888331270835968e-05, + "loss": 0.4269, + "step": 3098 + }, + { + "epoch": 0.6110015772870663, + "grad_norm": 0.8549193019773949, + "learning_rate": 1.8882600824801932e-05, + "loss": 0.4349, + "step": 3099 + }, + { + "epoch": 0.611198738170347, + "grad_norm": 5.778893003349931, + "learning_rate": 1.888188872783255e-05, + "loss": 0.4817, + "step": 3100 + }, + { + "epoch": 0.6113958990536278, + "grad_norm": 1.1949763025723064, + "learning_rate": 1.888117641746863e-05, + "loss": 0.4478, + "step": 3101 + }, + { + "epoch": 0.6115930599369085, + "grad_norm": 0.6510700265765879, + "learning_rate": 1.8880463893727297e-05, + "loss": 0.4496, + "step": 3102 + }, + { + "epoch": 0.6117902208201893, + "grad_norm": 0.5839708585126449, + "learning_rate": 1.8879751156625673e-05, + "loss": 0.4457, + "step": 3103 + }, + { + "epoch": 0.61198738170347, + "grad_norm": 17.48414582449077, + "learning_rate": 1.887903820618087e-05, + "loss": 0.4768, + "step": 3104 + }, + { + "epoch": 0.6121845425867508, + "grad_norm": 1.437092612882128, + "learning_rate": 1.887832504241003e-05, + "loss": 0.4374, + "step": 3105 + }, + { + "epoch": 0.6123817034700315, + "grad_norm": 4.569861399699513, + "learning_rate": 1.887761166533028e-05, + "loss": 0.4938, + "step": 3106 + }, + { + "epoch": 0.6125788643533123, + "grad_norm": 0.8415642887494247, + "learning_rate": 1.8876898074958757e-05, + "loss": 0.4235, + "step": 3107 + }, + { + "epoch": 0.612776025236593, + "grad_norm": 0.7258259413681535, + "learning_rate": 1.887618427131261e-05, + "loss": 0.4572, + "step": 3108 + }, + { + "epoch": 0.6129731861198738, + "grad_norm": 0.6290488996802963, + "learning_rate": 1.887547025440899e-05, + "loss": 0.425, + "step": 3109 + }, + { + "epoch": 0.6131703470031545, + "grad_norm": 0.7318494963979986, + "learning_rate": 1.8874756024265045e-05, + "loss": 0.4631, + "step": 3110 + }, + { + "epoch": 0.6133675078864353, + "grad_norm": 0.6443190758237272, + "learning_rate": 1.8874041580897944e-05, + "loss": 0.4751, + "step": 3111 + }, + { + "epoch": 0.613564668769716, + "grad_norm": 0.6538478656310595, + "learning_rate": 1.887332692432485e-05, + "loss": 0.4124, + "step": 3112 + }, + { + "epoch": 0.6137618296529969, + "grad_norm": 0.6822067929384322, + "learning_rate": 1.8872612054562927e-05, + "loss": 0.4744, + "step": 3113 + }, + { + "epoch": 0.6139589905362776, + "grad_norm": 0.5861492452369412, + "learning_rate": 1.8871896971629356e-05, + "loss": 0.4494, + "step": 3114 + }, + { + "epoch": 0.6141561514195584, + "grad_norm": 0.6402603357742715, + "learning_rate": 1.8871181675541316e-05, + "loss": 0.4203, + "step": 3115 + }, + { + "epoch": 0.6143533123028391, + "grad_norm": 0.5974596546772335, + "learning_rate": 1.8870466166315992e-05, + "loss": 0.4682, + "step": 3116 + }, + { + "epoch": 0.6145504731861199, + "grad_norm": 0.6645668240438647, + "learning_rate": 1.8869750443970574e-05, + "loss": 0.461, + "step": 3117 + }, + { + "epoch": 0.6147476340694006, + "grad_norm": 0.5555905216873552, + "learning_rate": 1.8869034508522255e-05, + "loss": 0.4412, + "step": 3118 + }, + { + "epoch": 0.6149447949526814, + "grad_norm": 0.6339546381509664, + "learning_rate": 1.8868318359988247e-05, + "loss": 0.4321, + "step": 3119 + }, + { + "epoch": 0.6151419558359621, + "grad_norm": 0.5848309865797923, + "learning_rate": 1.8867601998385746e-05, + "loss": 0.4476, + "step": 3120 + }, + { + "epoch": 0.6153391167192429, + "grad_norm": 0.6339063416959785, + "learning_rate": 1.8866885423731965e-05, + "loss": 0.4726, + "step": 3121 + }, + { + "epoch": 0.6155362776025236, + "grad_norm": 0.6250487989240934, + "learning_rate": 1.8866168636044123e-05, + "loss": 0.4438, + "step": 3122 + }, + { + "epoch": 0.6157334384858044, + "grad_norm": 0.5912498590576298, + "learning_rate": 1.8865451635339437e-05, + "loss": 0.4249, + "step": 3123 + }, + { + "epoch": 0.6159305993690851, + "grad_norm": 0.9964033422451408, + "learning_rate": 1.8864734421635138e-05, + "loss": 0.4533, + "step": 3124 + }, + { + "epoch": 0.6161277602523659, + "grad_norm": 0.6783125045850023, + "learning_rate": 1.8864016994948456e-05, + "loss": 0.4399, + "step": 3125 + }, + { + "epoch": 0.6163249211356467, + "grad_norm": 0.5640318304653738, + "learning_rate": 1.8863299355296626e-05, + "loss": 0.417, + "step": 3126 + }, + { + "epoch": 0.6165220820189274, + "grad_norm": 0.7270371763719108, + "learning_rate": 1.8862581502696893e-05, + "loss": 0.4581, + "step": 3127 + }, + { + "epoch": 0.6167192429022083, + "grad_norm": 0.5953459293919341, + "learning_rate": 1.8861863437166503e-05, + "loss": 0.4629, + "step": 3128 + }, + { + "epoch": 0.616916403785489, + "grad_norm": 0.7333601695695047, + "learning_rate": 1.8861145158722703e-05, + "loss": 0.4449, + "step": 3129 + }, + { + "epoch": 0.6171135646687698, + "grad_norm": 0.6093823968435338, + "learning_rate": 1.886042666738276e-05, + "loss": 0.4538, + "step": 3130 + }, + { + "epoch": 0.6173107255520505, + "grad_norm": 0.7379382803953564, + "learning_rate": 1.885970796316393e-05, + "loss": 0.422, + "step": 3131 + }, + { + "epoch": 0.6175078864353313, + "grad_norm": 0.5910536949790164, + "learning_rate": 1.885898904608348e-05, + "loss": 0.4416, + "step": 3132 + }, + { + "epoch": 0.617705047318612, + "grad_norm": 0.7181285290226457, + "learning_rate": 1.8858269916158683e-05, + "loss": 0.4505, + "step": 3133 + }, + { + "epoch": 0.6179022082018928, + "grad_norm": 0.6153838686175938, + "learning_rate": 1.885755057340682e-05, + "loss": 0.462, + "step": 3134 + }, + { + "epoch": 0.6180993690851735, + "grad_norm": 0.6682026146168444, + "learning_rate": 1.8856831017845172e-05, + "loss": 0.4394, + "step": 3135 + }, + { + "epoch": 0.6182965299684543, + "grad_norm": 0.6870178773294343, + "learning_rate": 1.885611124949102e-05, + "loss": 0.4573, + "step": 3136 + }, + { + "epoch": 0.618493690851735, + "grad_norm": 0.734375693308938, + "learning_rate": 1.8855391268361672e-05, + "loss": 0.4893, + "step": 3137 + }, + { + "epoch": 0.6186908517350158, + "grad_norm": 0.6503117986853993, + "learning_rate": 1.8854671074474415e-05, + "loss": 0.4491, + "step": 3138 + }, + { + "epoch": 0.6188880126182965, + "grad_norm": 0.6902398712205959, + "learning_rate": 1.8853950667846552e-05, + "loss": 0.4619, + "step": 3139 + }, + { + "epoch": 0.6190851735015773, + "grad_norm": 0.5584656484231819, + "learning_rate": 1.8853230048495397e-05, + "loss": 0.4073, + "step": 3140 + }, + { + "epoch": 0.619282334384858, + "grad_norm": 0.6705513309518663, + "learning_rate": 1.885250921643826e-05, + "loss": 0.4545, + "step": 3141 + }, + { + "epoch": 0.6194794952681388, + "grad_norm": 0.5384726667184218, + "learning_rate": 1.885178817169246e-05, + "loss": 0.3684, + "step": 3142 + }, + { + "epoch": 0.6196766561514195, + "grad_norm": 0.706927829405689, + "learning_rate": 1.885106691427532e-05, + "loss": 0.4447, + "step": 3143 + }, + { + "epoch": 0.6198738170347003, + "grad_norm": 0.553276508635115, + "learning_rate": 1.885034544420417e-05, + "loss": 0.4144, + "step": 3144 + }, + { + "epoch": 0.620070977917981, + "grad_norm": 0.6314672976429081, + "learning_rate": 1.8849623761496344e-05, + "loss": 0.4518, + "step": 3145 + }, + { + "epoch": 0.6202681388012619, + "grad_norm": 0.5953908729469908, + "learning_rate": 1.884890186616918e-05, + "loss": 0.4872, + "step": 3146 + }, + { + "epoch": 0.6204652996845426, + "grad_norm": 0.6073305974604772, + "learning_rate": 1.884817975824002e-05, + "loss": 0.441, + "step": 3147 + }, + { + "epoch": 0.6206624605678234, + "grad_norm": 0.6526389975490265, + "learning_rate": 1.884745743772622e-05, + "loss": 0.435, + "step": 3148 + }, + { + "epoch": 0.6208596214511041, + "grad_norm": 0.6677687240737739, + "learning_rate": 1.884673490464513e-05, + "loss": 0.4322, + "step": 3149 + }, + { + "epoch": 0.6210567823343849, + "grad_norm": 0.5838277331167949, + "learning_rate": 1.884601215901411e-05, + "loss": 0.4541, + "step": 3150 + }, + { + "epoch": 0.6212539432176656, + "grad_norm": 0.5916911486373814, + "learning_rate": 1.8845289200850523e-05, + "loss": 0.4284, + "step": 3151 + }, + { + "epoch": 0.6214511041009464, + "grad_norm": 0.5790112173160624, + "learning_rate": 1.8844566030171737e-05, + "loss": 0.4481, + "step": 3152 + }, + { + "epoch": 0.6216482649842271, + "grad_norm": 1.1931766299083246, + "learning_rate": 1.8843842646995135e-05, + "loss": 0.4262, + "step": 3153 + }, + { + "epoch": 0.6218454258675079, + "grad_norm": 0.6719849832897626, + "learning_rate": 1.884311905133809e-05, + "loss": 0.4364, + "step": 3154 + }, + { + "epoch": 0.6220425867507886, + "grad_norm": 0.6366996603140915, + "learning_rate": 1.8842395243217986e-05, + "loss": 0.452, + "step": 3155 + }, + { + "epoch": 0.6222397476340694, + "grad_norm": 0.6564134246631125, + "learning_rate": 1.884167122265222e-05, + "loss": 0.4725, + "step": 3156 + }, + { + "epoch": 0.6224369085173501, + "grad_norm": 0.6718054415711968, + "learning_rate": 1.8840946989658175e-05, + "loss": 0.4179, + "step": 3157 + }, + { + "epoch": 0.6226340694006309, + "grad_norm": 0.6323002874688755, + "learning_rate": 1.8840222544253265e-05, + "loss": 0.4829, + "step": 3158 + }, + { + "epoch": 0.6228312302839116, + "grad_norm": 0.5594006708403959, + "learning_rate": 1.883949788645489e-05, + "loss": 0.4349, + "step": 3159 + }, + { + "epoch": 0.6230283911671924, + "grad_norm": 0.6793405116890823, + "learning_rate": 1.8838773016280457e-05, + "loss": 0.4814, + "step": 3160 + }, + { + "epoch": 0.6232255520504731, + "grad_norm": 0.5889819240809804, + "learning_rate": 1.8838047933747386e-05, + "loss": 0.477, + "step": 3161 + }, + { + "epoch": 0.623422712933754, + "grad_norm": 0.6015700310464448, + "learning_rate": 1.8837322638873093e-05, + "loss": 0.4701, + "step": 3162 + }, + { + "epoch": 0.6236198738170347, + "grad_norm": 0.537156319132712, + "learning_rate": 1.883659713167501e-05, + "loss": 0.3994, + "step": 3163 + }, + { + "epoch": 0.6238170347003155, + "grad_norm": 0.6309490765989089, + "learning_rate": 1.8835871412170563e-05, + "loss": 0.4657, + "step": 3164 + }, + { + "epoch": 0.6240141955835962, + "grad_norm": 0.5932465191692712, + "learning_rate": 1.8835145480377194e-05, + "loss": 0.4406, + "step": 3165 + }, + { + "epoch": 0.624211356466877, + "grad_norm": 0.5843771388751763, + "learning_rate": 1.8834419336312334e-05, + "loss": 0.4644, + "step": 3166 + }, + { + "epoch": 0.6244085173501577, + "grad_norm": 0.5350744617276989, + "learning_rate": 1.8833692979993437e-05, + "loss": 0.4334, + "step": 3167 + }, + { + "epoch": 0.6246056782334385, + "grad_norm": 0.6032969293393299, + "learning_rate": 1.8832966411437958e-05, + "loss": 0.4516, + "step": 3168 + }, + { + "epoch": 0.6248028391167192, + "grad_norm": 0.5402329623043484, + "learning_rate": 1.883223963066334e-05, + "loss": 0.4669, + "step": 3169 + }, + { + "epoch": 0.625, + "grad_norm": 0.6420727536456771, + "learning_rate": 1.8831512637687054e-05, + "loss": 0.4933, + "step": 3170 + }, + { + "epoch": 0.6251971608832808, + "grad_norm": 0.6837145058399116, + "learning_rate": 1.8830785432526568e-05, + "loss": 0.4316, + "step": 3171 + }, + { + "epoch": 0.6253943217665615, + "grad_norm": 0.5669487608780952, + "learning_rate": 1.883005801519935e-05, + "loss": 0.4435, + "step": 3172 + }, + { + "epoch": 0.6255914826498423, + "grad_norm": 0.5500128986494645, + "learning_rate": 1.8829330385722875e-05, + "loss": 0.4352, + "step": 3173 + }, + { + "epoch": 0.625788643533123, + "grad_norm": 0.659708848221679, + "learning_rate": 1.882860254411463e-05, + "loss": 0.4302, + "step": 3174 + }, + { + "epoch": 0.6259858044164038, + "grad_norm": 0.5634015211579477, + "learning_rate": 1.8827874490392095e-05, + "loss": 0.4344, + "step": 3175 + }, + { + "epoch": 0.6261829652996845, + "grad_norm": 0.5511225422280275, + "learning_rate": 1.882714622457277e-05, + "loss": 0.4493, + "step": 3176 + }, + { + "epoch": 0.6263801261829653, + "grad_norm": 0.5535762556867952, + "learning_rate": 1.882641774667415e-05, + "loss": 0.4786, + "step": 3177 + }, + { + "epoch": 0.626577287066246, + "grad_norm": 0.7296756084321033, + "learning_rate": 1.8825689056713733e-05, + "loss": 0.4107, + "step": 3178 + }, + { + "epoch": 0.6267744479495269, + "grad_norm": 0.5691759534943049, + "learning_rate": 1.8824960154709027e-05, + "loss": 0.4937, + "step": 3179 + }, + { + "epoch": 0.6269716088328076, + "grad_norm": 0.5643398502645974, + "learning_rate": 1.882423104067755e-05, + "loss": 0.4371, + "step": 3180 + }, + { + "epoch": 0.6271687697160884, + "grad_norm": 0.5414110934753018, + "learning_rate": 1.8823501714636815e-05, + "loss": 0.4372, + "step": 3181 + }, + { + "epoch": 0.6273659305993691, + "grad_norm": 0.5716798244654584, + "learning_rate": 1.8822772176604346e-05, + "loss": 0.4386, + "step": 3182 + }, + { + "epoch": 0.6275630914826499, + "grad_norm": 0.5594732072778538, + "learning_rate": 1.882204242659767e-05, + "loss": 0.4415, + "step": 3183 + }, + { + "epoch": 0.6277602523659306, + "grad_norm": 0.5968240035493283, + "learning_rate": 1.8821312464634318e-05, + "loss": 0.4651, + "step": 3184 + }, + { + "epoch": 0.6279574132492114, + "grad_norm": 0.5050689025421782, + "learning_rate": 1.8820582290731836e-05, + "loss": 0.3813, + "step": 3185 + }, + { + "epoch": 0.6281545741324921, + "grad_norm": 0.571720125704504, + "learning_rate": 1.8819851904907756e-05, + "loss": 0.4117, + "step": 3186 + }, + { + "epoch": 0.6283517350157729, + "grad_norm": 0.6099795491237106, + "learning_rate": 1.8819121307179634e-05, + "loss": 0.4739, + "step": 3187 + }, + { + "epoch": 0.6285488958990536, + "grad_norm": 0.5516740130300875, + "learning_rate": 1.881839049756502e-05, + "loss": 0.4132, + "step": 3188 + }, + { + "epoch": 0.6287460567823344, + "grad_norm": 0.6078983072011943, + "learning_rate": 1.8817659476081474e-05, + "loss": 0.4612, + "step": 3189 + }, + { + "epoch": 0.6289432176656151, + "grad_norm": 0.5561987130798062, + "learning_rate": 1.8816928242746554e-05, + "loss": 0.4411, + "step": 3190 + }, + { + "epoch": 0.6291403785488959, + "grad_norm": 0.5499939289696452, + "learning_rate": 1.8816196797577838e-05, + "loss": 0.4462, + "step": 3191 + }, + { + "epoch": 0.6293375394321766, + "grad_norm": 0.565448729109912, + "learning_rate": 1.881546514059289e-05, + "loss": 0.4426, + "step": 3192 + }, + { + "epoch": 0.6295347003154574, + "grad_norm": 0.5466556054488702, + "learning_rate": 1.8814733271809296e-05, + "loss": 0.4633, + "step": 3193 + }, + { + "epoch": 0.6297318611987381, + "grad_norm": 0.6054836321256308, + "learning_rate": 1.8814001191244636e-05, + "loss": 0.4592, + "step": 3194 + }, + { + "epoch": 0.629929022082019, + "grad_norm": 0.6252405670685508, + "learning_rate": 1.8813268898916498e-05, + "loss": 0.4588, + "step": 3195 + }, + { + "epoch": 0.6301261829652997, + "grad_norm": 0.5210693946541807, + "learning_rate": 1.881253639484248e-05, + "loss": 0.4461, + "step": 3196 + }, + { + "epoch": 0.6303233438485805, + "grad_norm": 0.5953514887490388, + "learning_rate": 1.8811803679040178e-05, + "loss": 0.4862, + "step": 3197 + }, + { + "epoch": 0.6305205047318612, + "grad_norm": 0.6102071586071421, + "learning_rate": 1.8811070751527196e-05, + "loss": 0.4214, + "step": 3198 + }, + { + "epoch": 0.630717665615142, + "grad_norm": 0.6100495554396542, + "learning_rate": 1.8810337612321144e-05, + "loss": 0.4648, + "step": 3199 + }, + { + "epoch": 0.6309148264984227, + "grad_norm": 0.5197551849910016, + "learning_rate": 1.8809604261439634e-05, + "loss": 0.4382, + "step": 3200 + }, + { + "epoch": 0.6311119873817035, + "grad_norm": 0.5490144964409054, + "learning_rate": 1.8808870698900288e-05, + "loss": 0.4477, + "step": 3201 + }, + { + "epoch": 0.6313091482649842, + "grad_norm": 0.5628554485158465, + "learning_rate": 1.880813692472073e-05, + "loss": 0.4605, + "step": 3202 + }, + { + "epoch": 0.631506309148265, + "grad_norm": 0.8650405899982267, + "learning_rate": 1.8807402938918588e-05, + "loss": 0.4288, + "step": 3203 + }, + { + "epoch": 0.6317034700315457, + "grad_norm": 0.5546723666140645, + "learning_rate": 1.88066687415115e-05, + "loss": 0.4412, + "step": 3204 + }, + { + "epoch": 0.6319006309148265, + "grad_norm": 0.5808634529336613, + "learning_rate": 1.8805934332517104e-05, + "loss": 0.4733, + "step": 3205 + }, + { + "epoch": 0.6320977917981072, + "grad_norm": 0.6305165897108991, + "learning_rate": 1.880519971195304e-05, + "loss": 0.4469, + "step": 3206 + }, + { + "epoch": 0.632294952681388, + "grad_norm": 0.5672855153253277, + "learning_rate": 1.880446487983697e-05, + "loss": 0.4699, + "step": 3207 + }, + { + "epoch": 0.6324921135646687, + "grad_norm": 0.5959371940317938, + "learning_rate": 1.880372983618653e-05, + "loss": 0.481, + "step": 3208 + }, + { + "epoch": 0.6326892744479495, + "grad_norm": 0.5264066379704362, + "learning_rate": 1.88029945810194e-05, + "loss": 0.4093, + "step": 3209 + }, + { + "epoch": 0.6328864353312302, + "grad_norm": 0.5705739208816702, + "learning_rate": 1.880225911435323e-05, + "loss": 0.4517, + "step": 3210 + }, + { + "epoch": 0.633083596214511, + "grad_norm": 0.6114799226361008, + "learning_rate": 1.88015234362057e-05, + "loss": 0.4527, + "step": 3211 + }, + { + "epoch": 0.6332807570977917, + "grad_norm": 0.5837271501297863, + "learning_rate": 1.8800787546594487e-05, + "loss": 0.4596, + "step": 3212 + }, + { + "epoch": 0.6334779179810726, + "grad_norm": 0.5577201101577096, + "learning_rate": 1.8800051445537256e-05, + "loss": 0.4577, + "step": 3213 + }, + { + "epoch": 0.6336750788643533, + "grad_norm": 0.5736984526638703, + "learning_rate": 1.8799315133051707e-05, + "loss": 0.4451, + "step": 3214 + }, + { + "epoch": 0.6338722397476341, + "grad_norm": 0.7258407481830798, + "learning_rate": 1.8798578609155528e-05, + "loss": 0.4459, + "step": 3215 + }, + { + "epoch": 0.6340694006309149, + "grad_norm": 0.5742874979476474, + "learning_rate": 1.8797841873866406e-05, + "loss": 0.4688, + "step": 3216 + }, + { + "epoch": 0.6342665615141956, + "grad_norm": 0.5334223008485237, + "learning_rate": 1.8797104927202055e-05, + "loss": 0.4496, + "step": 3217 + }, + { + "epoch": 0.6344637223974764, + "grad_norm": 0.5706148443360702, + "learning_rate": 1.879636776918017e-05, + "loss": 0.4617, + "step": 3218 + }, + { + "epoch": 0.6346608832807571, + "grad_norm": 0.8529042316450798, + "learning_rate": 1.8795630399818466e-05, + "loss": 0.4704, + "step": 3219 + }, + { + "epoch": 0.6348580441640379, + "grad_norm": 0.5624250580860771, + "learning_rate": 1.8794892819134657e-05, + "loss": 0.4725, + "step": 3220 + }, + { + "epoch": 0.6350552050473186, + "grad_norm": 0.5838011246869812, + "learning_rate": 1.8794155027146468e-05, + "loss": 0.4641, + "step": 3221 + }, + { + "epoch": 0.6352523659305994, + "grad_norm": 0.564379408051252, + "learning_rate": 1.879341702387162e-05, + "loss": 0.4536, + "step": 3222 + }, + { + "epoch": 0.6354495268138801, + "grad_norm": 0.5730447220643163, + "learning_rate": 1.8792678809327852e-05, + "loss": 0.4228, + "step": 3223 + }, + { + "epoch": 0.6356466876971609, + "grad_norm": 0.9816181582279039, + "learning_rate": 1.879194038353289e-05, + "loss": 0.4869, + "step": 3224 + }, + { + "epoch": 0.6358438485804416, + "grad_norm": 0.5470063459656487, + "learning_rate": 1.8791201746504485e-05, + "loss": 0.4252, + "step": 3225 + }, + { + "epoch": 0.6360410094637224, + "grad_norm": 0.5839122977250125, + "learning_rate": 1.8790462898260373e-05, + "loss": 0.3985, + "step": 3226 + }, + { + "epoch": 0.6362381703470031, + "grad_norm": 0.601019249469374, + "learning_rate": 1.8789723838818314e-05, + "loss": 0.4639, + "step": 3227 + }, + { + "epoch": 0.636435331230284, + "grad_norm": 0.6426956793821633, + "learning_rate": 1.878898456819606e-05, + "loss": 0.4722, + "step": 3228 + }, + { + "epoch": 0.6366324921135647, + "grad_norm": 0.5652057471720214, + "learning_rate": 1.878824508641137e-05, + "loss": 0.4385, + "step": 3229 + }, + { + "epoch": 0.6368296529968455, + "grad_norm": 0.634391516253938, + "learning_rate": 1.8787505393482023e-05, + "loss": 0.4861, + "step": 3230 + }, + { + "epoch": 0.6370268138801262, + "grad_norm": 0.5276142848529989, + "learning_rate": 1.8786765489425776e-05, + "loss": 0.4167, + "step": 3231 + }, + { + "epoch": 0.637223974763407, + "grad_norm": 0.642092919755005, + "learning_rate": 1.8786025374260418e-05, + "loss": 0.4447, + "step": 3232 + }, + { + "epoch": 0.6374211356466877, + "grad_norm": 0.6622984389500397, + "learning_rate": 1.8785285048003722e-05, + "loss": 0.4509, + "step": 3233 + }, + { + "epoch": 0.6376182965299685, + "grad_norm": 0.6059786233435729, + "learning_rate": 1.8784544510673477e-05, + "loss": 0.4328, + "step": 3234 + }, + { + "epoch": 0.6378154574132492, + "grad_norm": 0.5608277860083667, + "learning_rate": 1.8783803762287477e-05, + "loss": 0.4403, + "step": 3235 + }, + { + "epoch": 0.63801261829653, + "grad_norm": 0.5705812031919998, + "learning_rate": 1.8783062802863516e-05, + "loss": 0.447, + "step": 3236 + }, + { + "epoch": 0.6382097791798107, + "grad_norm": 0.6099064166899052, + "learning_rate": 1.8782321632419402e-05, + "loss": 0.4635, + "step": 3237 + }, + { + "epoch": 0.6384069400630915, + "grad_norm": 0.5789456791219298, + "learning_rate": 1.8781580250972933e-05, + "loss": 0.453, + "step": 3238 + }, + { + "epoch": 0.6386041009463722, + "grad_norm": 0.5435966426313446, + "learning_rate": 1.8780838658541932e-05, + "loss": 0.4088, + "step": 3239 + }, + { + "epoch": 0.638801261829653, + "grad_norm": 0.5637574474073005, + "learning_rate": 1.878009685514421e-05, + "loss": 0.4516, + "step": 3240 + }, + { + "epoch": 0.6389984227129337, + "grad_norm": 0.6583384784739884, + "learning_rate": 1.8779354840797588e-05, + "loss": 0.454, + "step": 3241 + }, + { + "epoch": 0.6391955835962145, + "grad_norm": 0.5554247237363399, + "learning_rate": 1.87786126155199e-05, + "loss": 0.4567, + "step": 3242 + }, + { + "epoch": 0.6393927444794952, + "grad_norm": 0.5657140987582041, + "learning_rate": 1.877787017932897e-05, + "loss": 0.4716, + "step": 3243 + }, + { + "epoch": 0.639589905362776, + "grad_norm": 0.5498999624708573, + "learning_rate": 1.8777127532242643e-05, + "loss": 0.4527, + "step": 3244 + }, + { + "epoch": 0.6397870662460567, + "grad_norm": 0.597593319418814, + "learning_rate": 1.8776384674278756e-05, + "loss": 0.4409, + "step": 3245 + }, + { + "epoch": 0.6399842271293376, + "grad_norm": 0.5970410292124881, + "learning_rate": 1.8775641605455162e-05, + "loss": 0.4716, + "step": 3246 + }, + { + "epoch": 0.6401813880126183, + "grad_norm": 0.5712017944124336, + "learning_rate": 1.877489832578971e-05, + "loss": 0.4408, + "step": 3247 + }, + { + "epoch": 0.6403785488958991, + "grad_norm": 0.5264381336103525, + "learning_rate": 1.877415483530026e-05, + "loss": 0.4419, + "step": 3248 + }, + { + "epoch": 0.6405757097791798, + "grad_norm": 0.6131589523064833, + "learning_rate": 1.8773411134004677e-05, + "loss": 0.4544, + "step": 3249 + }, + { + "epoch": 0.6407728706624606, + "grad_norm": 0.5718419218962923, + "learning_rate": 1.8772667221920823e-05, + "loss": 0.4296, + "step": 3250 + }, + { + "epoch": 0.6409700315457413, + "grad_norm": 0.578624341262867, + "learning_rate": 1.8771923099066573e-05, + "loss": 0.4605, + "step": 3251 + }, + { + "epoch": 0.6411671924290221, + "grad_norm": 0.5787190935133748, + "learning_rate": 1.877117876545981e-05, + "loss": 0.4434, + "step": 3252 + }, + { + "epoch": 0.6413643533123028, + "grad_norm": 0.5634817233410009, + "learning_rate": 1.877043422111841e-05, + "loss": 0.4614, + "step": 3253 + }, + { + "epoch": 0.6415615141955836, + "grad_norm": 0.5510386384849804, + "learning_rate": 1.876968946606027e-05, + "loss": 0.4477, + "step": 3254 + }, + { + "epoch": 0.6417586750788643, + "grad_norm": 0.6031650127924183, + "learning_rate": 1.8768944500303276e-05, + "loss": 0.4528, + "step": 3255 + }, + { + "epoch": 0.6419558359621451, + "grad_norm": 0.563947528442472, + "learning_rate": 1.876819932386533e-05, + "loss": 0.4464, + "step": 3256 + }, + { + "epoch": 0.6421529968454258, + "grad_norm": 0.562104581809266, + "learning_rate": 1.8767453936764332e-05, + "loss": 0.4448, + "step": 3257 + }, + { + "epoch": 0.6423501577287066, + "grad_norm": 0.5711382767124764, + "learning_rate": 1.876670833901819e-05, + "loss": 0.432, + "step": 3258 + }, + { + "epoch": 0.6425473186119873, + "grad_norm": 0.5697395824924175, + "learning_rate": 1.8765962530644826e-05, + "loss": 0.446, + "step": 3259 + }, + { + "epoch": 0.6427444794952681, + "grad_norm": 0.5940770780237805, + "learning_rate": 1.8765216511662153e-05, + "loss": 0.4551, + "step": 3260 + }, + { + "epoch": 0.642941640378549, + "grad_norm": 0.9903145042557886, + "learning_rate": 1.876447028208809e-05, + "loss": 0.4633, + "step": 3261 + }, + { + "epoch": 0.6431388012618297, + "grad_norm": 0.5566812407384747, + "learning_rate": 1.8763723841940576e-05, + "loss": 0.4058, + "step": 3262 + }, + { + "epoch": 0.6433359621451105, + "grad_norm": 0.6250449584024177, + "learning_rate": 1.8762977191237536e-05, + "loss": 0.4706, + "step": 3263 + }, + { + "epoch": 0.6435331230283912, + "grad_norm": 2.210243090857098, + "learning_rate": 1.876223032999691e-05, + "loss": 0.4406, + "step": 3264 + }, + { + "epoch": 0.643730283911672, + "grad_norm": 0.6481804636618524, + "learning_rate": 1.876148325823665e-05, + "loss": 0.432, + "step": 3265 + }, + { + "epoch": 0.6439274447949527, + "grad_norm": 0.6303860129263393, + "learning_rate": 1.8760735975974693e-05, + "loss": 0.4137, + "step": 3266 + }, + { + "epoch": 0.6441246056782335, + "grad_norm": 0.6383795817729836, + "learning_rate": 1.8759988483229e-05, + "loss": 0.4554, + "step": 3267 + }, + { + "epoch": 0.6443217665615142, + "grad_norm": 0.6875260817255804, + "learning_rate": 1.8759240780017534e-05, + "loss": 0.443, + "step": 3268 + }, + { + "epoch": 0.644518927444795, + "grad_norm": 0.6662104585984299, + "learning_rate": 1.875849286635825e-05, + "loss": 0.4445, + "step": 3269 + }, + { + "epoch": 0.6447160883280757, + "grad_norm": 0.6171930410732172, + "learning_rate": 1.8757744742269123e-05, + "loss": 0.4374, + "step": 3270 + }, + { + "epoch": 0.6449132492113565, + "grad_norm": 0.7933975678175971, + "learning_rate": 1.8756996407768128e-05, + "loss": 0.4766, + "step": 3271 + }, + { + "epoch": 0.6451104100946372, + "grad_norm": 1.0409394957016507, + "learning_rate": 1.875624786287324e-05, + "loss": 0.4418, + "step": 3272 + }, + { + "epoch": 0.645307570977918, + "grad_norm": 0.5947622881166746, + "learning_rate": 1.875549910760245e-05, + "loss": 0.4575, + "step": 3273 + }, + { + "epoch": 0.6455047318611987, + "grad_norm": 0.5670316521357006, + "learning_rate": 1.875475014197374e-05, + "loss": 0.4488, + "step": 3274 + }, + { + "epoch": 0.6457018927444795, + "grad_norm": 0.6158749237920658, + "learning_rate": 1.8754000966005105e-05, + "loss": 0.4173, + "step": 3275 + }, + { + "epoch": 0.6458990536277602, + "grad_norm": 0.6765411423930585, + "learning_rate": 1.8753251579714548e-05, + "loss": 0.4131, + "step": 3276 + }, + { + "epoch": 0.646096214511041, + "grad_norm": 0.5338915194924335, + "learning_rate": 1.8752501983120076e-05, + "loss": 0.4262, + "step": 3277 + }, + { + "epoch": 0.6462933753943217, + "grad_norm": 0.566003677186254, + "learning_rate": 1.8751752176239693e-05, + "loss": 0.4208, + "step": 3278 + }, + { + "epoch": 0.6464905362776026, + "grad_norm": 0.6018668670309205, + "learning_rate": 1.8751002159091415e-05, + "loss": 0.4591, + "step": 3279 + }, + { + "epoch": 0.6466876971608833, + "grad_norm": 0.6534805827884597, + "learning_rate": 1.8750251931693265e-05, + "loss": 0.4538, + "step": 3280 + }, + { + "epoch": 0.6468848580441641, + "grad_norm": 0.5692551760672482, + "learning_rate": 1.8749501494063266e-05, + "loss": 0.4171, + "step": 3281 + }, + { + "epoch": 0.6470820189274448, + "grad_norm": 0.6447690381178922, + "learning_rate": 1.874875084621945e-05, + "loss": 0.4436, + "step": 3282 + }, + { + "epoch": 0.6472791798107256, + "grad_norm": 0.531474927660677, + "learning_rate": 1.8747999988179846e-05, + "loss": 0.4287, + "step": 3283 + }, + { + "epoch": 0.6474763406940063, + "grad_norm": 0.5470438621167268, + "learning_rate": 1.8747248919962498e-05, + "loss": 0.4417, + "step": 3284 + }, + { + "epoch": 0.6476735015772871, + "grad_norm": 0.5741916924212119, + "learning_rate": 1.874649764158545e-05, + "loss": 0.425, + "step": 3285 + }, + { + "epoch": 0.6478706624605678, + "grad_norm": 0.5348263622053581, + "learning_rate": 1.8745746153066756e-05, + "loss": 0.4169, + "step": 3286 + }, + { + "epoch": 0.6480678233438486, + "grad_norm": 0.5607951226714928, + "learning_rate": 1.8744994454424463e-05, + "loss": 0.4375, + "step": 3287 + }, + { + "epoch": 0.6482649842271293, + "grad_norm": 0.6004093976459436, + "learning_rate": 1.874424254567664e-05, + "loss": 0.4482, + "step": 3288 + }, + { + "epoch": 0.6484621451104101, + "grad_norm": 0.5668829924890922, + "learning_rate": 1.8743490426841346e-05, + "loss": 0.4462, + "step": 3289 + }, + { + "epoch": 0.6486593059936908, + "grad_norm": 0.7152077392581754, + "learning_rate": 1.8742738097936653e-05, + "loss": 0.4899, + "step": 3290 + }, + { + "epoch": 0.6488564668769716, + "grad_norm": 0.5722777962307597, + "learning_rate": 1.874198555898064e-05, + "loss": 0.4618, + "step": 3291 + }, + { + "epoch": 0.6490536277602523, + "grad_norm": 0.6284802948397332, + "learning_rate": 1.874123280999138e-05, + "loss": 0.415, + "step": 3292 + }, + { + "epoch": 0.6492507886435331, + "grad_norm": 0.6162363187667032, + "learning_rate": 1.8740479850986962e-05, + "loss": 0.4557, + "step": 3293 + }, + { + "epoch": 0.6494479495268138, + "grad_norm": 0.616624529665098, + "learning_rate": 1.8739726681985478e-05, + "loss": 0.4604, + "step": 3294 + }, + { + "epoch": 0.6496451104100947, + "grad_norm": 0.5855562263616807, + "learning_rate": 1.8738973303005024e-05, + "loss": 0.4637, + "step": 3295 + }, + { + "epoch": 0.6498422712933754, + "grad_norm": 0.6041841740545615, + "learning_rate": 1.87382197140637e-05, + "loss": 0.4623, + "step": 3296 + }, + { + "epoch": 0.6500394321766562, + "grad_norm": 4.533496170121018, + "learning_rate": 1.873746591517961e-05, + "loss": 0.4471, + "step": 3297 + }, + { + "epoch": 0.6502365930599369, + "grad_norm": 0.6624890927859284, + "learning_rate": 1.873671190637086e-05, + "loss": 0.4267, + "step": 3298 + }, + { + "epoch": 0.6504337539432177, + "grad_norm": 0.5905974277110145, + "learning_rate": 1.8735957687655577e-05, + "loss": 0.4125, + "step": 3299 + }, + { + "epoch": 0.6506309148264984, + "grad_norm": 0.6041600898310406, + "learning_rate": 1.8735203259051872e-05, + "loss": 0.454, + "step": 3300 + }, + { + "epoch": 0.6508280757097792, + "grad_norm": 0.6390231671362254, + "learning_rate": 1.8734448620577875e-05, + "loss": 0.4887, + "step": 3301 + }, + { + "epoch": 0.6510252365930599, + "grad_norm": 0.6117066782843804, + "learning_rate": 1.8733693772251716e-05, + "loss": 0.4556, + "step": 3302 + }, + { + "epoch": 0.6512223974763407, + "grad_norm": 0.5711747386901173, + "learning_rate": 1.873293871409153e-05, + "loss": 0.4204, + "step": 3303 + }, + { + "epoch": 0.6514195583596214, + "grad_norm": 0.6143464512727876, + "learning_rate": 1.8732183446115462e-05, + "loss": 0.4434, + "step": 3304 + }, + { + "epoch": 0.6516167192429022, + "grad_norm": 0.5417403420673808, + "learning_rate": 1.8731427968341654e-05, + "loss": 0.4246, + "step": 3305 + }, + { + "epoch": 0.651813880126183, + "grad_norm": 0.6890669641904964, + "learning_rate": 1.8730672280788254e-05, + "loss": 0.4989, + "step": 3306 + }, + { + "epoch": 0.6520110410094637, + "grad_norm": 0.5499879128399455, + "learning_rate": 1.8729916383473427e-05, + "loss": 0.4529, + "step": 3307 + }, + { + "epoch": 0.6522082018927445, + "grad_norm": 0.5830578121646517, + "learning_rate": 1.8729160276415325e-05, + "loss": 0.4023, + "step": 3308 + }, + { + "epoch": 0.6524053627760252, + "grad_norm": 0.5667181804512541, + "learning_rate": 1.872840395963212e-05, + "loss": 0.4491, + "step": 3309 + }, + { + "epoch": 0.652602523659306, + "grad_norm": 0.5766356519567429, + "learning_rate": 1.872764743314198e-05, + "loss": 0.4666, + "step": 3310 + }, + { + "epoch": 0.6527996845425867, + "grad_norm": 0.5432561051881504, + "learning_rate": 1.872689069696308e-05, + "loss": 0.4311, + "step": 3311 + }, + { + "epoch": 0.6529968454258676, + "grad_norm": 0.5811877061331849, + "learning_rate": 1.8726133751113605e-05, + "loss": 0.4351, + "step": 3312 + }, + { + "epoch": 0.6531940063091483, + "grad_norm": 0.5756974097066245, + "learning_rate": 1.872537659561174e-05, + "loss": 0.4561, + "step": 3313 + }, + { + "epoch": 0.6533911671924291, + "grad_norm": 0.5313182062056155, + "learning_rate": 1.8724619230475675e-05, + "loss": 0.3902, + "step": 3314 + }, + { + "epoch": 0.6535883280757098, + "grad_norm": 0.5497309033214044, + "learning_rate": 1.872386165572361e-05, + "loss": 0.4263, + "step": 3315 + }, + { + "epoch": 0.6537854889589906, + "grad_norm": 0.5466469389230997, + "learning_rate": 1.872310387137374e-05, + "loss": 0.4224, + "step": 3316 + }, + { + "epoch": 0.6539826498422713, + "grad_norm": 2.202702375918148, + "learning_rate": 1.872234587744427e-05, + "loss": 0.4472, + "step": 3317 + }, + { + "epoch": 0.6541798107255521, + "grad_norm": 0.5869148492580374, + "learning_rate": 1.8721587673953425e-05, + "loss": 0.4671, + "step": 3318 + }, + { + "epoch": 0.6543769716088328, + "grad_norm": 0.5427996690221996, + "learning_rate": 1.8720829260919407e-05, + "loss": 0.4289, + "step": 3319 + }, + { + "epoch": 0.6545741324921136, + "grad_norm": 0.5656017555414672, + "learning_rate": 1.8720070638360447e-05, + "loss": 0.4294, + "step": 3320 + }, + { + "epoch": 0.6547712933753943, + "grad_norm": 4.358651338020947, + "learning_rate": 1.8719311806294768e-05, + "loss": 0.5382, + "step": 3321 + }, + { + "epoch": 0.6549684542586751, + "grad_norm": 0.6865845679707644, + "learning_rate": 1.87185527647406e-05, + "loss": 0.4885, + "step": 3322 + }, + { + "epoch": 0.6551656151419558, + "grad_norm": 1.9321821824405176, + "learning_rate": 1.871779351371618e-05, + "loss": 0.4279, + "step": 3323 + }, + { + "epoch": 0.6553627760252366, + "grad_norm": 0.6207436028024012, + "learning_rate": 1.8717034053239748e-05, + "loss": 0.4415, + "step": 3324 + }, + { + "epoch": 0.6555599369085173, + "grad_norm": 0.7726441299810708, + "learning_rate": 1.8716274383329556e-05, + "loss": 0.4166, + "step": 3325 + }, + { + "epoch": 0.6557570977917981, + "grad_norm": 1.3763107640662533, + "learning_rate": 1.8715514504003854e-05, + "loss": 0.4449, + "step": 3326 + }, + { + "epoch": 0.6559542586750788, + "grad_norm": 0.6496821405637063, + "learning_rate": 1.8714754415280894e-05, + "loss": 0.451, + "step": 3327 + }, + { + "epoch": 0.6561514195583596, + "grad_norm": 0.6130036203951224, + "learning_rate": 1.8713994117178945e-05, + "loss": 0.4593, + "step": 3328 + }, + { + "epoch": 0.6563485804416404, + "grad_norm": 0.5725117299905743, + "learning_rate": 1.8713233609716266e-05, + "loss": 0.4278, + "step": 3329 + }, + { + "epoch": 0.6565457413249212, + "grad_norm": 0.5445827811729219, + "learning_rate": 1.8712472892911132e-05, + "loss": 0.4101, + "step": 3330 + }, + { + "epoch": 0.6567429022082019, + "grad_norm": 0.7500084982788475, + "learning_rate": 1.8711711966781826e-05, + "loss": 0.4248, + "step": 3331 + }, + { + "epoch": 0.6569400630914827, + "grad_norm": 0.6621959429007946, + "learning_rate": 1.8710950831346623e-05, + "loss": 0.4908, + "step": 3332 + }, + { + "epoch": 0.6571372239747634, + "grad_norm": 2.2676905068146054, + "learning_rate": 1.871018948662381e-05, + "loss": 0.498, + "step": 3333 + }, + { + "epoch": 0.6573343848580442, + "grad_norm": 0.7294312287786445, + "learning_rate": 1.870942793263168e-05, + "loss": 0.4693, + "step": 3334 + }, + { + "epoch": 0.6575315457413249, + "grad_norm": 0.5923749861108738, + "learning_rate": 1.870866616938853e-05, + "loss": 0.4634, + "step": 3335 + }, + { + "epoch": 0.6577287066246057, + "grad_norm": 0.6623239066356625, + "learning_rate": 1.870790419691266e-05, + "loss": 0.4434, + "step": 3336 + }, + { + "epoch": 0.6579258675078864, + "grad_norm": 0.6507535977225328, + "learning_rate": 1.8707142015222386e-05, + "loss": 0.4619, + "step": 3337 + }, + { + "epoch": 0.6581230283911672, + "grad_norm": 3.387004003722433, + "learning_rate": 1.870637962433601e-05, + "loss": 0.5185, + "step": 3338 + }, + { + "epoch": 0.6583201892744479, + "grad_norm": 0.7986315924274782, + "learning_rate": 1.870561702427185e-05, + "loss": 0.441, + "step": 3339 + }, + { + "epoch": 0.6585173501577287, + "grad_norm": 0.5803415335213525, + "learning_rate": 1.870485421504823e-05, + "loss": 0.4293, + "step": 3340 + }, + { + "epoch": 0.6587145110410094, + "grad_norm": 0.7703993364474427, + "learning_rate": 1.870409119668348e-05, + "loss": 0.4929, + "step": 3341 + }, + { + "epoch": 0.6589116719242902, + "grad_norm": 0.6513075944835972, + "learning_rate": 1.870332796919593e-05, + "loss": 0.4607, + "step": 3342 + }, + { + "epoch": 0.6591088328075709, + "grad_norm": 0.6750849955698458, + "learning_rate": 1.8702564532603917e-05, + "loss": 0.439, + "step": 3343 + }, + { + "epoch": 0.6593059936908517, + "grad_norm": 0.6375890019239625, + "learning_rate": 1.8701800886925784e-05, + "loss": 0.4726, + "step": 3344 + }, + { + "epoch": 0.6595031545741324, + "grad_norm": 0.65991248253205, + "learning_rate": 1.8701037032179873e-05, + "loss": 0.4631, + "step": 3345 + }, + { + "epoch": 0.6597003154574133, + "grad_norm": 0.8742592115103047, + "learning_rate": 1.870027296838454e-05, + "loss": 0.4502, + "step": 3346 + }, + { + "epoch": 0.659897476340694, + "grad_norm": 0.6581694810065215, + "learning_rate": 1.8699508695558145e-05, + "loss": 0.4371, + "step": 3347 + }, + { + "epoch": 0.6600946372239748, + "grad_norm": 0.7348305014231767, + "learning_rate": 1.869874421371905e-05, + "loss": 0.4287, + "step": 3348 + }, + { + "epoch": 0.6602917981072555, + "grad_norm": 0.6216191228474292, + "learning_rate": 1.8697979522885617e-05, + "loss": 0.441, + "step": 3349 + }, + { + "epoch": 0.6604889589905363, + "grad_norm": 0.7451526119076238, + "learning_rate": 1.8697214623076222e-05, + "loss": 0.4419, + "step": 3350 + }, + { + "epoch": 0.660686119873817, + "grad_norm": 0.5518756188697328, + "learning_rate": 1.8696449514309244e-05, + "loss": 0.4176, + "step": 3351 + }, + { + "epoch": 0.6608832807570978, + "grad_norm": 0.6734594739824642, + "learning_rate": 1.869568419660306e-05, + "loss": 0.4669, + "step": 3352 + }, + { + "epoch": 0.6610804416403786, + "grad_norm": 0.679456190018415, + "learning_rate": 1.8694918669976063e-05, + "loss": 0.456, + "step": 3353 + }, + { + "epoch": 0.6612776025236593, + "grad_norm": 0.5841633102935858, + "learning_rate": 1.8694152934446642e-05, + "loss": 0.4252, + "step": 3354 + }, + { + "epoch": 0.6614747634069401, + "grad_norm": 0.5737135644711173, + "learning_rate": 1.8693386990033194e-05, + "loss": 0.4152, + "step": 3355 + }, + { + "epoch": 0.6616719242902208, + "grad_norm": 0.6231408225081773, + "learning_rate": 1.8692620836754124e-05, + "loss": 0.4393, + "step": 3356 + }, + { + "epoch": 0.6618690851735016, + "grad_norm": 0.7428753795745229, + "learning_rate": 1.8691854474627838e-05, + "loss": 0.4596, + "step": 3357 + }, + { + "epoch": 0.6620662460567823, + "grad_norm": 0.6536356718789794, + "learning_rate": 1.8691087903672752e-05, + "loss": 0.4548, + "step": 3358 + }, + { + "epoch": 0.6622634069400631, + "grad_norm": 0.6121171634606317, + "learning_rate": 1.8690321123907277e-05, + "loss": 0.4633, + "step": 3359 + }, + { + "epoch": 0.6624605678233438, + "grad_norm": 0.5626255419013269, + "learning_rate": 1.868955413534984e-05, + "loss": 0.443, + "step": 3360 + }, + { + "epoch": 0.6626577287066246, + "grad_norm": 0.5728350774998016, + "learning_rate": 1.8688786938018866e-05, + "loss": 0.4273, + "step": 3361 + }, + { + "epoch": 0.6628548895899053, + "grad_norm": 0.5765941930596808, + "learning_rate": 1.8688019531932788e-05, + "loss": 0.4511, + "step": 3362 + }, + { + "epoch": 0.6630520504731862, + "grad_norm": 0.5622738400789903, + "learning_rate": 1.8687251917110045e-05, + "loss": 0.4643, + "step": 3363 + }, + { + "epoch": 0.6632492113564669, + "grad_norm": 0.5415771750284268, + "learning_rate": 1.8686484093569078e-05, + "loss": 0.4228, + "step": 3364 + }, + { + "epoch": 0.6634463722397477, + "grad_norm": 0.5508756543268856, + "learning_rate": 1.868571606132834e-05, + "loss": 0.452, + "step": 3365 + }, + { + "epoch": 0.6636435331230284, + "grad_norm": 0.6135272218599022, + "learning_rate": 1.8684947820406273e-05, + "loss": 0.4667, + "step": 3366 + }, + { + "epoch": 0.6638406940063092, + "grad_norm": 0.5960455949438687, + "learning_rate": 1.8684179370821343e-05, + "loss": 0.4686, + "step": 3367 + }, + { + "epoch": 0.6640378548895899, + "grad_norm": 0.5480090387207861, + "learning_rate": 1.8683410712592015e-05, + "loss": 0.448, + "step": 3368 + }, + { + "epoch": 0.6642350157728707, + "grad_norm": 0.6217390232273009, + "learning_rate": 1.8682641845736748e-05, + "loss": 0.4106, + "step": 3369 + }, + { + "epoch": 0.6644321766561514, + "grad_norm": 0.5922247859816271, + "learning_rate": 1.8681872770274013e-05, + "loss": 0.4577, + "step": 3370 + }, + { + "epoch": 0.6646293375394322, + "grad_norm": 0.556590095986522, + "learning_rate": 1.86811034862223e-05, + "loss": 0.4541, + "step": 3371 + }, + { + "epoch": 0.6648264984227129, + "grad_norm": 0.5422277567079224, + "learning_rate": 1.8680333993600084e-05, + "loss": 0.4413, + "step": 3372 + }, + { + "epoch": 0.6650236593059937, + "grad_norm": 0.609846737333361, + "learning_rate": 1.867956429242585e-05, + "loss": 0.4439, + "step": 3373 + }, + { + "epoch": 0.6652208201892744, + "grad_norm": 0.5366562298747772, + "learning_rate": 1.86787943827181e-05, + "loss": 0.4227, + "step": 3374 + }, + { + "epoch": 0.6654179810725552, + "grad_norm": 0.59012017930504, + "learning_rate": 1.8678024264495323e-05, + "loss": 0.4351, + "step": 3375 + }, + { + "epoch": 0.6656151419558359, + "grad_norm": 0.5771194135297009, + "learning_rate": 1.8677253937776024e-05, + "loss": 0.4206, + "step": 3376 + }, + { + "epoch": 0.6658123028391167, + "grad_norm": 0.5678453290335862, + "learning_rate": 1.8676483402578714e-05, + "loss": 0.458, + "step": 3377 + }, + { + "epoch": 0.6660094637223974, + "grad_norm": 0.5848984447281752, + "learning_rate": 1.86757126589219e-05, + "loss": 0.4294, + "step": 3378 + }, + { + "epoch": 0.6662066246056783, + "grad_norm": 0.5612739049068476, + "learning_rate": 1.8674941706824104e-05, + "loss": 0.4335, + "step": 3379 + }, + { + "epoch": 0.666403785488959, + "grad_norm": 0.5554127683707469, + "learning_rate": 1.8674170546303846e-05, + "loss": 0.4216, + "step": 3380 + }, + { + "epoch": 0.6666009463722398, + "grad_norm": 0.5356096683041639, + "learning_rate": 1.8673399177379657e-05, + "loss": 0.417, + "step": 3381 + }, + { + "epoch": 0.6667981072555205, + "grad_norm": 0.5413485188215648, + "learning_rate": 1.8672627600070068e-05, + "loss": 0.4244, + "step": 3382 + }, + { + "epoch": 0.6669952681388013, + "grad_norm": 0.5305208256677184, + "learning_rate": 1.8671855814393617e-05, + "loss": 0.4153, + "step": 3383 + }, + { + "epoch": 0.667192429022082, + "grad_norm": 0.5903750497070852, + "learning_rate": 1.8671083820368846e-05, + "loss": 0.4633, + "step": 3384 + }, + { + "epoch": 0.6673895899053628, + "grad_norm": 1.389773255933898, + "learning_rate": 1.8670311618014307e-05, + "loss": 0.4518, + "step": 3385 + }, + { + "epoch": 0.6675867507886435, + "grad_norm": 0.6734664022072558, + "learning_rate": 1.8669539207348544e-05, + "loss": 0.4303, + "step": 3386 + }, + { + "epoch": 0.6677839116719243, + "grad_norm": 0.5925325374467875, + "learning_rate": 1.8668766588390122e-05, + "loss": 0.4222, + "step": 3387 + }, + { + "epoch": 0.667981072555205, + "grad_norm": 0.5505241807065062, + "learning_rate": 1.8667993761157602e-05, + "loss": 0.4407, + "step": 3388 + }, + { + "epoch": 0.6681782334384858, + "grad_norm": 0.7188824261126605, + "learning_rate": 1.866722072566955e-05, + "loss": 0.4378, + "step": 3389 + }, + { + "epoch": 0.6683753943217665, + "grad_norm": 1.6862636115429694, + "learning_rate": 1.8666447481944542e-05, + "loss": 0.4472, + "step": 3390 + }, + { + "epoch": 0.6685725552050473, + "grad_norm": 0.535071580188235, + "learning_rate": 1.8665674030001154e-05, + "loss": 0.4324, + "step": 3391 + }, + { + "epoch": 0.668769716088328, + "grad_norm": 0.5720087515109528, + "learning_rate": 1.866490036985797e-05, + "loss": 0.4285, + "step": 3392 + }, + { + "epoch": 0.6689668769716088, + "grad_norm": 0.5753023247784349, + "learning_rate": 1.8664126501533576e-05, + "loss": 0.471, + "step": 3393 + }, + { + "epoch": 0.6691640378548895, + "grad_norm": 0.5438723970201407, + "learning_rate": 1.8663352425046564e-05, + "loss": 0.4349, + "step": 3394 + }, + { + "epoch": 0.6693611987381703, + "grad_norm": 0.565118624393037, + "learning_rate": 1.8662578140415535e-05, + "loss": 0.4534, + "step": 3395 + }, + { + "epoch": 0.669558359621451, + "grad_norm": 0.6635843691593936, + "learning_rate": 1.866180364765909e-05, + "loss": 0.4541, + "step": 3396 + }, + { + "epoch": 0.6697555205047319, + "grad_norm": 0.5553454488274417, + "learning_rate": 1.8661028946795837e-05, + "loss": 0.4164, + "step": 3397 + }, + { + "epoch": 0.6699526813880127, + "grad_norm": 2.036398178901347, + "learning_rate": 1.866025403784439e-05, + "loss": 0.4708, + "step": 3398 + }, + { + "epoch": 0.6701498422712934, + "grad_norm": 0.5825487984269258, + "learning_rate": 1.8659478920823364e-05, + "loss": 0.4196, + "step": 3399 + }, + { + "epoch": 0.6703470031545742, + "grad_norm": 0.5882347623158389, + "learning_rate": 1.865870359575138e-05, + "loss": 0.4376, + "step": 3400 + }, + { + "epoch": 0.6705441640378549, + "grad_norm": 0.5537130860684163, + "learning_rate": 1.8657928062647075e-05, + "loss": 0.4292, + "step": 3401 + }, + { + "epoch": 0.6707413249211357, + "grad_norm": 0.6278956340355503, + "learning_rate": 1.8657152321529075e-05, + "loss": 0.4216, + "step": 3402 + }, + { + "epoch": 0.6709384858044164, + "grad_norm": 0.5849954721222042, + "learning_rate": 1.8656376372416017e-05, + "loss": 0.472, + "step": 3403 + }, + { + "epoch": 0.6711356466876972, + "grad_norm": 0.6501311313277848, + "learning_rate": 1.8655600215326547e-05, + "loss": 0.4157, + "step": 3404 + }, + { + "epoch": 0.6713328075709779, + "grad_norm": 0.60533862716353, + "learning_rate": 1.8654823850279312e-05, + "loss": 0.4249, + "step": 3405 + }, + { + "epoch": 0.6715299684542587, + "grad_norm": 0.7045177259010618, + "learning_rate": 1.8654047277292962e-05, + "loss": 0.5035, + "step": 3406 + }, + { + "epoch": 0.6717271293375394, + "grad_norm": 0.5709556862264039, + "learning_rate": 1.8653270496386163e-05, + "loss": 0.4189, + "step": 3407 + }, + { + "epoch": 0.6719242902208202, + "grad_norm": 0.657277026760384, + "learning_rate": 1.8652493507577564e-05, + "loss": 0.4936, + "step": 3408 + }, + { + "epoch": 0.6721214511041009, + "grad_norm": 0.8211869369723545, + "learning_rate": 1.8651716310885845e-05, + "loss": 0.4697, + "step": 3409 + }, + { + "epoch": 0.6723186119873817, + "grad_norm": 0.5634640318066427, + "learning_rate": 1.8650938906329674e-05, + "loss": 0.4266, + "step": 3410 + }, + { + "epoch": 0.6725157728706624, + "grad_norm": 0.6140117782031614, + "learning_rate": 1.865016129392773e-05, + "loss": 0.4469, + "step": 3411 + }, + { + "epoch": 0.6727129337539433, + "grad_norm": 0.6385762621009046, + "learning_rate": 1.864938347369869e-05, + "loss": 0.471, + "step": 3412 + }, + { + "epoch": 0.672910094637224, + "grad_norm": 0.5758795206307361, + "learning_rate": 1.8648605445661256e-05, + "loss": 0.4321, + "step": 3413 + }, + { + "epoch": 0.6731072555205048, + "grad_norm": 0.633725193056609, + "learning_rate": 1.8647827209834105e-05, + "loss": 0.469, + "step": 3414 + }, + { + "epoch": 0.6733044164037855, + "grad_norm": 0.5649636256512869, + "learning_rate": 1.864704876623594e-05, + "loss": 0.3962, + "step": 3415 + }, + { + "epoch": 0.6735015772870663, + "grad_norm": 0.5541617307127727, + "learning_rate": 1.8646270114885467e-05, + "loss": 0.4245, + "step": 3416 + }, + { + "epoch": 0.673698738170347, + "grad_norm": 0.5576784865143702, + "learning_rate": 1.864549125580139e-05, + "loss": 0.4479, + "step": 3417 + }, + { + "epoch": 0.6738958990536278, + "grad_norm": 0.6606800054432356, + "learning_rate": 1.8644712189002426e-05, + "loss": 0.4209, + "step": 3418 + }, + { + "epoch": 0.6740930599369085, + "grad_norm": 0.5996093302287164, + "learning_rate": 1.864393291450729e-05, + "loss": 0.4806, + "step": 3419 + }, + { + "epoch": 0.6742902208201893, + "grad_norm": 0.5796582195346354, + "learning_rate": 1.8643153432334703e-05, + "loss": 0.4231, + "step": 3420 + }, + { + "epoch": 0.67448738170347, + "grad_norm": 0.6804211689949103, + "learning_rate": 1.8642373742503395e-05, + "loss": 0.469, + "step": 3421 + }, + { + "epoch": 0.6746845425867508, + "grad_norm": 0.8018195345405356, + "learning_rate": 1.8641593845032098e-05, + "loss": 0.4612, + "step": 3422 + }, + { + "epoch": 0.6748817034700315, + "grad_norm": 0.5430532048956593, + "learning_rate": 1.864081373993955e-05, + "loss": 0.447, + "step": 3423 + }, + { + "epoch": 0.6750788643533123, + "grad_norm": 0.5081290249970105, + "learning_rate": 1.8640033427244497e-05, + "loss": 0.3789, + "step": 3424 + }, + { + "epoch": 0.675276025236593, + "grad_norm": 0.6128416516260621, + "learning_rate": 1.863925290696568e-05, + "loss": 0.4795, + "step": 3425 + }, + { + "epoch": 0.6754731861198738, + "grad_norm": 0.5433092528759821, + "learning_rate": 1.8638472179121855e-05, + "loss": 0.4399, + "step": 3426 + }, + { + "epoch": 0.6756703470031545, + "grad_norm": 6.721875316952158, + "learning_rate": 1.863769124373178e-05, + "loss": 0.4689, + "step": 3427 + }, + { + "epoch": 0.6758675078864353, + "grad_norm": 0.7065414307044805, + "learning_rate": 1.8636910100814216e-05, + "loss": 0.457, + "step": 3428 + }, + { + "epoch": 0.676064668769716, + "grad_norm": 0.5525113473136398, + "learning_rate": 1.863612875038793e-05, + "loss": 0.4101, + "step": 3429 + }, + { + "epoch": 0.6762618296529969, + "grad_norm": 0.6783813336542089, + "learning_rate": 1.86353471924717e-05, + "loss": 0.4869, + "step": 3430 + }, + { + "epoch": 0.6764589905362776, + "grad_norm": 0.7824903261317586, + "learning_rate": 1.8634565427084295e-05, + "loss": 0.4584, + "step": 3431 + }, + { + "epoch": 0.6766561514195584, + "grad_norm": 0.6900514518883072, + "learning_rate": 1.8633783454244506e-05, + "loss": 0.4408, + "step": 3432 + }, + { + "epoch": 0.6768533123028391, + "grad_norm": 0.6615900688611679, + "learning_rate": 1.8633001273971115e-05, + "loss": 0.4558, + "step": 3433 + }, + { + "epoch": 0.6770504731861199, + "grad_norm": 0.5567854903968781, + "learning_rate": 1.863221888628292e-05, + "loss": 0.4031, + "step": 3434 + }, + { + "epoch": 0.6772476340694006, + "grad_norm": 0.6417523757757003, + "learning_rate": 1.8631436291198707e-05, + "loss": 0.4672, + "step": 3435 + }, + { + "epoch": 0.6774447949526814, + "grad_norm": 0.6024057489717937, + "learning_rate": 1.863065348873729e-05, + "loss": 0.4381, + "step": 3436 + }, + { + "epoch": 0.6776419558359621, + "grad_norm": 0.6645203432793743, + "learning_rate": 1.8629870478917477e-05, + "loss": 0.4424, + "step": 3437 + }, + { + "epoch": 0.6778391167192429, + "grad_norm": 0.5526525607402512, + "learning_rate": 1.8629087261758072e-05, + "loss": 0.4154, + "step": 3438 + }, + { + "epoch": 0.6780362776025236, + "grad_norm": 0.5995871293018885, + "learning_rate": 1.8628303837277893e-05, + "loss": 0.4304, + "step": 3439 + }, + { + "epoch": 0.6782334384858044, + "grad_norm": 0.5743129816438846, + "learning_rate": 1.8627520205495772e-05, + "loss": 0.448, + "step": 3440 + }, + { + "epoch": 0.6784305993690851, + "grad_norm": 0.5829067346816689, + "learning_rate": 1.862673636643053e-05, + "loss": 0.4211, + "step": 3441 + }, + { + "epoch": 0.6786277602523659, + "grad_norm": 0.6220895944597309, + "learning_rate": 1.8625952320100998e-05, + "loss": 0.4241, + "step": 3442 + }, + { + "epoch": 0.6788249211356467, + "grad_norm": 0.8700470116975232, + "learning_rate": 1.8625168066526017e-05, + "loss": 0.4635, + "step": 3443 + }, + { + "epoch": 0.6790220820189274, + "grad_norm": 0.572249467365147, + "learning_rate": 1.8624383605724422e-05, + "loss": 0.4684, + "step": 3444 + }, + { + "epoch": 0.6792192429022083, + "grad_norm": 0.5329766253780658, + "learning_rate": 1.8623598937715072e-05, + "loss": 0.4132, + "step": 3445 + }, + { + "epoch": 0.679416403785489, + "grad_norm": 0.5378353899707224, + "learning_rate": 1.8622814062516807e-05, + "loss": 0.4514, + "step": 3446 + }, + { + "epoch": 0.6796135646687698, + "grad_norm": 0.6043076943443687, + "learning_rate": 1.8622028980148494e-05, + "loss": 0.4195, + "step": 3447 + }, + { + "epoch": 0.6798107255520505, + "grad_norm": 0.5415001575820032, + "learning_rate": 1.8621243690628993e-05, + "loss": 0.4388, + "step": 3448 + }, + { + "epoch": 0.6800078864353313, + "grad_norm": 0.5715084310965, + "learning_rate": 1.8620458193977166e-05, + "loss": 0.4377, + "step": 3449 + }, + { + "epoch": 0.680205047318612, + "grad_norm": 0.5477132137010473, + "learning_rate": 1.861967249021189e-05, + "loss": 0.4166, + "step": 3450 + }, + { + "epoch": 0.6804022082018928, + "grad_norm": 0.513438305978723, + "learning_rate": 1.861888657935204e-05, + "loss": 0.4098, + "step": 3451 + }, + { + "epoch": 0.6805993690851735, + "grad_norm": 0.7909891794930992, + "learning_rate": 1.8618100461416503e-05, + "loss": 0.4311, + "step": 3452 + }, + { + "epoch": 0.6807965299684543, + "grad_norm": 0.5263405808562206, + "learning_rate": 1.8617314136424157e-05, + "loss": 0.4381, + "step": 3453 + }, + { + "epoch": 0.680993690851735, + "grad_norm": 0.5858050604116924, + "learning_rate": 1.8616527604393903e-05, + "loss": 0.4383, + "step": 3454 + }, + { + "epoch": 0.6811908517350158, + "grad_norm": 0.56256951114871, + "learning_rate": 1.8615740865344632e-05, + "loss": 0.4517, + "step": 3455 + }, + { + "epoch": 0.6813880126182965, + "grad_norm": 1.956983000612891, + "learning_rate": 1.861495391929525e-05, + "loss": 0.4863, + "step": 3456 + }, + { + "epoch": 0.6815851735015773, + "grad_norm": 224.166458708411, + "learning_rate": 1.8614166766264662e-05, + "loss": 0.8088, + "step": 3457 + }, + { + "epoch": 0.681782334384858, + "grad_norm": 0.7066319163990039, + "learning_rate": 1.8613379406271784e-05, + "loss": 0.4568, + "step": 3458 + }, + { + "epoch": 0.6819794952681388, + "grad_norm": 0.530318401808118, + "learning_rate": 1.8612591839335526e-05, + "loss": 0.4104, + "step": 3459 + }, + { + "epoch": 0.6821766561514195, + "grad_norm": 0.6416231249123863, + "learning_rate": 1.861180406547481e-05, + "loss": 0.4504, + "step": 3460 + }, + { + "epoch": 0.6823738170347003, + "grad_norm": 0.6016468271692968, + "learning_rate": 1.8611016084708572e-05, + "loss": 0.4866, + "step": 3461 + }, + { + "epoch": 0.682570977917981, + "grad_norm": 0.5665263512082256, + "learning_rate": 1.8610227897055736e-05, + "loss": 0.4272, + "step": 3462 + }, + { + "epoch": 0.6827681388012619, + "grad_norm": 0.5951978340725875, + "learning_rate": 1.8609439502535244e-05, + "loss": 0.47, + "step": 3463 + }, + { + "epoch": 0.6829652996845426, + "grad_norm": 0.5145073667769414, + "learning_rate": 1.8608650901166034e-05, + "loss": 0.3968, + "step": 3464 + }, + { + "epoch": 0.6831624605678234, + "grad_norm": 0.6596358236905893, + "learning_rate": 1.8607862092967048e-05, + "loss": 0.4825, + "step": 3465 + }, + { + "epoch": 0.6833596214511041, + "grad_norm": 0.5633284833079664, + "learning_rate": 1.8607073077957246e-05, + "loss": 0.4295, + "step": 3466 + }, + { + "epoch": 0.6835567823343849, + "grad_norm": 0.5986090013281019, + "learning_rate": 1.8606283856155585e-05, + "loss": 0.4758, + "step": 3467 + }, + { + "epoch": 0.6837539432176656, + "grad_norm": 0.6051379255945182, + "learning_rate": 1.8605494427581022e-05, + "loss": 0.4411, + "step": 3468 + }, + { + "epoch": 0.6839511041009464, + "grad_norm": 0.5119522262994246, + "learning_rate": 1.8604704792252524e-05, + "loss": 0.4063, + "step": 3469 + }, + { + "epoch": 0.6841482649842271, + "grad_norm": 0.5838237384793697, + "learning_rate": 1.8603914950189063e-05, + "loss": 0.4083, + "step": 3470 + }, + { + "epoch": 0.6843454258675079, + "grad_norm": 0.5495244130738517, + "learning_rate": 1.860312490140962e-05, + "loss": 0.4398, + "step": 3471 + }, + { + "epoch": 0.6845425867507886, + "grad_norm": 0.8373973638550604, + "learning_rate": 1.860233464593317e-05, + "loss": 0.4274, + "step": 3472 + }, + { + "epoch": 0.6847397476340694, + "grad_norm": 0.6095069405419896, + "learning_rate": 1.8601544183778707e-05, + "loss": 0.4282, + "step": 3473 + }, + { + "epoch": 0.6849369085173501, + "grad_norm": 0.6313696871900962, + "learning_rate": 1.8600753514965215e-05, + "loss": 0.4644, + "step": 3474 + }, + { + "epoch": 0.6851340694006309, + "grad_norm": 0.6331745807987341, + "learning_rate": 1.8599962639511692e-05, + "loss": 0.4227, + "step": 3475 + }, + { + "epoch": 0.6853312302839116, + "grad_norm": 0.6271712528145608, + "learning_rate": 1.8599171557437147e-05, + "loss": 0.4182, + "step": 3476 + }, + { + "epoch": 0.6855283911671924, + "grad_norm": 0.6822672026358032, + "learning_rate": 1.8598380268760573e-05, + "loss": 0.4609, + "step": 3477 + }, + { + "epoch": 0.6857255520504731, + "grad_norm": 0.6394977318604866, + "learning_rate": 1.8597588773500997e-05, + "loss": 0.4657, + "step": 3478 + }, + { + "epoch": 0.685922712933754, + "grad_norm": 0.5473583517133394, + "learning_rate": 1.8596797071677422e-05, + "loss": 0.4207, + "step": 3479 + }, + { + "epoch": 0.6861198738170347, + "grad_norm": 0.5332831738588625, + "learning_rate": 1.8596005163308874e-05, + "loss": 0.393, + "step": 3480 + }, + { + "epoch": 0.6863170347003155, + "grad_norm": 0.5850491469644991, + "learning_rate": 1.859521304841438e-05, + "loss": 0.4544, + "step": 3481 + }, + { + "epoch": 0.6865141955835962, + "grad_norm": 0.5298743941736833, + "learning_rate": 1.859442072701297e-05, + "loss": 0.4558, + "step": 3482 + }, + { + "epoch": 0.686711356466877, + "grad_norm": 0.5914438587794221, + "learning_rate": 1.8593628199123684e-05, + "loss": 0.4696, + "step": 3483 + }, + { + "epoch": 0.6869085173501577, + "grad_norm": 0.5849436915983998, + "learning_rate": 1.8592835464765557e-05, + "loss": 0.468, + "step": 3484 + }, + { + "epoch": 0.6871056782334385, + "grad_norm": 0.5517879662711394, + "learning_rate": 1.859204252395764e-05, + "loss": 0.4404, + "step": 3485 + }, + { + "epoch": 0.6873028391167192, + "grad_norm": 0.5432311116041397, + "learning_rate": 1.8591249376718984e-05, + "loss": 0.4182, + "step": 3486 + }, + { + "epoch": 0.6875, + "grad_norm": 0.5778253507921007, + "learning_rate": 1.859045602306864e-05, + "loss": 0.4714, + "step": 3487 + }, + { + "epoch": 0.6876971608832808, + "grad_norm": 0.5626709550212079, + "learning_rate": 1.8589662463025674e-05, + "loss": 0.4214, + "step": 3488 + }, + { + "epoch": 0.6878943217665615, + "grad_norm": 0.5828536367056488, + "learning_rate": 1.858886869660915e-05, + "loss": 0.4687, + "step": 3489 + }, + { + "epoch": 0.6880914826498423, + "grad_norm": 0.5546642607606428, + "learning_rate": 1.8588074723838136e-05, + "loss": 0.4452, + "step": 3490 + }, + { + "epoch": 0.688288643533123, + "grad_norm": 0.5748243298511659, + "learning_rate": 1.8587280544731712e-05, + "loss": 0.4597, + "step": 3491 + }, + { + "epoch": 0.6884858044164038, + "grad_norm": 0.5614432197404846, + "learning_rate": 1.858648615930896e-05, + "loss": 0.4094, + "step": 3492 + }, + { + "epoch": 0.6886829652996845, + "grad_norm": 0.5310867099166175, + "learning_rate": 1.8585691567588964e-05, + "loss": 0.4, + "step": 3493 + }, + { + "epoch": 0.6888801261829653, + "grad_norm": 0.5487952682416996, + "learning_rate": 1.858489676959081e-05, + "loss": 0.4263, + "step": 3494 + }, + { + "epoch": 0.689077287066246, + "grad_norm": 0.6769572158367133, + "learning_rate": 1.85841017653336e-05, + "loss": 0.4654, + "step": 3495 + }, + { + "epoch": 0.6892744479495269, + "grad_norm": 0.6393020125226531, + "learning_rate": 1.8583306554836432e-05, + "loss": 0.4583, + "step": 3496 + }, + { + "epoch": 0.6894716088328076, + "grad_norm": 0.559712814410196, + "learning_rate": 1.8582511138118413e-05, + "loss": 0.4298, + "step": 3497 + }, + { + "epoch": 0.6896687697160884, + "grad_norm": 0.602123934295947, + "learning_rate": 1.8581715515198652e-05, + "loss": 0.4675, + "step": 3498 + }, + { + "epoch": 0.6898659305993691, + "grad_norm": 0.5733778203272168, + "learning_rate": 1.8580919686096263e-05, + "loss": 0.4297, + "step": 3499 + }, + { + "epoch": 0.6900630914826499, + "grad_norm": 0.6829776698050027, + "learning_rate": 1.858012365083037e-05, + "loss": 0.4478, + "step": 3500 + }, + { + "epoch": 0.6902602523659306, + "grad_norm": 0.9792198683444031, + "learning_rate": 1.8579327409420094e-05, + "loss": 0.4772, + "step": 3501 + }, + { + "epoch": 0.6904574132492114, + "grad_norm": 0.594469172261072, + "learning_rate": 1.8578530961884574e-05, + "loss": 0.4306, + "step": 3502 + }, + { + "epoch": 0.6906545741324921, + "grad_norm": 0.6517540399351949, + "learning_rate": 1.8577734308242936e-05, + "loss": 0.4324, + "step": 3503 + }, + { + "epoch": 0.6908517350157729, + "grad_norm": 0.5804034407531462, + "learning_rate": 1.8576937448514323e-05, + "loss": 0.4582, + "step": 3504 + }, + { + "epoch": 0.6910488958990536, + "grad_norm": 0.5754381054150248, + "learning_rate": 1.857614038271788e-05, + "loss": 0.4442, + "step": 3505 + }, + { + "epoch": 0.6912460567823344, + "grad_norm": 0.5394282365890705, + "learning_rate": 1.857534311087276e-05, + "loss": 0.3904, + "step": 3506 + }, + { + "epoch": 0.6914432176656151, + "grad_norm": 0.5714584536292786, + "learning_rate": 1.8574545632998116e-05, + "loss": 0.438, + "step": 3507 + }, + { + "epoch": 0.6916403785488959, + "grad_norm": 0.6005244403686105, + "learning_rate": 1.857374794911311e-05, + "loss": 0.4698, + "step": 3508 + }, + { + "epoch": 0.6918375394321766, + "grad_norm": 0.5475318354156559, + "learning_rate": 1.85729500592369e-05, + "loss": 0.4425, + "step": 3509 + }, + { + "epoch": 0.6920347003154574, + "grad_norm": 0.5867289944866695, + "learning_rate": 1.857215196338866e-05, + "loss": 0.4601, + "step": 3510 + }, + { + "epoch": 0.6922318611987381, + "grad_norm": 0.5465094788446564, + "learning_rate": 1.8571353661587573e-05, + "loss": 0.4496, + "step": 3511 + }, + { + "epoch": 0.692429022082019, + "grad_norm": 0.574286072227518, + "learning_rate": 1.8570555153852806e-05, + "loss": 0.4449, + "step": 3512 + }, + { + "epoch": 0.6926261829652997, + "grad_norm": 0.6073127421651645, + "learning_rate": 1.8569756440203554e-05, + "loss": 0.449, + "step": 3513 + }, + { + "epoch": 0.6928233438485805, + "grad_norm": 0.5771548485295239, + "learning_rate": 1.8568957520659e-05, + "loss": 0.4555, + "step": 3514 + }, + { + "epoch": 0.6930205047318612, + "grad_norm": 0.5445589778122638, + "learning_rate": 1.856815839523834e-05, + "loss": 0.4319, + "step": 3515 + }, + { + "epoch": 0.693217665615142, + "grad_norm": 0.5629986928595075, + "learning_rate": 1.8567359063960778e-05, + "loss": 0.439, + "step": 3516 + }, + { + "epoch": 0.6934148264984227, + "grad_norm": 0.5596461637975891, + "learning_rate": 1.8566559526845512e-05, + "loss": 0.4341, + "step": 3517 + }, + { + "epoch": 0.6936119873817035, + "grad_norm": 0.5375000041425667, + "learning_rate": 1.8565759783911756e-05, + "loss": 0.3897, + "step": 3518 + }, + { + "epoch": 0.6938091482649842, + "grad_norm": 0.503604882436781, + "learning_rate": 1.8564959835178725e-05, + "loss": 0.3891, + "step": 3519 + }, + { + "epoch": 0.694006309148265, + "grad_norm": 0.6060651930247586, + "learning_rate": 1.8564159680665633e-05, + "loss": 0.4572, + "step": 3520 + }, + { + "epoch": 0.6942034700315457, + "grad_norm": 0.5897040744548951, + "learning_rate": 1.856335932039171e-05, + "loss": 0.4341, + "step": 3521 + }, + { + "epoch": 0.6944006309148265, + "grad_norm": 0.5726098247448242, + "learning_rate": 1.8562558754376182e-05, + "loss": 0.4337, + "step": 3522 + }, + { + "epoch": 0.6945977917981072, + "grad_norm": 0.5241962440051866, + "learning_rate": 1.8561757982638285e-05, + "loss": 0.4222, + "step": 3523 + }, + { + "epoch": 0.694794952681388, + "grad_norm": 0.5767874135195625, + "learning_rate": 1.856095700519726e-05, + "loss": 0.4669, + "step": 3524 + }, + { + "epoch": 0.6949921135646687, + "grad_norm": 0.605586730484475, + "learning_rate": 1.856015582207235e-05, + "loss": 0.4965, + "step": 3525 + }, + { + "epoch": 0.6951892744479495, + "grad_norm": 0.5293811874847645, + "learning_rate": 1.8559354433282795e-05, + "loss": 0.4187, + "step": 3526 + }, + { + "epoch": 0.6953864353312302, + "grad_norm": 0.557624349231964, + "learning_rate": 1.8558552838847862e-05, + "loss": 0.4413, + "step": 3527 + }, + { + "epoch": 0.695583596214511, + "grad_norm": 0.5724753571082727, + "learning_rate": 1.8557751038786807e-05, + "loss": 0.4325, + "step": 3528 + }, + { + "epoch": 0.6957807570977917, + "grad_norm": 0.5642589878108021, + "learning_rate": 1.8556949033118886e-05, + "loss": 0.4346, + "step": 3529 + }, + { + "epoch": 0.6959779179810726, + "grad_norm": 0.587492971862397, + "learning_rate": 1.855614682186338e-05, + "loss": 0.4709, + "step": 3530 + }, + { + "epoch": 0.6961750788643533, + "grad_norm": 0.5465280182977807, + "learning_rate": 1.8555344405039553e-05, + "loss": 0.4184, + "step": 3531 + }, + { + "epoch": 0.6963722397476341, + "grad_norm": 0.6480278193209652, + "learning_rate": 1.8554541782666685e-05, + "loss": 0.4649, + "step": 3532 + }, + { + "epoch": 0.6965694006309149, + "grad_norm": 0.5135839915061162, + "learning_rate": 1.8553738954764068e-05, + "loss": 0.4147, + "step": 3533 + }, + { + "epoch": 0.6967665615141956, + "grad_norm": 0.5495932553673605, + "learning_rate": 1.855293592135098e-05, + "loss": 0.3947, + "step": 3534 + }, + { + "epoch": 0.6969637223974764, + "grad_norm": 0.5759718073274976, + "learning_rate": 1.8552132682446716e-05, + "loss": 0.4613, + "step": 3535 + }, + { + "epoch": 0.6971608832807571, + "grad_norm": 0.5693917333392535, + "learning_rate": 1.8551329238070583e-05, + "loss": 0.4661, + "step": 3536 + }, + { + "epoch": 0.6973580441640379, + "grad_norm": 0.633091177062187, + "learning_rate": 1.8550525588241878e-05, + "loss": 0.4114, + "step": 3537 + }, + { + "epoch": 0.6975552050473186, + "grad_norm": 0.5719956129908553, + "learning_rate": 1.8549721732979904e-05, + "loss": 0.4638, + "step": 3538 + }, + { + "epoch": 0.6977523659305994, + "grad_norm": 0.5734766418241823, + "learning_rate": 1.8548917672303987e-05, + "loss": 0.4279, + "step": 3539 + }, + { + "epoch": 0.6979495268138801, + "grad_norm": 0.5817240921924031, + "learning_rate": 1.8548113406233436e-05, + "loss": 0.4595, + "step": 3540 + }, + { + "epoch": 0.6981466876971609, + "grad_norm": 0.5498165766780686, + "learning_rate": 1.8547308934787576e-05, + "loss": 0.4183, + "step": 3541 + }, + { + "epoch": 0.6983438485804416, + "grad_norm": 0.5771901083506239, + "learning_rate": 1.8546504257985738e-05, + "loss": 0.4399, + "step": 3542 + }, + { + "epoch": 0.6985410094637224, + "grad_norm": 0.8880467198374812, + "learning_rate": 1.8545699375847247e-05, + "loss": 0.4673, + "step": 3543 + }, + { + "epoch": 0.6987381703470031, + "grad_norm": 0.8215695607693159, + "learning_rate": 1.8544894288391452e-05, + "loss": 0.3944, + "step": 3544 + }, + { + "epoch": 0.698935331230284, + "grad_norm": 2.190816475198652, + "learning_rate": 1.8544088995637693e-05, + "loss": 0.43, + "step": 3545 + }, + { + "epoch": 0.6991324921135647, + "grad_norm": 0.6619407960935211, + "learning_rate": 1.854328349760531e-05, + "loss": 0.474, + "step": 3546 + }, + { + "epoch": 0.6993296529968455, + "grad_norm": 0.7707630061873859, + "learning_rate": 1.8542477794313662e-05, + "loss": 0.4369, + "step": 3547 + }, + { + "epoch": 0.6995268138801262, + "grad_norm": 0.6668874413715213, + "learning_rate": 1.8541671885782106e-05, + "loss": 0.4226, + "step": 3548 + }, + { + "epoch": 0.699723974763407, + "grad_norm": 0.5933363551194153, + "learning_rate": 1.8540865772030004e-05, + "loss": 0.44, + "step": 3549 + }, + { + "epoch": 0.6999211356466877, + "grad_norm": 0.5411012362808466, + "learning_rate": 1.8540059453076728e-05, + "loss": 0.4505, + "step": 3550 + }, + { + "epoch": 0.7001182965299685, + "grad_norm": 2.092070387748023, + "learning_rate": 1.853925292894164e-05, + "loss": 0.4816, + "step": 3551 + }, + { + "epoch": 0.7003154574132492, + "grad_norm": 0.6015879767455612, + "learning_rate": 1.853844619964413e-05, + "loss": 0.4578, + "step": 3552 + }, + { + "epoch": 0.70051261829653, + "grad_norm": 0.6227707549366479, + "learning_rate": 1.853763926520357e-05, + "loss": 0.4621, + "step": 3553 + }, + { + "epoch": 0.7007097791798107, + "grad_norm": 0.6158286256739309, + "learning_rate": 1.8536832125639353e-05, + "loss": 0.4615, + "step": 3554 + }, + { + "epoch": 0.7009069400630915, + "grad_norm": 4.099666789727112, + "learning_rate": 1.8536024780970868e-05, + "loss": 0.4471, + "step": 3555 + }, + { + "epoch": 0.7011041009463722, + "grad_norm": 1.3865107372029537, + "learning_rate": 1.8535217231217512e-05, + "loss": 0.4835, + "step": 3556 + }, + { + "epoch": 0.701301261829653, + "grad_norm": 0.6437956304730564, + "learning_rate": 1.8534409476398693e-05, + "loss": 0.4976, + "step": 3557 + }, + { + "epoch": 0.7014984227129337, + "grad_norm": 0.809900272846465, + "learning_rate": 1.853360151653381e-05, + "loss": 0.4337, + "step": 3558 + }, + { + "epoch": 0.7016955835962145, + "grad_norm": 0.5732397543697593, + "learning_rate": 1.8532793351642283e-05, + "loss": 0.447, + "step": 3559 + }, + { + "epoch": 0.7018927444794952, + "grad_norm": 0.6180992287814596, + "learning_rate": 1.853198498174352e-05, + "loss": 0.4483, + "step": 3560 + }, + { + "epoch": 0.702089905362776, + "grad_norm": 0.5836355816115922, + "learning_rate": 1.853117640685695e-05, + "loss": 0.4698, + "step": 3561 + }, + { + "epoch": 0.7022870662460567, + "grad_norm": 0.6266470149737501, + "learning_rate": 1.853036762700199e-05, + "loss": 0.4526, + "step": 3562 + }, + { + "epoch": 0.7024842271293376, + "grad_norm": 0.5545308522500857, + "learning_rate": 1.8529558642198085e-05, + "loss": 0.4275, + "step": 3563 + }, + { + "epoch": 0.7026813880126183, + "grad_norm": 0.5997846025265139, + "learning_rate": 1.8528749452464667e-05, + "loss": 0.4495, + "step": 3564 + }, + { + "epoch": 0.7028785488958991, + "grad_norm": 0.5932966298805183, + "learning_rate": 1.8527940057821168e-05, + "loss": 0.4404, + "step": 3565 + }, + { + "epoch": 0.7030757097791798, + "grad_norm": 0.6264233832414875, + "learning_rate": 1.8527130458287047e-05, + "loss": 0.4668, + "step": 3566 + }, + { + "epoch": 0.7032728706624606, + "grad_norm": 0.5422421411631603, + "learning_rate": 1.8526320653881745e-05, + "loss": 0.4352, + "step": 3567 + }, + { + "epoch": 0.7034700315457413, + "grad_norm": 0.5335967174546794, + "learning_rate": 1.8525510644624726e-05, + "loss": 0.4091, + "step": 3568 + }, + { + "epoch": 0.7036671924290221, + "grad_norm": 0.8352069980523148, + "learning_rate": 1.852470043053545e-05, + "loss": 0.4723, + "step": 3569 + }, + { + "epoch": 0.7038643533123028, + "grad_norm": 2.2486627770555585, + "learning_rate": 1.8523890011633377e-05, + "loss": 0.4209, + "step": 3570 + }, + { + "epoch": 0.7040615141955836, + "grad_norm": 0.5577369897883039, + "learning_rate": 1.8523079387937984e-05, + "loss": 0.4209, + "step": 3571 + }, + { + "epoch": 0.7042586750788643, + "grad_norm": 0.7637425020315376, + "learning_rate": 1.8522268559468744e-05, + "loss": 0.44, + "step": 3572 + }, + { + "epoch": 0.7044558359621451, + "grad_norm": 0.6449349203733895, + "learning_rate": 1.8521457526245142e-05, + "loss": 0.4603, + "step": 3573 + }, + { + "epoch": 0.7046529968454258, + "grad_norm": 0.5791189282772926, + "learning_rate": 1.852064628828666e-05, + "loss": 0.4327, + "step": 3574 + }, + { + "epoch": 0.7048501577287066, + "grad_norm": 0.7439087018417354, + "learning_rate": 1.851983484561279e-05, + "loss": 0.4688, + "step": 3575 + }, + { + "epoch": 0.7050473186119873, + "grad_norm": 0.5678987372658572, + "learning_rate": 1.8519023198243023e-05, + "loss": 0.4532, + "step": 3576 + }, + { + "epoch": 0.7052444794952681, + "grad_norm": 0.6893907579878671, + "learning_rate": 1.8518211346196865e-05, + "loss": 0.4482, + "step": 3577 + }, + { + "epoch": 0.705441640378549, + "grad_norm": 0.6063321929451956, + "learning_rate": 1.851739928949382e-05, + "loss": 0.452, + "step": 3578 + }, + { + "epoch": 0.7056388012618297, + "grad_norm": 0.5937895590659021, + "learning_rate": 1.85165870281534e-05, + "loss": 0.4067, + "step": 3579 + }, + { + "epoch": 0.7058359621451105, + "grad_norm": 0.571875582830355, + "learning_rate": 1.8515774562195115e-05, + "loss": 0.4296, + "step": 3580 + }, + { + "epoch": 0.7060331230283912, + "grad_norm": 0.6350546066367247, + "learning_rate": 1.851496189163849e-05, + "loss": 0.4802, + "step": 3581 + }, + { + "epoch": 0.706230283911672, + "grad_norm": 0.5728556146008228, + "learning_rate": 1.8514149016503048e-05, + "loss": 0.396, + "step": 3582 + }, + { + "epoch": 0.7064274447949527, + "grad_norm": 0.5607106332513473, + "learning_rate": 1.851333593680832e-05, + "loss": 0.395, + "step": 3583 + }, + { + "epoch": 0.7066246056782335, + "grad_norm": 0.6303565279440619, + "learning_rate": 1.851252265257384e-05, + "loss": 0.4606, + "step": 3584 + }, + { + "epoch": 0.7068217665615142, + "grad_norm": 0.5656202726110657, + "learning_rate": 1.8511709163819146e-05, + "loss": 0.4347, + "step": 3585 + }, + { + "epoch": 0.707018927444795, + "grad_norm": 0.5691199982028473, + "learning_rate": 1.851089547056379e-05, + "loss": 0.4799, + "step": 3586 + }, + { + "epoch": 0.7072160883280757, + "grad_norm": 0.5132663071610697, + "learning_rate": 1.851008157282731e-05, + "loss": 0.4254, + "step": 3587 + }, + { + "epoch": 0.7074132492113565, + "grad_norm": 0.599385661554487, + "learning_rate": 1.8509267470629275e-05, + "loss": 0.4807, + "step": 3588 + }, + { + "epoch": 0.7076104100946372, + "grad_norm": 0.5953814395962257, + "learning_rate": 1.850845316398923e-05, + "loss": 0.4638, + "step": 3589 + }, + { + "epoch": 0.707807570977918, + "grad_norm": 0.5296035745748813, + "learning_rate": 1.8507638652926748e-05, + "loss": 0.4341, + "step": 3590 + }, + { + "epoch": 0.7080047318611987, + "grad_norm": 0.5836957939032046, + "learning_rate": 1.85068239374614e-05, + "loss": 0.4664, + "step": 3591 + }, + { + "epoch": 0.7082018927444795, + "grad_norm": 0.5383659896543327, + "learning_rate": 1.8506009017612752e-05, + "loss": 0.4342, + "step": 3592 + }, + { + "epoch": 0.7083990536277602, + "grad_norm": 0.5802394458133452, + "learning_rate": 1.850519389340039e-05, + "loss": 0.4459, + "step": 3593 + }, + { + "epoch": 0.708596214511041, + "grad_norm": 0.6351661320355233, + "learning_rate": 1.850437856484389e-05, + "loss": 0.4401, + "step": 3594 + }, + { + "epoch": 0.7087933753943217, + "grad_norm": 0.5835902309373062, + "learning_rate": 1.850356303196285e-05, + "loss": 0.4412, + "step": 3595 + }, + { + "epoch": 0.7089905362776026, + "grad_norm": 0.5574546489985274, + "learning_rate": 1.850274729477686e-05, + "loss": 0.4558, + "step": 3596 + }, + { + "epoch": 0.7091876971608833, + "grad_norm": 0.6989042029450458, + "learning_rate": 1.850193135330552e-05, + "loss": 0.4467, + "step": 3597 + }, + { + "epoch": 0.7093848580441641, + "grad_norm": 0.5679323561115319, + "learning_rate": 1.850111520756843e-05, + "loss": 0.4235, + "step": 3598 + }, + { + "epoch": 0.7095820189274448, + "grad_norm": 2.385763474041582, + "learning_rate": 1.8500298857585207e-05, + "loss": 0.442, + "step": 3599 + }, + { + "epoch": 0.7097791798107256, + "grad_norm": 0.6566291998053769, + "learning_rate": 1.8499482303375454e-05, + "loss": 0.4537, + "step": 3600 + }, + { + "epoch": 0.7099763406940063, + "grad_norm": 0.5737937851393634, + "learning_rate": 1.8498665544958793e-05, + "loss": 0.4083, + "step": 3601 + }, + { + "epoch": 0.7101735015772871, + "grad_norm": 0.5528769295761875, + "learning_rate": 1.8497848582354852e-05, + "loss": 0.4165, + "step": 3602 + }, + { + "epoch": 0.7103706624605678, + "grad_norm": 0.5798728107229193, + "learning_rate": 1.8497031415583252e-05, + "loss": 0.4289, + "step": 3603 + }, + { + "epoch": 0.7105678233438486, + "grad_norm": 0.5389476339142518, + "learning_rate": 1.8496214044663633e-05, + "loss": 0.4062, + "step": 3604 + }, + { + "epoch": 0.7107649842271293, + "grad_norm": 0.6195250954116948, + "learning_rate": 1.8495396469615627e-05, + "loss": 0.4685, + "step": 3605 + }, + { + "epoch": 0.7109621451104101, + "grad_norm": 0.6193545738894738, + "learning_rate": 1.849457869045888e-05, + "loss": 0.4704, + "step": 3606 + }, + { + "epoch": 0.7111593059936908, + "grad_norm": 0.5963823991477826, + "learning_rate": 1.849376070721304e-05, + "loss": 0.4376, + "step": 3607 + }, + { + "epoch": 0.7113564668769716, + "grad_norm": 0.5649286189482374, + "learning_rate": 1.849294251989776e-05, + "loss": 0.442, + "step": 3608 + }, + { + "epoch": 0.7115536277602523, + "grad_norm": 0.5655868715091625, + "learning_rate": 1.8492124128532697e-05, + "loss": 0.4472, + "step": 3609 + }, + { + "epoch": 0.7117507886435331, + "grad_norm": 0.5624574778497905, + "learning_rate": 1.849130553313751e-05, + "loss": 0.4198, + "step": 3610 + }, + { + "epoch": 0.7119479495268138, + "grad_norm": 0.5918835988067689, + "learning_rate": 1.849048673373187e-05, + "loss": 0.4553, + "step": 3611 + }, + { + "epoch": 0.7121451104100947, + "grad_norm": 0.5450609283163641, + "learning_rate": 1.848966773033545e-05, + "loss": 0.408, + "step": 3612 + }, + { + "epoch": 0.7123422712933754, + "grad_norm": 0.6211662601865566, + "learning_rate": 1.8488848522967926e-05, + "loss": 0.4925, + "step": 3613 + }, + { + "epoch": 0.7125394321766562, + "grad_norm": 0.9407135144432643, + "learning_rate": 1.848802911164898e-05, + "loss": 0.4563, + "step": 3614 + }, + { + "epoch": 0.7127365930599369, + "grad_norm": 0.6364553089373497, + "learning_rate": 1.8487209496398298e-05, + "loss": 0.4556, + "step": 3615 + }, + { + "epoch": 0.7129337539432177, + "grad_norm": 0.6193764318415926, + "learning_rate": 1.8486389677235577e-05, + "loss": 0.5091, + "step": 3616 + }, + { + "epoch": 0.7131309148264984, + "grad_norm": 0.5810125836188512, + "learning_rate": 1.8485569654180506e-05, + "loss": 0.4331, + "step": 3617 + }, + { + "epoch": 0.7133280757097792, + "grad_norm": 0.5647496664846747, + "learning_rate": 1.8484749427252794e-05, + "loss": 0.4669, + "step": 3618 + }, + { + "epoch": 0.7135252365930599, + "grad_norm": 0.5966913216409047, + "learning_rate": 1.848392899647214e-05, + "loss": 0.4058, + "step": 3619 + }, + { + "epoch": 0.7137223974763407, + "grad_norm": 0.5710136188088236, + "learning_rate": 1.8483108361858263e-05, + "loss": 0.4354, + "step": 3620 + }, + { + "epoch": 0.7139195583596214, + "grad_norm": 0.5532446856241241, + "learning_rate": 1.8482287523430876e-05, + "loss": 0.4198, + "step": 3621 + }, + { + "epoch": 0.7141167192429022, + "grad_norm": 0.6722276257279677, + "learning_rate": 1.8481466481209696e-05, + "loss": 0.46, + "step": 3622 + }, + { + "epoch": 0.714313880126183, + "grad_norm": 0.5639143923716924, + "learning_rate": 1.848064523521446e-05, + "loss": 0.47, + "step": 3623 + }, + { + "epoch": 0.7145110410094637, + "grad_norm": 0.651640797548847, + "learning_rate": 1.847982378546489e-05, + "loss": 0.4575, + "step": 3624 + }, + { + "epoch": 0.7147082018927445, + "grad_norm": 0.6678536066940324, + "learning_rate": 1.8479002131980726e-05, + "loss": 0.4547, + "step": 3625 + }, + { + "epoch": 0.7149053627760252, + "grad_norm": 0.5640299919371896, + "learning_rate": 1.8478180274781707e-05, + "loss": 0.4302, + "step": 3626 + }, + { + "epoch": 0.715102523659306, + "grad_norm": 0.5625943114415008, + "learning_rate": 1.8477358213887578e-05, + "loss": 0.4295, + "step": 3627 + }, + { + "epoch": 0.7152996845425867, + "grad_norm": 0.8717418579189087, + "learning_rate": 1.8476535949318092e-05, + "loss": 0.4288, + "step": 3628 + }, + { + "epoch": 0.7154968454258676, + "grad_norm": 0.7111204636633626, + "learning_rate": 1.8475713481093005e-05, + "loss": 0.4779, + "step": 3629 + }, + { + "epoch": 0.7156940063091483, + "grad_norm": 0.5934289467421391, + "learning_rate": 1.8474890809232073e-05, + "loss": 0.4132, + "step": 3630 + }, + { + "epoch": 0.7158911671924291, + "grad_norm": 0.5316183763624316, + "learning_rate": 1.8474067933755067e-05, + "loss": 0.4343, + "step": 3631 + }, + { + "epoch": 0.7160883280757098, + "grad_norm": 0.6347964274352421, + "learning_rate": 1.8473244854681755e-05, + "loss": 0.4702, + "step": 3632 + }, + { + "epoch": 0.7162854889589906, + "grad_norm": 0.5307412228255478, + "learning_rate": 1.847242157203191e-05, + "loss": 0.4272, + "step": 3633 + }, + { + "epoch": 0.7164826498422713, + "grad_norm": 0.6030911118878556, + "learning_rate": 1.8471598085825318e-05, + "loss": 0.4644, + "step": 3634 + }, + { + "epoch": 0.7166798107255521, + "grad_norm": 0.5631771711931902, + "learning_rate": 1.8470774396081756e-05, + "loss": 0.445, + "step": 3635 + }, + { + "epoch": 0.7168769716088328, + "grad_norm": 0.528315324776306, + "learning_rate": 1.846995050282102e-05, + "loss": 0.413, + "step": 3636 + }, + { + "epoch": 0.7170741324921136, + "grad_norm": 0.542371055858913, + "learning_rate": 1.84691264060629e-05, + "loss": 0.4628, + "step": 3637 + }, + { + "epoch": 0.7172712933753943, + "grad_norm": 0.5752445019945468, + "learning_rate": 1.8468302105827195e-05, + "loss": 0.4715, + "step": 3638 + }, + { + "epoch": 0.7174684542586751, + "grad_norm": 0.5556213004935148, + "learning_rate": 1.8467477602133716e-05, + "loss": 0.451, + "step": 3639 + }, + { + "epoch": 0.7176656151419558, + "grad_norm": 0.5529878854385556, + "learning_rate": 1.8466652895002272e-05, + "loss": 0.4484, + "step": 3640 + }, + { + "epoch": 0.7178627760252366, + "grad_norm": 0.5491695139904219, + "learning_rate": 1.846582798445267e-05, + "loss": 0.4218, + "step": 3641 + }, + { + "epoch": 0.7180599369085173, + "grad_norm": 0.5467401669297904, + "learning_rate": 1.8465002870504734e-05, + "loss": 0.4229, + "step": 3642 + }, + { + "epoch": 0.7182570977917981, + "grad_norm": 0.557372492282451, + "learning_rate": 1.8464177553178287e-05, + "loss": 0.426, + "step": 3643 + }, + { + "epoch": 0.7184542586750788, + "grad_norm": 0.5362074017531132, + "learning_rate": 1.8463352032493162e-05, + "loss": 0.41, + "step": 3644 + }, + { + "epoch": 0.7186514195583596, + "grad_norm": 0.5685836955448099, + "learning_rate": 1.8462526308469182e-05, + "loss": 0.4272, + "step": 3645 + }, + { + "epoch": 0.7188485804416404, + "grad_norm": 0.5370351970866275, + "learning_rate": 1.8461700381126198e-05, + "loss": 0.4278, + "step": 3646 + }, + { + "epoch": 0.7190457413249212, + "grad_norm": 0.5227153310031122, + "learning_rate": 1.8460874250484045e-05, + "loss": 0.4478, + "step": 3647 + }, + { + "epoch": 0.7192429022082019, + "grad_norm": 0.5582355226303339, + "learning_rate": 1.8460047916562573e-05, + "loss": 0.412, + "step": 3648 + }, + { + "epoch": 0.7194400630914827, + "grad_norm": 0.9636937931489509, + "learning_rate": 1.845922137938164e-05, + "loss": 0.4217, + "step": 3649 + }, + { + "epoch": 0.7196372239747634, + "grad_norm": 0.5507780948989435, + "learning_rate": 1.8458394638961102e-05, + "loss": 0.46, + "step": 3650 + }, + { + "epoch": 0.7198343848580442, + "grad_norm": 0.5726098792524265, + "learning_rate": 1.845756769532082e-05, + "loss": 0.4523, + "step": 3651 + }, + { + "epoch": 0.7200315457413249, + "grad_norm": 0.5717699113943913, + "learning_rate": 1.8456740548480666e-05, + "loss": 0.4571, + "step": 3652 + }, + { + "epoch": 0.7202287066246057, + "grad_norm": 0.6191561110356281, + "learning_rate": 1.8455913198460503e-05, + "loss": 0.4511, + "step": 3653 + }, + { + "epoch": 0.7204258675078864, + "grad_norm": 0.707869839857266, + "learning_rate": 1.845508564528022e-05, + "loss": 0.4854, + "step": 3654 + }, + { + "epoch": 0.7206230283911672, + "grad_norm": 0.5271413420379327, + "learning_rate": 1.8454257888959695e-05, + "loss": 0.4362, + "step": 3655 + }, + { + "epoch": 0.7208201892744479, + "grad_norm": 0.6354322809953766, + "learning_rate": 1.845342992951882e-05, + "loss": 0.4481, + "step": 3656 + }, + { + "epoch": 0.7210173501577287, + "grad_norm": 0.5688074081822942, + "learning_rate": 1.845260176697748e-05, + "loss": 0.4423, + "step": 3657 + }, + { + "epoch": 0.7212145110410094, + "grad_norm": 0.5351090443779083, + "learning_rate": 1.8451773401355576e-05, + "loss": 0.4057, + "step": 3658 + }, + { + "epoch": 0.7214116719242902, + "grad_norm": 0.550968760823806, + "learning_rate": 1.845094483267301e-05, + "loss": 0.4556, + "step": 3659 + }, + { + "epoch": 0.7216088328075709, + "grad_norm": 0.5487359171002028, + "learning_rate": 1.845011606094969e-05, + "loss": 0.4499, + "step": 3660 + }, + { + "epoch": 0.7218059936908517, + "grad_norm": 0.544581400232813, + "learning_rate": 1.8449287086205525e-05, + "loss": 0.4242, + "step": 3661 + }, + { + "epoch": 0.7220031545741324, + "grad_norm": 0.5414669158249051, + "learning_rate": 1.8448457908460434e-05, + "loss": 0.4179, + "step": 3662 + }, + { + "epoch": 0.7222003154574133, + "grad_norm": 0.5796223289254395, + "learning_rate": 1.844762852773434e-05, + "loss": 0.4388, + "step": 3663 + }, + { + "epoch": 0.722397476340694, + "grad_norm": 0.5501910206556003, + "learning_rate": 1.8446798944047163e-05, + "loss": 0.4614, + "step": 3664 + }, + { + "epoch": 0.7225946372239748, + "grad_norm": 0.7117890177693054, + "learning_rate": 1.8445969157418845e-05, + "loss": 0.4557, + "step": 3665 + }, + { + "epoch": 0.7227917981072555, + "grad_norm": 0.5801289426969807, + "learning_rate": 1.844513916786931e-05, + "loss": 0.4469, + "step": 3666 + }, + { + "epoch": 0.7229889589905363, + "grad_norm": 0.565154672928485, + "learning_rate": 1.844430897541851e-05, + "loss": 0.411, + "step": 3667 + }, + { + "epoch": 0.723186119873817, + "grad_norm": 0.5512031287037732, + "learning_rate": 1.8443478580086388e-05, + "loss": 0.4382, + "step": 3668 + }, + { + "epoch": 0.7233832807570978, + "grad_norm": 0.5610735735424272, + "learning_rate": 1.844264798189289e-05, + "loss": 0.4713, + "step": 3669 + }, + { + "epoch": 0.7235804416403786, + "grad_norm": 0.5706453195319626, + "learning_rate": 1.8441817180857977e-05, + "loss": 0.4797, + "step": 3670 + }, + { + "epoch": 0.7237776025236593, + "grad_norm": 0.5321887393174666, + "learning_rate": 1.844098617700161e-05, + "loss": 0.4554, + "step": 3671 + }, + { + "epoch": 0.7239747634069401, + "grad_norm": 0.54415284289807, + "learning_rate": 1.8440154970343747e-05, + "loss": 0.4344, + "step": 3672 + }, + { + "epoch": 0.7241719242902208, + "grad_norm": 0.5250957449330941, + "learning_rate": 1.843932356090437e-05, + "loss": 0.4252, + "step": 3673 + }, + { + "epoch": 0.7243690851735016, + "grad_norm": 0.5266852693718858, + "learning_rate": 1.8438491948703445e-05, + "loss": 0.4257, + "step": 3674 + }, + { + "epoch": 0.7245662460567823, + "grad_norm": 0.5325304829293485, + "learning_rate": 1.8437660133760955e-05, + "loss": 0.4373, + "step": 3675 + }, + { + "epoch": 0.7247634069400631, + "grad_norm": 0.5793560541922426, + "learning_rate": 1.8436828116096886e-05, + "loss": 0.4349, + "step": 3676 + }, + { + "epoch": 0.7249605678233438, + "grad_norm": 0.560121961862576, + "learning_rate": 1.843599589573123e-05, + "loss": 0.4494, + "step": 3677 + }, + { + "epoch": 0.7251577287066246, + "grad_norm": 0.5249351610445621, + "learning_rate": 1.843516347268398e-05, + "loss": 0.4246, + "step": 3678 + }, + { + "epoch": 0.7253548895899053, + "grad_norm": 0.5412985163854938, + "learning_rate": 1.8434330846975128e-05, + "loss": 0.4273, + "step": 3679 + }, + { + "epoch": 0.7255520504731862, + "grad_norm": 0.5315287597533882, + "learning_rate": 1.843349801862469e-05, + "loss": 0.4564, + "step": 3680 + }, + { + "epoch": 0.7257492113564669, + "grad_norm": 0.5682525884568123, + "learning_rate": 1.843266498765267e-05, + "loss": 0.4446, + "step": 3681 + }, + { + "epoch": 0.7259463722397477, + "grad_norm": 0.5374471401172799, + "learning_rate": 1.8431831754079084e-05, + "loss": 0.4586, + "step": 3682 + }, + { + "epoch": 0.7261435331230284, + "grad_norm": 0.5687422399150629, + "learning_rate": 1.843099831792395e-05, + "loss": 0.4518, + "step": 3683 + }, + { + "epoch": 0.7263406940063092, + "grad_norm": 0.5448483856895682, + "learning_rate": 1.843016467920729e-05, + "loss": 0.4522, + "step": 3684 + }, + { + "epoch": 0.7265378548895899, + "grad_norm": 0.5303752136858124, + "learning_rate": 1.8429330837949134e-05, + "loss": 0.4303, + "step": 3685 + }, + { + "epoch": 0.7267350157728707, + "grad_norm": 0.5441603901459253, + "learning_rate": 1.842849679416952e-05, + "loss": 0.4265, + "step": 3686 + }, + { + "epoch": 0.7269321766561514, + "grad_norm": 0.5565608021825121, + "learning_rate": 1.842766254788848e-05, + "loss": 0.4704, + "step": 3687 + }, + { + "epoch": 0.7271293375394322, + "grad_norm": 0.5392512013397398, + "learning_rate": 1.8426828099126058e-05, + "loss": 0.4282, + "step": 3688 + }, + { + "epoch": 0.7273264984227129, + "grad_norm": 0.5228171340025033, + "learning_rate": 1.8425993447902312e-05, + "loss": 0.4262, + "step": 3689 + }, + { + "epoch": 0.7275236593059937, + "grad_norm": 0.5057395283789563, + "learning_rate": 1.8425158594237285e-05, + "loss": 0.3781, + "step": 3690 + }, + { + "epoch": 0.7277208201892744, + "grad_norm": 0.5637585116791193, + "learning_rate": 1.8424323538151038e-05, + "loss": 0.4578, + "step": 3691 + }, + { + "epoch": 0.7279179810725552, + "grad_norm": 0.5770703957710751, + "learning_rate": 1.842348827966363e-05, + "loss": 0.4474, + "step": 3692 + }, + { + "epoch": 0.7281151419558359, + "grad_norm": 0.5757553217171197, + "learning_rate": 1.8422652818795136e-05, + "loss": 0.4266, + "step": 3693 + }, + { + "epoch": 0.7283123028391167, + "grad_norm": 0.5212848130468406, + "learning_rate": 1.8421817155565627e-05, + "loss": 0.4297, + "step": 3694 + }, + { + "epoch": 0.7285094637223974, + "grad_norm": 0.6230547006563527, + "learning_rate": 1.8420981289995174e-05, + "loss": 0.5156, + "step": 3695 + }, + { + "epoch": 0.7287066246056783, + "grad_norm": 0.5232291475507522, + "learning_rate": 1.842014522210387e-05, + "loss": 0.4114, + "step": 3696 + }, + { + "epoch": 0.728903785488959, + "grad_norm": 0.5347786514736478, + "learning_rate": 1.841930895191179e-05, + "loss": 0.4235, + "step": 3697 + }, + { + "epoch": 0.7291009463722398, + "grad_norm": 0.5462129983268524, + "learning_rate": 1.841847247943904e-05, + "loss": 0.4421, + "step": 3698 + }, + { + "epoch": 0.7292981072555205, + "grad_norm": 0.49073103119405975, + "learning_rate": 1.84176358047057e-05, + "loss": 0.3842, + "step": 3699 + }, + { + "epoch": 0.7294952681388013, + "grad_norm": 0.6150333675068365, + "learning_rate": 1.8416798927731888e-05, + "loss": 0.4886, + "step": 3700 + }, + { + "epoch": 0.729692429022082, + "grad_norm": 0.5361700966964398, + "learning_rate": 1.8415961848537702e-05, + "loss": 0.4386, + "step": 3701 + }, + { + "epoch": 0.7298895899053628, + "grad_norm": 0.5347576067338814, + "learning_rate": 1.8415124567143258e-05, + "loss": 0.4184, + "step": 3702 + }, + { + "epoch": 0.7300867507886435, + "grad_norm": 0.5621215334987752, + "learning_rate": 1.8414287083568666e-05, + "loss": 0.4261, + "step": 3703 + }, + { + "epoch": 0.7302839116719243, + "grad_norm": 0.5700646061060279, + "learning_rate": 1.841344939783405e-05, + "loss": 0.4179, + "step": 3704 + }, + { + "epoch": 0.730481072555205, + "grad_norm": 0.518131991780773, + "learning_rate": 1.841261150995954e-05, + "loss": 0.4339, + "step": 3705 + }, + { + "epoch": 0.7306782334384858, + "grad_norm": 0.5459761294030725, + "learning_rate": 1.8411773419965263e-05, + "loss": 0.397, + "step": 3706 + }, + { + "epoch": 0.7308753943217665, + "grad_norm": 0.9314288519022254, + "learning_rate": 1.8410935127871356e-05, + "loss": 0.442, + "step": 3707 + }, + { + "epoch": 0.7310725552050473, + "grad_norm": 0.5636871116155396, + "learning_rate": 1.8410096633697956e-05, + "loss": 0.4173, + "step": 3708 + }, + { + "epoch": 0.731269716088328, + "grad_norm": 0.5193160515915649, + "learning_rate": 1.8409257937465216e-05, + "loss": 0.3913, + "step": 3709 + }, + { + "epoch": 0.7314668769716088, + "grad_norm": 0.712727581510582, + "learning_rate": 1.840841903919328e-05, + "loss": 0.4502, + "step": 3710 + }, + { + "epoch": 0.7316640378548895, + "grad_norm": 0.5769818802867137, + "learning_rate": 1.8407579938902302e-05, + "loss": 0.4527, + "step": 3711 + }, + { + "epoch": 0.7318611987381703, + "grad_norm": 0.5392705119979897, + "learning_rate": 1.8406740636612447e-05, + "loss": 0.4538, + "step": 3712 + }, + { + "epoch": 0.732058359621451, + "grad_norm": 0.6098765553285055, + "learning_rate": 1.8405901132343882e-05, + "loss": 0.4413, + "step": 3713 + }, + { + "epoch": 0.7322555205047319, + "grad_norm": 0.5771216137328318, + "learning_rate": 1.840506142611677e-05, + "loss": 0.4454, + "step": 3714 + }, + { + "epoch": 0.7324526813880127, + "grad_norm": 0.62152619615878, + "learning_rate": 1.840422151795129e-05, + "loss": 0.4713, + "step": 3715 + }, + { + "epoch": 0.7326498422712934, + "grad_norm": 0.5895477541474831, + "learning_rate": 1.840338140786762e-05, + "loss": 0.4415, + "step": 3716 + }, + { + "epoch": 0.7328470031545742, + "grad_norm": 0.7408210800785281, + "learning_rate": 1.8402541095885943e-05, + "loss": 0.4733, + "step": 3717 + }, + { + "epoch": 0.7330441640378549, + "grad_norm": 0.56864244318536, + "learning_rate": 1.8401700582026452e-05, + "loss": 0.4047, + "step": 3718 + }, + { + "epoch": 0.7332413249211357, + "grad_norm": 0.6695180859601412, + "learning_rate": 1.8400859866309337e-05, + "loss": 0.4215, + "step": 3719 + }, + { + "epoch": 0.7334384858044164, + "grad_norm": 0.6556762484776179, + "learning_rate": 1.84000189487548e-05, + "loss": 0.5069, + "step": 3720 + }, + { + "epoch": 0.7336356466876972, + "grad_norm": 0.5610750056727031, + "learning_rate": 1.8399177829383043e-05, + "loss": 0.4505, + "step": 3721 + }, + { + "epoch": 0.7338328075709779, + "grad_norm": 0.5779111361169296, + "learning_rate": 1.839833650821427e-05, + "loss": 0.4109, + "step": 3722 + }, + { + "epoch": 0.7340299684542587, + "grad_norm": 0.5512782925585601, + "learning_rate": 1.8397494985268705e-05, + "loss": 0.4559, + "step": 3723 + }, + { + "epoch": 0.7342271293375394, + "grad_norm": 0.5691768441468529, + "learning_rate": 1.839665326056656e-05, + "loss": 0.4112, + "step": 3724 + }, + { + "epoch": 0.7344242902208202, + "grad_norm": 0.5624881189319345, + "learning_rate": 1.8395811334128058e-05, + "loss": 0.4293, + "step": 3725 + }, + { + "epoch": 0.7346214511041009, + "grad_norm": 0.7923653826550998, + "learning_rate": 1.8394969205973426e-05, + "loss": 0.418, + "step": 3726 + }, + { + "epoch": 0.7348186119873817, + "grad_norm": 0.5815459567626416, + "learning_rate": 1.8394126876122896e-05, + "loss": 0.4526, + "step": 3727 + }, + { + "epoch": 0.7350157728706624, + "grad_norm": 0.6090000924351008, + "learning_rate": 1.8393284344596715e-05, + "loss": 0.4544, + "step": 3728 + }, + { + "epoch": 0.7352129337539433, + "grad_norm": 0.5232417470669563, + "learning_rate": 1.8392441611415113e-05, + "loss": 0.4496, + "step": 3729 + }, + { + "epoch": 0.735410094637224, + "grad_norm": 0.7399663560330513, + "learning_rate": 1.8391598676598344e-05, + "loss": 0.4242, + "step": 3730 + }, + { + "epoch": 0.7356072555205048, + "grad_norm": 0.5615281033101944, + "learning_rate": 1.8390755540166663e-05, + "loss": 0.4376, + "step": 3731 + }, + { + "epoch": 0.7358044164037855, + "grad_norm": 0.599463559470186, + "learning_rate": 1.8389912202140318e-05, + "loss": 0.446, + "step": 3732 + }, + { + "epoch": 0.7360015772870663, + "grad_norm": 0.5377177978774068, + "learning_rate": 1.838906866253958e-05, + "loss": 0.4516, + "step": 3733 + }, + { + "epoch": 0.736198738170347, + "grad_norm": 0.5681460967594567, + "learning_rate": 1.8388224921384707e-05, + "loss": 0.4066, + "step": 3734 + }, + { + "epoch": 0.7363958990536278, + "grad_norm": 0.5141694762603598, + "learning_rate": 1.8387380978695977e-05, + "loss": 0.4322, + "step": 3735 + }, + { + "epoch": 0.7365930599369085, + "grad_norm": 0.8238886062154376, + "learning_rate": 1.8386536834493667e-05, + "loss": 0.4507, + "step": 3736 + }, + { + "epoch": 0.7367902208201893, + "grad_norm": 0.535199442346653, + "learning_rate": 1.8385692488798056e-05, + "loss": 0.4553, + "step": 3737 + }, + { + "epoch": 0.73698738170347, + "grad_norm": 0.6483426681293745, + "learning_rate": 1.8384847941629423e-05, + "loss": 0.4634, + "step": 3738 + }, + { + "epoch": 0.7371845425867508, + "grad_norm": 0.5764953582261776, + "learning_rate": 1.8384003193008072e-05, + "loss": 0.442, + "step": 3739 + }, + { + "epoch": 0.7373817034700315, + "grad_norm": 0.5572119301548107, + "learning_rate": 1.8383158242954296e-05, + "loss": 0.4508, + "step": 3740 + }, + { + "epoch": 0.7375788643533123, + "grad_norm": 0.5681528660063264, + "learning_rate": 1.8382313091488385e-05, + "loss": 0.4532, + "step": 3741 + }, + { + "epoch": 0.737776025236593, + "grad_norm": 0.5891890950746687, + "learning_rate": 1.8381467738630656e-05, + "loss": 0.4669, + "step": 3742 + }, + { + "epoch": 0.7379731861198738, + "grad_norm": 0.5049888382066734, + "learning_rate": 1.8380622184401416e-05, + "loss": 0.3829, + "step": 3743 + }, + { + "epoch": 0.7381703470031545, + "grad_norm": 0.563208762050558, + "learning_rate": 1.8379776428820974e-05, + "loss": 0.4047, + "step": 3744 + }, + { + "epoch": 0.7383675078864353, + "grad_norm": 0.5779457902661825, + "learning_rate": 1.8378930471909658e-05, + "loss": 0.4206, + "step": 3745 + }, + { + "epoch": 0.738564668769716, + "grad_norm": 0.7774905699447151, + "learning_rate": 1.837808431368779e-05, + "loss": 0.4779, + "step": 3746 + }, + { + "epoch": 0.7387618296529969, + "grad_norm": 0.5629107990200406, + "learning_rate": 1.83772379541757e-05, + "loss": 0.3925, + "step": 3747 + }, + { + "epoch": 0.7389589905362776, + "grad_norm": 0.5970620392228986, + "learning_rate": 1.837639139339372e-05, + "loss": 0.4488, + "step": 3748 + }, + { + "epoch": 0.7391561514195584, + "grad_norm": 0.5557125511475749, + "learning_rate": 1.8375544631362195e-05, + "loss": 0.452, + "step": 3749 + }, + { + "epoch": 0.7393533123028391, + "grad_norm": 0.5872831543728793, + "learning_rate": 1.8374697668101463e-05, + "loss": 0.4594, + "step": 3750 + }, + { + "epoch": 0.7395504731861199, + "grad_norm": 0.5286161720159008, + "learning_rate": 1.8373850503631872e-05, + "loss": 0.4322, + "step": 3751 + }, + { + "epoch": 0.7397476340694006, + "grad_norm": 0.560014306046355, + "learning_rate": 1.8373003137973783e-05, + "loss": 0.4316, + "step": 3752 + }, + { + "epoch": 0.7399447949526814, + "grad_norm": 0.5456631340877649, + "learning_rate": 1.8372155571147554e-05, + "loss": 0.4331, + "step": 3753 + }, + { + "epoch": 0.7401419558359621, + "grad_norm": 0.5985522047951393, + "learning_rate": 1.837130780317354e-05, + "loss": 0.4628, + "step": 3754 + }, + { + "epoch": 0.7403391167192429, + "grad_norm": 0.5672181573442395, + "learning_rate": 1.8370459834072118e-05, + "loss": 0.4454, + "step": 3755 + }, + { + "epoch": 0.7405362776025236, + "grad_norm": 0.5635085742932182, + "learning_rate": 1.8369611663863656e-05, + "loss": 0.4223, + "step": 3756 + }, + { + "epoch": 0.7407334384858044, + "grad_norm": 0.5873022605290665, + "learning_rate": 1.8368763292568532e-05, + "loss": 0.4478, + "step": 3757 + }, + { + "epoch": 0.7409305993690851, + "grad_norm": 0.5346924744494808, + "learning_rate": 1.8367914720207137e-05, + "loss": 0.3994, + "step": 3758 + }, + { + "epoch": 0.7411277602523659, + "grad_norm": 0.5791612680965824, + "learning_rate": 1.8367065946799845e-05, + "loss": 0.417, + "step": 3759 + }, + { + "epoch": 0.7413249211356467, + "grad_norm": 0.5887526324679428, + "learning_rate": 1.8366216972367058e-05, + "loss": 0.4324, + "step": 3760 + }, + { + "epoch": 0.7415220820189274, + "grad_norm": 0.631982631731162, + "learning_rate": 1.836536779692917e-05, + "loss": 0.4398, + "step": 3761 + }, + { + "epoch": 0.7417192429022083, + "grad_norm": 0.5498266504156066, + "learning_rate": 1.836451842050659e-05, + "loss": 0.444, + "step": 3762 + }, + { + "epoch": 0.741916403785489, + "grad_norm": 0.5248298760030489, + "learning_rate": 1.8363668843119713e-05, + "loss": 0.4397, + "step": 3763 + }, + { + "epoch": 0.7421135646687698, + "grad_norm": 0.5574383621046671, + "learning_rate": 1.8362819064788956e-05, + "loss": 0.4321, + "step": 3764 + }, + { + "epoch": 0.7423107255520505, + "grad_norm": 0.5509414305533834, + "learning_rate": 1.8361969085534742e-05, + "loss": 0.4449, + "step": 3765 + }, + { + "epoch": 0.7425078864353313, + "grad_norm": 0.6218658695377531, + "learning_rate": 1.8361118905377483e-05, + "loss": 0.4698, + "step": 3766 + }, + { + "epoch": 0.742705047318612, + "grad_norm": 0.6025705112487485, + "learning_rate": 1.8360268524337606e-05, + "loss": 0.4425, + "step": 3767 + }, + { + "epoch": 0.7429022082018928, + "grad_norm": 0.5354499032310642, + "learning_rate": 1.835941794243555e-05, + "loss": 0.4212, + "step": 3768 + }, + { + "epoch": 0.7430993690851735, + "grad_norm": 0.6198349365571671, + "learning_rate": 1.8358567159691745e-05, + "loss": 0.4173, + "step": 3769 + }, + { + "epoch": 0.7432965299684543, + "grad_norm": 0.5633007693206912, + "learning_rate": 1.8357716176126633e-05, + "loss": 0.4407, + "step": 3770 + }, + { + "epoch": 0.743493690851735, + "grad_norm": 0.6033616935263962, + "learning_rate": 1.8356864991760658e-05, + "loss": 0.4832, + "step": 3771 + }, + { + "epoch": 0.7436908517350158, + "grad_norm": 0.5318673355760555, + "learning_rate": 1.8356013606614277e-05, + "loss": 0.4512, + "step": 3772 + }, + { + "epoch": 0.7438880126182965, + "grad_norm": 0.9499333131372167, + "learning_rate": 1.8355162020707932e-05, + "loss": 0.4394, + "step": 3773 + }, + { + "epoch": 0.7440851735015773, + "grad_norm": 0.5559579298686984, + "learning_rate": 1.8354310234062097e-05, + "loss": 0.4233, + "step": 3774 + }, + { + "epoch": 0.744282334384858, + "grad_norm": 0.6229971755454574, + "learning_rate": 1.835345824669723e-05, + "loss": 0.44, + "step": 3775 + }, + { + "epoch": 0.7444794952681388, + "grad_norm": 0.5923143947918056, + "learning_rate": 1.83526060586338e-05, + "loss": 0.4606, + "step": 3776 + }, + { + "epoch": 0.7446766561514195, + "grad_norm": 0.5501204352607113, + "learning_rate": 1.8351753669892284e-05, + "loss": 0.4291, + "step": 3777 + }, + { + "epoch": 0.7448738170347003, + "grad_norm": 0.6932256286950385, + "learning_rate": 1.8350901080493158e-05, + "loss": 0.461, + "step": 3778 + }, + { + "epoch": 0.745070977917981, + "grad_norm": 0.6073398760332815, + "learning_rate": 1.8350048290456912e-05, + "loss": 0.4141, + "step": 3779 + }, + { + "epoch": 0.7452681388012619, + "grad_norm": 0.5251923915616548, + "learning_rate": 1.834919529980403e-05, + "loss": 0.4481, + "step": 3780 + }, + { + "epoch": 0.7454652996845426, + "grad_norm": 0.5442185285627984, + "learning_rate": 1.8348342108555007e-05, + "loss": 0.4026, + "step": 3781 + }, + { + "epoch": 0.7456624605678234, + "grad_norm": 0.5693867434023069, + "learning_rate": 1.8347488716730343e-05, + "loss": 0.4311, + "step": 3782 + }, + { + "epoch": 0.7458596214511041, + "grad_norm": 0.5938661250513164, + "learning_rate": 1.834663512435054e-05, + "loss": 0.4613, + "step": 3783 + }, + { + "epoch": 0.7460567823343849, + "grad_norm": 0.5455676983214097, + "learning_rate": 1.8345781331436106e-05, + "loss": 0.4195, + "step": 3784 + }, + { + "epoch": 0.7462539432176656, + "grad_norm": 0.5860282571566524, + "learning_rate": 1.8344927338007554e-05, + "loss": 0.4211, + "step": 3785 + }, + { + "epoch": 0.7464511041009464, + "grad_norm": 0.5333294020645926, + "learning_rate": 1.8344073144085406e-05, + "loss": 0.4371, + "step": 3786 + }, + { + "epoch": 0.7466482649842271, + "grad_norm": 0.5706728165649482, + "learning_rate": 1.834321874969018e-05, + "loss": 0.4389, + "step": 3787 + }, + { + "epoch": 0.7468454258675079, + "grad_norm": 0.7074581360169672, + "learning_rate": 1.8342364154842404e-05, + "loss": 0.4937, + "step": 3788 + }, + { + "epoch": 0.7470425867507886, + "grad_norm": 0.5778783270484186, + "learning_rate": 1.8341509359562608e-05, + "loss": 0.4377, + "step": 3789 + }, + { + "epoch": 0.7472397476340694, + "grad_norm": 0.5270159539912247, + "learning_rate": 1.8340654363871334e-05, + "loss": 0.4117, + "step": 3790 + }, + { + "epoch": 0.7474369085173501, + "grad_norm": 0.5223441256989405, + "learning_rate": 1.8339799167789127e-05, + "loss": 0.4471, + "step": 3791 + }, + { + "epoch": 0.7476340694006309, + "grad_norm": 0.5781995373444279, + "learning_rate": 1.8338943771336522e-05, + "loss": 0.4378, + "step": 3792 + }, + { + "epoch": 0.7478312302839116, + "grad_norm": 0.5634947765385621, + "learning_rate": 1.8338088174534083e-05, + "loss": 0.4454, + "step": 3793 + }, + { + "epoch": 0.7480283911671924, + "grad_norm": 0.5059725150862037, + "learning_rate": 1.833723237740236e-05, + "loss": 0.4245, + "step": 3794 + }, + { + "epoch": 0.7482255520504731, + "grad_norm": 0.5326325739466916, + "learning_rate": 1.833637637996191e-05, + "loss": 0.4425, + "step": 3795 + }, + { + "epoch": 0.748422712933754, + "grad_norm": 0.5574154238044264, + "learning_rate": 1.833552018223331e-05, + "loss": 0.4492, + "step": 3796 + }, + { + "epoch": 0.7486198738170347, + "grad_norm": 0.5402149077761322, + "learning_rate": 1.8334663784237124e-05, + "loss": 0.4006, + "step": 3797 + }, + { + "epoch": 0.7488170347003155, + "grad_norm": 0.5791969213862481, + "learning_rate": 1.8333807185993927e-05, + "loss": 0.4434, + "step": 3798 + }, + { + "epoch": 0.7490141955835962, + "grad_norm": 0.6018305196834619, + "learning_rate": 1.8332950387524304e-05, + "loss": 0.4588, + "step": 3799 + }, + { + "epoch": 0.749211356466877, + "grad_norm": 0.6572227006369701, + "learning_rate": 1.8332093388848836e-05, + "loss": 0.446, + "step": 3800 + }, + { + "epoch": 0.7494085173501577, + "grad_norm": 0.5684378740198406, + "learning_rate": 1.8331236189988115e-05, + "loss": 0.4071, + "step": 3801 + }, + { + "epoch": 0.7496056782334385, + "grad_norm": 0.5338624607390893, + "learning_rate": 1.8330378790962734e-05, + "loss": 0.4635, + "step": 3802 + }, + { + "epoch": 0.7498028391167192, + "grad_norm": 0.5649660572252639, + "learning_rate": 1.8329521191793293e-05, + "loss": 0.4277, + "step": 3803 + }, + { + "epoch": 0.75, + "grad_norm": 0.5526706058734381, + "learning_rate": 1.83286633925004e-05, + "loss": 0.4404, + "step": 3804 + }, + { + "epoch": 0.75, + "eval_loss": 0.44013890624046326, + "eval_runtime": 343.9735, + "eval_samples_per_second": 23.636, + "eval_steps_per_second": 1.48, + "step": 3804 + }, + { + "epoch": 0.7501971608832808, + "grad_norm": 0.6372546717506914, + "learning_rate": 1.8327805393104658e-05, + "loss": 0.4548, + "step": 3805 + }, + { + "epoch": 0.7503943217665615, + "grad_norm": 0.5913842512169649, + "learning_rate": 1.832694719362669e-05, + "loss": 0.4518, + "step": 3806 + }, + { + "epoch": 0.7505914826498423, + "grad_norm": 0.5826300542247199, + "learning_rate": 1.8326088794087108e-05, + "loss": 0.4818, + "step": 3807 + }, + { + "epoch": 0.750788643533123, + "grad_norm": 0.5907815365065618, + "learning_rate": 1.8325230194506538e-05, + "loss": 0.4365, + "step": 3808 + }, + { + "epoch": 0.7509858044164038, + "grad_norm": 0.5496356683694023, + "learning_rate": 1.8324371394905606e-05, + "loss": 0.4574, + "step": 3809 + }, + { + "epoch": 0.7511829652996845, + "grad_norm": 0.5711613351725243, + "learning_rate": 1.832351239530495e-05, + "loss": 0.4584, + "step": 3810 + }, + { + "epoch": 0.7513801261829653, + "grad_norm": 0.9200633659180221, + "learning_rate": 1.8322653195725206e-05, + "loss": 0.4781, + "step": 3811 + }, + { + "epoch": 0.751577287066246, + "grad_norm": 0.6033375447437546, + "learning_rate": 1.832179379618702e-05, + "loss": 0.4412, + "step": 3812 + }, + { + "epoch": 0.7517744479495269, + "grad_norm": 0.5171494431027918, + "learning_rate": 1.832093419671103e-05, + "loss": 0.4153, + "step": 3813 + }, + { + "epoch": 0.7519716088328076, + "grad_norm": 0.7257919932365838, + "learning_rate": 1.83200743973179e-05, + "loss": 0.4529, + "step": 3814 + }, + { + "epoch": 0.7521687697160884, + "grad_norm": 0.6164155494736133, + "learning_rate": 1.831921439802828e-05, + "loss": 0.4239, + "step": 3815 + }, + { + "epoch": 0.7523659305993691, + "grad_norm": 0.5742726759356669, + "learning_rate": 1.8318354198862836e-05, + "loss": 0.4595, + "step": 3816 + }, + { + "epoch": 0.7525630914826499, + "grad_norm": 0.5364358313603149, + "learning_rate": 1.831749379984223e-05, + "loss": 0.3946, + "step": 3817 + }, + { + "epoch": 0.7527602523659306, + "grad_norm": 0.5931144322460239, + "learning_rate": 1.8316633200987143e-05, + "loss": 0.4723, + "step": 3818 + }, + { + "epoch": 0.7529574132492114, + "grad_norm": 0.6631211961273565, + "learning_rate": 1.8315772402318243e-05, + "loss": 0.5013, + "step": 3819 + }, + { + "epoch": 0.7531545741324921, + "grad_norm": 0.555817939093465, + "learning_rate": 1.8314911403856212e-05, + "loss": 0.4234, + "step": 3820 + }, + { + "epoch": 0.7533517350157729, + "grad_norm": 0.5699005102970771, + "learning_rate": 1.8314050205621742e-05, + "loss": 0.4629, + "step": 3821 + }, + { + "epoch": 0.7535488958990536, + "grad_norm": 0.5456220237338646, + "learning_rate": 1.831318880763552e-05, + "loss": 0.4227, + "step": 3822 + }, + { + "epoch": 0.7537460567823344, + "grad_norm": 0.564506446004603, + "learning_rate": 1.8312327209918242e-05, + "loss": 0.4478, + "step": 3823 + }, + { + "epoch": 0.7539432176656151, + "grad_norm": 0.6007442236000919, + "learning_rate": 1.831146541249061e-05, + "loss": 0.4404, + "step": 3824 + }, + { + "epoch": 0.7541403785488959, + "grad_norm": 0.5756483770605955, + "learning_rate": 1.8310603415373328e-05, + "loss": 0.436, + "step": 3825 + }, + { + "epoch": 0.7543375394321766, + "grad_norm": 0.5730763258268717, + "learning_rate": 1.8309741218587102e-05, + "loss": 0.4509, + "step": 3826 + }, + { + "epoch": 0.7545347003154574, + "grad_norm": 0.573198374198784, + "learning_rate": 1.8308878822152655e-05, + "loss": 0.437, + "step": 3827 + }, + { + "epoch": 0.7547318611987381, + "grad_norm": 0.5616109549442615, + "learning_rate": 1.8308016226090704e-05, + "loss": 0.4439, + "step": 3828 + }, + { + "epoch": 0.754929022082019, + "grad_norm": 0.5895791098794968, + "learning_rate": 1.8307153430421972e-05, + "loss": 0.4744, + "step": 3829 + }, + { + "epoch": 0.7551261829652997, + "grad_norm": 0.5565201237486335, + "learning_rate": 1.830629043516719e-05, + "loss": 0.4657, + "step": 3830 + }, + { + "epoch": 0.7553233438485805, + "grad_norm": 0.5061359477774192, + "learning_rate": 1.830542724034709e-05, + "loss": 0.4204, + "step": 3831 + }, + { + "epoch": 0.7555205047318612, + "grad_norm": 0.5660890953859956, + "learning_rate": 1.8304563845982413e-05, + "loss": 0.4396, + "step": 3832 + }, + { + "epoch": 0.755717665615142, + "grad_norm": 0.5503977078210044, + "learning_rate": 1.83037002520939e-05, + "loss": 0.4393, + "step": 3833 + }, + { + "epoch": 0.7559148264984227, + "grad_norm": 0.6550111174917882, + "learning_rate": 1.8302836458702302e-05, + "loss": 0.4523, + "step": 3834 + }, + { + "epoch": 0.7561119873817035, + "grad_norm": 2.619858501191366, + "learning_rate": 1.8301972465828373e-05, + "loss": 0.469, + "step": 3835 + }, + { + "epoch": 0.7563091482649842, + "grad_norm": 0.6235970138061254, + "learning_rate": 1.830110827349287e-05, + "loss": 0.4333, + "step": 3836 + }, + { + "epoch": 0.756506309148265, + "grad_norm": 0.532577589765572, + "learning_rate": 1.8300243881716553e-05, + "loss": 0.4646, + "step": 3837 + }, + { + "epoch": 0.7567034700315457, + "grad_norm": 0.6198914137370217, + "learning_rate": 1.8299379290520197e-05, + "loss": 0.4148, + "step": 3838 + }, + { + "epoch": 0.7569006309148265, + "grad_norm": 0.5916239893137991, + "learning_rate": 1.8298514499924567e-05, + "loss": 0.4566, + "step": 3839 + }, + { + "epoch": 0.7570977917981072, + "grad_norm": 0.5602675170057476, + "learning_rate": 1.8297649509950446e-05, + "loss": 0.4242, + "step": 3840 + }, + { + "epoch": 0.757294952681388, + "grad_norm": 0.493735580049777, + "learning_rate": 1.829678432061861e-05, + "loss": 0.3774, + "step": 3841 + }, + { + "epoch": 0.7574921135646687, + "grad_norm": 0.5400394381153409, + "learning_rate": 1.829591893194985e-05, + "loss": 0.4206, + "step": 3842 + }, + { + "epoch": 0.7576892744479495, + "grad_norm": 0.5643283428425839, + "learning_rate": 1.829505334396496e-05, + "loss": 0.4571, + "step": 3843 + }, + { + "epoch": 0.7578864353312302, + "grad_norm": 0.5377612017847214, + "learning_rate": 1.8294187556684733e-05, + "loss": 0.4133, + "step": 3844 + }, + { + "epoch": 0.758083596214511, + "grad_norm": 0.5497122296347786, + "learning_rate": 1.829332157012997e-05, + "loss": 0.4332, + "step": 3845 + }, + { + "epoch": 0.7582807570977917, + "grad_norm": 0.5849200039580923, + "learning_rate": 1.8292455384321476e-05, + "loss": 0.4284, + "step": 3846 + }, + { + "epoch": 0.7584779179810726, + "grad_norm": 0.5543841134301215, + "learning_rate": 1.8291588999280065e-05, + "loss": 0.4532, + "step": 3847 + }, + { + "epoch": 0.7586750788643533, + "grad_norm": 0.5615983003588527, + "learning_rate": 1.8290722415026548e-05, + "loss": 0.4302, + "step": 3848 + }, + { + "epoch": 0.7588722397476341, + "grad_norm": 0.551746761958415, + "learning_rate": 1.828985563158175e-05, + "loss": 0.3985, + "step": 3849 + }, + { + "epoch": 0.7590694006309149, + "grad_norm": 0.6412908877953788, + "learning_rate": 1.8288988648966498e-05, + "loss": 0.415, + "step": 3850 + }, + { + "epoch": 0.7592665615141956, + "grad_norm": 0.5308071077261949, + "learning_rate": 1.8288121467201615e-05, + "loss": 0.4216, + "step": 3851 + }, + { + "epoch": 0.7594637223974764, + "grad_norm": 0.5623371567129707, + "learning_rate": 1.8287254086307942e-05, + "loss": 0.4205, + "step": 3852 + }, + { + "epoch": 0.7596608832807571, + "grad_norm": 0.6609929780062805, + "learning_rate": 1.8286386506306314e-05, + "loss": 0.4483, + "step": 3853 + }, + { + "epoch": 0.7598580441640379, + "grad_norm": 0.5571621496040055, + "learning_rate": 1.8285518727217578e-05, + "loss": 0.3971, + "step": 3854 + }, + { + "epoch": 0.7600552050473186, + "grad_norm": 0.6151096972545813, + "learning_rate": 1.8284650749062583e-05, + "loss": 0.4393, + "step": 3855 + }, + { + "epoch": 0.7602523659305994, + "grad_norm": 0.565063463574547, + "learning_rate": 1.8283782571862182e-05, + "loss": 0.4521, + "step": 3856 + }, + { + "epoch": 0.7604495268138801, + "grad_norm": 0.5410714930908408, + "learning_rate": 1.828291419563723e-05, + "loss": 0.4147, + "step": 3857 + }, + { + "epoch": 0.7606466876971609, + "grad_norm": 0.5396095207165513, + "learning_rate": 1.8282045620408596e-05, + "loss": 0.4591, + "step": 3858 + }, + { + "epoch": 0.7608438485804416, + "grad_norm": 0.6397377487608418, + "learning_rate": 1.828117684619715e-05, + "loss": 0.4952, + "step": 3859 + }, + { + "epoch": 0.7610410094637224, + "grad_norm": 0.5360037148692628, + "learning_rate": 1.8280307873023758e-05, + "loss": 0.4169, + "step": 3860 + }, + { + "epoch": 0.7612381703470031, + "grad_norm": 0.5634653164334658, + "learning_rate": 1.8279438700909305e-05, + "loss": 0.4209, + "step": 3861 + }, + { + "epoch": 0.761435331230284, + "grad_norm": 0.5642735061715564, + "learning_rate": 1.8278569329874667e-05, + "loss": 0.4156, + "step": 3862 + }, + { + "epoch": 0.7616324921135647, + "grad_norm": 0.5346697867413164, + "learning_rate": 1.8277699759940732e-05, + "loss": 0.4157, + "step": 3863 + }, + { + "epoch": 0.7618296529968455, + "grad_norm": 0.5458496115616259, + "learning_rate": 1.8276829991128397e-05, + "loss": 0.4412, + "step": 3864 + }, + { + "epoch": 0.7620268138801262, + "grad_norm": 0.542780014402022, + "learning_rate": 1.8275960023458554e-05, + "loss": 0.4051, + "step": 3865 + }, + { + "epoch": 0.762223974763407, + "grad_norm": 0.5179693655492266, + "learning_rate": 1.827508985695211e-05, + "loss": 0.4127, + "step": 3866 + }, + { + "epoch": 0.7624211356466877, + "grad_norm": 0.5553660779995085, + "learning_rate": 1.8274219491629965e-05, + "loss": 0.4401, + "step": 3867 + }, + { + "epoch": 0.7626182965299685, + "grad_norm": 0.5712424475051656, + "learning_rate": 1.827334892751304e-05, + "loss": 0.4267, + "step": 3868 + }, + { + "epoch": 0.7628154574132492, + "grad_norm": 0.5521939908028256, + "learning_rate": 1.8272478164622237e-05, + "loss": 0.4525, + "step": 3869 + }, + { + "epoch": 0.76301261829653, + "grad_norm": 0.6059666034435798, + "learning_rate": 1.827160720297849e-05, + "loss": 0.4352, + "step": 3870 + }, + { + "epoch": 0.7632097791798107, + "grad_norm": 0.5568390943861937, + "learning_rate": 1.827073604260271e-05, + "loss": 0.4435, + "step": 3871 + }, + { + "epoch": 0.7634069400630915, + "grad_norm": 0.6154734211328192, + "learning_rate": 1.8269864683515847e-05, + "loss": 0.478, + "step": 3872 + }, + { + "epoch": 0.7636041009463722, + "grad_norm": 0.6336972990527457, + "learning_rate": 1.8268993125738817e-05, + "loss": 0.488, + "step": 3873 + }, + { + "epoch": 0.763801261829653, + "grad_norm": 0.5158784852798877, + "learning_rate": 1.826812136929257e-05, + "loss": 0.3997, + "step": 3874 + }, + { + "epoch": 0.7639984227129337, + "grad_norm": 0.5545942044525611, + "learning_rate": 1.8267249414198055e-05, + "loss": 0.4042, + "step": 3875 + }, + { + "epoch": 0.7641955835962145, + "grad_norm": 0.7691672609377771, + "learning_rate": 1.8266377260476206e-05, + "loss": 0.4804, + "step": 3876 + }, + { + "epoch": 0.7643927444794952, + "grad_norm": 0.607087584955189, + "learning_rate": 1.826550490814799e-05, + "loss": 0.4558, + "step": 3877 + }, + { + "epoch": 0.764589905362776, + "grad_norm": 0.62388863749188, + "learning_rate": 1.8264632357234366e-05, + "loss": 0.4604, + "step": 3878 + }, + { + "epoch": 0.7647870662460567, + "grad_norm": 0.5639061589375046, + "learning_rate": 1.826375960775629e-05, + "loss": 0.4197, + "step": 3879 + }, + { + "epoch": 0.7649842271293376, + "grad_norm": 0.575114173679805, + "learning_rate": 1.8262886659734738e-05, + "loss": 0.466, + "step": 3880 + }, + { + "epoch": 0.7651813880126183, + "grad_norm": 1.0686366669808436, + "learning_rate": 1.8262013513190677e-05, + "loss": 0.4063, + "step": 3881 + }, + { + "epoch": 0.7653785488958991, + "grad_norm": 0.6064570679491103, + "learning_rate": 1.826114016814509e-05, + "loss": 0.3918, + "step": 3882 + }, + { + "epoch": 0.7655757097791798, + "grad_norm": 0.5647896972824821, + "learning_rate": 1.8260266624618957e-05, + "loss": 0.4329, + "step": 3883 + }, + { + "epoch": 0.7657728706624606, + "grad_norm": 0.6946858259139369, + "learning_rate": 1.8259392882633266e-05, + "loss": 0.4704, + "step": 3884 + }, + { + "epoch": 0.7659700315457413, + "grad_norm": 0.5720833541883139, + "learning_rate": 1.825851894220901e-05, + "loss": 0.4309, + "step": 3885 + }, + { + "epoch": 0.7661671924290221, + "grad_norm": 0.5967274277243448, + "learning_rate": 1.8257644803367186e-05, + "loss": 0.4341, + "step": 3886 + }, + { + "epoch": 0.7663643533123028, + "grad_norm": 0.5985333908745908, + "learning_rate": 1.8256770466128793e-05, + "loss": 0.4523, + "step": 3887 + }, + { + "epoch": 0.7665615141955836, + "grad_norm": 0.5560392931956263, + "learning_rate": 1.8255895930514843e-05, + "loss": 0.4228, + "step": 3888 + }, + { + "epoch": 0.7667586750788643, + "grad_norm": 0.5832994995378911, + "learning_rate": 1.8255021196546346e-05, + "loss": 0.4299, + "step": 3889 + }, + { + "epoch": 0.7669558359621451, + "grad_norm": 1.4002961334330222, + "learning_rate": 1.8254146264244316e-05, + "loss": 0.4941, + "step": 3890 + }, + { + "epoch": 0.7671529968454258, + "grad_norm": 0.5534367003995347, + "learning_rate": 1.8253271133629775e-05, + "loss": 0.4185, + "step": 3891 + }, + { + "epoch": 0.7673501577287066, + "grad_norm": 0.5935042775545306, + "learning_rate": 1.8252395804723744e-05, + "loss": 0.4807, + "step": 3892 + }, + { + "epoch": 0.7675473186119873, + "grad_norm": 0.8877026134448658, + "learning_rate": 1.8251520277547267e-05, + "loss": 0.4268, + "step": 3893 + }, + { + "epoch": 0.7677444794952681, + "grad_norm": 1.2517353246924268, + "learning_rate": 1.8250644552121362e-05, + "loss": 0.4519, + "step": 3894 + }, + { + "epoch": 0.767941640378549, + "grad_norm": 0.5929121870930241, + "learning_rate": 1.8249768628467085e-05, + "loss": 0.4413, + "step": 3895 + }, + { + "epoch": 0.7681388012618297, + "grad_norm": 0.5782475207334447, + "learning_rate": 1.8248892506605468e-05, + "loss": 0.4157, + "step": 3896 + }, + { + "epoch": 0.7683359621451105, + "grad_norm": 0.6117468432248259, + "learning_rate": 1.8248016186557566e-05, + "loss": 0.4333, + "step": 3897 + }, + { + "epoch": 0.7685331230283912, + "grad_norm": 0.5927409035058885, + "learning_rate": 1.8247139668344432e-05, + "loss": 0.4869, + "step": 3898 + }, + { + "epoch": 0.768730283911672, + "grad_norm": 1.158455927574009, + "learning_rate": 1.824626295198713e-05, + "loss": 0.4575, + "step": 3899 + }, + { + "epoch": 0.7689274447949527, + "grad_norm": 0.8392945391726799, + "learning_rate": 1.824538603750672e-05, + "loss": 0.4607, + "step": 3900 + }, + { + "epoch": 0.7691246056782335, + "grad_norm": 0.5998059892020197, + "learning_rate": 1.824450892492427e-05, + "loss": 0.4594, + "step": 3901 + }, + { + "epoch": 0.7693217665615142, + "grad_norm": 0.5695316389310182, + "learning_rate": 1.824363161426085e-05, + "loss": 0.4175, + "step": 3902 + }, + { + "epoch": 0.769518927444795, + "grad_norm": 0.6240626330754476, + "learning_rate": 1.8242754105537542e-05, + "loss": 0.454, + "step": 3903 + }, + { + "epoch": 0.7697160883280757, + "grad_norm": 0.5912514018969862, + "learning_rate": 1.8241876398775434e-05, + "loss": 0.4311, + "step": 3904 + }, + { + "epoch": 0.7699132492113565, + "grad_norm": 0.6624090571377835, + "learning_rate": 1.8240998493995607e-05, + "loss": 0.4719, + "step": 3905 + }, + { + "epoch": 0.7701104100946372, + "grad_norm": 0.9203541875889368, + "learning_rate": 1.8240120391219148e-05, + "loss": 0.4803, + "step": 3906 + }, + { + "epoch": 0.770307570977918, + "grad_norm": 0.5914744368657348, + "learning_rate": 1.823924209046717e-05, + "loss": 0.418, + "step": 3907 + }, + { + "epoch": 0.7705047318611987, + "grad_norm": 0.6070617757379179, + "learning_rate": 1.8238363591760758e-05, + "loss": 0.4429, + "step": 3908 + }, + { + "epoch": 0.7707018927444795, + "grad_norm": 0.6325247392458869, + "learning_rate": 1.8237484895121033e-05, + "loss": 0.4688, + "step": 3909 + }, + { + "epoch": 0.7708990536277602, + "grad_norm": 0.573852165719456, + "learning_rate": 1.8236606000569095e-05, + "loss": 0.4126, + "step": 3910 + }, + { + "epoch": 0.771096214511041, + "grad_norm": 0.5567521119492211, + "learning_rate": 1.823572690812607e-05, + "loss": 0.4415, + "step": 3911 + }, + { + "epoch": 0.7712933753943217, + "grad_norm": 0.6224892204399546, + "learning_rate": 1.8234847617813067e-05, + "loss": 0.4473, + "step": 3912 + }, + { + "epoch": 0.7714905362776026, + "grad_norm": 0.6241807022753024, + "learning_rate": 1.823396812965122e-05, + "loss": 0.4632, + "step": 3913 + }, + { + "epoch": 0.7716876971608833, + "grad_norm": 0.564677599383395, + "learning_rate": 1.8233088443661665e-05, + "loss": 0.4242, + "step": 3914 + }, + { + "epoch": 0.7718848580441641, + "grad_norm": 0.5882925220136831, + "learning_rate": 1.8232208559865522e-05, + "loss": 0.431, + "step": 3915 + }, + { + "epoch": 0.7720820189274448, + "grad_norm": 0.574227496801443, + "learning_rate": 1.823132847828394e-05, + "loss": 0.451, + "step": 3916 + }, + { + "epoch": 0.7722791798107256, + "grad_norm": 0.5222564423269557, + "learning_rate": 1.8230448198938067e-05, + "loss": 0.4296, + "step": 3917 + }, + { + "epoch": 0.7724763406940063, + "grad_norm": 0.57083859662196, + "learning_rate": 1.8229567721849046e-05, + "loss": 0.4257, + "step": 3918 + }, + { + "epoch": 0.7726735015772871, + "grad_norm": 1.6592668807628246, + "learning_rate": 1.822868704703803e-05, + "loss": 0.4921, + "step": 3919 + }, + { + "epoch": 0.7728706624605678, + "grad_norm": 0.6157770095064646, + "learning_rate": 1.8227806174526187e-05, + "loss": 0.4314, + "step": 3920 + }, + { + "epoch": 0.7730678233438486, + "grad_norm": 0.5852063956961934, + "learning_rate": 1.822692510433467e-05, + "loss": 0.4654, + "step": 3921 + }, + { + "epoch": 0.7732649842271293, + "grad_norm": 0.5866034997283565, + "learning_rate": 1.8226043836484655e-05, + "loss": 0.4346, + "step": 3922 + }, + { + "epoch": 0.7734621451104101, + "grad_norm": 0.5510812356437924, + "learning_rate": 1.8225162370997313e-05, + "loss": 0.4801, + "step": 3923 + }, + { + "epoch": 0.7736593059936908, + "grad_norm": 0.5472377435499002, + "learning_rate": 1.822428070789382e-05, + "loss": 0.4502, + "step": 3924 + }, + { + "epoch": 0.7738564668769716, + "grad_norm": 0.5620895652186938, + "learning_rate": 1.8223398847195358e-05, + "loss": 0.4428, + "step": 3925 + }, + { + "epoch": 0.7740536277602523, + "grad_norm": 0.5573421453852319, + "learning_rate": 1.822251678892312e-05, + "loss": 0.4397, + "step": 3926 + }, + { + "epoch": 0.7742507886435331, + "grad_norm": 0.5756487306350889, + "learning_rate": 1.822163453309829e-05, + "loss": 0.4255, + "step": 3927 + }, + { + "epoch": 0.7744479495268138, + "grad_norm": 0.5374818660181303, + "learning_rate": 1.8220752079742072e-05, + "loss": 0.4208, + "step": 3928 + }, + { + "epoch": 0.7746451104100947, + "grad_norm": 0.5294345547402046, + "learning_rate": 1.8219869428875668e-05, + "loss": 0.4217, + "step": 3929 + }, + { + "epoch": 0.7748422712933754, + "grad_norm": 0.56730405884325, + "learning_rate": 1.8218986580520276e-05, + "loss": 0.4768, + "step": 3930 + }, + { + "epoch": 0.7750394321766562, + "grad_norm": 0.5604501496661701, + "learning_rate": 1.8218103534697116e-05, + "loss": 0.4596, + "step": 3931 + }, + { + "epoch": 0.7752365930599369, + "grad_norm": 0.5579910747422517, + "learning_rate": 1.8217220291427398e-05, + "loss": 0.4527, + "step": 3932 + }, + { + "epoch": 0.7754337539432177, + "grad_norm": 0.5270550522697447, + "learning_rate": 1.821633685073235e-05, + "loss": 0.4559, + "step": 3933 + }, + { + "epoch": 0.7756309148264984, + "grad_norm": 0.5923687114112185, + "learning_rate": 1.8215453212633188e-05, + "loss": 0.4548, + "step": 3934 + }, + { + "epoch": 0.7758280757097792, + "grad_norm": 0.6303703716242776, + "learning_rate": 1.821456937715115e-05, + "loss": 0.5182, + "step": 3935 + }, + { + "epoch": 0.7760252365930599, + "grad_norm": 0.5753784721230842, + "learning_rate": 1.8213685344307465e-05, + "loss": 0.4475, + "step": 3936 + }, + { + "epoch": 0.7762223974763407, + "grad_norm": 0.7373579959797398, + "learning_rate": 1.8212801114123377e-05, + "loss": 0.4316, + "step": 3937 + }, + { + "epoch": 0.7764195583596214, + "grad_norm": 0.5762319703064297, + "learning_rate": 1.8211916686620128e-05, + "loss": 0.4546, + "step": 3938 + }, + { + "epoch": 0.7766167192429022, + "grad_norm": 0.5920585040131835, + "learning_rate": 1.8211032061818968e-05, + "loss": 0.4471, + "step": 3939 + }, + { + "epoch": 0.776813880126183, + "grad_norm": 0.6803659907474547, + "learning_rate": 1.8210147239741148e-05, + "loss": 0.4755, + "step": 3940 + }, + { + "epoch": 0.7770110410094637, + "grad_norm": 0.5509653813630704, + "learning_rate": 1.8209262220407932e-05, + "loss": 0.4586, + "step": 3941 + }, + { + "epoch": 0.7772082018927445, + "grad_norm": 0.5581868920790423, + "learning_rate": 1.820837700384058e-05, + "loss": 0.4219, + "step": 3942 + }, + { + "epoch": 0.7774053627760252, + "grad_norm": 0.5592867285417933, + "learning_rate": 1.8207491590060356e-05, + "loss": 0.4193, + "step": 3943 + }, + { + "epoch": 0.777602523659306, + "grad_norm": 0.6501883656371863, + "learning_rate": 1.8206605979088545e-05, + "loss": 0.4511, + "step": 3944 + }, + { + "epoch": 0.7777996845425867, + "grad_norm": 0.5572979380305522, + "learning_rate": 1.820572017094641e-05, + "loss": 0.4403, + "step": 3945 + }, + { + "epoch": 0.7779968454258676, + "grad_norm": 0.6146644203648315, + "learning_rate": 1.8204834165655242e-05, + "loss": 0.4852, + "step": 3946 + }, + { + "epoch": 0.7781940063091483, + "grad_norm": 0.5413773415764146, + "learning_rate": 1.8203947963236322e-05, + "loss": 0.4437, + "step": 3947 + }, + { + "epoch": 0.7783911671924291, + "grad_norm": 0.5605314804048697, + "learning_rate": 1.8203061563710952e-05, + "loss": 0.4294, + "step": 3948 + }, + { + "epoch": 0.7785883280757098, + "grad_norm": 0.5316072340317646, + "learning_rate": 1.820217496710042e-05, + "loss": 0.4243, + "step": 3949 + }, + { + "epoch": 0.7787854889589906, + "grad_norm": 0.6492198435447817, + "learning_rate": 1.8201288173426027e-05, + "loss": 0.4659, + "step": 3950 + }, + { + "epoch": 0.7789826498422713, + "grad_norm": 0.5713530844391045, + "learning_rate": 1.820040118270908e-05, + "loss": 0.4579, + "step": 3951 + }, + { + "epoch": 0.7791798107255521, + "grad_norm": 0.5363420111437506, + "learning_rate": 1.8199513994970893e-05, + "loss": 0.425, + "step": 3952 + }, + { + "epoch": 0.7793769716088328, + "grad_norm": 0.5542539770275866, + "learning_rate": 1.819862661023278e-05, + "loss": 0.4234, + "step": 3953 + }, + { + "epoch": 0.7795741324921136, + "grad_norm": 0.5285872082909737, + "learning_rate": 1.8197739028516062e-05, + "loss": 0.4238, + "step": 3954 + }, + { + "epoch": 0.7797712933753943, + "grad_norm": 0.5456326663863709, + "learning_rate": 1.819685124984206e-05, + "loss": 0.4301, + "step": 3955 + }, + { + "epoch": 0.7799684542586751, + "grad_norm": 0.5664354597499723, + "learning_rate": 1.8195963274232106e-05, + "loss": 0.4555, + "step": 3956 + }, + { + "epoch": 0.7801656151419558, + "grad_norm": 0.5315491155568633, + "learning_rate": 1.8195075101707535e-05, + "loss": 0.4272, + "step": 3957 + }, + { + "epoch": 0.7803627760252366, + "grad_norm": 0.8663503651895043, + "learning_rate": 1.819418673228968e-05, + "loss": 0.4816, + "step": 3958 + }, + { + "epoch": 0.7805599369085173, + "grad_norm": 0.5526623269158064, + "learning_rate": 1.8193298165999896e-05, + "loss": 0.462, + "step": 3959 + }, + { + "epoch": 0.7807570977917981, + "grad_norm": 0.5973908735760364, + "learning_rate": 1.8192409402859526e-05, + "loss": 0.4452, + "step": 3960 + }, + { + "epoch": 0.7809542586750788, + "grad_norm": 0.5521161874463199, + "learning_rate": 1.819152044288992e-05, + "loss": 0.4203, + "step": 3961 + }, + { + "epoch": 0.7811514195583596, + "grad_norm": 0.6040478645836748, + "learning_rate": 1.819063128611244e-05, + "loss": 0.4598, + "step": 3962 + }, + { + "epoch": 0.7813485804416404, + "grad_norm": 0.5454361035093315, + "learning_rate": 1.8189741932548447e-05, + "loss": 0.426, + "step": 3963 + }, + { + "epoch": 0.7815457413249212, + "grad_norm": 0.6321003848230068, + "learning_rate": 1.8188852382219308e-05, + "loss": 0.4685, + "step": 3964 + }, + { + "epoch": 0.7817429022082019, + "grad_norm": 0.6137933316061781, + "learning_rate": 1.8187962635146397e-05, + "loss": 0.4376, + "step": 3965 + }, + { + "epoch": 0.7819400630914827, + "grad_norm": 0.5396185891653529, + "learning_rate": 1.8187072691351088e-05, + "loss": 0.421, + "step": 3966 + }, + { + "epoch": 0.7821372239747634, + "grad_norm": 0.6005283575463105, + "learning_rate": 1.8186182550854768e-05, + "loss": 0.4782, + "step": 3967 + }, + { + "epoch": 0.7823343848580442, + "grad_norm": 0.516827483855773, + "learning_rate": 1.8185292213678818e-05, + "loss": 0.4034, + "step": 3968 + }, + { + "epoch": 0.7825315457413249, + "grad_norm": 0.5753697155306382, + "learning_rate": 1.818440167984463e-05, + "loss": 0.4585, + "step": 3969 + }, + { + "epoch": 0.7827287066246057, + "grad_norm": 0.5542615113432319, + "learning_rate": 1.8183510949373603e-05, + "loss": 0.4504, + "step": 3970 + }, + { + "epoch": 0.7829258675078864, + "grad_norm": 0.5333879318693766, + "learning_rate": 1.8182620022287133e-05, + "loss": 0.4213, + "step": 3971 + }, + { + "epoch": 0.7831230283911672, + "grad_norm": 0.6072385415952638, + "learning_rate": 1.8181728898606628e-05, + "loss": 0.4395, + "step": 3972 + }, + { + "epoch": 0.7833201892744479, + "grad_norm": 0.543662857256321, + "learning_rate": 1.81808375783535e-05, + "loss": 0.4385, + "step": 3973 + }, + { + "epoch": 0.7835173501577287, + "grad_norm": 5.542609848424303, + "learning_rate": 1.817994606154916e-05, + "loss": 0.5795, + "step": 3974 + }, + { + "epoch": 0.7837145110410094, + "grad_norm": 0.5948140810991429, + "learning_rate": 1.8179054348215025e-05, + "loss": 0.4474, + "step": 3975 + }, + { + "epoch": 0.7839116719242902, + "grad_norm": 0.519423366632737, + "learning_rate": 1.8178162438372528e-05, + "loss": 0.4145, + "step": 3976 + }, + { + "epoch": 0.7841088328075709, + "grad_norm": 0.5902188567847949, + "learning_rate": 1.817727033204309e-05, + "loss": 0.4315, + "step": 3977 + }, + { + "epoch": 0.7843059936908517, + "grad_norm": 0.5948977039074232, + "learning_rate": 1.8176378029248147e-05, + "loss": 0.458, + "step": 3978 + }, + { + "epoch": 0.7845031545741324, + "grad_norm": 0.5611639183104089, + "learning_rate": 1.8175485530009137e-05, + "loss": 0.4531, + "step": 3979 + }, + { + "epoch": 0.7847003154574133, + "grad_norm": 1.0876807024291277, + "learning_rate": 1.8174592834347503e-05, + "loss": 0.4904, + "step": 3980 + }, + { + "epoch": 0.784897476340694, + "grad_norm": 0.5358410389890251, + "learning_rate": 1.8173699942284695e-05, + "loss": 0.4228, + "step": 3981 + }, + { + "epoch": 0.7850946372239748, + "grad_norm": 0.5943472824942584, + "learning_rate": 1.8172806853842163e-05, + "loss": 0.4777, + "step": 3982 + }, + { + "epoch": 0.7852917981072555, + "grad_norm": 0.5829558673091376, + "learning_rate": 1.8171913569041362e-05, + "loss": 0.4223, + "step": 3983 + }, + { + "epoch": 0.7854889589905363, + "grad_norm": 0.5306668965862121, + "learning_rate": 1.8171020087903762e-05, + "loss": 0.4089, + "step": 3984 + }, + { + "epoch": 0.785686119873817, + "grad_norm": 0.5451241305750418, + "learning_rate": 1.8170126410450823e-05, + "loss": 0.4361, + "step": 3985 + }, + { + "epoch": 0.7858832807570978, + "grad_norm": 0.5469353788262228, + "learning_rate": 1.8169232536704012e-05, + "loss": 0.4297, + "step": 3986 + }, + { + "epoch": 0.7860804416403786, + "grad_norm": 0.7105261028282002, + "learning_rate": 1.8168338466684817e-05, + "loss": 0.4311, + "step": 3987 + }, + { + "epoch": 0.7862776025236593, + "grad_norm": 0.5989580178979008, + "learning_rate": 1.816744420041471e-05, + "loss": 0.468, + "step": 3988 + }, + { + "epoch": 0.7864747634069401, + "grad_norm": 0.545707298003204, + "learning_rate": 1.816654973791518e-05, + "loss": 0.4527, + "step": 3989 + }, + { + "epoch": 0.7866719242902208, + "grad_norm": 0.6273199078420786, + "learning_rate": 1.8165655079207716e-05, + "loss": 0.449, + "step": 3990 + }, + { + "epoch": 0.7868690851735016, + "grad_norm": 0.5611837255166214, + "learning_rate": 1.816476022431381e-05, + "loss": 0.4656, + "step": 3991 + }, + { + "epoch": 0.7870662460567823, + "grad_norm": 0.5518444727001759, + "learning_rate": 1.816386517325497e-05, + "loss": 0.4284, + "step": 3992 + }, + { + "epoch": 0.7872634069400631, + "grad_norm": 0.5892275816171331, + "learning_rate": 1.816296992605269e-05, + "loss": 0.4539, + "step": 3993 + }, + { + "epoch": 0.7874605678233438, + "grad_norm": 0.5254195560267715, + "learning_rate": 1.8162074482728487e-05, + "loss": 0.4565, + "step": 3994 + }, + { + "epoch": 0.7876577287066246, + "grad_norm": 0.5448949680828074, + "learning_rate": 1.816117884330387e-05, + "loss": 0.4178, + "step": 3995 + }, + { + "epoch": 0.7878548895899053, + "grad_norm": 0.5746576844950827, + "learning_rate": 1.816028300780036e-05, + "loss": 0.4735, + "step": 3996 + }, + { + "epoch": 0.7880520504731862, + "grad_norm": 0.537017218341343, + "learning_rate": 1.8159386976239478e-05, + "loss": 0.4047, + "step": 3997 + }, + { + "epoch": 0.7882492113564669, + "grad_norm": 0.6354220762186988, + "learning_rate": 1.815849074864275e-05, + "loss": 0.4155, + "step": 3998 + }, + { + "epoch": 0.7884463722397477, + "grad_norm": 0.578467197061079, + "learning_rate": 1.8157594325031716e-05, + "loss": 0.4183, + "step": 3999 + }, + { + "epoch": 0.7886435331230284, + "grad_norm": 0.6475745820310453, + "learning_rate": 1.8156697705427907e-05, + "loss": 0.4209, + "step": 4000 + }, + { + "epoch": 0.7888406940063092, + "grad_norm": 0.6157428470516835, + "learning_rate": 1.815580088985287e-05, + "loss": 0.4531, + "step": 4001 + }, + { + "epoch": 0.7890378548895899, + "grad_norm": 0.5998799236699729, + "learning_rate": 1.815490387832815e-05, + "loss": 0.4751, + "step": 4002 + }, + { + "epoch": 0.7892350157728707, + "grad_norm": 0.5551797172113444, + "learning_rate": 1.8154006670875294e-05, + "loss": 0.444, + "step": 4003 + }, + { + "epoch": 0.7894321766561514, + "grad_norm": 0.7557929675891345, + "learning_rate": 1.815310926751586e-05, + "loss": 0.433, + "step": 4004 + }, + { + "epoch": 0.7896293375394322, + "grad_norm": 3.8174090731821937, + "learning_rate": 1.8152211668271413e-05, + "loss": 0.428, + "step": 4005 + }, + { + "epoch": 0.7898264984227129, + "grad_norm": 0.7034780817377414, + "learning_rate": 1.8151313873163513e-05, + "loss": 0.417, + "step": 4006 + }, + { + "epoch": 0.7900236593059937, + "grad_norm": 0.5926550848858589, + "learning_rate": 1.8150415882213735e-05, + "loss": 0.4756, + "step": 4007 + }, + { + "epoch": 0.7902208201892744, + "grad_norm": 0.6888172885215963, + "learning_rate": 1.814951769544365e-05, + "loss": 0.4438, + "step": 4008 + }, + { + "epoch": 0.7904179810725552, + "grad_norm": 0.5290234268356881, + "learning_rate": 1.8148619312874844e-05, + "loss": 0.4186, + "step": 4009 + }, + { + "epoch": 0.7906151419558359, + "grad_norm": 0.7662700497693185, + "learning_rate": 1.8147720734528893e-05, + "loss": 0.4871, + "step": 4010 + }, + { + "epoch": 0.7908123028391167, + "grad_norm": 0.5957613132464488, + "learning_rate": 1.814682196042739e-05, + "loss": 0.4572, + "step": 4011 + }, + { + "epoch": 0.7910094637223974, + "grad_norm": 0.6866807817706564, + "learning_rate": 1.8145922990591932e-05, + "loss": 0.4296, + "step": 4012 + }, + { + "epoch": 0.7912066246056783, + "grad_norm": 0.5397278268058946, + "learning_rate": 1.8145023825044114e-05, + "loss": 0.4247, + "step": 4013 + }, + { + "epoch": 0.791403785488959, + "grad_norm": 0.7343980490854036, + "learning_rate": 1.8144124463805535e-05, + "loss": 0.4587, + "step": 4014 + }, + { + "epoch": 0.7916009463722398, + "grad_norm": 0.5376919265100979, + "learning_rate": 1.8143224906897812e-05, + "loss": 0.4111, + "step": 4015 + }, + { + "epoch": 0.7917981072555205, + "grad_norm": 0.633488416178425, + "learning_rate": 1.814232515434255e-05, + "loss": 0.4592, + "step": 4016 + }, + { + "epoch": 0.7919952681388013, + "grad_norm": 0.5283179674486278, + "learning_rate": 1.814142520616137e-05, + "loss": 0.4288, + "step": 4017 + }, + { + "epoch": 0.792192429022082, + "grad_norm": 0.5780943767126454, + "learning_rate": 1.8140525062375894e-05, + "loss": 0.4239, + "step": 4018 + }, + { + "epoch": 0.7923895899053628, + "grad_norm": 0.5489901267012213, + "learning_rate": 1.8139624723007748e-05, + "loss": 0.423, + "step": 4019 + }, + { + "epoch": 0.7925867507886435, + "grad_norm": 0.5878569214560349, + "learning_rate": 1.813872418807856e-05, + "loss": 0.4265, + "step": 4020 + }, + { + "epoch": 0.7927839116719243, + "grad_norm": 0.5737545069149199, + "learning_rate": 1.813782345760997e-05, + "loss": 0.4491, + "step": 4021 + }, + { + "epoch": 0.792981072555205, + "grad_norm": 0.823990029369412, + "learning_rate": 1.813692253162362e-05, + "loss": 0.4082, + "step": 4022 + }, + { + "epoch": 0.7931782334384858, + "grad_norm": 0.5299434682153719, + "learning_rate": 1.8136021410141154e-05, + "loss": 0.3884, + "step": 4023 + }, + { + "epoch": 0.7933753943217665, + "grad_norm": 0.551899978074941, + "learning_rate": 1.813512009318422e-05, + "loss": 0.4369, + "step": 4024 + }, + { + "epoch": 0.7935725552050473, + "grad_norm": 8.176889365950668, + "learning_rate": 1.8134218580774475e-05, + "loss": 0.4485, + "step": 4025 + }, + { + "epoch": 0.793769716088328, + "grad_norm": 0.6125163347743271, + "learning_rate": 1.813331687293358e-05, + "loss": 0.4474, + "step": 4026 + }, + { + "epoch": 0.7939668769716088, + "grad_norm": 0.5484917124775043, + "learning_rate": 1.8132414969683197e-05, + "loss": 0.4205, + "step": 4027 + }, + { + "epoch": 0.7941640378548895, + "grad_norm": 0.557875142995521, + "learning_rate": 1.8131512871044993e-05, + "loss": 0.4225, + "step": 4028 + }, + { + "epoch": 0.7943611987381703, + "grad_norm": 0.637672714031013, + "learning_rate": 1.8130610577040646e-05, + "loss": 0.483, + "step": 4029 + }, + { + "epoch": 0.794558359621451, + "grad_norm": 0.5142583048499065, + "learning_rate": 1.812970808769183e-05, + "loss": 0.42, + "step": 4030 + }, + { + "epoch": 0.7947555205047319, + "grad_norm": 0.5565242219925195, + "learning_rate": 1.812880540302023e-05, + "loss": 0.4521, + "step": 4031 + }, + { + "epoch": 0.7949526813880127, + "grad_norm": 0.5468093785090244, + "learning_rate": 1.812790252304754e-05, + "loss": 0.4285, + "step": 4032 + }, + { + "epoch": 0.7951498422712934, + "grad_norm": 0.5792091705278589, + "learning_rate": 1.8126999447795438e-05, + "loss": 0.4666, + "step": 4033 + }, + { + "epoch": 0.7953470031545742, + "grad_norm": 4.072216890062226, + "learning_rate": 1.8126096177285637e-05, + "loss": 0.5029, + "step": 4034 + }, + { + "epoch": 0.7955441640378549, + "grad_norm": 0.6296907138779285, + "learning_rate": 1.8125192711539828e-05, + "loss": 0.4361, + "step": 4035 + }, + { + "epoch": 0.7957413249211357, + "grad_norm": 0.5788781690916837, + "learning_rate": 1.812428905057972e-05, + "loss": 0.4399, + "step": 4036 + }, + { + "epoch": 0.7959384858044164, + "grad_norm": 0.5884757591246823, + "learning_rate": 1.8123385194427027e-05, + "loss": 0.4888, + "step": 4037 + }, + { + "epoch": 0.7961356466876972, + "grad_norm": 0.6261353284400933, + "learning_rate": 1.8122481143103465e-05, + "loss": 0.4474, + "step": 4038 + }, + { + "epoch": 0.7963328075709779, + "grad_norm": 0.5682139648963489, + "learning_rate": 1.812157689663075e-05, + "loss": 0.4466, + "step": 4039 + }, + { + "epoch": 0.7965299684542587, + "grad_norm": 0.6006306057576911, + "learning_rate": 1.8120672455030606e-05, + "loss": 0.4287, + "step": 4040 + }, + { + "epoch": 0.7967271293375394, + "grad_norm": 0.6096153492041305, + "learning_rate": 1.8119767818324773e-05, + "loss": 0.4854, + "step": 4041 + }, + { + "epoch": 0.7969242902208202, + "grad_norm": 0.5816536621919691, + "learning_rate": 1.8118862986534974e-05, + "loss": 0.4314, + "step": 4042 + }, + { + "epoch": 0.7971214511041009, + "grad_norm": 0.5915477679203885, + "learning_rate": 1.811795795968296e-05, + "loss": 0.4392, + "step": 4043 + }, + { + "epoch": 0.7973186119873817, + "grad_norm": 0.5550175414784126, + "learning_rate": 1.8117052737790463e-05, + "loss": 0.3972, + "step": 4044 + }, + { + "epoch": 0.7975157728706624, + "grad_norm": 0.5778936646533668, + "learning_rate": 1.8116147320879238e-05, + "loss": 0.4382, + "step": 4045 + }, + { + "epoch": 0.7977129337539433, + "grad_norm": 0.6157010473714177, + "learning_rate": 1.811524170897104e-05, + "loss": 0.4333, + "step": 4046 + }, + { + "epoch": 0.797910094637224, + "grad_norm": 0.5637162571951514, + "learning_rate": 1.8114335902087625e-05, + "loss": 0.4542, + "step": 4047 + }, + { + "epoch": 0.7981072555205048, + "grad_norm": 0.584581727444574, + "learning_rate": 1.811342990025075e-05, + "loss": 0.454, + "step": 4048 + }, + { + "epoch": 0.7983044164037855, + "grad_norm": 0.5543640727335405, + "learning_rate": 1.8112523703482194e-05, + "loss": 0.4145, + "step": 4049 + }, + { + "epoch": 0.7985015772870663, + "grad_norm": 0.6099912064912423, + "learning_rate": 1.8111617311803722e-05, + "loss": 0.4478, + "step": 4050 + }, + { + "epoch": 0.798698738170347, + "grad_norm": 0.5700094595349666, + "learning_rate": 1.8110710725237114e-05, + "loss": 0.4459, + "step": 4051 + }, + { + "epoch": 0.7988958990536278, + "grad_norm": 0.5495028696341402, + "learning_rate": 1.8109803943804146e-05, + "loss": 0.424, + "step": 4052 + }, + { + "epoch": 0.7990930599369085, + "grad_norm": 0.5306200168382031, + "learning_rate": 1.8108896967526607e-05, + "loss": 0.4018, + "step": 4053 + }, + { + "epoch": 0.7992902208201893, + "grad_norm": 1.038998012553428, + "learning_rate": 1.810798979642629e-05, + "loss": 0.4464, + "step": 4054 + }, + { + "epoch": 0.79948738170347, + "grad_norm": 1.0234857955785743, + "learning_rate": 1.8107082430524986e-05, + "loss": 0.4581, + "step": 4055 + }, + { + "epoch": 0.7996845425867508, + "grad_norm": 0.5159790767288523, + "learning_rate": 1.81061748698445e-05, + "loss": 0.433, + "step": 4056 + }, + { + "epoch": 0.7998817034700315, + "grad_norm": 1.0841772174572797, + "learning_rate": 1.8105267114406633e-05, + "loss": 0.4212, + "step": 4057 + }, + { + "epoch": 0.8000788643533123, + "grad_norm": 0.5786065173723118, + "learning_rate": 1.81043591642332e-05, + "loss": 0.4607, + "step": 4058 + }, + { + "epoch": 0.800276025236593, + "grad_norm": 0.68262028393148, + "learning_rate": 1.810345101934601e-05, + "loss": 0.41, + "step": 4059 + }, + { + "epoch": 0.8004731861198738, + "grad_norm": 1.292583207003667, + "learning_rate": 1.8102542679766884e-05, + "loss": 0.4715, + "step": 4060 + }, + { + "epoch": 0.8006703470031545, + "grad_norm": 0.5803586064757899, + "learning_rate": 1.8101634145517644e-05, + "loss": 0.4231, + "step": 4061 + }, + { + "epoch": 0.8008675078864353, + "grad_norm": 0.6051334610135585, + "learning_rate": 1.810072541662012e-05, + "loss": 0.5071, + "step": 4062 + }, + { + "epoch": 0.801064668769716, + "grad_norm": 0.6153429571062644, + "learning_rate": 1.8099816493096144e-05, + "loss": 0.4541, + "step": 4063 + }, + { + "epoch": 0.8012618296529969, + "grad_norm": 0.6245215575502755, + "learning_rate": 1.8098907374967557e-05, + "loss": 0.4805, + "step": 4064 + }, + { + "epoch": 0.8014589905362776, + "grad_norm": 1.371389249743891, + "learning_rate": 1.8097998062256193e-05, + "loss": 0.4497, + "step": 4065 + }, + { + "epoch": 0.8016561514195584, + "grad_norm": 0.5974203617721795, + "learning_rate": 1.8097088554983906e-05, + "loss": 0.4546, + "step": 4066 + }, + { + "epoch": 0.8018533123028391, + "grad_norm": 0.8325017489031189, + "learning_rate": 1.8096178853172548e-05, + "loss": 0.4492, + "step": 4067 + }, + { + "epoch": 0.8020504731861199, + "grad_norm": 0.5910351760004731, + "learning_rate": 1.809526895684397e-05, + "loss": 0.4848, + "step": 4068 + }, + { + "epoch": 0.8022476340694006, + "grad_norm": 0.5900803892149564, + "learning_rate": 1.809435886602004e-05, + "loss": 0.4601, + "step": 4069 + }, + { + "epoch": 0.8024447949526814, + "grad_norm": 0.8379969783401379, + "learning_rate": 1.8093448580722617e-05, + "loss": 0.4094, + "step": 4070 + }, + { + "epoch": 0.8026419558359621, + "grad_norm": 0.6961897937500606, + "learning_rate": 1.809253810097358e-05, + "loss": 0.4621, + "step": 4071 + }, + { + "epoch": 0.8028391167192429, + "grad_norm": 0.5613024287580417, + "learning_rate": 1.809162742679479e-05, + "loss": 0.4727, + "step": 4072 + }, + { + "epoch": 0.8030362776025236, + "grad_norm": 0.5597933707348918, + "learning_rate": 1.8090716558208136e-05, + "loss": 0.4608, + "step": 4073 + }, + { + "epoch": 0.8032334384858044, + "grad_norm": 0.5506039857706837, + "learning_rate": 1.8089805495235507e-05, + "loss": 0.4177, + "step": 4074 + }, + { + "epoch": 0.8034305993690851, + "grad_norm": 0.6412143999415043, + "learning_rate": 1.808889423789878e-05, + "loss": 0.45, + "step": 4075 + }, + { + "epoch": 0.8036277602523659, + "grad_norm": 0.5355910563904579, + "learning_rate": 1.808798278621986e-05, + "loss": 0.4033, + "step": 4076 + }, + { + "epoch": 0.8038249211356467, + "grad_norm": 1.393193920661567, + "learning_rate": 1.808707114022064e-05, + "loss": 0.4729, + "step": 4077 + }, + { + "epoch": 0.8040220820189274, + "grad_norm": 0.5347566428548295, + "learning_rate": 1.808615929992302e-05, + "loss": 0.4392, + "step": 4078 + }, + { + "epoch": 0.8042192429022083, + "grad_norm": 0.5701756563883921, + "learning_rate": 1.8085247265348913e-05, + "loss": 0.4447, + "step": 4079 + }, + { + "epoch": 0.804416403785489, + "grad_norm": 0.5336017684518319, + "learning_rate": 1.808433503652023e-05, + "loss": 0.4357, + "step": 4080 + }, + { + "epoch": 0.8046135646687698, + "grad_norm": 0.5128271293806619, + "learning_rate": 1.8083422613458886e-05, + "loss": 0.4149, + "step": 4081 + }, + { + "epoch": 0.8048107255520505, + "grad_norm": 0.5813112643112214, + "learning_rate": 1.8082509996186802e-05, + "loss": 0.475, + "step": 4082 + }, + { + "epoch": 0.8050078864353313, + "grad_norm": 0.5645562044886906, + "learning_rate": 1.808159718472591e-05, + "loss": 0.449, + "step": 4083 + }, + { + "epoch": 0.805205047318612, + "grad_norm": 0.5209108203099843, + "learning_rate": 1.8080684179098135e-05, + "loss": 0.4282, + "step": 4084 + }, + { + "epoch": 0.8054022082018928, + "grad_norm": 0.5442950978703488, + "learning_rate": 1.807977097932542e-05, + "loss": 0.4519, + "step": 4085 + }, + { + "epoch": 0.8055993690851735, + "grad_norm": 0.5397055857931383, + "learning_rate": 1.8078857585429698e-05, + "loss": 0.4182, + "step": 4086 + }, + { + "epoch": 0.8057965299684543, + "grad_norm": 0.5457157266372396, + "learning_rate": 1.8077943997432913e-05, + "loss": 0.4589, + "step": 4087 + }, + { + "epoch": 0.805993690851735, + "grad_norm": 0.5415377375158944, + "learning_rate": 1.8077030215357024e-05, + "loss": 0.4621, + "step": 4088 + }, + { + "epoch": 0.8061908517350158, + "grad_norm": 0.5301693008219718, + "learning_rate": 1.8076116239223976e-05, + "loss": 0.4085, + "step": 4089 + }, + { + "epoch": 0.8063880126182965, + "grad_norm": 0.5719849084421536, + "learning_rate": 1.807520206905573e-05, + "loss": 0.4478, + "step": 4090 + }, + { + "epoch": 0.8065851735015773, + "grad_norm": 0.5068601560398842, + "learning_rate": 1.8074287704874258e-05, + "loss": 0.4119, + "step": 4091 + }, + { + "epoch": 0.806782334384858, + "grad_norm": 0.5667924640136568, + "learning_rate": 1.8073373146701517e-05, + "loss": 0.4064, + "step": 4092 + }, + { + "epoch": 0.8069794952681388, + "grad_norm": 0.5134527641968423, + "learning_rate": 1.8072458394559485e-05, + "loss": 0.3986, + "step": 4093 + }, + { + "epoch": 0.8071766561514195, + "grad_norm": 0.6144715703002955, + "learning_rate": 1.807154344847014e-05, + "loss": 0.4866, + "step": 4094 + }, + { + "epoch": 0.8073738170347003, + "grad_norm": 0.6012333197786712, + "learning_rate": 1.8070628308455463e-05, + "loss": 0.4539, + "step": 4095 + }, + { + "epoch": 0.807570977917981, + "grad_norm": 0.5267747372001423, + "learning_rate": 1.8069712974537444e-05, + "loss": 0.4468, + "step": 4096 + }, + { + "epoch": 0.8077681388012619, + "grad_norm": 0.5566674138070858, + "learning_rate": 1.8068797446738072e-05, + "loss": 0.4231, + "step": 4097 + }, + { + "epoch": 0.8079652996845426, + "grad_norm": 0.5254269426936413, + "learning_rate": 1.806788172507934e-05, + "loss": 0.4015, + "step": 4098 + }, + { + "epoch": 0.8081624605678234, + "grad_norm": 0.5478509815630053, + "learning_rate": 1.8066965809583255e-05, + "loss": 0.4472, + "step": 4099 + }, + { + "epoch": 0.8083596214511041, + "grad_norm": 0.5715957253192107, + "learning_rate": 1.8066049700271818e-05, + "loss": 0.4465, + "step": 4100 + }, + { + "epoch": 0.8085567823343849, + "grad_norm": 0.5584176748983026, + "learning_rate": 1.8065133397167045e-05, + "loss": 0.4425, + "step": 4101 + }, + { + "epoch": 0.8087539432176656, + "grad_norm": 0.5738027949569142, + "learning_rate": 1.8064216900290943e-05, + "loss": 0.4376, + "step": 4102 + }, + { + "epoch": 0.8089511041009464, + "grad_norm": 0.5582915978440195, + "learning_rate": 1.806330020966554e-05, + "loss": 0.4309, + "step": 4103 + }, + { + "epoch": 0.8091482649842271, + "grad_norm": 0.6063188844006441, + "learning_rate": 1.8062383325312855e-05, + "loss": 0.4636, + "step": 4104 + }, + { + "epoch": 0.8093454258675079, + "grad_norm": 0.5657855330946964, + "learning_rate": 1.8061466247254914e-05, + "loss": 0.4632, + "step": 4105 + }, + { + "epoch": 0.8095425867507886, + "grad_norm": 0.5670328332375311, + "learning_rate": 1.806054897551376e-05, + "loss": 0.439, + "step": 4106 + }, + { + "epoch": 0.8097397476340694, + "grad_norm": 0.6265658420538915, + "learning_rate": 1.8059631510111424e-05, + "loss": 0.4424, + "step": 4107 + }, + { + "epoch": 0.8099369085173501, + "grad_norm": 0.7579410291352082, + "learning_rate": 1.805871385106995e-05, + "loss": 0.4335, + "step": 4108 + }, + { + "epoch": 0.8101340694006309, + "grad_norm": 0.5063248042221858, + "learning_rate": 1.8057795998411384e-05, + "loss": 0.4214, + "step": 4109 + }, + { + "epoch": 0.8103312302839116, + "grad_norm": 0.5236497593759417, + "learning_rate": 1.8056877952157786e-05, + "loss": 0.43, + "step": 4110 + }, + { + "epoch": 0.8105283911671924, + "grad_norm": 0.6034930272092489, + "learning_rate": 1.80559597123312e-05, + "loss": 0.4211, + "step": 4111 + }, + { + "epoch": 0.8107255520504731, + "grad_norm": 0.5882438633902384, + "learning_rate": 1.80550412789537e-05, + "loss": 0.443, + "step": 4112 + }, + { + "epoch": 0.810922712933754, + "grad_norm": 0.5366584995675621, + "learning_rate": 1.8054122652047342e-05, + "loss": 0.4112, + "step": 4113 + }, + { + "epoch": 0.8111198738170347, + "grad_norm": 0.6250197267195037, + "learning_rate": 1.8053203831634207e-05, + "loss": 0.4361, + "step": 4114 + }, + { + "epoch": 0.8113170347003155, + "grad_norm": 0.5654169621581985, + "learning_rate": 1.805228481773636e-05, + "loss": 0.4699, + "step": 4115 + }, + { + "epoch": 0.8115141955835962, + "grad_norm": 0.5753974888094099, + "learning_rate": 1.8051365610375884e-05, + "loss": 0.4418, + "step": 4116 + }, + { + "epoch": 0.811711356466877, + "grad_norm": 0.5264404059073737, + "learning_rate": 1.8050446209574872e-05, + "loss": 0.4286, + "step": 4117 + }, + { + "epoch": 0.8119085173501577, + "grad_norm": 0.6190117803945342, + "learning_rate": 1.8049526615355404e-05, + "loss": 0.4704, + "step": 4118 + }, + { + "epoch": 0.8121056782334385, + "grad_norm": 0.5747549365566348, + "learning_rate": 1.8048606827739578e-05, + "loss": 0.4404, + "step": 4119 + }, + { + "epoch": 0.8123028391167192, + "grad_norm": 0.6355096601857466, + "learning_rate": 1.8047686846749488e-05, + "loss": 0.453, + "step": 4120 + }, + { + "epoch": 0.8125, + "grad_norm": 0.5199655678601448, + "learning_rate": 1.8046766672407244e-05, + "loss": 0.4341, + "step": 4121 + }, + { + "epoch": 0.8126971608832808, + "grad_norm": 0.6314812726234823, + "learning_rate": 1.8045846304734948e-05, + "loss": 0.4633, + "step": 4122 + }, + { + "epoch": 0.8128943217665615, + "grad_norm": 0.5432564307359536, + "learning_rate": 1.8044925743754717e-05, + "loss": 0.4788, + "step": 4123 + }, + { + "epoch": 0.8130914826498423, + "grad_norm": 0.7570361059774975, + "learning_rate": 1.8044004989488662e-05, + "loss": 0.4695, + "step": 4124 + }, + { + "epoch": 0.813288643533123, + "grad_norm": 0.5383797237758886, + "learning_rate": 1.8043084041958915e-05, + "loss": 0.4453, + "step": 4125 + }, + { + "epoch": 0.8134858044164038, + "grad_norm": 0.647994374550417, + "learning_rate": 1.8042162901187596e-05, + "loss": 0.4632, + "step": 4126 + }, + { + "epoch": 0.8136829652996845, + "grad_norm": 0.5849265654509385, + "learning_rate": 1.8041241567196834e-05, + "loss": 0.4533, + "step": 4127 + }, + { + "epoch": 0.8138801261829653, + "grad_norm": 0.5450500959334965, + "learning_rate": 1.804032004000877e-05, + "loss": 0.4477, + "step": 4128 + }, + { + "epoch": 0.814077287066246, + "grad_norm": 0.5608733688063781, + "learning_rate": 1.803939831964554e-05, + "loss": 0.4586, + "step": 4129 + }, + { + "epoch": 0.8142744479495269, + "grad_norm": 0.4822206632784062, + "learning_rate": 1.8038476406129294e-05, + "loss": 0.4067, + "step": 4130 + }, + { + "epoch": 0.8144716088328076, + "grad_norm": 0.5171796277326578, + "learning_rate": 1.803755429948218e-05, + "loss": 0.4101, + "step": 4131 + }, + { + "epoch": 0.8146687697160884, + "grad_norm": 0.5373199928293783, + "learning_rate": 1.8036631999726348e-05, + "loss": 0.4328, + "step": 4132 + }, + { + "epoch": 0.8148659305993691, + "grad_norm": 0.5649352218109933, + "learning_rate": 1.8035709506883962e-05, + "loss": 0.4896, + "step": 4133 + }, + { + "epoch": 0.8150630914826499, + "grad_norm": 0.5095327224371953, + "learning_rate": 1.8034786820977184e-05, + "loss": 0.4626, + "step": 4134 + }, + { + "epoch": 0.8152602523659306, + "grad_norm": 0.5296243774919772, + "learning_rate": 1.8033863942028183e-05, + "loss": 0.4478, + "step": 4135 + }, + { + "epoch": 0.8154574132492114, + "grad_norm": 0.5532856082609808, + "learning_rate": 1.803294087005913e-05, + "loss": 0.4353, + "step": 4136 + }, + { + "epoch": 0.8156545741324921, + "grad_norm": 0.5241033299839589, + "learning_rate": 1.8032017605092202e-05, + "loss": 0.4305, + "step": 4137 + }, + { + "epoch": 0.8158517350157729, + "grad_norm": 0.47170306660651057, + "learning_rate": 1.8031094147149587e-05, + "loss": 0.3626, + "step": 4138 + }, + { + "epoch": 0.8160488958990536, + "grad_norm": 0.5677681072099129, + "learning_rate": 1.8030170496253463e-05, + "loss": 0.4935, + "step": 4139 + }, + { + "epoch": 0.8162460567823344, + "grad_norm": 0.5340874190338516, + "learning_rate": 1.802924665242603e-05, + "loss": 0.42, + "step": 4140 + }, + { + "epoch": 0.8164432176656151, + "grad_norm": 0.5247377226187014, + "learning_rate": 1.8028322615689477e-05, + "loss": 0.4289, + "step": 4141 + }, + { + "epoch": 0.8166403785488959, + "grad_norm": 0.5542330994880117, + "learning_rate": 1.802739838606601e-05, + "loss": 0.4464, + "step": 4142 + }, + { + "epoch": 0.8168375394321766, + "grad_norm": 19.89904924544576, + "learning_rate": 1.8026473963577834e-05, + "loss": 0.5853, + "step": 4143 + }, + { + "epoch": 0.8170347003154574, + "grad_norm": 0.6171631795186983, + "learning_rate": 1.8025549348247154e-05, + "loss": 0.4306, + "step": 4144 + }, + { + "epoch": 0.8172318611987381, + "grad_norm": 0.5504405635665522, + "learning_rate": 1.802462454009619e-05, + "loss": 0.4298, + "step": 4145 + }, + { + "epoch": 0.817429022082019, + "grad_norm": 0.5886602077518848, + "learning_rate": 1.802369953914716e-05, + "loss": 0.4274, + "step": 4146 + }, + { + "epoch": 0.8176261829652997, + "grad_norm": 0.5150160069331389, + "learning_rate": 1.8022774345422284e-05, + "loss": 0.4456, + "step": 4147 + }, + { + "epoch": 0.8178233438485805, + "grad_norm": 0.5736174914873833, + "learning_rate": 1.8021848958943796e-05, + "loss": 0.4318, + "step": 4148 + }, + { + "epoch": 0.8180205047318612, + "grad_norm": 0.553943761555841, + "learning_rate": 1.8020923379733925e-05, + "loss": 0.4331, + "step": 4149 + }, + { + "epoch": 0.818217665615142, + "grad_norm": 0.5424039413522739, + "learning_rate": 1.801999760781491e-05, + "loss": 0.4062, + "step": 4150 + }, + { + "epoch": 0.8184148264984227, + "grad_norm": 0.6049951163196499, + "learning_rate": 1.8019071643208996e-05, + "loss": 0.4153, + "step": 4151 + }, + { + "epoch": 0.8186119873817035, + "grad_norm": 0.5680607234809361, + "learning_rate": 1.8018145485938427e-05, + "loss": 0.4338, + "step": 4152 + }, + { + "epoch": 0.8188091482649842, + "grad_norm": 0.5291584704930397, + "learning_rate": 1.8017219136025458e-05, + "loss": 0.4267, + "step": 4153 + }, + { + "epoch": 0.819006309148265, + "grad_norm": 0.5806259018554695, + "learning_rate": 1.801629259349234e-05, + "loss": 0.4584, + "step": 4154 + }, + { + "epoch": 0.8192034700315457, + "grad_norm": 0.5109979743220913, + "learning_rate": 1.801536585836134e-05, + "loss": 0.4448, + "step": 4155 + }, + { + "epoch": 0.8194006309148265, + "grad_norm": 1.0089832585060787, + "learning_rate": 1.801443893065472e-05, + "loss": 0.4481, + "step": 4156 + }, + { + "epoch": 0.8195977917981072, + "grad_norm": 0.5400204179891461, + "learning_rate": 1.8013511810394747e-05, + "loss": 0.4355, + "step": 4157 + }, + { + "epoch": 0.819794952681388, + "grad_norm": 0.5585252526726473, + "learning_rate": 1.80125844976037e-05, + "loss": 0.4572, + "step": 4158 + }, + { + "epoch": 0.8199921135646687, + "grad_norm": 0.5965272393146293, + "learning_rate": 1.8011656992303863e-05, + "loss": 0.4494, + "step": 4159 + }, + { + "epoch": 0.8201892744479495, + "grad_norm": 0.5918746014179956, + "learning_rate": 1.801072929451751e-05, + "loss": 0.4272, + "step": 4160 + }, + { + "epoch": 0.8203864353312302, + "grad_norm": 0.5389833452459519, + "learning_rate": 1.8009801404266936e-05, + "loss": 0.4041, + "step": 4161 + }, + { + "epoch": 0.820583596214511, + "grad_norm": 2.6854862231259338, + "learning_rate": 1.8008873321574435e-05, + "loss": 0.4462, + "step": 4162 + }, + { + "epoch": 0.8207807570977917, + "grad_norm": 12.845011498981133, + "learning_rate": 1.8007945046462302e-05, + "loss": 0.4828, + "step": 4163 + }, + { + "epoch": 0.8209779179810726, + "grad_norm": 0.6285393491640829, + "learning_rate": 1.800701657895284e-05, + "loss": 0.4869, + "step": 4164 + }, + { + "epoch": 0.8211750788643533, + "grad_norm": 0.5463921951908919, + "learning_rate": 1.8006087919068354e-05, + "loss": 0.4517, + "step": 4165 + }, + { + "epoch": 0.8213722397476341, + "grad_norm": 3.3042430055230434, + "learning_rate": 1.800515906683116e-05, + "loss": 0.4767, + "step": 4166 + }, + { + "epoch": 0.8215694006309149, + "grad_norm": 0.608965250290735, + "learning_rate": 1.8004230022263575e-05, + "loss": 0.4708, + "step": 4167 + }, + { + "epoch": 0.8217665615141956, + "grad_norm": 0.5644431847609628, + "learning_rate": 1.800330078538792e-05, + "loss": 0.4087, + "step": 4168 + }, + { + "epoch": 0.8219637223974764, + "grad_norm": 0.6120359597106482, + "learning_rate": 1.8002371356226512e-05, + "loss": 0.4715, + "step": 4169 + }, + { + "epoch": 0.8221608832807571, + "grad_norm": 0.5666338187192854, + "learning_rate": 1.800144173480169e-05, + "loss": 0.463, + "step": 4170 + }, + { + "epoch": 0.8223580441640379, + "grad_norm": 0.5992673957167014, + "learning_rate": 1.800051192113579e-05, + "loss": 0.4724, + "step": 4171 + }, + { + "epoch": 0.8225552050473186, + "grad_norm": 0.5713699259593359, + "learning_rate": 1.799958191525115e-05, + "loss": 0.4568, + "step": 4172 + }, + { + "epoch": 0.8227523659305994, + "grad_norm": 0.5796266634572839, + "learning_rate": 1.7998651717170105e-05, + "loss": 0.4501, + "step": 4173 + }, + { + "epoch": 0.8229495268138801, + "grad_norm": 0.5794145486533863, + "learning_rate": 1.7997721326915015e-05, + "loss": 0.4894, + "step": 4174 + }, + { + "epoch": 0.8231466876971609, + "grad_norm": 0.548917204300831, + "learning_rate": 1.799679074450823e-05, + "loss": 0.4151, + "step": 4175 + }, + { + "epoch": 0.8233438485804416, + "grad_norm": 0.6880509796679153, + "learning_rate": 1.7995859969972108e-05, + "loss": 0.4362, + "step": 4176 + }, + { + "epoch": 0.8235410094637224, + "grad_norm": 0.6333808028462822, + "learning_rate": 1.7994929003329008e-05, + "loss": 0.4802, + "step": 4177 + }, + { + "epoch": 0.8237381703470031, + "grad_norm": 0.5717804452138856, + "learning_rate": 1.7993997844601305e-05, + "loss": 0.4161, + "step": 4178 + }, + { + "epoch": 0.823935331230284, + "grad_norm": 0.5904214094197301, + "learning_rate": 1.799306649381136e-05, + "loss": 0.4527, + "step": 4179 + }, + { + "epoch": 0.8241324921135647, + "grad_norm": 0.5224269790433933, + "learning_rate": 1.7992134950981562e-05, + "loss": 0.3997, + "step": 4180 + }, + { + "epoch": 0.8243296529968455, + "grad_norm": 0.5926917719981647, + "learning_rate": 1.7991203216134283e-05, + "loss": 0.436, + "step": 4181 + }, + { + "epoch": 0.8245268138801262, + "grad_norm": 0.5735078186499827, + "learning_rate": 1.7990271289291913e-05, + "loss": 0.4367, + "step": 4182 + }, + { + "epoch": 0.824723974763407, + "grad_norm": 0.6677743637655965, + "learning_rate": 1.798933917047684e-05, + "loss": 0.4423, + "step": 4183 + }, + { + "epoch": 0.8249211356466877, + "grad_norm": 0.582150975318319, + "learning_rate": 1.7988406859711457e-05, + "loss": 0.4656, + "step": 4184 + }, + { + "epoch": 0.8251182965299685, + "grad_norm": 0.4907564880534393, + "learning_rate": 1.7987474357018172e-05, + "loss": 0.3808, + "step": 4185 + }, + { + "epoch": 0.8253154574132492, + "grad_norm": 0.6380319217129097, + "learning_rate": 1.7986541662419376e-05, + "loss": 0.422, + "step": 4186 + }, + { + "epoch": 0.82551261829653, + "grad_norm": 0.5678615187977981, + "learning_rate": 1.7985608775937492e-05, + "loss": 0.4317, + "step": 4187 + }, + { + "epoch": 0.8257097791798107, + "grad_norm": 0.5265688118305606, + "learning_rate": 1.798467569759492e-05, + "loss": 0.4309, + "step": 4188 + }, + { + "epoch": 0.8259069400630915, + "grad_norm": 0.5925439253755009, + "learning_rate": 1.798374242741409e-05, + "loss": 0.4434, + "step": 4189 + }, + { + "epoch": 0.8261041009463722, + "grad_norm": 0.6569515898788928, + "learning_rate": 1.7982808965417415e-05, + "loss": 0.4008, + "step": 4190 + }, + { + "epoch": 0.826301261829653, + "grad_norm": 0.5541366851451021, + "learning_rate": 1.7981875311627327e-05, + "loss": 0.4168, + "step": 4191 + }, + { + "epoch": 0.8264984227129337, + "grad_norm": 0.5419308496525422, + "learning_rate": 1.7980941466066254e-05, + "loss": 0.4263, + "step": 4192 + }, + { + "epoch": 0.8266955835962145, + "grad_norm": 0.5627041984651608, + "learning_rate": 1.798000742875664e-05, + "loss": 0.4471, + "step": 4193 + }, + { + "epoch": 0.8268927444794952, + "grad_norm": 0.5277704275115969, + "learning_rate": 1.797907319972092e-05, + "loss": 0.4245, + "step": 4194 + }, + { + "epoch": 0.827089905362776, + "grad_norm": 0.5673323456563254, + "learning_rate": 1.797813877898154e-05, + "loss": 0.437, + "step": 4195 + }, + { + "epoch": 0.8272870662460567, + "grad_norm": 0.580531434044333, + "learning_rate": 1.7977204166560954e-05, + "loss": 0.4844, + "step": 4196 + }, + { + "epoch": 0.8274842271293376, + "grad_norm": 0.5301677248107065, + "learning_rate": 1.797626936248161e-05, + "loss": 0.4146, + "step": 4197 + }, + { + "epoch": 0.8276813880126183, + "grad_norm": 0.6170836844395501, + "learning_rate": 1.7975334366765974e-05, + "loss": 0.4555, + "step": 4198 + }, + { + "epoch": 0.8278785488958991, + "grad_norm": 0.5481999082788359, + "learning_rate": 1.7974399179436502e-05, + "loss": 0.467, + "step": 4199 + }, + { + "epoch": 0.8280757097791798, + "grad_norm": 0.5408148308068799, + "learning_rate": 1.7973463800515675e-05, + "loss": 0.4672, + "step": 4200 + }, + { + "epoch": 0.8282728706624606, + "grad_norm": 0.5652366078680668, + "learning_rate": 1.7972528230025954e-05, + "loss": 0.4412, + "step": 4201 + }, + { + "epoch": 0.8284700315457413, + "grad_norm": 0.538008456834178, + "learning_rate": 1.7971592467989824e-05, + "loss": 0.4455, + "step": 4202 + }, + { + "epoch": 0.8286671924290221, + "grad_norm": 0.5497144473137757, + "learning_rate": 1.7970656514429767e-05, + "loss": 0.4495, + "step": 4203 + }, + { + "epoch": 0.8288643533123028, + "grad_norm": 0.5400822082606279, + "learning_rate": 1.7969720369368266e-05, + "loss": 0.4171, + "step": 4204 + }, + { + "epoch": 0.8290615141955836, + "grad_norm": 0.5032870883447115, + "learning_rate": 1.796878403282782e-05, + "loss": 0.4039, + "step": 4205 + }, + { + "epoch": 0.8292586750788643, + "grad_norm": 0.5077806097030966, + "learning_rate": 1.7967847504830914e-05, + "loss": 0.4433, + "step": 4206 + }, + { + "epoch": 0.8294558359621451, + "grad_norm": 0.5809809540553955, + "learning_rate": 1.7966910785400058e-05, + "loss": 0.4481, + "step": 4207 + }, + { + "epoch": 0.8296529968454258, + "grad_norm": 0.5469499961190881, + "learning_rate": 1.7965973874557754e-05, + "loss": 0.4356, + "step": 4208 + }, + { + "epoch": 0.8298501577287066, + "grad_norm": 0.5124543710594913, + "learning_rate": 1.7965036772326515e-05, + "loss": 0.4394, + "step": 4209 + }, + { + "epoch": 0.8300473186119873, + "grad_norm": 0.5569661416418024, + "learning_rate": 1.796409947872885e-05, + "loss": 0.4131, + "step": 4210 + }, + { + "epoch": 0.8302444794952681, + "grad_norm": 0.566607051094716, + "learning_rate": 1.7963161993787285e-05, + "loss": 0.4291, + "step": 4211 + }, + { + "epoch": 0.830441640378549, + "grad_norm": 0.6317395544141171, + "learning_rate": 1.796222431752434e-05, + "loss": 0.4528, + "step": 4212 + }, + { + "epoch": 0.8306388012618297, + "grad_norm": 0.5260509022156253, + "learning_rate": 1.796128644996254e-05, + "loss": 0.4194, + "step": 4213 + }, + { + "epoch": 0.8308359621451105, + "grad_norm": 0.8039173856781239, + "learning_rate": 1.7960348391124422e-05, + "loss": 0.4602, + "step": 4214 + }, + { + "epoch": 0.8310331230283912, + "grad_norm": 0.5673951125473756, + "learning_rate": 1.7959410141032524e-05, + "loss": 0.4874, + "step": 4215 + }, + { + "epoch": 0.831230283911672, + "grad_norm": 0.5472093603546104, + "learning_rate": 1.795847169970939e-05, + "loss": 0.4226, + "step": 4216 + }, + { + "epoch": 0.8314274447949527, + "grad_norm": 0.6600760076559387, + "learning_rate": 1.7957533067177565e-05, + "loss": 0.4507, + "step": 4217 + }, + { + "epoch": 0.8316246056782335, + "grad_norm": 0.5420048851350387, + "learning_rate": 1.7956594243459597e-05, + "loss": 0.4597, + "step": 4218 + }, + { + "epoch": 0.8318217665615142, + "grad_norm": 0.5846319849001685, + "learning_rate": 1.7955655228578046e-05, + "loss": 0.4637, + "step": 4219 + }, + { + "epoch": 0.832018927444795, + "grad_norm": 0.5925117015754054, + "learning_rate": 1.7954716022555474e-05, + "loss": 0.475, + "step": 4220 + }, + { + "epoch": 0.8322160883280757, + "grad_norm": 0.5371836503383141, + "learning_rate": 1.795377662541444e-05, + "loss": 0.4217, + "step": 4221 + }, + { + "epoch": 0.8324132492113565, + "grad_norm": 0.7483433485104296, + "learning_rate": 1.795283703717752e-05, + "loss": 0.4527, + "step": 4222 + }, + { + "epoch": 0.8326104100946372, + "grad_norm": 0.5384653534370835, + "learning_rate": 1.7951897257867284e-05, + "loss": 0.4125, + "step": 4223 + }, + { + "epoch": 0.832807570977918, + "grad_norm": 0.5538901981181154, + "learning_rate": 1.7950957287506313e-05, + "loss": 0.437, + "step": 4224 + }, + { + "epoch": 0.8330047318611987, + "grad_norm": 0.5448553268692006, + "learning_rate": 1.795001712611719e-05, + "loss": 0.432, + "step": 4225 + }, + { + "epoch": 0.8332018927444795, + "grad_norm": 0.5608394906395617, + "learning_rate": 1.7949076773722505e-05, + "loss": 0.4812, + "step": 4226 + }, + { + "epoch": 0.8333990536277602, + "grad_norm": 0.48592649216287015, + "learning_rate": 1.7948136230344847e-05, + "loss": 0.4042, + "step": 4227 + }, + { + "epoch": 0.833596214511041, + "grad_norm": 0.5149456873877976, + "learning_rate": 1.7947195496006817e-05, + "loss": 0.4183, + "step": 4228 + }, + { + "epoch": 0.8337933753943217, + "grad_norm": 0.5308679728526855, + "learning_rate": 1.7946254570731015e-05, + "loss": 0.4564, + "step": 4229 + }, + { + "epoch": 0.8339905362776026, + "grad_norm": 0.5705369452847299, + "learning_rate": 1.7945313454540046e-05, + "loss": 0.4626, + "step": 4230 + }, + { + "epoch": 0.8341876971608833, + "grad_norm": 0.5379448499030216, + "learning_rate": 1.7944372147456527e-05, + "loss": 0.449, + "step": 4231 + }, + { + "epoch": 0.8343848580441641, + "grad_norm": 0.536663991436941, + "learning_rate": 1.7943430649503065e-05, + "loss": 0.4614, + "step": 4232 + }, + { + "epoch": 0.8345820189274448, + "grad_norm": 0.5322133371923756, + "learning_rate": 1.794248896070229e-05, + "loss": 0.4251, + "step": 4233 + }, + { + "epoch": 0.8347791798107256, + "grad_norm": 0.5399982863538748, + "learning_rate": 1.7941547081076818e-05, + "loss": 0.4575, + "step": 4234 + }, + { + "epoch": 0.8349763406940063, + "grad_norm": 0.5375534373449318, + "learning_rate": 1.7940605010649284e-05, + "loss": 0.4295, + "step": 4235 + }, + { + "epoch": 0.8351735015772871, + "grad_norm": 0.5478250406848311, + "learning_rate": 1.7939662749442317e-05, + "loss": 0.4352, + "step": 4236 + }, + { + "epoch": 0.8353706624605678, + "grad_norm": 0.5493522873611169, + "learning_rate": 1.7938720297478564e-05, + "loss": 0.4487, + "step": 4237 + }, + { + "epoch": 0.8355678233438486, + "grad_norm": 0.5461493071646538, + "learning_rate": 1.7937777654780656e-05, + "loss": 0.4621, + "step": 4238 + }, + { + "epoch": 0.8357649842271293, + "grad_norm": 0.5405378451575877, + "learning_rate": 1.793683482137125e-05, + "loss": 0.4167, + "step": 4239 + }, + { + "epoch": 0.8359621451104101, + "grad_norm": 0.5514595660368122, + "learning_rate": 1.7935891797272998e-05, + "loss": 0.4397, + "step": 4240 + }, + { + "epoch": 0.8361593059936908, + "grad_norm": 0.5700483701942513, + "learning_rate": 1.7934948582508554e-05, + "loss": 0.4287, + "step": 4241 + }, + { + "epoch": 0.8363564668769716, + "grad_norm": 34.56133146719008, + "learning_rate": 1.793400517710058e-05, + "loss": 0.4441, + "step": 4242 + }, + { + "epoch": 0.8365536277602523, + "grad_norm": 24.176632613926778, + "learning_rate": 1.7933061581071743e-05, + "loss": 0.4912, + "step": 4243 + }, + { + "epoch": 0.8367507886435331, + "grad_norm": 0.6360329434335535, + "learning_rate": 1.793211779444471e-05, + "loss": 0.4639, + "step": 4244 + }, + { + "epoch": 0.8369479495268138, + "grad_norm": 0.6094525873812249, + "learning_rate": 1.7931173817242163e-05, + "loss": 0.4648, + "step": 4245 + }, + { + "epoch": 0.8371451104100947, + "grad_norm": 0.6835903177747272, + "learning_rate": 1.7930229649486777e-05, + "loss": 0.4328, + "step": 4246 + }, + { + "epoch": 0.8373422712933754, + "grad_norm": 0.5554084747146647, + "learning_rate": 1.7929285291201237e-05, + "loss": 0.4385, + "step": 4247 + }, + { + "epoch": 0.8375394321766562, + "grad_norm": 0.5699771546905784, + "learning_rate": 1.7928340742408236e-05, + "loss": 0.4457, + "step": 4248 + }, + { + "epoch": 0.8377365930599369, + "grad_norm": 0.6851558987403872, + "learning_rate": 1.792739600313046e-05, + "loss": 0.4285, + "step": 4249 + }, + { + "epoch": 0.8379337539432177, + "grad_norm": 0.5710755557207288, + "learning_rate": 1.7926451073390612e-05, + "loss": 0.4773, + "step": 4250 + }, + { + "epoch": 0.8381309148264984, + "grad_norm": 0.6090925217214507, + "learning_rate": 1.7925505953211394e-05, + "loss": 0.4597, + "step": 4251 + }, + { + "epoch": 0.8383280757097792, + "grad_norm": 0.5459422698014952, + "learning_rate": 1.792456064261551e-05, + "loss": 0.4478, + "step": 4252 + }, + { + "epoch": 0.8385252365930599, + "grad_norm": 0.5617534088004623, + "learning_rate": 1.7923615141625677e-05, + "loss": 0.4396, + "step": 4253 + }, + { + "epoch": 0.8387223974763407, + "grad_norm": 0.6076401507399524, + "learning_rate": 1.792266945026461e-05, + "loss": 0.4723, + "step": 4254 + }, + { + "epoch": 0.8389195583596214, + "grad_norm": 0.5960645754305288, + "learning_rate": 1.792172356855503e-05, + "loss": 0.4686, + "step": 4255 + }, + { + "epoch": 0.8391167192429022, + "grad_norm": 0.5490789054602314, + "learning_rate": 1.7920777496519665e-05, + "loss": 0.4362, + "step": 4256 + }, + { + "epoch": 0.839313880126183, + "grad_norm": 0.5627710935612827, + "learning_rate": 1.7919831234181234e-05, + "loss": 0.4299, + "step": 4257 + }, + { + "epoch": 0.8395110410094637, + "grad_norm": 0.5861974351027728, + "learning_rate": 1.7918884781562486e-05, + "loss": 0.444, + "step": 4258 + }, + { + "epoch": 0.8397082018927445, + "grad_norm": 0.5865858099699106, + "learning_rate": 1.7917938138686152e-05, + "loss": 0.4527, + "step": 4259 + }, + { + "epoch": 0.8399053627760252, + "grad_norm": 0.6002524896560638, + "learning_rate": 1.791699130557498e-05, + "loss": 0.4515, + "step": 4260 + }, + { + "epoch": 0.840102523659306, + "grad_norm": 1.3685110134791254, + "learning_rate": 1.7916044282251713e-05, + "loss": 0.3769, + "step": 4261 + }, + { + "epoch": 0.8402996845425867, + "grad_norm": 0.5968798080121339, + "learning_rate": 1.7915097068739108e-05, + "loss": 0.443, + "step": 4262 + }, + { + "epoch": 0.8404968454258676, + "grad_norm": 0.916862721842038, + "learning_rate": 1.7914149665059922e-05, + "loss": 0.4274, + "step": 4263 + }, + { + "epoch": 0.8406940063091483, + "grad_norm": 0.5597685865087868, + "learning_rate": 1.791320207123692e-05, + "loss": 0.4155, + "step": 4264 + }, + { + "epoch": 0.8408911671924291, + "grad_norm": 0.7515342823351008, + "learning_rate": 1.7912254287292863e-05, + "loss": 0.4606, + "step": 4265 + }, + { + "epoch": 0.8410883280757098, + "grad_norm": 0.5225927715886619, + "learning_rate": 1.7911306313250523e-05, + "loss": 0.4025, + "step": 4266 + }, + { + "epoch": 0.8412854889589906, + "grad_norm": 0.5557086212596624, + "learning_rate": 1.7910358149132682e-05, + "loss": 0.4528, + "step": 4267 + }, + { + "epoch": 0.8414826498422713, + "grad_norm": 0.5735148560669096, + "learning_rate": 1.7909409794962115e-05, + "loss": 0.4348, + "step": 4268 + }, + { + "epoch": 0.8416798107255521, + "grad_norm": 0.5535257331063189, + "learning_rate": 1.790846125076161e-05, + "loss": 0.4406, + "step": 4269 + }, + { + "epoch": 0.8418769716088328, + "grad_norm": 0.6394946265678507, + "learning_rate": 1.790751251655395e-05, + "loss": 0.4316, + "step": 4270 + }, + { + "epoch": 0.8420741324921136, + "grad_norm": 0.5519239974498845, + "learning_rate": 1.7906563592361935e-05, + "loss": 0.4457, + "step": 4271 + }, + { + "epoch": 0.8422712933753943, + "grad_norm": 0.5760757282643882, + "learning_rate": 1.7905614478208363e-05, + "loss": 0.4369, + "step": 4272 + }, + { + "epoch": 0.8424684542586751, + "grad_norm": 0.5792308751257407, + "learning_rate": 1.7904665174116038e-05, + "loss": 0.4549, + "step": 4273 + }, + { + "epoch": 0.8426656151419558, + "grad_norm": 0.5683056364517756, + "learning_rate": 1.790371568010777e-05, + "loss": 0.4554, + "step": 4274 + }, + { + "epoch": 0.8428627760252366, + "grad_norm": 0.5615230001406172, + "learning_rate": 1.7902765996206364e-05, + "loss": 0.4459, + "step": 4275 + }, + { + "epoch": 0.8430599369085173, + "grad_norm": 0.5564542215994093, + "learning_rate": 1.790181612243464e-05, + "loss": 0.45, + "step": 4276 + }, + { + "epoch": 0.8432570977917981, + "grad_norm": 0.5401445915473098, + "learning_rate": 1.7900866058815424e-05, + "loss": 0.4439, + "step": 4277 + }, + { + "epoch": 0.8434542586750788, + "grad_norm": 0.5386599600784427, + "learning_rate": 1.7899915805371536e-05, + "loss": 0.4422, + "step": 4278 + }, + { + "epoch": 0.8436514195583596, + "grad_norm": 0.5120099294044634, + "learning_rate": 1.789896536212581e-05, + "loss": 0.4206, + "step": 4279 + }, + { + "epoch": 0.8438485804416404, + "grad_norm": 0.5832867234998125, + "learning_rate": 1.7898014729101077e-05, + "loss": 0.4727, + "step": 4280 + }, + { + "epoch": 0.8440457413249212, + "grad_norm": 0.5192015921845785, + "learning_rate": 1.7897063906320182e-05, + "loss": 0.4239, + "step": 4281 + }, + { + "epoch": 0.8442429022082019, + "grad_norm": 0.5700985443501763, + "learning_rate": 1.7896112893805967e-05, + "loss": 0.4813, + "step": 4282 + }, + { + "epoch": 0.8444400630914827, + "grad_norm": 0.5274314100347487, + "learning_rate": 1.789516169158128e-05, + "loss": 0.4251, + "step": 4283 + }, + { + "epoch": 0.8446372239747634, + "grad_norm": 0.5003375182641053, + "learning_rate": 1.7894210299668977e-05, + "loss": 0.3926, + "step": 4284 + }, + { + "epoch": 0.8448343848580442, + "grad_norm": 0.5327663026050636, + "learning_rate": 1.7893258718091916e-05, + "loss": 0.4379, + "step": 4285 + }, + { + "epoch": 0.8450315457413249, + "grad_norm": 0.5526141806841286, + "learning_rate": 1.7892306946872952e-05, + "loss": 0.4765, + "step": 4286 + }, + { + "epoch": 0.8452287066246057, + "grad_norm": 0.5253955764429079, + "learning_rate": 1.7891354986034964e-05, + "loss": 0.4083, + "step": 4287 + }, + { + "epoch": 0.8454258675078864, + "grad_norm": 0.9128365999475107, + "learning_rate": 1.7890402835600814e-05, + "loss": 0.4293, + "step": 4288 + }, + { + "epoch": 0.8456230283911672, + "grad_norm": 0.50508451474393, + "learning_rate": 1.7889450495593386e-05, + "loss": 0.4281, + "step": 4289 + }, + { + "epoch": 0.8458201892744479, + "grad_norm": 0.5667695632508356, + "learning_rate": 1.7888497966035552e-05, + "loss": 0.4551, + "step": 4290 + }, + { + "epoch": 0.8460173501577287, + "grad_norm": 0.5140164942376172, + "learning_rate": 1.7887545246950204e-05, + "loss": 0.4283, + "step": 4291 + }, + { + "epoch": 0.8462145110410094, + "grad_norm": 0.5547368089832118, + "learning_rate": 1.7886592338360227e-05, + "loss": 0.4204, + "step": 4292 + }, + { + "epoch": 0.8464116719242902, + "grad_norm": 0.5107722071951123, + "learning_rate": 1.7885639240288523e-05, + "loss": 0.4269, + "step": 4293 + }, + { + "epoch": 0.8466088328075709, + "grad_norm": 0.5099322891614341, + "learning_rate": 1.788468595275798e-05, + "loss": 0.3775, + "step": 4294 + }, + { + "epoch": 0.8468059936908517, + "grad_norm": 0.5424856816385994, + "learning_rate": 1.7883732475791512e-05, + "loss": 0.4256, + "step": 4295 + }, + { + "epoch": 0.8470031545741324, + "grad_norm": 0.5241853108328052, + "learning_rate": 1.7882778809412024e-05, + "loss": 0.4292, + "step": 4296 + }, + { + "epoch": 0.8472003154574133, + "grad_norm": 0.542588881145842, + "learning_rate": 1.7881824953642423e-05, + "loss": 0.4074, + "step": 4297 + }, + { + "epoch": 0.847397476340694, + "grad_norm": 0.7626469717720238, + "learning_rate": 1.788087090850563e-05, + "loss": 0.4365, + "step": 4298 + }, + { + "epoch": 0.8475946372239748, + "grad_norm": 0.6333662401521095, + "learning_rate": 1.787991667402457e-05, + "loss": 0.4631, + "step": 4299 + }, + { + "epoch": 0.8477917981072555, + "grad_norm": 0.5258355942220182, + "learning_rate": 1.787896225022216e-05, + "loss": 0.4423, + "step": 4300 + }, + { + "epoch": 0.8479889589905363, + "grad_norm": 0.572162435469079, + "learning_rate": 1.7878007637121344e-05, + "loss": 0.4635, + "step": 4301 + }, + { + "epoch": 0.848186119873817, + "grad_norm": 0.6051771551039156, + "learning_rate": 1.7877052834745048e-05, + "loss": 0.5096, + "step": 4302 + }, + { + "epoch": 0.8483832807570978, + "grad_norm": 0.5953955107671394, + "learning_rate": 1.7876097843116214e-05, + "loss": 0.4335, + "step": 4303 + }, + { + "epoch": 0.8485804416403786, + "grad_norm": 0.5157443177704121, + "learning_rate": 1.7875142662257788e-05, + "loss": 0.4226, + "step": 4304 + }, + { + "epoch": 0.8487776025236593, + "grad_norm": 0.5783953306524755, + "learning_rate": 1.7874187292192716e-05, + "loss": 0.4105, + "step": 4305 + }, + { + "epoch": 0.8489747634069401, + "grad_norm": 3.512906038519687, + "learning_rate": 1.7873231732943954e-05, + "loss": 0.4794, + "step": 4306 + }, + { + "epoch": 0.8491719242902208, + "grad_norm": 0.6607264573754198, + "learning_rate": 1.787227598453446e-05, + "loss": 0.4226, + "step": 4307 + }, + { + "epoch": 0.8493690851735016, + "grad_norm": 0.5184733373485347, + "learning_rate": 1.7871320046987195e-05, + "loss": 0.4468, + "step": 4308 + }, + { + "epoch": 0.8495662460567823, + "grad_norm": 0.6207995208356605, + "learning_rate": 1.7870363920325126e-05, + "loss": 0.4391, + "step": 4309 + }, + { + "epoch": 0.8497634069400631, + "grad_norm": 0.9804052510102346, + "learning_rate": 1.7869407604571228e-05, + "loss": 0.4196, + "step": 4310 + }, + { + "epoch": 0.8499605678233438, + "grad_norm": 0.5996292455245761, + "learning_rate": 1.7868451099748473e-05, + "loss": 0.4297, + "step": 4311 + }, + { + "epoch": 0.8501577287066246, + "grad_norm": 0.5472017368916122, + "learning_rate": 1.7867494405879847e-05, + "loss": 0.4032, + "step": 4312 + }, + { + "epoch": 0.8503548895899053, + "grad_norm": 0.7587762660324456, + "learning_rate": 1.786653752298833e-05, + "loss": 0.4218, + "step": 4313 + }, + { + "epoch": 0.8505520504731862, + "grad_norm": 0.5433647751228663, + "learning_rate": 1.7865580451096912e-05, + "loss": 0.4439, + "step": 4314 + }, + { + "epoch": 0.8507492113564669, + "grad_norm": 0.5467356597687231, + "learning_rate": 1.7864623190228592e-05, + "loss": 0.4679, + "step": 4315 + }, + { + "epoch": 0.8509463722397477, + "grad_norm": 0.5811637139524398, + "learning_rate": 1.7863665740406367e-05, + "loss": 0.4204, + "step": 4316 + }, + { + "epoch": 0.8511435331230284, + "grad_norm": 0.5301517872977239, + "learning_rate": 1.786270810165324e-05, + "loss": 0.4555, + "step": 4317 + }, + { + "epoch": 0.8513406940063092, + "grad_norm": 0.5854692352465581, + "learning_rate": 1.7861750273992216e-05, + "loss": 0.4597, + "step": 4318 + }, + { + "epoch": 0.8515378548895899, + "grad_norm": 0.5663117363358948, + "learning_rate": 1.7860792257446315e-05, + "loss": 0.4437, + "step": 4319 + }, + { + "epoch": 0.8517350157728707, + "grad_norm": 0.5681785500111551, + "learning_rate": 1.785983405203855e-05, + "loss": 0.4747, + "step": 4320 + }, + { + "epoch": 0.8519321766561514, + "grad_norm": 0.6863444614061993, + "learning_rate": 1.7858875657791937e-05, + "loss": 0.4423, + "step": 4321 + }, + { + "epoch": 0.8521293375394322, + "grad_norm": 0.8893916870868374, + "learning_rate": 1.7857917074729513e-05, + "loss": 0.4413, + "step": 4322 + }, + { + "epoch": 0.8523264984227129, + "grad_norm": 0.5878530748957967, + "learning_rate": 1.78569583028743e-05, + "loss": 0.4737, + "step": 4323 + }, + { + "epoch": 0.8525236593059937, + "grad_norm": 0.5738260255184614, + "learning_rate": 1.7855999342249338e-05, + "loss": 0.4325, + "step": 4324 + }, + { + "epoch": 0.8527208201892744, + "grad_norm": 0.5660103845510339, + "learning_rate": 1.7855040192877666e-05, + "loss": 0.4242, + "step": 4325 + }, + { + "epoch": 0.8529179810725552, + "grad_norm": 0.5644476271832222, + "learning_rate": 1.7854080854782324e-05, + "loss": 0.4465, + "step": 4326 + }, + { + "epoch": 0.8531151419558359, + "grad_norm": 0.5507139670335401, + "learning_rate": 1.7853121327986368e-05, + "loss": 0.452, + "step": 4327 + }, + { + "epoch": 0.8533123028391167, + "grad_norm": 0.5645022256444481, + "learning_rate": 1.785216161251285e-05, + "loss": 0.4225, + "step": 4328 + }, + { + "epoch": 0.8535094637223974, + "grad_norm": 2.5107638932499734, + "learning_rate": 1.7851201708384823e-05, + "loss": 0.466, + "step": 4329 + }, + { + "epoch": 0.8537066246056783, + "grad_norm": 0.6115304333104712, + "learning_rate": 1.785024161562535e-05, + "loss": 0.4656, + "step": 4330 + }, + { + "epoch": 0.853903785488959, + "grad_norm": 0.5583226024777117, + "learning_rate": 1.7849281334257504e-05, + "loss": 0.4285, + "step": 4331 + }, + { + "epoch": 0.8541009463722398, + "grad_norm": 0.5123230160651777, + "learning_rate": 1.784832086430435e-05, + "loss": 0.3872, + "step": 4332 + }, + { + "epoch": 0.8542981072555205, + "grad_norm": 0.6949006788224281, + "learning_rate": 1.784736020578897e-05, + "loss": 0.4535, + "step": 4333 + }, + { + "epoch": 0.8544952681388013, + "grad_norm": 1.427386384185727, + "learning_rate": 1.784639935873444e-05, + "loss": 0.4704, + "step": 4334 + }, + { + "epoch": 0.854692429022082, + "grad_norm": 1.106108233363933, + "learning_rate": 1.784543832316385e-05, + "loss": 0.4372, + "step": 4335 + }, + { + "epoch": 0.8548895899053628, + "grad_norm": 0.5901770882082108, + "learning_rate": 1.7844477099100282e-05, + "loss": 0.4699, + "step": 4336 + }, + { + "epoch": 0.8550867507886435, + "grad_norm": 0.544525043882823, + "learning_rate": 1.784351568656684e-05, + "loss": 0.4027, + "step": 4337 + }, + { + "epoch": 0.8552839116719243, + "grad_norm": 0.5934533266780927, + "learning_rate": 1.7842554085586613e-05, + "loss": 0.4838, + "step": 4338 + }, + { + "epoch": 0.855481072555205, + "grad_norm": 0.5596803431341593, + "learning_rate": 1.7841592296182705e-05, + "loss": 0.4621, + "step": 4339 + }, + { + "epoch": 0.8556782334384858, + "grad_norm": 0.5394622987888912, + "learning_rate": 1.7840630318378233e-05, + "loss": 0.397, + "step": 4340 + }, + { + "epoch": 0.8558753943217665, + "grad_norm": 0.5455394824500505, + "learning_rate": 1.78396681521963e-05, + "loss": 0.4519, + "step": 4341 + }, + { + "epoch": 0.8560725552050473, + "grad_norm": 0.5203051216706605, + "learning_rate": 1.7838705797660033e-05, + "loss": 0.4413, + "step": 4342 + }, + { + "epoch": 0.856269716088328, + "grad_norm": 0.5539835315564265, + "learning_rate": 1.783774325479254e-05, + "loss": 0.4285, + "step": 4343 + }, + { + "epoch": 0.8564668769716088, + "grad_norm": 0.546518036363135, + "learning_rate": 1.7836780523616957e-05, + "loss": 0.4444, + "step": 4344 + }, + { + "epoch": 0.8566640378548895, + "grad_norm": 0.5381032645999315, + "learning_rate": 1.7835817604156407e-05, + "loss": 0.4187, + "step": 4345 + }, + { + "epoch": 0.8568611987381703, + "grad_norm": 0.6118133777534611, + "learning_rate": 1.7834854496434032e-05, + "loss": 0.4176, + "step": 4346 + }, + { + "epoch": 0.857058359621451, + "grad_norm": 0.6427676574290028, + "learning_rate": 1.7833891200472967e-05, + "loss": 0.5017, + "step": 4347 + }, + { + "epoch": 0.8572555205047319, + "grad_norm": 0.5031944020117598, + "learning_rate": 1.7832927716296357e-05, + "loss": 0.3831, + "step": 4348 + }, + { + "epoch": 0.8574526813880127, + "grad_norm": 0.5370545377340153, + "learning_rate": 1.7831964043927355e-05, + "loss": 0.4165, + "step": 4349 + }, + { + "epoch": 0.8576498422712934, + "grad_norm": 0.5878309901877112, + "learning_rate": 1.7831000183389107e-05, + "loss": 0.4848, + "step": 4350 + }, + { + "epoch": 0.8578470031545742, + "grad_norm": 0.5608856336311973, + "learning_rate": 1.783003613470477e-05, + "loss": 0.4418, + "step": 4351 + }, + { + "epoch": 0.8580441640378549, + "grad_norm": 0.5690843999597316, + "learning_rate": 1.7829071897897515e-05, + "loss": 0.4331, + "step": 4352 + }, + { + "epoch": 0.8582413249211357, + "grad_norm": 0.5542069316816697, + "learning_rate": 1.7828107472990498e-05, + "loss": 0.4454, + "step": 4353 + }, + { + "epoch": 0.8584384858044164, + "grad_norm": 0.5267592858399887, + "learning_rate": 1.78271428600069e-05, + "loss": 0.4271, + "step": 4354 + }, + { + "epoch": 0.8586356466876972, + "grad_norm": 0.5686441971248088, + "learning_rate": 1.7826178058969884e-05, + "loss": 0.4757, + "step": 4355 + }, + { + "epoch": 0.8588328075709779, + "grad_norm": 0.5227257642875571, + "learning_rate": 1.7825213069902646e-05, + "loss": 0.4574, + "step": 4356 + }, + { + "epoch": 0.8590299684542587, + "grad_norm": 0.5300979103639493, + "learning_rate": 1.782424789282836e-05, + "loss": 0.4289, + "step": 4357 + }, + { + "epoch": 0.8592271293375394, + "grad_norm": 0.5849541511404455, + "learning_rate": 1.7823282527770214e-05, + "loss": 0.4209, + "step": 4358 + }, + { + "epoch": 0.8594242902208202, + "grad_norm": 0.48637468314860755, + "learning_rate": 1.782231697475141e-05, + "loss": 0.4429, + "step": 4359 + }, + { + "epoch": 0.8596214511041009, + "grad_norm": 0.5419865357900749, + "learning_rate": 1.7821351233795135e-05, + "loss": 0.4511, + "step": 4360 + }, + { + "epoch": 0.8598186119873817, + "grad_norm": 1.4858320044760873, + "learning_rate": 1.7820385304924602e-05, + "loss": 0.4831, + "step": 4361 + }, + { + "epoch": 0.8600157728706624, + "grad_norm": 0.5226021841678236, + "learning_rate": 1.7819419188163015e-05, + "loss": 0.4268, + "step": 4362 + }, + { + "epoch": 0.8602129337539433, + "grad_norm": 0.5958597819761025, + "learning_rate": 1.7818452883533587e-05, + "loss": 0.4404, + "step": 4363 + }, + { + "epoch": 0.860410094637224, + "grad_norm": 0.5282803263222494, + "learning_rate": 1.781748639105953e-05, + "loss": 0.4351, + "step": 4364 + }, + { + "epoch": 0.8606072555205048, + "grad_norm": 0.5691278859949257, + "learning_rate": 1.7816519710764065e-05, + "loss": 0.4288, + "step": 4365 + }, + { + "epoch": 0.8608044164037855, + "grad_norm": 0.8711980455369287, + "learning_rate": 1.7815552842670424e-05, + "loss": 0.4175, + "step": 4366 + }, + { + "epoch": 0.8610015772870663, + "grad_norm": 0.48580191160854, + "learning_rate": 1.7814585786801826e-05, + "loss": 0.4037, + "step": 4367 + }, + { + "epoch": 0.861198738170347, + "grad_norm": 0.5079270568376504, + "learning_rate": 1.7813618543181515e-05, + "loss": 0.4153, + "step": 4368 + }, + { + "epoch": 0.8613958990536278, + "grad_norm": 0.5060650682920903, + "learning_rate": 1.781265111183273e-05, + "loss": 0.4223, + "step": 4369 + }, + { + "epoch": 0.8615930599369085, + "grad_norm": 0.5229358894402564, + "learning_rate": 1.7811683492778704e-05, + "loss": 0.4267, + "step": 4370 + }, + { + "epoch": 0.8617902208201893, + "grad_norm": 0.5485854868559356, + "learning_rate": 1.7810715686042694e-05, + "loss": 0.4611, + "step": 4371 + }, + { + "epoch": 0.86198738170347, + "grad_norm": 0.7504801541100838, + "learning_rate": 1.7809747691647947e-05, + "loss": 0.4411, + "step": 4372 + }, + { + "epoch": 0.8621845425867508, + "grad_norm": 0.5652888768887478, + "learning_rate": 1.7808779509617726e-05, + "loss": 0.4544, + "step": 4373 + }, + { + "epoch": 0.8623817034700315, + "grad_norm": 0.5229601149145549, + "learning_rate": 1.7807811139975287e-05, + "loss": 0.432, + "step": 4374 + }, + { + "epoch": 0.8625788643533123, + "grad_norm": 0.5693809303118241, + "learning_rate": 1.78068425827439e-05, + "loss": 0.4531, + "step": 4375 + }, + { + "epoch": 0.862776025236593, + "grad_norm": 0.5324905030720758, + "learning_rate": 1.7805873837946833e-05, + "loss": 0.4503, + "step": 4376 + }, + { + "epoch": 0.8629731861198738, + "grad_norm": 0.5981637783741762, + "learning_rate": 1.780490490560736e-05, + "loss": 0.4517, + "step": 4377 + }, + { + "epoch": 0.8631703470031545, + "grad_norm": 0.5398817339769384, + "learning_rate": 1.7803935785748758e-05, + "loss": 0.4098, + "step": 4378 + }, + { + "epoch": 0.8633675078864353, + "grad_norm": 0.6324773282334193, + "learning_rate": 1.7802966478394318e-05, + "loss": 0.4814, + "step": 4379 + }, + { + "epoch": 0.863564668769716, + "grad_norm": 0.5574386589246891, + "learning_rate": 1.7801996983567325e-05, + "loss": 0.446, + "step": 4380 + }, + { + "epoch": 0.8637618296529969, + "grad_norm": 0.5038265095672974, + "learning_rate": 1.780102730129107e-05, + "loss": 0.4219, + "step": 4381 + }, + { + "epoch": 0.8639589905362776, + "grad_norm": 0.5471006623991824, + "learning_rate": 1.7800057431588852e-05, + "loss": 0.4301, + "step": 4382 + }, + { + "epoch": 0.8641561514195584, + "grad_norm": 0.562249614894374, + "learning_rate": 1.7799087374483974e-05, + "loss": 0.4629, + "step": 4383 + }, + { + "epoch": 0.8643533123028391, + "grad_norm": 0.5815465050068923, + "learning_rate": 1.7798117129999738e-05, + "loss": 0.4361, + "step": 4384 + }, + { + "epoch": 0.8645504731861199, + "grad_norm": 0.5771439496581714, + "learning_rate": 1.779714669815946e-05, + "loss": 0.4527, + "step": 4385 + }, + { + "epoch": 0.8647476340694006, + "grad_norm": 0.5527264398916031, + "learning_rate": 1.7796176078986458e-05, + "loss": 0.4552, + "step": 4386 + }, + { + "epoch": 0.8649447949526814, + "grad_norm": 0.564880759780719, + "learning_rate": 1.7795205272504044e-05, + "loss": 0.4528, + "step": 4387 + }, + { + "epoch": 0.8651419558359621, + "grad_norm": 0.5384822591936045, + "learning_rate": 1.7794234278735544e-05, + "loss": 0.4476, + "step": 4388 + }, + { + "epoch": 0.8653391167192429, + "grad_norm": 1.03147273350115, + "learning_rate": 1.779326309770429e-05, + "loss": 0.427, + "step": 4389 + }, + { + "epoch": 0.8655362776025236, + "grad_norm": 0.5033337533381047, + "learning_rate": 1.7792291729433615e-05, + "loss": 0.422, + "step": 4390 + }, + { + "epoch": 0.8657334384858044, + "grad_norm": 0.8142231011342511, + "learning_rate": 1.7791320173946857e-05, + "loss": 0.4298, + "step": 4391 + }, + { + "epoch": 0.8659305993690851, + "grad_norm": 0.5307387314489819, + "learning_rate": 1.7790348431267353e-05, + "loss": 0.4549, + "step": 4392 + }, + { + "epoch": 0.8661277602523659, + "grad_norm": 0.5806932313493961, + "learning_rate": 1.7789376501418457e-05, + "loss": 0.4667, + "step": 4393 + }, + { + "epoch": 0.8663249211356467, + "grad_norm": 0.516262259612544, + "learning_rate": 1.778840438442352e-05, + "loss": 0.4056, + "step": 4394 + }, + { + "epoch": 0.8665220820189274, + "grad_norm": 0.5715000920752781, + "learning_rate": 1.7787432080305895e-05, + "loss": 0.4327, + "step": 4395 + }, + { + "epoch": 0.8667192429022083, + "grad_norm": 0.549118836810879, + "learning_rate": 1.7786459589088942e-05, + "loss": 0.4196, + "step": 4396 + }, + { + "epoch": 0.866916403785489, + "grad_norm": 0.5185987495530604, + "learning_rate": 1.778548691079603e-05, + "loss": 0.3998, + "step": 4397 + }, + { + "epoch": 0.8671135646687698, + "grad_norm": 0.594091330595555, + "learning_rate": 1.7784514045450518e-05, + "loss": 0.4366, + "step": 4398 + }, + { + "epoch": 0.8673107255520505, + "grad_norm": 0.5512970340203216, + "learning_rate": 1.7783540993075793e-05, + "loss": 0.4249, + "step": 4399 + }, + { + "epoch": 0.8675078864353313, + "grad_norm": 0.6495070525009865, + "learning_rate": 1.7782567753695227e-05, + "loss": 0.4518, + "step": 4400 + }, + { + "epoch": 0.867705047318612, + "grad_norm": 0.5475014771103609, + "learning_rate": 1.7781594327332203e-05, + "loss": 0.4358, + "step": 4401 + }, + { + "epoch": 0.8679022082018928, + "grad_norm": 0.5446824793376154, + "learning_rate": 1.7780620714010108e-05, + "loss": 0.4317, + "step": 4402 + }, + { + "epoch": 0.8680993690851735, + "grad_norm": 0.5672459127571157, + "learning_rate": 1.7779646913752334e-05, + "loss": 0.4306, + "step": 4403 + }, + { + "epoch": 0.8682965299684543, + "grad_norm": 0.5159309255044556, + "learning_rate": 1.7778672926582277e-05, + "loss": 0.4356, + "step": 4404 + }, + { + "epoch": 0.868493690851735, + "grad_norm": 0.5706253535740993, + "learning_rate": 1.777769875252334e-05, + "loss": 0.4624, + "step": 4405 + }, + { + "epoch": 0.8686908517350158, + "grad_norm": 0.5468735157960866, + "learning_rate": 1.7776724391598928e-05, + "loss": 0.4204, + "step": 4406 + }, + { + "epoch": 0.8688880126182965, + "grad_norm": 0.6041214464176105, + "learning_rate": 1.7775749843832454e-05, + "loss": 0.4493, + "step": 4407 + }, + { + "epoch": 0.8690851735015773, + "grad_norm": 0.5209820736307841, + "learning_rate": 1.777477510924732e-05, + "loss": 0.4042, + "step": 4408 + }, + { + "epoch": 0.869282334384858, + "grad_norm": 0.5604674535426473, + "learning_rate": 1.777380018786696e-05, + "loss": 0.4276, + "step": 4409 + }, + { + "epoch": 0.8694794952681388, + "grad_norm": 0.5079309439621922, + "learning_rate": 1.7772825079714788e-05, + "loss": 0.4476, + "step": 4410 + }, + { + "epoch": 0.8696766561514195, + "grad_norm": 0.6476704699217136, + "learning_rate": 1.7771849784814232e-05, + "loss": 0.4578, + "step": 4411 + }, + { + "epoch": 0.8698738170347003, + "grad_norm": 0.5924379738519122, + "learning_rate": 1.7770874303188727e-05, + "loss": 0.4311, + "step": 4412 + }, + { + "epoch": 0.870070977917981, + "grad_norm": 0.524148406557284, + "learning_rate": 1.776989863486171e-05, + "loss": 0.389, + "step": 4413 + }, + { + "epoch": 0.8702681388012619, + "grad_norm": 0.5327802595222653, + "learning_rate": 1.776892277985662e-05, + "loss": 0.4413, + "step": 4414 + }, + { + "epoch": 0.8704652996845426, + "grad_norm": 0.62581900914729, + "learning_rate": 1.7767946738196903e-05, + "loss": 0.4319, + "step": 4415 + }, + { + "epoch": 0.8706624605678234, + "grad_norm": 0.55431472515896, + "learning_rate": 1.7766970509906014e-05, + "loss": 0.4486, + "step": 4416 + }, + { + "epoch": 0.8708596214511041, + "grad_norm": 0.5876652626007826, + "learning_rate": 1.77659940950074e-05, + "loss": 0.4344, + "step": 4417 + }, + { + "epoch": 0.8710567823343849, + "grad_norm": 0.5551588366167218, + "learning_rate": 1.7765017493524526e-05, + "loss": 0.4568, + "step": 4418 + }, + { + "epoch": 0.8712539432176656, + "grad_norm": 0.5251199442258818, + "learning_rate": 1.776404070548085e-05, + "loss": 0.444, + "step": 4419 + }, + { + "epoch": 0.8714511041009464, + "grad_norm": 0.5098007039746633, + "learning_rate": 1.7763063730899846e-05, + "loss": 0.4078, + "step": 4420 + }, + { + "epoch": 0.8716482649842271, + "grad_norm": 0.5270323086561973, + "learning_rate": 1.776208656980499e-05, + "loss": 0.4585, + "step": 4421 + }, + { + "epoch": 0.8718454258675079, + "grad_norm": 0.575464987069899, + "learning_rate": 1.7761109222219747e-05, + "loss": 0.4532, + "step": 4422 + }, + { + "epoch": 0.8720425867507886, + "grad_norm": 0.5312918974599236, + "learning_rate": 1.7760131688167606e-05, + "loss": 0.4247, + "step": 4423 + }, + { + "epoch": 0.8722397476340694, + "grad_norm": 0.5487949802158028, + "learning_rate": 1.775915396767205e-05, + "loss": 0.4636, + "step": 4424 + }, + { + "epoch": 0.8724369085173501, + "grad_norm": 0.6044316638894782, + "learning_rate": 1.7758176060756572e-05, + "loss": 0.4606, + "step": 4425 + }, + { + "epoch": 0.8726340694006309, + "grad_norm": 0.5816796983719151, + "learning_rate": 1.775719796744467e-05, + "loss": 0.4585, + "step": 4426 + }, + { + "epoch": 0.8728312302839116, + "grad_norm": 0.5731233568405086, + "learning_rate": 1.775621968775984e-05, + "loss": 0.4263, + "step": 4427 + }, + { + "epoch": 0.8730283911671924, + "grad_norm": 0.596194849375308, + "learning_rate": 1.7755241221725583e-05, + "loss": 0.4631, + "step": 4428 + }, + { + "epoch": 0.8732255520504731, + "grad_norm": 0.5635206736546373, + "learning_rate": 1.7754262569365413e-05, + "loss": 0.4385, + "step": 4429 + }, + { + "epoch": 0.873422712933754, + "grad_norm": 0.5479175042231295, + "learning_rate": 1.7753283730702837e-05, + "loss": 0.431, + "step": 4430 + }, + { + "epoch": 0.8736198738170347, + "grad_norm": 0.5279639808442017, + "learning_rate": 1.7752304705761377e-05, + "loss": 0.4326, + "step": 4431 + }, + { + "epoch": 0.8738170347003155, + "grad_norm": 0.5194599602636989, + "learning_rate": 1.7751325494564556e-05, + "loss": 0.4215, + "step": 4432 + }, + { + "epoch": 0.8740141955835962, + "grad_norm": 0.5615746974328736, + "learning_rate": 1.7750346097135896e-05, + "loss": 0.4653, + "step": 4433 + }, + { + "epoch": 0.874211356466877, + "grad_norm": 0.5428620740818375, + "learning_rate": 1.774936651349893e-05, + "loss": 0.4014, + "step": 4434 + }, + { + "epoch": 0.8744085173501577, + "grad_norm": 0.4968476884769345, + "learning_rate": 1.774838674367719e-05, + "loss": 0.4188, + "step": 4435 + }, + { + "epoch": 0.8746056782334385, + "grad_norm": 0.5130271919066233, + "learning_rate": 1.7747406787694222e-05, + "loss": 0.4324, + "step": 4436 + }, + { + "epoch": 0.8748028391167192, + "grad_norm": 0.7695088300581262, + "learning_rate": 1.774642664557357e-05, + "loss": 0.4302, + "step": 4437 + }, + { + "epoch": 0.875, + "grad_norm": 0.5482718548743002, + "learning_rate": 1.7745446317338773e-05, + "loss": 0.4285, + "step": 4438 + }, + { + "epoch": 0.8751971608832808, + "grad_norm": 0.5602436646595499, + "learning_rate": 1.7744465803013394e-05, + "loss": 0.4519, + "step": 4439 + }, + { + "epoch": 0.8753943217665615, + "grad_norm": 0.6144891088977394, + "learning_rate": 1.774348510262099e-05, + "loss": 0.4275, + "step": 4440 + }, + { + "epoch": 0.8755914826498423, + "grad_norm": 0.56029801618344, + "learning_rate": 1.774250421618511e-05, + "loss": 0.3993, + "step": 4441 + }, + { + "epoch": 0.875788643533123, + "grad_norm": 0.5547908674306443, + "learning_rate": 1.7741523143729344e-05, + "loss": 0.4449, + "step": 4442 + }, + { + "epoch": 0.8759858044164038, + "grad_norm": 0.5349957311996212, + "learning_rate": 1.7740541885277243e-05, + "loss": 0.4173, + "step": 4443 + }, + { + "epoch": 0.8761829652996845, + "grad_norm": 0.5813115159456966, + "learning_rate": 1.773956044085239e-05, + "loss": 0.4598, + "step": 4444 + }, + { + "epoch": 0.8763801261829653, + "grad_norm": 0.5607295785609846, + "learning_rate": 1.773857881047837e-05, + "loss": 0.4563, + "step": 4445 + }, + { + "epoch": 0.876577287066246, + "grad_norm": 0.535106190739021, + "learning_rate": 1.773759699417876e-05, + "loss": 0.4743, + "step": 4446 + }, + { + "epoch": 0.8767744479495269, + "grad_norm": 0.5125347386594389, + "learning_rate": 1.773661499197715e-05, + "loss": 0.4162, + "step": 4447 + }, + { + "epoch": 0.8769716088328076, + "grad_norm": 0.49637878407034497, + "learning_rate": 1.7735632803897135e-05, + "loss": 0.4293, + "step": 4448 + }, + { + "epoch": 0.8771687697160884, + "grad_norm": 0.5116773726384485, + "learning_rate": 1.773465042996231e-05, + "loss": 0.449, + "step": 4449 + }, + { + "epoch": 0.8773659305993691, + "grad_norm": 0.5442212905587093, + "learning_rate": 1.7733667870196282e-05, + "loss": 0.433, + "step": 4450 + }, + { + "epoch": 0.8775630914826499, + "grad_norm": 0.5755384800113009, + "learning_rate": 1.7732685124622656e-05, + "loss": 0.4539, + "step": 4451 + }, + { + "epoch": 0.8777602523659306, + "grad_norm": 0.5372496581928871, + "learning_rate": 1.773170219326504e-05, + "loss": 0.436, + "step": 4452 + }, + { + "epoch": 0.8779574132492114, + "grad_norm": 0.5189362257725963, + "learning_rate": 1.7730719076147057e-05, + "loss": 0.4433, + "step": 4453 + }, + { + "epoch": 0.8781545741324921, + "grad_norm": 0.548560113980181, + "learning_rate": 1.7729735773292322e-05, + "loss": 0.4596, + "step": 4454 + }, + { + "epoch": 0.8783517350157729, + "grad_norm": 0.5494525538977173, + "learning_rate": 1.7728752284724454e-05, + "loss": 0.4452, + "step": 4455 + }, + { + "epoch": 0.8785488958990536, + "grad_norm": 0.5269179148603672, + "learning_rate": 1.7727768610467097e-05, + "loss": 0.4617, + "step": 4456 + }, + { + "epoch": 0.8787460567823344, + "grad_norm": 0.6517832670327971, + "learning_rate": 1.7726784750543867e-05, + "loss": 0.4377, + "step": 4457 + }, + { + "epoch": 0.8789432176656151, + "grad_norm": 0.5812600325676386, + "learning_rate": 1.7725800704978416e-05, + "loss": 0.4803, + "step": 4458 + }, + { + "epoch": 0.8791403785488959, + "grad_norm": 0.5376422098313386, + "learning_rate": 1.772481647379438e-05, + "loss": 0.4467, + "step": 4459 + }, + { + "epoch": 0.8793375394321766, + "grad_norm": 0.5404291511379831, + "learning_rate": 1.7723832057015413e-05, + "loss": 0.4662, + "step": 4460 + }, + { + "epoch": 0.8795347003154574, + "grad_norm": 0.6308719185145436, + "learning_rate": 1.7722847454665156e-05, + "loss": 0.456, + "step": 4461 + }, + { + "epoch": 0.8797318611987381, + "grad_norm": 0.5257173961466224, + "learning_rate": 1.772186266676727e-05, + "loss": 0.4427, + "step": 4462 + }, + { + "epoch": 0.879929022082019, + "grad_norm": 0.6053907583720101, + "learning_rate": 1.7720877693345414e-05, + "loss": 0.4045, + "step": 4463 + }, + { + "epoch": 0.8801261829652997, + "grad_norm": 0.6644310590594147, + "learning_rate": 1.7719892534423255e-05, + "loss": 0.4652, + "step": 4464 + }, + { + "epoch": 0.8803233438485805, + "grad_norm": 0.5704624224624693, + "learning_rate": 1.7718907190024462e-05, + "loss": 0.4775, + "step": 4465 + }, + { + "epoch": 0.8805205047318612, + "grad_norm": 0.5410836564451232, + "learning_rate": 1.7717921660172708e-05, + "loss": 0.4499, + "step": 4466 + }, + { + "epoch": 0.880717665615142, + "grad_norm": 0.5228027698546309, + "learning_rate": 1.771693594489167e-05, + "loss": 0.4313, + "step": 4467 + }, + { + "epoch": 0.8809148264984227, + "grad_norm": 0.5265433161743185, + "learning_rate": 1.771595004420503e-05, + "loss": 0.404, + "step": 4468 + }, + { + "epoch": 0.8811119873817035, + "grad_norm": 0.5812483523416486, + "learning_rate": 1.771496395813648e-05, + "loss": 0.4359, + "step": 4469 + }, + { + "epoch": 0.8813091482649842, + "grad_norm": 0.5446967314373959, + "learning_rate": 1.7713977686709706e-05, + "loss": 0.4476, + "step": 4470 + }, + { + "epoch": 0.881506309148265, + "grad_norm": 0.5031631759781324, + "learning_rate": 1.7712991229948405e-05, + "loss": 0.3942, + "step": 4471 + }, + { + "epoch": 0.8817034700315457, + "grad_norm": 0.5115052245818232, + "learning_rate": 1.7712004587876278e-05, + "loss": 0.4236, + "step": 4472 + }, + { + "epoch": 0.8819006309148265, + "grad_norm": 0.5212693828848795, + "learning_rate": 1.7711017760517033e-05, + "loss": 0.4075, + "step": 4473 + }, + { + "epoch": 0.8820977917981072, + "grad_norm": 0.5593095083849272, + "learning_rate": 1.7710030747894375e-05, + "loss": 0.4372, + "step": 4474 + }, + { + "epoch": 0.882294952681388, + "grad_norm": 0.5103075891986539, + "learning_rate": 1.770904355003202e-05, + "loss": 0.424, + "step": 4475 + }, + { + "epoch": 0.8824921135646687, + "grad_norm": 0.5391202190499896, + "learning_rate": 1.7708056166953684e-05, + "loss": 0.4861, + "step": 4476 + }, + { + "epoch": 0.8826892744479495, + "grad_norm": 0.5138735176424362, + "learning_rate": 1.7707068598683095e-05, + "loss": 0.4461, + "step": 4477 + }, + { + "epoch": 0.8828864353312302, + "grad_norm": 0.5261969184630167, + "learning_rate": 1.7706080845243975e-05, + "loss": 0.4413, + "step": 4478 + }, + { + "epoch": 0.883083596214511, + "grad_norm": 0.5319173341858092, + "learning_rate": 1.7705092906660054e-05, + "loss": 0.4574, + "step": 4479 + }, + { + "epoch": 0.8832807570977917, + "grad_norm": 0.5410201601688506, + "learning_rate": 1.7704104782955074e-05, + "loss": 0.4396, + "step": 4480 + }, + { + "epoch": 0.8834779179810726, + "grad_norm": 0.5346889247807762, + "learning_rate": 1.770311647415277e-05, + "loss": 0.4465, + "step": 4481 + }, + { + "epoch": 0.8836750788643533, + "grad_norm": 0.5216713453775305, + "learning_rate": 1.7702127980276893e-05, + "loss": 0.439, + "step": 4482 + }, + { + "epoch": 0.8838722397476341, + "grad_norm": 0.5794595199940287, + "learning_rate": 1.7701139301351187e-05, + "loss": 0.4277, + "step": 4483 + }, + { + "epoch": 0.8840694006309149, + "grad_norm": 0.5173700020686466, + "learning_rate": 1.7700150437399405e-05, + "loss": 0.4404, + "step": 4484 + }, + { + "epoch": 0.8842665615141956, + "grad_norm": 0.7338285688890706, + "learning_rate": 1.7699161388445313e-05, + "loss": 0.4936, + "step": 4485 + }, + { + "epoch": 0.8844637223974764, + "grad_norm": 0.49830492751234945, + "learning_rate": 1.7698172154512666e-05, + "loss": 0.4316, + "step": 4486 + }, + { + "epoch": 0.8846608832807571, + "grad_norm": 0.5574600419347723, + "learning_rate": 1.7697182735625233e-05, + "loss": 0.448, + "step": 4487 + }, + { + "epoch": 0.8848580441640379, + "grad_norm": 0.5158020909340794, + "learning_rate": 1.7696193131806786e-05, + "loss": 0.4157, + "step": 4488 + }, + { + "epoch": 0.8850552050473186, + "grad_norm": 0.5156523644377932, + "learning_rate": 1.76952033430811e-05, + "loss": 0.4277, + "step": 4489 + }, + { + "epoch": 0.8852523659305994, + "grad_norm": 0.4951200681741904, + "learning_rate": 1.769421336947196e-05, + "loss": 0.41, + "step": 4490 + }, + { + "epoch": 0.8854495268138801, + "grad_norm": 0.5210924234785088, + "learning_rate": 1.769322321100315e-05, + "loss": 0.4321, + "step": 4491 + }, + { + "epoch": 0.8856466876971609, + "grad_norm": 0.5546404645348809, + "learning_rate": 1.769223286769845e-05, + "loss": 0.46, + "step": 4492 + }, + { + "epoch": 0.8858438485804416, + "grad_norm": 0.5116058758291511, + "learning_rate": 1.7691242339581664e-05, + "loss": 0.4255, + "step": 4493 + }, + { + "epoch": 0.8860410094637224, + "grad_norm": 0.5259991490113677, + "learning_rate": 1.769025162667659e-05, + "loss": 0.4601, + "step": 4494 + }, + { + "epoch": 0.8862381703470031, + "grad_norm": 0.6096603798487029, + "learning_rate": 1.7689260729007025e-05, + "loss": 0.4446, + "step": 4495 + }, + { + "epoch": 0.886435331230284, + "grad_norm": 0.5502457929004166, + "learning_rate": 1.768826964659678e-05, + "loss": 0.4437, + "step": 4496 + }, + { + "epoch": 0.8866324921135647, + "grad_norm": 0.4955351951661073, + "learning_rate": 1.7687278379469665e-05, + "loss": 0.4367, + "step": 4497 + }, + { + "epoch": 0.8868296529968455, + "grad_norm": 0.48794238779718313, + "learning_rate": 1.7686286927649493e-05, + "loss": 0.3898, + "step": 4498 + }, + { + "epoch": 0.8870268138801262, + "grad_norm": 0.5403525717354225, + "learning_rate": 1.768529529116009e-05, + "loss": 0.4431, + "step": 4499 + }, + { + "epoch": 0.887223974763407, + "grad_norm": 0.5376973399491151, + "learning_rate": 1.768430347002528e-05, + "loss": 0.444, + "step": 4500 + }, + { + "epoch": 0.8874211356466877, + "grad_norm": 0.5272976538103022, + "learning_rate": 1.768331146426889e-05, + "loss": 0.4236, + "step": 4501 + }, + { + "epoch": 0.8876182965299685, + "grad_norm": 0.5344980125641151, + "learning_rate": 1.7682319273914755e-05, + "loss": 0.4426, + "step": 4502 + }, + { + "epoch": 0.8878154574132492, + "grad_norm": 0.49953537905007334, + "learning_rate": 1.7681326898986713e-05, + "loss": 0.4209, + "step": 4503 + }, + { + "epoch": 0.88801261829653, + "grad_norm": 0.5371135502177788, + "learning_rate": 1.7680334339508604e-05, + "loss": 0.4334, + "step": 4504 + }, + { + "epoch": 0.8882097791798107, + "grad_norm": 0.5315742083699853, + "learning_rate": 1.767934159550428e-05, + "loss": 0.4259, + "step": 4505 + }, + { + "epoch": 0.8884069400630915, + "grad_norm": 0.5965034470720473, + "learning_rate": 1.767834866699759e-05, + "loss": 0.4426, + "step": 4506 + }, + { + "epoch": 0.8886041009463722, + "grad_norm": 0.5836671704413614, + "learning_rate": 1.767735555401239e-05, + "loss": 0.4287, + "step": 4507 + }, + { + "epoch": 0.888801261829653, + "grad_norm": 0.5625765431157921, + "learning_rate": 1.767636225657254e-05, + "loss": 0.4459, + "step": 4508 + }, + { + "epoch": 0.8889984227129337, + "grad_norm": 0.7370740412563955, + "learning_rate": 1.7675368774701906e-05, + "loss": 0.426, + "step": 4509 + }, + { + "epoch": 0.8891955835962145, + "grad_norm": 0.5477180104542236, + "learning_rate": 1.7674375108424354e-05, + "loss": 0.4451, + "step": 4510 + }, + { + "epoch": 0.8893927444794952, + "grad_norm": 0.5330930261842161, + "learning_rate": 1.767338125776376e-05, + "loss": 0.3926, + "step": 4511 + }, + { + "epoch": 0.889589905362776, + "grad_norm": 0.5310418495745474, + "learning_rate": 1.7672387222744e-05, + "loss": 0.4434, + "step": 4512 + }, + { + "epoch": 0.8897870662460567, + "grad_norm": 0.5341421770335281, + "learning_rate": 1.7671393003388964e-05, + "loss": 0.4476, + "step": 4513 + }, + { + "epoch": 0.8899842271293376, + "grad_norm": 0.50982098147757, + "learning_rate": 1.7670398599722533e-05, + "loss": 0.4392, + "step": 4514 + }, + { + "epoch": 0.8901813880126183, + "grad_norm": 0.6050964967523266, + "learning_rate": 1.7669404011768596e-05, + "loss": 0.466, + "step": 4515 + }, + { + "epoch": 0.8903785488958991, + "grad_norm": 0.5082170155704829, + "learning_rate": 1.766840923955105e-05, + "loss": 0.3896, + "step": 4516 + }, + { + "epoch": 0.8905757097791798, + "grad_norm": 0.542583849270664, + "learning_rate": 1.76674142830938e-05, + "loss": 0.4499, + "step": 4517 + }, + { + "epoch": 0.8907728706624606, + "grad_norm": 0.549649515755227, + "learning_rate": 1.7666419142420746e-05, + "loss": 0.4205, + "step": 4518 + }, + { + "epoch": 0.8909700315457413, + "grad_norm": 0.5575951769960029, + "learning_rate": 1.76654238175558e-05, + "loss": 0.4465, + "step": 4519 + }, + { + "epoch": 0.8911671924290221, + "grad_norm": 0.5453782064529624, + "learning_rate": 1.766442830852287e-05, + "loss": 0.4769, + "step": 4520 + }, + { + "epoch": 0.8913643533123028, + "grad_norm": 0.5101131437540427, + "learning_rate": 1.766343261534588e-05, + "loss": 0.4017, + "step": 4521 + }, + { + "epoch": 0.8915615141955836, + "grad_norm": 0.6136158485523702, + "learning_rate": 1.766243673804875e-05, + "loss": 0.4098, + "step": 4522 + }, + { + "epoch": 0.8917586750788643, + "grad_norm": 0.5292957632786092, + "learning_rate": 1.7661440676655407e-05, + "loss": 0.4223, + "step": 4523 + }, + { + "epoch": 0.8919558359621451, + "grad_norm": 0.5540759577823543, + "learning_rate": 1.766044443118978e-05, + "loss": 0.4593, + "step": 4524 + }, + { + "epoch": 0.8921529968454258, + "grad_norm": 0.5392743345482487, + "learning_rate": 1.765944800167581e-05, + "loss": 0.4235, + "step": 4525 + }, + { + "epoch": 0.8923501577287066, + "grad_norm": 0.6346400393371832, + "learning_rate": 1.7658451388137432e-05, + "loss": 0.4231, + "step": 4526 + }, + { + "epoch": 0.8925473186119873, + "grad_norm": 0.5201806359082028, + "learning_rate": 1.7657454590598594e-05, + "loss": 0.4375, + "step": 4527 + }, + { + "epoch": 0.8927444794952681, + "grad_norm": 0.5175630311254862, + "learning_rate": 1.765645760908324e-05, + "loss": 0.4408, + "step": 4528 + }, + { + "epoch": 0.892941640378549, + "grad_norm": 0.5592078904199146, + "learning_rate": 1.7655460443615327e-05, + "loss": 0.4714, + "step": 4529 + }, + { + "epoch": 0.8931388012618297, + "grad_norm": 0.595148154874751, + "learning_rate": 1.7654463094218813e-05, + "loss": 0.4265, + "step": 4530 + }, + { + "epoch": 0.8933359621451105, + "grad_norm": 0.5367589779182992, + "learning_rate": 1.7653465560917656e-05, + "loss": 0.4664, + "step": 4531 + }, + { + "epoch": 0.8935331230283912, + "grad_norm": 0.5404275654069878, + "learning_rate": 1.7652467843735828e-05, + "loss": 0.4304, + "step": 4532 + }, + { + "epoch": 0.893730283911672, + "grad_norm": 0.5426667005563901, + "learning_rate": 1.7651469942697296e-05, + "loss": 0.4187, + "step": 4533 + }, + { + "epoch": 0.8939274447949527, + "grad_norm": 0.5355183086191119, + "learning_rate": 1.7650471857826038e-05, + "loss": 0.42, + "step": 4534 + }, + { + "epoch": 0.8941246056782335, + "grad_norm": 0.5434987852881384, + "learning_rate": 1.7649473589146032e-05, + "loss": 0.4129, + "step": 4535 + }, + { + "epoch": 0.8943217665615142, + "grad_norm": 0.49571415053957635, + "learning_rate": 1.7648475136681265e-05, + "loss": 0.4332, + "step": 4536 + }, + { + "epoch": 0.894518927444795, + "grad_norm": 0.545085011797932, + "learning_rate": 1.7647476500455723e-05, + "loss": 0.4722, + "step": 4537 + }, + { + "epoch": 0.8947160883280757, + "grad_norm": 0.5458717696446517, + "learning_rate": 1.76464776804934e-05, + "loss": 0.4437, + "step": 4538 + }, + { + "epoch": 0.8949132492113565, + "grad_norm": 0.5198965233821335, + "learning_rate": 1.764547867681829e-05, + "loss": 0.4335, + "step": 4539 + }, + { + "epoch": 0.8951104100946372, + "grad_norm": 0.5098927604754595, + "learning_rate": 1.76444794894544e-05, + "loss": 0.4281, + "step": 4540 + }, + { + "epoch": 0.895307570977918, + "grad_norm": 0.5476572140939739, + "learning_rate": 1.7643480118425733e-05, + "loss": 0.4575, + "step": 4541 + }, + { + "epoch": 0.8955047318611987, + "grad_norm": 0.5506996953424919, + "learning_rate": 1.7642480563756305e-05, + "loss": 0.4308, + "step": 4542 + }, + { + "epoch": 0.8957018927444795, + "grad_norm": 0.5418834197677749, + "learning_rate": 1.7641480825470123e-05, + "loss": 0.3992, + "step": 4543 + }, + { + "epoch": 0.8958990536277602, + "grad_norm": 0.5330741568842563, + "learning_rate": 1.764048090359121e-05, + "loss": 0.4314, + "step": 4544 + }, + { + "epoch": 0.896096214511041, + "grad_norm": 0.5364576977743398, + "learning_rate": 1.7639480798143593e-05, + "loss": 0.4311, + "step": 4545 + }, + { + "epoch": 0.8962933753943217, + "grad_norm": 0.5244339795995854, + "learning_rate": 1.7638480509151297e-05, + "loss": 0.4576, + "step": 4546 + }, + { + "epoch": 0.8964905362776026, + "grad_norm": 0.563203046041122, + "learning_rate": 1.7637480036638356e-05, + "loss": 0.4828, + "step": 4547 + }, + { + "epoch": 0.8966876971608833, + "grad_norm": 0.5070783658444626, + "learning_rate": 1.7636479380628806e-05, + "loss": 0.4226, + "step": 4548 + }, + { + "epoch": 0.8968848580441641, + "grad_norm": 0.5595950137316498, + "learning_rate": 1.7635478541146687e-05, + "loss": 0.459, + "step": 4549 + }, + { + "epoch": 0.8970820189274448, + "grad_norm": 0.5564160601048573, + "learning_rate": 1.763447751821605e-05, + "loss": 0.4069, + "step": 4550 + }, + { + "epoch": 0.8972791798107256, + "grad_norm": 0.5397157852142757, + "learning_rate": 1.763347631186094e-05, + "loss": 0.4336, + "step": 4551 + }, + { + "epoch": 0.8974763406940063, + "grad_norm": 0.5028947071543393, + "learning_rate": 1.7632474922105416e-05, + "loss": 0.4111, + "step": 4552 + }, + { + "epoch": 0.8976735015772871, + "grad_norm": 0.6653704389356556, + "learning_rate": 1.7631473348973537e-05, + "loss": 0.4817, + "step": 4553 + }, + { + "epoch": 0.8978706624605678, + "grad_norm": 0.6954841719181198, + "learning_rate": 1.7630471592489366e-05, + "loss": 0.4422, + "step": 4554 + }, + { + "epoch": 0.8980678233438486, + "grad_norm": 0.5904197333916205, + "learning_rate": 1.7629469652676965e-05, + "loss": 0.4538, + "step": 4555 + }, + { + "epoch": 0.8982649842271293, + "grad_norm": 1.260867911052856, + "learning_rate": 1.7628467529560417e-05, + "loss": 0.4514, + "step": 4556 + }, + { + "epoch": 0.8984621451104101, + "grad_norm": 1.0289033380566197, + "learning_rate": 1.762746522316379e-05, + "loss": 0.4128, + "step": 4557 + }, + { + "epoch": 0.8986593059936908, + "grad_norm": 0.5244592891802363, + "learning_rate": 1.762646273351117e-05, + "loss": 0.4338, + "step": 4558 + }, + { + "epoch": 0.8988564668769716, + "grad_norm": 0.5044945072717523, + "learning_rate": 1.7625460060626644e-05, + "loss": 0.4117, + "step": 4559 + }, + { + "epoch": 0.8990536277602523, + "grad_norm": 0.7264810983583718, + "learning_rate": 1.7624457204534292e-05, + "loss": 0.4583, + "step": 4560 + }, + { + "epoch": 0.8992507886435331, + "grad_norm": 0.5588409715862622, + "learning_rate": 1.762345416525822e-05, + "loss": 0.4361, + "step": 4561 + }, + { + "epoch": 0.8994479495268138, + "grad_norm": 0.4900029305965899, + "learning_rate": 1.7622450942822524e-05, + "loss": 0.4037, + "step": 4562 + }, + { + "epoch": 0.8996451104100947, + "grad_norm": 0.6753821625040557, + "learning_rate": 1.7621447537251307e-05, + "loss": 0.4286, + "step": 4563 + }, + { + "epoch": 0.8998422712933754, + "grad_norm": 0.5928729012070617, + "learning_rate": 1.762044394856867e-05, + "loss": 0.4391, + "step": 4564 + }, + { + "epoch": 0.9000394321766562, + "grad_norm": 0.5403084220268555, + "learning_rate": 1.7619440176798733e-05, + "loss": 0.4305, + "step": 4565 + }, + { + "epoch": 0.9002365930599369, + "grad_norm": 0.5023737487603344, + "learning_rate": 1.761843622196561e-05, + "loss": 0.432, + "step": 4566 + }, + { + "epoch": 0.9004337539432177, + "grad_norm": 0.8214257640360723, + "learning_rate": 1.7617432084093424e-05, + "loss": 0.4732, + "step": 4567 + }, + { + "epoch": 0.9006309148264984, + "grad_norm": 0.6157040111093859, + "learning_rate": 1.7616427763206294e-05, + "loss": 0.4106, + "step": 4568 + }, + { + "epoch": 0.9008280757097792, + "grad_norm": 0.6386520150246892, + "learning_rate": 1.7615423259328356e-05, + "loss": 0.4317, + "step": 4569 + }, + { + "epoch": 0.9010252365930599, + "grad_norm": 0.5233550893573012, + "learning_rate": 1.761441857248374e-05, + "loss": 0.428, + "step": 4570 + }, + { + "epoch": 0.9012223974763407, + "grad_norm": 0.5816027891775581, + "learning_rate": 1.7613413702696584e-05, + "loss": 0.4624, + "step": 4571 + }, + { + "epoch": 0.9014195583596214, + "grad_norm": 0.5226310374487503, + "learning_rate": 1.7612408649991037e-05, + "loss": 0.4252, + "step": 4572 + }, + { + "epoch": 0.9016167192429022, + "grad_norm": 0.7039706825623913, + "learning_rate": 1.761140341439124e-05, + "loss": 0.4465, + "step": 4573 + }, + { + "epoch": 0.901813880126183, + "grad_norm": 0.6335577419820779, + "learning_rate": 1.7610397995921348e-05, + "loss": 0.4193, + "step": 4574 + }, + { + "epoch": 0.9020110410094637, + "grad_norm": 0.5647597040461998, + "learning_rate": 1.760939239460551e-05, + "loss": 0.4354, + "step": 4575 + }, + { + "epoch": 0.9022082018927445, + "grad_norm": 0.5628152685174825, + "learning_rate": 1.7608386610467898e-05, + "loss": 0.4757, + "step": 4576 + }, + { + "epoch": 0.9024053627760252, + "grad_norm": 3.2929032363930917, + "learning_rate": 1.7607380643532667e-05, + "loss": 0.4282, + "step": 4577 + }, + { + "epoch": 0.902602523659306, + "grad_norm": 0.7504633569369734, + "learning_rate": 1.7606374493823993e-05, + "loss": 0.4495, + "step": 4578 + }, + { + "epoch": 0.9027996845425867, + "grad_norm": 0.5656614412611999, + "learning_rate": 1.7605368161366043e-05, + "loss": 0.4242, + "step": 4579 + }, + { + "epoch": 0.9029968454258676, + "grad_norm": 0.5437572557240907, + "learning_rate": 1.7604361646183004e-05, + "loss": 0.4365, + "step": 4580 + }, + { + "epoch": 0.9031940063091483, + "grad_norm": 0.5704886873684303, + "learning_rate": 1.760335494829905e-05, + "loss": 0.4378, + "step": 4581 + }, + { + "epoch": 0.9033911671924291, + "grad_norm": 0.5670944223065693, + "learning_rate": 1.7602348067738367e-05, + "loss": 0.4222, + "step": 4582 + }, + { + "epoch": 0.9035883280757098, + "grad_norm": 0.571465040387628, + "learning_rate": 1.760134100452515e-05, + "loss": 0.4459, + "step": 4583 + }, + { + "epoch": 0.9037854889589906, + "grad_norm": 0.5411774398722019, + "learning_rate": 1.7600333758683598e-05, + "loss": 0.4373, + "step": 4584 + }, + { + "epoch": 0.9039826498422713, + "grad_norm": 0.5164990496084579, + "learning_rate": 1.7599326330237906e-05, + "loss": 0.3954, + "step": 4585 + }, + { + "epoch": 0.9041798107255521, + "grad_norm": 0.5954434300630094, + "learning_rate": 1.7598318719212274e-05, + "loss": 0.4541, + "step": 4586 + }, + { + "epoch": 0.9043769716088328, + "grad_norm": 0.5629588725994905, + "learning_rate": 1.7597310925630922e-05, + "loss": 0.4696, + "step": 4587 + }, + { + "epoch": 0.9045741324921136, + "grad_norm": 1.0533510364008871, + "learning_rate": 1.7596302949518054e-05, + "loss": 0.4087, + "step": 4588 + }, + { + "epoch": 0.9047712933753943, + "grad_norm": 0.5848783608010563, + "learning_rate": 1.759529479089789e-05, + "loss": 0.4562, + "step": 4589 + }, + { + "epoch": 0.9049684542586751, + "grad_norm": 0.5359425519217931, + "learning_rate": 1.7594286449794655e-05, + "loss": 0.4139, + "step": 4590 + }, + { + "epoch": 0.9051656151419558, + "grad_norm": 0.5766734785111527, + "learning_rate": 1.759327792623257e-05, + "loss": 0.4232, + "step": 4591 + }, + { + "epoch": 0.9053627760252366, + "grad_norm": 0.5120340477249785, + "learning_rate": 1.759226922023587e-05, + "loss": 0.4266, + "step": 4592 + }, + { + "epoch": 0.9055599369085173, + "grad_norm": 0.5398891426274068, + "learning_rate": 1.7591260331828785e-05, + "loss": 0.4179, + "step": 4593 + }, + { + "epoch": 0.9057570977917981, + "grad_norm": 0.5184539964235813, + "learning_rate": 1.759025126103556e-05, + "loss": 0.4125, + "step": 4594 + }, + { + "epoch": 0.9059542586750788, + "grad_norm": 0.6029467871180186, + "learning_rate": 1.7589242007880435e-05, + "loss": 0.4991, + "step": 4595 + }, + { + "epoch": 0.9061514195583596, + "grad_norm": 0.5757959829050494, + "learning_rate": 1.7588232572387657e-05, + "loss": 0.4294, + "step": 4596 + }, + { + "epoch": 0.9063485804416404, + "grad_norm": 0.5612525679026711, + "learning_rate": 1.7587222954581483e-05, + "loss": 0.4779, + "step": 4597 + }, + { + "epoch": 0.9065457413249212, + "grad_norm": 0.5047317331482842, + "learning_rate": 1.758621315448617e-05, + "loss": 0.3938, + "step": 4598 + }, + { + "epoch": 0.9067429022082019, + "grad_norm": 0.5667514613061159, + "learning_rate": 1.7585203172125972e-05, + "loss": 0.4599, + "step": 4599 + }, + { + "epoch": 0.9069400630914827, + "grad_norm": 0.5293002056349737, + "learning_rate": 1.758419300752516e-05, + "loss": 0.4219, + "step": 4600 + }, + { + "epoch": 0.9071372239747634, + "grad_norm": 0.5080066799539775, + "learning_rate": 1.758318266070801e-05, + "loss": 0.3933, + "step": 4601 + }, + { + "epoch": 0.9073343848580442, + "grad_norm": 0.5576817270172586, + "learning_rate": 1.758217213169878e-05, + "loss": 0.4389, + "step": 4602 + }, + { + "epoch": 0.9075315457413249, + "grad_norm": 0.5264486271047948, + "learning_rate": 1.7581161420521765e-05, + "loss": 0.4522, + "step": 4603 + }, + { + "epoch": 0.9077287066246057, + "grad_norm": 0.5894471822557419, + "learning_rate": 1.758015052720124e-05, + "loss": 0.4453, + "step": 4604 + }, + { + "epoch": 0.9079258675078864, + "grad_norm": 0.5534225411257014, + "learning_rate": 1.7579139451761495e-05, + "loss": 0.4532, + "step": 4605 + }, + { + "epoch": 0.9081230283911672, + "grad_norm": 0.5369235116741078, + "learning_rate": 1.7578128194226823e-05, + "loss": 0.4251, + "step": 4606 + }, + { + "epoch": 0.9083201892744479, + "grad_norm": 0.4830071277784729, + "learning_rate": 1.7577116754621512e-05, + "loss": 0.4189, + "step": 4607 + }, + { + "epoch": 0.9085173501577287, + "grad_norm": 0.7383285750265494, + "learning_rate": 1.7576105132969874e-05, + "loss": 0.4759, + "step": 4608 + }, + { + "epoch": 0.9087145110410094, + "grad_norm": 0.5395821153111766, + "learning_rate": 1.757509332929621e-05, + "loss": 0.4595, + "step": 4609 + }, + { + "epoch": 0.9089116719242902, + "grad_norm": 0.5971105354676437, + "learning_rate": 1.7574081343624827e-05, + "loss": 0.4501, + "step": 4610 + }, + { + "epoch": 0.9091088328075709, + "grad_norm": 0.5322422724423118, + "learning_rate": 1.757306917598004e-05, + "loss": 0.4432, + "step": 4611 + }, + { + "epoch": 0.9093059936908517, + "grad_norm": 0.6382374433003123, + "learning_rate": 1.7572056826386167e-05, + "loss": 0.4918, + "step": 4612 + }, + { + "epoch": 0.9095031545741324, + "grad_norm": 1.0874753951271274, + "learning_rate": 1.7571044294867533e-05, + "loss": 0.4519, + "step": 4613 + }, + { + "epoch": 0.9097003154574133, + "grad_norm": 1.0844704783301853, + "learning_rate": 1.757003158144846e-05, + "loss": 0.4352, + "step": 4614 + }, + { + "epoch": 0.909897476340694, + "grad_norm": 0.522978838373464, + "learning_rate": 1.7569018686153286e-05, + "loss": 0.4233, + "step": 4615 + }, + { + "epoch": 0.9100946372239748, + "grad_norm": 0.5933248972486103, + "learning_rate": 1.756800560900634e-05, + "loss": 0.4419, + "step": 4616 + }, + { + "epoch": 0.9102917981072555, + "grad_norm": 1.216489202161031, + "learning_rate": 1.7566992350031965e-05, + "loss": 0.4741, + "step": 4617 + }, + { + "epoch": 0.9104889589905363, + "grad_norm": 0.5819684520560912, + "learning_rate": 1.7565978909254508e-05, + "loss": 0.4278, + "step": 4618 + }, + { + "epoch": 0.910686119873817, + "grad_norm": 0.580755399529423, + "learning_rate": 1.756496528669831e-05, + "loss": 0.4708, + "step": 4619 + }, + { + "epoch": 0.9108832807570978, + "grad_norm": 0.5363608983821153, + "learning_rate": 1.7563951482387733e-05, + "loss": 0.4008, + "step": 4620 + }, + { + "epoch": 0.9110804416403786, + "grad_norm": 0.5949632674163584, + "learning_rate": 1.7562937496347126e-05, + "loss": 0.4663, + "step": 4621 + }, + { + "epoch": 0.9112776025236593, + "grad_norm": 0.7340546915226606, + "learning_rate": 1.756192332860086e-05, + "loss": 0.4453, + "step": 4622 + }, + { + "epoch": 0.9114747634069401, + "grad_norm": 0.5539458382888915, + "learning_rate": 1.7560908979173294e-05, + "loss": 0.4457, + "step": 4623 + }, + { + "epoch": 0.9116719242902208, + "grad_norm": 0.5614114939363166, + "learning_rate": 1.7559894448088802e-05, + "loss": 0.4023, + "step": 4624 + }, + { + "epoch": 0.9118690851735016, + "grad_norm": 0.5710583292679061, + "learning_rate": 1.7558879735371753e-05, + "loss": 0.4688, + "step": 4625 + }, + { + "epoch": 0.9120662460567823, + "grad_norm": 0.5550240437903982, + "learning_rate": 1.755786484104654e-05, + "loss": 0.3906, + "step": 4626 + }, + { + "epoch": 0.9122634069400631, + "grad_norm": 0.4837275585777602, + "learning_rate": 1.755684976513753e-05, + "loss": 0.3919, + "step": 4627 + }, + { + "epoch": 0.9124605678233438, + "grad_norm": 0.5844614371573156, + "learning_rate": 1.7555834507669124e-05, + "loss": 0.4086, + "step": 4628 + }, + { + "epoch": 0.9126577287066246, + "grad_norm": 0.5180930399677969, + "learning_rate": 1.7554819068665707e-05, + "loss": 0.4039, + "step": 4629 + }, + { + "epoch": 0.9128548895899053, + "grad_norm": 0.5819268556226904, + "learning_rate": 1.7553803448151678e-05, + "loss": 0.4102, + "step": 4630 + }, + { + "epoch": 0.9130520504731862, + "grad_norm": 0.5158313726358502, + "learning_rate": 1.755278764615144e-05, + "loss": 0.4266, + "step": 4631 + }, + { + "epoch": 0.9132492113564669, + "grad_norm": 0.5930149183798764, + "learning_rate": 1.7551771662689393e-05, + "loss": 0.4468, + "step": 4632 + }, + { + "epoch": 0.9134463722397477, + "grad_norm": 0.5517958419973356, + "learning_rate": 1.7550755497789955e-05, + "loss": 0.4381, + "step": 4633 + }, + { + "epoch": 0.9136435331230284, + "grad_norm": 0.6790482699026756, + "learning_rate": 1.754973915147753e-05, + "loss": 0.4301, + "step": 4634 + }, + { + "epoch": 0.9138406940063092, + "grad_norm": 0.5451328047356113, + "learning_rate": 1.7548722623776547e-05, + "loss": 0.455, + "step": 4635 + }, + { + "epoch": 0.9140378548895899, + "grad_norm": 0.5153803270688035, + "learning_rate": 1.754770591471142e-05, + "loss": 0.4178, + "step": 4636 + }, + { + "epoch": 0.9142350157728707, + "grad_norm": 0.5355618034950106, + "learning_rate": 1.7546689024306585e-05, + "loss": 0.4328, + "step": 4637 + }, + { + "epoch": 0.9144321766561514, + "grad_norm": 0.5793723146542517, + "learning_rate": 1.7545671952586464e-05, + "loss": 0.4498, + "step": 4638 + }, + { + "epoch": 0.9146293375394322, + "grad_norm": 0.49690302528171043, + "learning_rate": 1.75446546995755e-05, + "loss": 0.4184, + "step": 4639 + }, + { + "epoch": 0.9148264984227129, + "grad_norm": 0.5093059262044622, + "learning_rate": 1.7543637265298136e-05, + "loss": 0.3955, + "step": 4640 + }, + { + "epoch": 0.9150236593059937, + "grad_norm": 0.5592098323156065, + "learning_rate": 1.7542619649778804e-05, + "loss": 0.4376, + "step": 4641 + }, + { + "epoch": 0.9152208201892744, + "grad_norm": 0.5525412638322774, + "learning_rate": 1.7541601853041963e-05, + "loss": 0.4554, + "step": 4642 + }, + { + "epoch": 0.9154179810725552, + "grad_norm": 0.5262471397287505, + "learning_rate": 1.7540583875112065e-05, + "loss": 0.4337, + "step": 4643 + }, + { + "epoch": 0.9156151419558359, + "grad_norm": 0.5235757919665939, + "learning_rate": 1.753956571601357e-05, + "loss": 0.4335, + "step": 4644 + }, + { + "epoch": 0.9158123028391167, + "grad_norm": 0.5494919507343168, + "learning_rate": 1.7538547375770934e-05, + "loss": 0.4198, + "step": 4645 + }, + { + "epoch": 0.9160094637223974, + "grad_norm": 0.5155811541332505, + "learning_rate": 1.7537528854408625e-05, + "loss": 0.4006, + "step": 4646 + }, + { + "epoch": 0.9162066246056783, + "grad_norm": 0.619279355812408, + "learning_rate": 1.753651015195112e-05, + "loss": 0.4524, + "step": 4647 + }, + { + "epoch": 0.916403785488959, + "grad_norm": 0.5234709511842155, + "learning_rate": 1.7535491268422885e-05, + "loss": 0.4365, + "step": 4648 + }, + { + "epoch": 0.9166009463722398, + "grad_norm": 0.5760967298774721, + "learning_rate": 1.7534472203848402e-05, + "loss": 0.421, + "step": 4649 + }, + { + "epoch": 0.9167981072555205, + "grad_norm": 0.5744984868953806, + "learning_rate": 1.7533452958252164e-05, + "loss": 0.4393, + "step": 4650 + }, + { + "epoch": 0.9169952681388013, + "grad_norm": 0.5325399090464799, + "learning_rate": 1.7532433531658646e-05, + "loss": 0.4407, + "step": 4651 + }, + { + "epoch": 0.917192429022082, + "grad_norm": 0.5340699591886052, + "learning_rate": 1.7531413924092347e-05, + "loss": 0.4295, + "step": 4652 + }, + { + "epoch": 0.9173895899053628, + "grad_norm": 0.5494041102557055, + "learning_rate": 1.7530394135577768e-05, + "loss": 0.4489, + "step": 4653 + }, + { + "epoch": 0.9175867507886435, + "grad_norm": 0.5182867692860177, + "learning_rate": 1.75293741661394e-05, + "loss": 0.4204, + "step": 4654 + }, + { + "epoch": 0.9177839116719243, + "grad_norm": 0.5728288091111668, + "learning_rate": 1.752835401580176e-05, + "loss": 0.4238, + "step": 4655 + }, + { + "epoch": 0.917981072555205, + "grad_norm": 0.49589426702980033, + "learning_rate": 1.752733368458935e-05, + "loss": 0.4539, + "step": 4656 + }, + { + "epoch": 0.9181782334384858, + "grad_norm": 0.48458956489019156, + "learning_rate": 1.7526313172526687e-05, + "loss": 0.3936, + "step": 4657 + }, + { + "epoch": 0.9183753943217665, + "grad_norm": 0.5274678434361841, + "learning_rate": 1.7525292479638286e-05, + "loss": 0.4484, + "step": 4658 + }, + { + "epoch": 0.9185725552050473, + "grad_norm": 0.5211344047356368, + "learning_rate": 1.7524271605948676e-05, + "loss": 0.4461, + "step": 4659 + }, + { + "epoch": 0.918769716088328, + "grad_norm": 0.511296727944466, + "learning_rate": 1.752325055148238e-05, + "loss": 0.4185, + "step": 4660 + }, + { + "epoch": 0.9189668769716088, + "grad_norm": 0.5647095561867232, + "learning_rate": 1.752222931626393e-05, + "loss": 0.4473, + "step": 4661 + }, + { + "epoch": 0.9191640378548895, + "grad_norm": 0.513631356257729, + "learning_rate": 1.7521207900317866e-05, + "loss": 0.4403, + "step": 4662 + }, + { + "epoch": 0.9193611987381703, + "grad_norm": 0.5155278543678777, + "learning_rate": 1.7520186303668722e-05, + "loss": 0.4393, + "step": 4663 + }, + { + "epoch": 0.919558359621451, + "grad_norm": 0.5019934238237479, + "learning_rate": 1.751916452634105e-05, + "loss": 0.4006, + "step": 4664 + }, + { + "epoch": 0.9197555205047319, + "grad_norm": 0.5133536326390901, + "learning_rate": 1.7518142568359395e-05, + "loss": 0.4422, + "step": 4665 + }, + { + "epoch": 0.9199526813880127, + "grad_norm": 1.1425170395813518, + "learning_rate": 1.7517120429748305e-05, + "loss": 0.4254, + "step": 4666 + }, + { + "epoch": 0.9201498422712934, + "grad_norm": 0.5062808556522904, + "learning_rate": 1.751609811053235e-05, + "loss": 0.4093, + "step": 4667 + }, + { + "epoch": 0.9203470031545742, + "grad_norm": 0.5329727124070021, + "learning_rate": 1.7515075610736077e-05, + "loss": 0.445, + "step": 4668 + }, + { + "epoch": 0.9205441640378549, + "grad_norm": 0.5773802180358756, + "learning_rate": 1.751405293038407e-05, + "loss": 0.4348, + "step": 4669 + }, + { + "epoch": 0.9207413249211357, + "grad_norm": 0.5692775257801391, + "learning_rate": 1.7513030069500885e-05, + "loss": 0.4427, + "step": 4670 + }, + { + "epoch": 0.9209384858044164, + "grad_norm": 0.5036100073808859, + "learning_rate": 1.7512007028111103e-05, + "loss": 0.4376, + "step": 4671 + }, + { + "epoch": 0.9211356466876972, + "grad_norm": 0.5271798471163804, + "learning_rate": 1.75109838062393e-05, + "loss": 0.4504, + "step": 4672 + }, + { + "epoch": 0.9213328075709779, + "grad_norm": 0.5357814969525453, + "learning_rate": 1.750996040391007e-05, + "loss": 0.4528, + "step": 4673 + }, + { + "epoch": 0.9215299684542587, + "grad_norm": 0.5702199988441526, + "learning_rate": 1.7508936821147986e-05, + "loss": 0.453, + "step": 4674 + }, + { + "epoch": 0.9217271293375394, + "grad_norm": 0.5404783434059391, + "learning_rate": 1.750791305797765e-05, + "loss": 0.4315, + "step": 4675 + }, + { + "epoch": 0.9219242902208202, + "grad_norm": 0.5084713263493303, + "learning_rate": 1.7506889114423658e-05, + "loss": 0.4159, + "step": 4676 + }, + { + "epoch": 0.9221214511041009, + "grad_norm": 0.5201149594816759, + "learning_rate": 1.750586499051061e-05, + "loss": 0.4029, + "step": 4677 + }, + { + "epoch": 0.9223186119873817, + "grad_norm": 0.5436352106831215, + "learning_rate": 1.750484068626311e-05, + "loss": 0.4239, + "step": 4678 + }, + { + "epoch": 0.9225157728706624, + "grad_norm": 0.5231124883960365, + "learning_rate": 1.7503816201705772e-05, + "loss": 0.4557, + "step": 4679 + }, + { + "epoch": 0.9227129337539433, + "grad_norm": 0.5377568165510164, + "learning_rate": 1.75027915368632e-05, + "loss": 0.4332, + "step": 4680 + }, + { + "epoch": 0.922910094637224, + "grad_norm": 0.49556159503049685, + "learning_rate": 1.7501766691760027e-05, + "loss": 0.3759, + "step": 4681 + }, + { + "epoch": 0.9231072555205048, + "grad_norm": 0.5366737464738414, + "learning_rate": 1.7500741666420863e-05, + "loss": 0.4272, + "step": 4682 + }, + { + "epoch": 0.9233044164037855, + "grad_norm": 0.5333368284829383, + "learning_rate": 1.749971646087034e-05, + "loss": 0.4893, + "step": 4683 + }, + { + "epoch": 0.9235015772870663, + "grad_norm": 0.5191084944970333, + "learning_rate": 1.7498691075133094e-05, + "loss": 0.4537, + "step": 4684 + }, + { + "epoch": 0.923698738170347, + "grad_norm": 0.4874538317773819, + "learning_rate": 1.7497665509233753e-05, + "loss": 0.4073, + "step": 4685 + }, + { + "epoch": 0.9238958990536278, + "grad_norm": 0.5841148468262191, + "learning_rate": 1.7496639763196965e-05, + "loss": 0.4568, + "step": 4686 + }, + { + "epoch": 0.9240930599369085, + "grad_norm": 0.46497152647935325, + "learning_rate": 1.7495613837047362e-05, + "loss": 0.3768, + "step": 4687 + }, + { + "epoch": 0.9242902208201893, + "grad_norm": 0.6951617732456002, + "learning_rate": 1.7494587730809603e-05, + "loss": 0.4369, + "step": 4688 + }, + { + "epoch": 0.92448738170347, + "grad_norm": 0.5439225603081767, + "learning_rate": 1.749356144450834e-05, + "loss": 0.4204, + "step": 4689 + }, + { + "epoch": 0.9246845425867508, + "grad_norm": 0.5545139274444996, + "learning_rate": 1.749253497816823e-05, + "loss": 0.4533, + "step": 4690 + }, + { + "epoch": 0.9248817034700315, + "grad_norm": 0.5645877878868146, + "learning_rate": 1.7491508331813928e-05, + "loss": 0.4588, + "step": 4691 + }, + { + "epoch": 0.9250788643533123, + "grad_norm": 0.5001014226230943, + "learning_rate": 1.7490481505470112e-05, + "loss": 0.413, + "step": 4692 + }, + { + "epoch": 0.925276025236593, + "grad_norm": 0.5340731127970042, + "learning_rate": 1.748945449916144e-05, + "loss": 0.4355, + "step": 4693 + }, + { + "epoch": 0.9254731861198738, + "grad_norm": 0.4812452465843482, + "learning_rate": 1.7488427312912596e-05, + "loss": 0.4345, + "step": 4694 + }, + { + "epoch": 0.9256703470031545, + "grad_norm": 1.0890928667586521, + "learning_rate": 1.7487399946748253e-05, + "loss": 0.4734, + "step": 4695 + }, + { + "epoch": 0.9258675078864353, + "grad_norm": 0.5505478980835827, + "learning_rate": 1.74863724006931e-05, + "loss": 0.4808, + "step": 4696 + }, + { + "epoch": 0.926064668769716, + "grad_norm": 0.5008122123586376, + "learning_rate": 1.7485344674771817e-05, + "loss": 0.4225, + "step": 4697 + }, + { + "epoch": 0.9262618296529969, + "grad_norm": 0.6702157966102137, + "learning_rate": 1.7484316769009105e-05, + "loss": 0.4698, + "step": 4698 + }, + { + "epoch": 0.9264589905362776, + "grad_norm": 0.5272899289334974, + "learning_rate": 1.7483288683429655e-05, + "loss": 0.414, + "step": 4699 + }, + { + "epoch": 0.9266561514195584, + "grad_norm": 0.5858391308330164, + "learning_rate": 1.7482260418058167e-05, + "loss": 0.4288, + "step": 4700 + }, + { + "epoch": 0.9268533123028391, + "grad_norm": 0.5833034743596007, + "learning_rate": 1.7481231972919346e-05, + "loss": 0.4879, + "step": 4701 + }, + { + "epoch": 0.9270504731861199, + "grad_norm": 0.5320561242621449, + "learning_rate": 1.74802033480379e-05, + "loss": 0.4144, + "step": 4702 + }, + { + "epoch": 0.9272476340694006, + "grad_norm": 0.5387587973540497, + "learning_rate": 1.7479174543438547e-05, + "loss": 0.4487, + "step": 4703 + }, + { + "epoch": 0.9274447949526814, + "grad_norm": 0.5997202359893333, + "learning_rate": 1.7478145559146002e-05, + "loss": 0.4796, + "step": 4704 + }, + { + "epoch": 0.9276419558359621, + "grad_norm": 0.5857148110448572, + "learning_rate": 1.747711639518499e-05, + "loss": 0.4235, + "step": 4705 + }, + { + "epoch": 0.9278391167192429, + "grad_norm": 0.5798656081956509, + "learning_rate": 1.7476087051580235e-05, + "loss": 0.4518, + "step": 4706 + }, + { + "epoch": 0.9280362776025236, + "grad_norm": 0.5831794755793467, + "learning_rate": 1.7475057528356466e-05, + "loss": 0.4613, + "step": 4707 + }, + { + "epoch": 0.9282334384858044, + "grad_norm": 0.5068716865871977, + "learning_rate": 1.7474027825538422e-05, + "loss": 0.4025, + "step": 4708 + }, + { + "epoch": 0.9284305993690851, + "grad_norm": 0.5949150495584267, + "learning_rate": 1.747299794315084e-05, + "loss": 0.4116, + "step": 4709 + }, + { + "epoch": 0.9286277602523659, + "grad_norm": 0.5125219184067067, + "learning_rate": 1.7471967881218466e-05, + "loss": 0.4338, + "step": 4710 + }, + { + "epoch": 0.9288249211356467, + "grad_norm": 0.5270990266912229, + "learning_rate": 1.7470937639766042e-05, + "loss": 0.4209, + "step": 4711 + }, + { + "epoch": 0.9290220820189274, + "grad_norm": 0.5495969435777367, + "learning_rate": 1.746990721881833e-05, + "loss": 0.4161, + "step": 4712 + }, + { + "epoch": 0.9292192429022083, + "grad_norm": 0.519716074091138, + "learning_rate": 1.746887661840008e-05, + "loss": 0.4226, + "step": 4713 + }, + { + "epoch": 0.929416403785489, + "grad_norm": 0.5108308327628213, + "learning_rate": 1.7467845838536054e-05, + "loss": 0.4335, + "step": 4714 + }, + { + "epoch": 0.9296135646687698, + "grad_norm": 4.264832132353777, + "learning_rate": 1.746681487925102e-05, + "loss": 0.5218, + "step": 4715 + }, + { + "epoch": 0.9298107255520505, + "grad_norm": 0.6179886893181556, + "learning_rate": 1.746578374056974e-05, + "loss": 0.4544, + "step": 4716 + }, + { + "epoch": 0.9300078864353313, + "grad_norm": 0.5304906705707955, + "learning_rate": 1.7464752422516996e-05, + "loss": 0.4232, + "step": 4717 + }, + { + "epoch": 0.930205047318612, + "grad_norm": 0.5331325354477906, + "learning_rate": 1.7463720925117565e-05, + "loss": 0.3829, + "step": 4718 + }, + { + "epoch": 0.9304022082018928, + "grad_norm": 0.5765730607350507, + "learning_rate": 1.7462689248396228e-05, + "loss": 0.4317, + "step": 4719 + }, + { + "epoch": 0.9305993690851735, + "grad_norm": 0.5313090247910929, + "learning_rate": 1.7461657392377772e-05, + "loss": 0.4628, + "step": 4720 + }, + { + "epoch": 0.9307965299684543, + "grad_norm": 0.5604970128075755, + "learning_rate": 1.746062535708699e-05, + "loss": 0.4345, + "step": 4721 + }, + { + "epoch": 0.930993690851735, + "grad_norm": 0.6273618716282534, + "learning_rate": 1.7459593142548674e-05, + "loss": 0.4068, + "step": 4722 + }, + { + "epoch": 0.9311908517350158, + "grad_norm": 0.5829766636894154, + "learning_rate": 1.7458560748787625e-05, + "loss": 0.4577, + "step": 4723 + }, + { + "epoch": 0.9313880126182965, + "grad_norm": 0.6025168414566998, + "learning_rate": 1.7457528175828648e-05, + "loss": 0.4054, + "step": 4724 + }, + { + "epoch": 0.9315851735015773, + "grad_norm": 1.0019218230150604, + "learning_rate": 1.745649542369655e-05, + "loss": 0.4536, + "step": 4725 + }, + { + "epoch": 0.931782334384858, + "grad_norm": 0.5862918544220463, + "learning_rate": 1.7455462492416148e-05, + "loss": 0.4495, + "step": 4726 + }, + { + "epoch": 0.9319794952681388, + "grad_norm": 0.5461382110764923, + "learning_rate": 1.7454429382012255e-05, + "loss": 0.4391, + "step": 4727 + }, + { + "epoch": 0.9321766561514195, + "grad_norm": 0.5936092061621204, + "learning_rate": 1.745339609250969e-05, + "loss": 0.4482, + "step": 4728 + }, + { + "epoch": 0.9323738170347003, + "grad_norm": 0.5346698342600564, + "learning_rate": 1.7452362623933283e-05, + "loss": 0.4358, + "step": 4729 + }, + { + "epoch": 0.932570977917981, + "grad_norm": 0.5604545243460433, + "learning_rate": 1.7451328976307864e-05, + "loss": 0.4072, + "step": 4730 + }, + { + "epoch": 0.9327681388012619, + "grad_norm": 0.5624845340732466, + "learning_rate": 1.7450295149658265e-05, + "loss": 0.4933, + "step": 4731 + }, + { + "epoch": 0.9329652996845426, + "grad_norm": 0.5321758225204737, + "learning_rate": 1.7449261144009325e-05, + "loss": 0.4828, + "step": 4732 + }, + { + "epoch": 0.9331624605678234, + "grad_norm": 0.5442798225790718, + "learning_rate": 1.744822695938589e-05, + "loss": 0.4275, + "step": 4733 + }, + { + "epoch": 0.9333596214511041, + "grad_norm": 2.0994490161932124, + "learning_rate": 1.74471925958128e-05, + "loss": 0.4325, + "step": 4734 + }, + { + "epoch": 0.9335567823343849, + "grad_norm": 0.5447286623554133, + "learning_rate": 1.744615805331491e-05, + "loss": 0.4577, + "step": 4735 + }, + { + "epoch": 0.9337539432176656, + "grad_norm": 1.0389460335213794, + "learning_rate": 1.744512333191708e-05, + "loss": 0.4268, + "step": 4736 + }, + { + "epoch": 0.9339511041009464, + "grad_norm": 0.5585776839463066, + "learning_rate": 1.7444088431644166e-05, + "loss": 0.4436, + "step": 4737 + }, + { + "epoch": 0.9341482649842271, + "grad_norm": 0.5415927198264985, + "learning_rate": 1.7443053352521032e-05, + "loss": 0.4297, + "step": 4738 + }, + { + "epoch": 0.9343454258675079, + "grad_norm": 0.7020791964781086, + "learning_rate": 1.7442018094572546e-05, + "loss": 0.3968, + "step": 4739 + }, + { + "epoch": 0.9345425867507886, + "grad_norm": 0.5293642076390404, + "learning_rate": 1.7440982657823583e-05, + "loss": 0.4414, + "step": 4740 + }, + { + "epoch": 0.9347397476340694, + "grad_norm": 0.5699485347614668, + "learning_rate": 1.743994704229902e-05, + "loss": 0.4452, + "step": 4741 + }, + { + "epoch": 0.9349369085173501, + "grad_norm": 0.9714674676297455, + "learning_rate": 1.743891124802374e-05, + "loss": 0.4315, + "step": 4742 + }, + { + "epoch": 0.9351340694006309, + "grad_norm": 0.5220962127597865, + "learning_rate": 1.7437875275022622e-05, + "loss": 0.4145, + "step": 4743 + }, + { + "epoch": 0.9353312302839116, + "grad_norm": 0.538501745627749, + "learning_rate": 1.7436839123320566e-05, + "loss": 0.46, + "step": 4744 + }, + { + "epoch": 0.9355283911671924, + "grad_norm": 0.49639504847696175, + "learning_rate": 1.743580279294246e-05, + "loss": 0.4082, + "step": 4745 + }, + { + "epoch": 0.9357255520504731, + "grad_norm": 0.531129670237936, + "learning_rate": 1.74347662839132e-05, + "loss": 0.4302, + "step": 4746 + }, + { + "epoch": 0.935922712933754, + "grad_norm": 0.6286351478232094, + "learning_rate": 1.7433729596257694e-05, + "loss": 0.4533, + "step": 4747 + }, + { + "epoch": 0.9361198738170347, + "grad_norm": 0.5416558622895243, + "learning_rate": 1.743269273000085e-05, + "loss": 0.4097, + "step": 4748 + }, + { + "epoch": 0.9363170347003155, + "grad_norm": 0.5339455229568808, + "learning_rate": 1.7431655685167578e-05, + "loss": 0.439, + "step": 4749 + }, + { + "epoch": 0.9365141955835962, + "grad_norm": 0.5760351851798424, + "learning_rate": 1.743061846178279e-05, + "loss": 0.4818, + "step": 4750 + }, + { + "epoch": 0.936711356466877, + "grad_norm": 0.5917480235250198, + "learning_rate": 1.742958105987141e-05, + "loss": 0.4568, + "step": 4751 + }, + { + "epoch": 0.9369085173501577, + "grad_norm": 0.8339420837901182, + "learning_rate": 1.7428543479458367e-05, + "loss": 0.4443, + "step": 4752 + }, + { + "epoch": 0.9371056782334385, + "grad_norm": 0.5697309446607353, + "learning_rate": 1.7427505720568583e-05, + "loss": 0.4565, + "step": 4753 + }, + { + "epoch": 0.9373028391167192, + "grad_norm": 0.5415366469073645, + "learning_rate": 1.7426467783226992e-05, + "loss": 0.4764, + "step": 4754 + }, + { + "epoch": 0.9375, + "grad_norm": 0.6019352545618148, + "learning_rate": 1.742542966745853e-05, + "loss": 0.4492, + "step": 4755 + }, + { + "epoch": 0.9376971608832808, + "grad_norm": 0.6685857679022774, + "learning_rate": 1.7424391373288142e-05, + "loss": 0.4581, + "step": 4756 + }, + { + "epoch": 0.9378943217665615, + "grad_norm": 0.5894795119442221, + "learning_rate": 1.742335290074077e-05, + "loss": 0.4677, + "step": 4757 + }, + { + "epoch": 0.9380914826498423, + "grad_norm": 0.5528329269741018, + "learning_rate": 1.7422314249841373e-05, + "loss": 0.4342, + "step": 4758 + }, + { + "epoch": 0.938288643533123, + "grad_norm": 0.5633875242021811, + "learning_rate": 1.7421275420614895e-05, + "loss": 0.4297, + "step": 4759 + }, + { + "epoch": 0.9384858044164038, + "grad_norm": 0.6057160126806879, + "learning_rate": 1.7420236413086298e-05, + "loss": 0.4469, + "step": 4760 + }, + { + "epoch": 0.9386829652996845, + "grad_norm": 0.5062209536970022, + "learning_rate": 1.7419197227280545e-05, + "loss": 0.413, + "step": 4761 + }, + { + "epoch": 0.9388801261829653, + "grad_norm": 0.5947801340129587, + "learning_rate": 1.7418157863222608e-05, + "loss": 0.4505, + "step": 4762 + }, + { + "epoch": 0.939077287066246, + "grad_norm": 0.5473326180648399, + "learning_rate": 1.7417118320937452e-05, + "loss": 0.4298, + "step": 4763 + }, + { + "epoch": 0.9392744479495269, + "grad_norm": 0.5842555110402521, + "learning_rate": 1.7416078600450053e-05, + "loss": 0.4366, + "step": 4764 + }, + { + "epoch": 0.9394716088328076, + "grad_norm": 6.596709214102324, + "learning_rate": 1.7415038701785397e-05, + "loss": 0.4933, + "step": 4765 + }, + { + "epoch": 0.9396687697160884, + "grad_norm": 0.5983220979388622, + "learning_rate": 1.741399862496846e-05, + "loss": 0.4303, + "step": 4766 + }, + { + "epoch": 0.9398659305993691, + "grad_norm": 0.5605426934566005, + "learning_rate": 1.741295837002424e-05, + "loss": 0.4376, + "step": 4767 + }, + { + "epoch": 0.9400630914826499, + "grad_norm": 0.6623850194449121, + "learning_rate": 1.7411917936977728e-05, + "loss": 0.4891, + "step": 4768 + }, + { + "epoch": 0.9402602523659306, + "grad_norm": 0.5643318735416755, + "learning_rate": 1.7410877325853914e-05, + "loss": 0.4342, + "step": 4769 + }, + { + "epoch": 0.9404574132492114, + "grad_norm": 0.510419807668752, + "learning_rate": 1.7409836536677804e-05, + "loss": 0.4094, + "step": 4770 + }, + { + "epoch": 0.9406545741324921, + "grad_norm": 0.5957515521253349, + "learning_rate": 1.7408795569474407e-05, + "loss": 0.4534, + "step": 4771 + }, + { + "epoch": 0.9408517350157729, + "grad_norm": 0.5938740829653067, + "learning_rate": 1.7407754424268727e-05, + "loss": 0.4209, + "step": 4772 + }, + { + "epoch": 0.9410488958990536, + "grad_norm": 0.5430175081242069, + "learning_rate": 1.7406713101085782e-05, + "loss": 0.4428, + "step": 4773 + }, + { + "epoch": 0.9412460567823344, + "grad_norm": 0.6092122034861576, + "learning_rate": 1.7405671599950593e-05, + "loss": 0.4552, + "step": 4774 + }, + { + "epoch": 0.9414432176656151, + "grad_norm": 0.5301372981781962, + "learning_rate": 1.7404629920888178e-05, + "loss": 0.3949, + "step": 4775 + }, + { + "epoch": 0.9416403785488959, + "grad_norm": 0.874324067325636, + "learning_rate": 1.7403588063923565e-05, + "loss": 0.4171, + "step": 4776 + }, + { + "epoch": 0.9418375394321766, + "grad_norm": 0.562779555203244, + "learning_rate": 1.7402546029081793e-05, + "loss": 0.4508, + "step": 4777 + }, + { + "epoch": 0.9420347003154574, + "grad_norm": 0.5751034428164898, + "learning_rate": 1.7401503816387886e-05, + "loss": 0.4648, + "step": 4778 + }, + { + "epoch": 0.9422318611987381, + "grad_norm": 0.4928153903001422, + "learning_rate": 1.740046142586689e-05, + "loss": 0.4094, + "step": 4779 + }, + { + "epoch": 0.942429022082019, + "grad_norm": 0.5593941847892724, + "learning_rate": 1.7399418857543848e-05, + "loss": 0.4448, + "step": 4780 + }, + { + "epoch": 0.9426261829652997, + "grad_norm": 0.5120731327364189, + "learning_rate": 1.739837611144381e-05, + "loss": 0.4433, + "step": 4781 + }, + { + "epoch": 0.9428233438485805, + "grad_norm": 0.5547010617938658, + "learning_rate": 1.739733318759183e-05, + "loss": 0.4517, + "step": 4782 + }, + { + "epoch": 0.9430205047318612, + "grad_norm": 0.5279709568184927, + "learning_rate": 1.739629008601296e-05, + "loss": 0.4414, + "step": 4783 + }, + { + "epoch": 0.943217665615142, + "grad_norm": 0.6546356596831223, + "learning_rate": 1.7395246806732266e-05, + "loss": 0.502, + "step": 4784 + }, + { + "epoch": 0.9434148264984227, + "grad_norm": 0.5551495693288206, + "learning_rate": 1.739420334977481e-05, + "loss": 0.4212, + "step": 4785 + }, + { + "epoch": 0.9436119873817035, + "grad_norm": 0.5092496805523852, + "learning_rate": 1.7393159715165668e-05, + "loss": 0.405, + "step": 4786 + }, + { + "epoch": 0.9438091482649842, + "grad_norm": 0.5118148863889752, + "learning_rate": 1.739211590292991e-05, + "loss": 0.4185, + "step": 4787 + }, + { + "epoch": 0.944006309148265, + "grad_norm": 0.5543188761300978, + "learning_rate": 1.739107191309261e-05, + "loss": 0.4229, + "step": 4788 + }, + { + "epoch": 0.9442034700315457, + "grad_norm": 0.4947381983906698, + "learning_rate": 1.7390027745678857e-05, + "loss": 0.4347, + "step": 4789 + }, + { + "epoch": 0.9444006309148265, + "grad_norm": 0.5333604588136764, + "learning_rate": 1.7388983400713736e-05, + "loss": 0.4273, + "step": 4790 + }, + { + "epoch": 0.9445977917981072, + "grad_norm": 0.48436693257142593, + "learning_rate": 1.7387938878222337e-05, + "loss": 0.3842, + "step": 4791 + }, + { + "epoch": 0.944794952681388, + "grad_norm": 0.48098662178856766, + "learning_rate": 1.7386894178229764e-05, + "loss": 0.4078, + "step": 4792 + }, + { + "epoch": 0.9449921135646687, + "grad_norm": 0.5927563970559201, + "learning_rate": 1.7385849300761104e-05, + "loss": 0.4479, + "step": 4793 + }, + { + "epoch": 0.9451892744479495, + "grad_norm": 0.5292188971068847, + "learning_rate": 1.7384804245841468e-05, + "loss": 0.4466, + "step": 4794 + }, + { + "epoch": 0.9453864353312302, + "grad_norm": 0.5495901269166898, + "learning_rate": 1.7383759013495965e-05, + "loss": 0.423, + "step": 4795 + }, + { + "epoch": 0.945583596214511, + "grad_norm": 0.5124360410567629, + "learning_rate": 1.73827136037497e-05, + "loss": 0.4669, + "step": 4796 + }, + { + "epoch": 0.9457807570977917, + "grad_norm": 0.5728281769441116, + "learning_rate": 1.7381668016627798e-05, + "loss": 0.4712, + "step": 4797 + }, + { + "epoch": 0.9459779179810726, + "grad_norm": 0.48078417126183814, + "learning_rate": 1.738062225215538e-05, + "loss": 0.4534, + "step": 4798 + }, + { + "epoch": 0.9461750788643533, + "grad_norm": 0.4972157768011806, + "learning_rate": 1.7379576310357568e-05, + "loss": 0.4335, + "step": 4799 + }, + { + "epoch": 0.9463722397476341, + "grad_norm": 0.6537133826858609, + "learning_rate": 1.7378530191259492e-05, + "loss": 0.4604, + "step": 4800 + }, + { + "epoch": 0.9465694006309149, + "grad_norm": 0.5206764483089402, + "learning_rate": 1.7377483894886285e-05, + "loss": 0.4377, + "step": 4801 + }, + { + "epoch": 0.9467665615141956, + "grad_norm": 0.5571810897809417, + "learning_rate": 1.7376437421263088e-05, + "loss": 0.4732, + "step": 4802 + }, + { + "epoch": 0.9469637223974764, + "grad_norm": 0.5154485693291944, + "learning_rate": 1.737539077041504e-05, + "loss": 0.4422, + "step": 4803 + }, + { + "epoch": 0.9471608832807571, + "grad_norm": 2.538605874424299, + "learning_rate": 1.737434394236729e-05, + "loss": 0.4926, + "step": 4804 + }, + { + "epoch": 0.9473580441640379, + "grad_norm": 0.5854347379433069, + "learning_rate": 1.737329693714499e-05, + "loss": 0.4397, + "step": 4805 + }, + { + "epoch": 0.9475552050473186, + "grad_norm": 0.5692750771949014, + "learning_rate": 1.7372249754773292e-05, + "loss": 0.4892, + "step": 4806 + }, + { + "epoch": 0.9477523659305994, + "grad_norm": 0.5453924056164228, + "learning_rate": 1.7371202395277357e-05, + "loss": 0.4488, + "step": 4807 + }, + { + "epoch": 0.9479495268138801, + "grad_norm": 0.5329921263116298, + "learning_rate": 1.7370154858682347e-05, + "loss": 0.4305, + "step": 4808 + }, + { + "epoch": 0.9481466876971609, + "grad_norm": 0.5070927581481749, + "learning_rate": 1.736910714501343e-05, + "loss": 0.3879, + "step": 4809 + }, + { + "epoch": 0.9483438485804416, + "grad_norm": 0.49761316318511184, + "learning_rate": 1.7368059254295783e-05, + "loss": 0.4439, + "step": 4810 + }, + { + "epoch": 0.9485410094637224, + "grad_norm": 0.6116580466663731, + "learning_rate": 1.736701118655458e-05, + "loss": 0.4213, + "step": 4811 + }, + { + "epoch": 0.9487381703470031, + "grad_norm": 0.6059773769906507, + "learning_rate": 1.7365962941814998e-05, + "loss": 0.4447, + "step": 4812 + }, + { + "epoch": 0.948935331230284, + "grad_norm": 0.5302599878215019, + "learning_rate": 1.7364914520102223e-05, + "loss": 0.4219, + "step": 4813 + }, + { + "epoch": 0.9491324921135647, + "grad_norm": 0.5346407557209393, + "learning_rate": 1.736386592144145e-05, + "loss": 0.4452, + "step": 4814 + }, + { + "epoch": 0.9493296529968455, + "grad_norm": 2.5973339435009675, + "learning_rate": 1.7362817145857866e-05, + "loss": 0.5188, + "step": 4815 + }, + { + "epoch": 0.9495268138801262, + "grad_norm": 0.6369521233467632, + "learning_rate": 1.736176819337667e-05, + "loss": 0.4557, + "step": 4816 + }, + { + "epoch": 0.949723974763407, + "grad_norm": 0.7229696457122112, + "learning_rate": 1.7360719064023067e-05, + "loss": 0.4351, + "step": 4817 + }, + { + "epoch": 0.9499211356466877, + "grad_norm": 0.5548249000280323, + "learning_rate": 1.7359669757822256e-05, + "loss": 0.4495, + "step": 4818 + }, + { + "epoch": 0.9501182965299685, + "grad_norm": 0.5371920142295181, + "learning_rate": 1.7358620274799455e-05, + "loss": 0.4468, + "step": 4819 + }, + { + "epoch": 0.9503154574132492, + "grad_norm": 0.5768920071321693, + "learning_rate": 1.7357570614979878e-05, + "loss": 0.4429, + "step": 4820 + }, + { + "epoch": 0.95051261829653, + "grad_norm": 0.5491663645524869, + "learning_rate": 1.735652077838874e-05, + "loss": 0.4645, + "step": 4821 + }, + { + "epoch": 0.9507097791798107, + "grad_norm": 0.5993991809124819, + "learning_rate": 1.735547076505127e-05, + "loss": 0.4071, + "step": 4822 + }, + { + "epoch": 0.9509069400630915, + "grad_norm": 0.4880554858272554, + "learning_rate": 1.7354420574992686e-05, + "loss": 0.4059, + "step": 4823 + }, + { + "epoch": 0.9511041009463722, + "grad_norm": 0.903588480396732, + "learning_rate": 1.7353370208238226e-05, + "loss": 0.4422, + "step": 4824 + }, + { + "epoch": 0.951301261829653, + "grad_norm": 0.5459492044215943, + "learning_rate": 1.7352319664813126e-05, + "loss": 0.4284, + "step": 4825 + }, + { + "epoch": 0.9514984227129337, + "grad_norm": 0.531596665432785, + "learning_rate": 1.7351268944742626e-05, + "loss": 0.4111, + "step": 4826 + }, + { + "epoch": 0.9516955835962145, + "grad_norm": 0.5420852235661492, + "learning_rate": 1.735021804805197e-05, + "loss": 0.4238, + "step": 4827 + }, + { + "epoch": 0.9518927444794952, + "grad_norm": 0.5118772611762697, + "learning_rate": 1.7349166974766407e-05, + "loss": 0.4063, + "step": 4828 + }, + { + "epoch": 0.952089905362776, + "grad_norm": 0.6928236531905183, + "learning_rate": 1.7348115724911188e-05, + "loss": 0.4473, + "step": 4829 + }, + { + "epoch": 0.9522870662460567, + "grad_norm": 0.5459811584499307, + "learning_rate": 1.734706429851157e-05, + "loss": 0.4339, + "step": 4830 + }, + { + "epoch": 0.9524842271293376, + "grad_norm": 0.5306998499179901, + "learning_rate": 1.7346012695592817e-05, + "loss": 0.4496, + "step": 4831 + }, + { + "epoch": 0.9526813880126183, + "grad_norm": 0.5488004261374878, + "learning_rate": 1.7344960916180192e-05, + "loss": 0.4504, + "step": 4832 + }, + { + "epoch": 0.9528785488958991, + "grad_norm": 0.5667953843020231, + "learning_rate": 1.734390896029897e-05, + "loss": 0.4086, + "step": 4833 + }, + { + "epoch": 0.9530757097791798, + "grad_norm": 0.5323191653010501, + "learning_rate": 1.7342856827974417e-05, + "loss": 0.4353, + "step": 4834 + }, + { + "epoch": 0.9532728706624606, + "grad_norm": 0.5423134586875568, + "learning_rate": 1.7341804519231815e-05, + "loss": 0.4411, + "step": 4835 + }, + { + "epoch": 0.9534700315457413, + "grad_norm": 0.548006233665663, + "learning_rate": 1.734075203409645e-05, + "loss": 0.4433, + "step": 4836 + }, + { + "epoch": 0.9536671924290221, + "grad_norm": 0.5273485527910695, + "learning_rate": 1.7339699372593605e-05, + "loss": 0.4673, + "step": 4837 + }, + { + "epoch": 0.9538643533123028, + "grad_norm": 0.5406256570631455, + "learning_rate": 1.7338646534748572e-05, + "loss": 0.4424, + "step": 4838 + }, + { + "epoch": 0.9540615141955836, + "grad_norm": 0.5228289639533955, + "learning_rate": 1.7337593520586645e-05, + "loss": 0.4113, + "step": 4839 + }, + { + "epoch": 0.9542586750788643, + "grad_norm": 0.5199695216281296, + "learning_rate": 1.7336540330133126e-05, + "loss": 0.4334, + "step": 4840 + }, + { + "epoch": 0.9544558359621451, + "grad_norm": 0.5157055642134136, + "learning_rate": 1.7335486963413318e-05, + "loss": 0.4114, + "step": 4841 + }, + { + "epoch": 0.9546529968454258, + "grad_norm": 0.5627963986729889, + "learning_rate": 1.7334433420452527e-05, + "loss": 0.454, + "step": 4842 + }, + { + "epoch": 0.9548501577287066, + "grad_norm": 0.5354466461718106, + "learning_rate": 1.7333379701276068e-05, + "loss": 0.4418, + "step": 4843 + }, + { + "epoch": 0.9550473186119873, + "grad_norm": 0.9673410881580022, + "learning_rate": 1.7332325805909256e-05, + "loss": 0.4306, + "step": 4844 + }, + { + "epoch": 0.9552444794952681, + "grad_norm": 5.372837853533415, + "learning_rate": 1.733127173437741e-05, + "loss": 0.4094, + "step": 4845 + }, + { + "epoch": 0.955441640378549, + "grad_norm": 0.6273908422804465, + "learning_rate": 1.7330217486705862e-05, + "loss": 0.4191, + "step": 4846 + }, + { + "epoch": 0.9556388012618297, + "grad_norm": 0.5769927479813597, + "learning_rate": 1.732916306291993e-05, + "loss": 0.4635, + "step": 4847 + }, + { + "epoch": 0.9558359621451105, + "grad_norm": 0.5345159528878386, + "learning_rate": 1.7328108463044953e-05, + "loss": 0.4228, + "step": 4848 + }, + { + "epoch": 0.9560331230283912, + "grad_norm": 0.5196926420397846, + "learning_rate": 1.7327053687106273e-05, + "loss": 0.3951, + "step": 4849 + }, + { + "epoch": 0.956230283911672, + "grad_norm": 0.4976003610694764, + "learning_rate": 1.7325998735129227e-05, + "loss": 0.3933, + "step": 4850 + }, + { + "epoch": 0.9564274447949527, + "grad_norm": 0.5629160034402856, + "learning_rate": 1.7324943607139158e-05, + "loss": 0.4207, + "step": 4851 + }, + { + "epoch": 0.9566246056782335, + "grad_norm": 0.541499287915572, + "learning_rate": 1.7323888303161422e-05, + "loss": 0.4565, + "step": 4852 + }, + { + "epoch": 0.9568217665615142, + "grad_norm": 0.5494402412976808, + "learning_rate": 1.732283282322137e-05, + "loss": 0.4487, + "step": 4853 + }, + { + "epoch": 0.957018927444795, + "grad_norm": 0.5145315890387855, + "learning_rate": 1.7321777167344367e-05, + "loss": 0.4067, + "step": 4854 + }, + { + "epoch": 0.9572160883280757, + "grad_norm": 0.5356564633940517, + "learning_rate": 1.732072133555577e-05, + "loss": 0.4432, + "step": 4855 + }, + { + "epoch": 0.9574132492113565, + "grad_norm": 0.498975459571729, + "learning_rate": 1.7319665327880945e-05, + "loss": 0.3735, + "step": 4856 + }, + { + "epoch": 0.9576104100946372, + "grad_norm": 0.618141773791518, + "learning_rate": 1.7318609144345265e-05, + "loss": 0.4099, + "step": 4857 + }, + { + "epoch": 0.957807570977918, + "grad_norm": 0.5160325448737755, + "learning_rate": 1.7317552784974113e-05, + "loss": 0.3971, + "step": 4858 + }, + { + "epoch": 0.9580047318611987, + "grad_norm": 0.5514233004043473, + "learning_rate": 1.7316496249792857e-05, + "loss": 0.46, + "step": 4859 + }, + { + "epoch": 0.9582018927444795, + "grad_norm": 0.54586722386132, + "learning_rate": 1.7315439538826887e-05, + "loss": 0.4271, + "step": 4860 + }, + { + "epoch": 0.9583990536277602, + "grad_norm": 0.50615085528196, + "learning_rate": 1.7314382652101595e-05, + "loss": 0.4251, + "step": 4861 + }, + { + "epoch": 0.958596214511041, + "grad_norm": 0.5311049525152373, + "learning_rate": 1.7313325589642363e-05, + "loss": 0.4354, + "step": 4862 + }, + { + "epoch": 0.9587933753943217, + "grad_norm": 0.5243846048285586, + "learning_rate": 1.7312268351474603e-05, + "loss": 0.423, + "step": 4863 + }, + { + "epoch": 0.9589905362776026, + "grad_norm": 0.5359735257750494, + "learning_rate": 1.73112109376237e-05, + "loss": 0.4092, + "step": 4864 + }, + { + "epoch": 0.9591876971608833, + "grad_norm": 0.5599828574009664, + "learning_rate": 1.7310153348115068e-05, + "loss": 0.4465, + "step": 4865 + }, + { + "epoch": 0.9593848580441641, + "grad_norm": 0.53367931056808, + "learning_rate": 1.7309095582974115e-05, + "loss": 0.4485, + "step": 4866 + }, + { + "epoch": 0.9595820189274448, + "grad_norm": 0.5485569037489243, + "learning_rate": 1.7308037642226258e-05, + "loss": 0.4181, + "step": 4867 + }, + { + "epoch": 0.9597791798107256, + "grad_norm": 0.5325054492925666, + "learning_rate": 1.7306979525896907e-05, + "loss": 0.4338, + "step": 4868 + }, + { + "epoch": 0.9599763406940063, + "grad_norm": 1.2447432969875094, + "learning_rate": 1.730592123401149e-05, + "loss": 0.447, + "step": 4869 + }, + { + "epoch": 0.9601735015772871, + "grad_norm": 0.5405855805944394, + "learning_rate": 1.7304862766595433e-05, + "loss": 0.4556, + "step": 4870 + }, + { + "epoch": 0.9603706624605678, + "grad_norm": 0.5014468926425727, + "learning_rate": 1.7303804123674165e-05, + "loss": 0.4127, + "step": 4871 + }, + { + "epoch": 0.9605678233438486, + "grad_norm": 0.5242180756855537, + "learning_rate": 1.730274530527312e-05, + "loss": 0.4488, + "step": 4872 + }, + { + "epoch": 0.9607649842271293, + "grad_norm": 0.5590887485992455, + "learning_rate": 1.730168631141774e-05, + "loss": 0.4408, + "step": 4873 + }, + { + "epoch": 0.9609621451104101, + "grad_norm": 0.5590641148729568, + "learning_rate": 1.7300627142133466e-05, + "loss": 0.4421, + "step": 4874 + }, + { + "epoch": 0.9611593059936908, + "grad_norm": 0.5381918633394875, + "learning_rate": 1.7299567797445744e-05, + "loss": 0.4152, + "step": 4875 + }, + { + "epoch": 0.9613564668769716, + "grad_norm": 0.5116503025098527, + "learning_rate": 1.729850827738003e-05, + "loss": 0.4137, + "step": 4876 + }, + { + "epoch": 0.9615536277602523, + "grad_norm": 0.601103697573897, + "learning_rate": 1.7297448581961775e-05, + "loss": 0.4367, + "step": 4877 + }, + { + "epoch": 0.9617507886435331, + "grad_norm": 0.5234000253181185, + "learning_rate": 1.7296388711216442e-05, + "loss": 0.4264, + "step": 4878 + }, + { + "epoch": 0.9619479495268138, + "grad_norm": 0.5489406687628315, + "learning_rate": 1.7295328665169495e-05, + "loss": 0.4522, + "step": 4879 + }, + { + "epoch": 0.9621451104100947, + "grad_norm": 0.5304149080662556, + "learning_rate": 1.7294268443846403e-05, + "loss": 0.4531, + "step": 4880 + }, + { + "epoch": 0.9623422712933754, + "grad_norm": 0.5277630403000341, + "learning_rate": 1.7293208047272635e-05, + "loss": 0.4233, + "step": 4881 + }, + { + "epoch": 0.9625394321766562, + "grad_norm": 0.5216948283800477, + "learning_rate": 1.729214747547367e-05, + "loss": 0.4634, + "step": 4882 + }, + { + "epoch": 0.9627365930599369, + "grad_norm": 0.5128948306308777, + "learning_rate": 1.7291086728474992e-05, + "loss": 0.3948, + "step": 4883 + }, + { + "epoch": 0.9629337539432177, + "grad_norm": 0.4864061144860882, + "learning_rate": 1.729002580630208e-05, + "loss": 0.4011, + "step": 4884 + }, + { + "epoch": 0.9631309148264984, + "grad_norm": 1.3934757331709215, + "learning_rate": 1.7288964708980432e-05, + "loss": 0.4406, + "step": 4885 + }, + { + "epoch": 0.9633280757097792, + "grad_norm": 0.5314428535537528, + "learning_rate": 1.7287903436535535e-05, + "loss": 0.4282, + "step": 4886 + }, + { + "epoch": 0.9635252365930599, + "grad_norm": 0.5113959395669175, + "learning_rate": 1.728684198899289e-05, + "loss": 0.4459, + "step": 4887 + }, + { + "epoch": 0.9637223974763407, + "grad_norm": 0.7377416915879886, + "learning_rate": 1.7285780366377998e-05, + "loss": 0.4226, + "step": 4888 + }, + { + "epoch": 0.9639195583596214, + "grad_norm": 0.625103270758122, + "learning_rate": 1.7284718568716362e-05, + "loss": 0.4857, + "step": 4889 + }, + { + "epoch": 0.9641167192429022, + "grad_norm": 0.49384862341141844, + "learning_rate": 1.7283656596033502e-05, + "loss": 0.4377, + "step": 4890 + }, + { + "epoch": 0.964313880126183, + "grad_norm": 0.8265103239871188, + "learning_rate": 1.7282594448354922e-05, + "loss": 0.4826, + "step": 4891 + }, + { + "epoch": 0.9645110410094637, + "grad_norm": 0.5660829434512461, + "learning_rate": 1.728153212570615e-05, + "loss": 0.436, + "step": 4892 + }, + { + "epoch": 0.9647082018927445, + "grad_norm": 0.49876988837276626, + "learning_rate": 1.7280469628112698e-05, + "loss": 0.4192, + "step": 4893 + }, + { + "epoch": 0.9649053627760252, + "grad_norm": 0.6150615512750844, + "learning_rate": 1.7279406955600107e-05, + "loss": 0.4016, + "step": 4894 + }, + { + "epoch": 0.965102523659306, + "grad_norm": 0.536942351647561, + "learning_rate": 1.7278344108193897e-05, + "loss": 0.4207, + "step": 4895 + }, + { + "epoch": 0.9652996845425867, + "grad_norm": 0.9470023911925746, + "learning_rate": 1.7277281085919613e-05, + "loss": 0.4274, + "step": 4896 + }, + { + "epoch": 0.9654968454258676, + "grad_norm": 0.5235369204502012, + "learning_rate": 1.727621788880279e-05, + "loss": 0.4244, + "step": 4897 + }, + { + "epoch": 0.9656940063091483, + "grad_norm": 0.6071731227037889, + "learning_rate": 1.727515451686897e-05, + "loss": 0.3893, + "step": 4898 + }, + { + "epoch": 0.9658911671924291, + "grad_norm": 0.5903477067942368, + "learning_rate": 1.7274090970143705e-05, + "loss": 0.4356, + "step": 4899 + }, + { + "epoch": 0.9660883280757098, + "grad_norm": 0.5376060698400171, + "learning_rate": 1.7273027248652545e-05, + "loss": 0.4561, + "step": 4900 + }, + { + "epoch": 0.9662854889589906, + "grad_norm": 0.5838617039015437, + "learning_rate": 1.727196335242105e-05, + "loss": 0.4383, + "step": 4901 + }, + { + "epoch": 0.9664826498422713, + "grad_norm": 0.5167788808435415, + "learning_rate": 1.727089928147478e-05, + "loss": 0.4024, + "step": 4902 + }, + { + "epoch": 0.9666798107255521, + "grad_norm": 0.5624366894471787, + "learning_rate": 1.72698350358393e-05, + "loss": 0.4309, + "step": 4903 + }, + { + "epoch": 0.9668769716088328, + "grad_norm": 0.753504301288875, + "learning_rate": 1.726877061554018e-05, + "loss": 0.3958, + "step": 4904 + }, + { + "epoch": 0.9670741324921136, + "grad_norm": 0.5684825023726245, + "learning_rate": 1.726770602060299e-05, + "loss": 0.4328, + "step": 4905 + }, + { + "epoch": 0.9672712933753943, + "grad_norm": 0.6897233391043007, + "learning_rate": 1.726664125105331e-05, + "loss": 0.4256, + "step": 4906 + }, + { + "epoch": 0.9674684542586751, + "grad_norm": 0.5640572708097991, + "learning_rate": 1.726557630691672e-05, + "loss": 0.4343, + "step": 4907 + }, + { + "epoch": 0.9676656151419558, + "grad_norm": 0.5552212931426607, + "learning_rate": 1.7264511188218812e-05, + "loss": 0.4653, + "step": 4908 + }, + { + "epoch": 0.9678627760252366, + "grad_norm": 1.0633900724505037, + "learning_rate": 1.726344589498517e-05, + "loss": 0.4826, + "step": 4909 + }, + { + "epoch": 0.9680599369085173, + "grad_norm": 0.7383588772446893, + "learning_rate": 1.7262380427241394e-05, + "loss": 0.4684, + "step": 4910 + }, + { + "epoch": 0.9682570977917981, + "grad_norm": 0.6826171176254585, + "learning_rate": 1.7261314785013078e-05, + "loss": 0.4237, + "step": 4911 + }, + { + "epoch": 0.9684542586750788, + "grad_norm": 0.6364722722252665, + "learning_rate": 1.7260248968325828e-05, + "loss": 0.4489, + "step": 4912 + }, + { + "epoch": 0.9686514195583596, + "grad_norm": 0.5663790368688596, + "learning_rate": 1.7259182977205248e-05, + "loss": 0.4189, + "step": 4913 + }, + { + "epoch": 0.9688485804416404, + "grad_norm": 0.5420023619747468, + "learning_rate": 1.7258116811676956e-05, + "loss": 0.4295, + "step": 4914 + }, + { + "epoch": 0.9690457413249212, + "grad_norm": 0.5377721001533368, + "learning_rate": 1.7257050471766558e-05, + "loss": 0.4638, + "step": 4915 + }, + { + "epoch": 0.9692429022082019, + "grad_norm": 0.6046459705426991, + "learning_rate": 1.7255983957499676e-05, + "loss": 0.4886, + "step": 4916 + }, + { + "epoch": 0.9694400630914827, + "grad_norm": 0.5236164202200155, + "learning_rate": 1.7254917268901942e-05, + "loss": 0.4251, + "step": 4917 + }, + { + "epoch": 0.9696372239747634, + "grad_norm": 0.5378880369504965, + "learning_rate": 1.7253850405998976e-05, + "loss": 0.4662, + "step": 4918 + }, + { + "epoch": 0.9698343848580442, + "grad_norm": 0.5749445692065384, + "learning_rate": 1.7252783368816413e-05, + "loss": 0.4855, + "step": 4919 + }, + { + "epoch": 0.9700315457413249, + "grad_norm": 0.49857545471871095, + "learning_rate": 1.7251716157379887e-05, + "loss": 0.4235, + "step": 4920 + }, + { + "epoch": 0.9702287066246057, + "grad_norm": 0.5825438945665429, + "learning_rate": 1.725064877171504e-05, + "loss": 0.4292, + "step": 4921 + }, + { + "epoch": 0.9704258675078864, + "grad_norm": 0.7116226202261247, + "learning_rate": 1.724958121184752e-05, + "loss": 0.4942, + "step": 4922 + }, + { + "epoch": 0.9706230283911672, + "grad_norm": 0.5595515787710044, + "learning_rate": 1.7248513477802973e-05, + "loss": 0.4251, + "step": 4923 + }, + { + "epoch": 0.9708201892744479, + "grad_norm": 0.5609266287961348, + "learning_rate": 1.724744556960705e-05, + "loss": 0.4362, + "step": 4924 + }, + { + "epoch": 0.9710173501577287, + "grad_norm": 0.5332114412325697, + "learning_rate": 1.7246377487285415e-05, + "loss": 0.4222, + "step": 4925 + }, + { + "epoch": 0.9712145110410094, + "grad_norm": 0.5579951518622186, + "learning_rate": 1.7245309230863723e-05, + "loss": 0.4351, + "step": 4926 + }, + { + "epoch": 0.9714116719242902, + "grad_norm": 0.5504796489083765, + "learning_rate": 1.7244240800367642e-05, + "loss": 0.466, + "step": 4927 + }, + { + "epoch": 0.9716088328075709, + "grad_norm": 0.5364866603592925, + "learning_rate": 1.724317219582284e-05, + "loss": 0.4356, + "step": 4928 + }, + { + "epoch": 0.9718059936908517, + "grad_norm": 0.5307686124133709, + "learning_rate": 1.7242103417255e-05, + "loss": 0.3943, + "step": 4929 + }, + { + "epoch": 0.9720031545741324, + "grad_norm": 0.535779232578516, + "learning_rate": 1.724103446468979e-05, + "loss": 0.4479, + "step": 4930 + }, + { + "epoch": 0.9722003154574133, + "grad_norm": 0.8527430234546016, + "learning_rate": 1.723996533815289e-05, + "loss": 0.4271, + "step": 4931 + }, + { + "epoch": 0.972397476340694, + "grad_norm": 0.4893815242972822, + "learning_rate": 1.723889603767e-05, + "loss": 0.4294, + "step": 4932 + }, + { + "epoch": 0.9725946372239748, + "grad_norm": 0.5142240975494096, + "learning_rate": 1.7237826563266797e-05, + "loss": 0.4111, + "step": 4933 + }, + { + "epoch": 0.9727917981072555, + "grad_norm": 0.5056050398485274, + "learning_rate": 1.7236756914968985e-05, + "loss": 0.4459, + "step": 4934 + }, + { + "epoch": 0.9729889589905363, + "grad_norm": 0.6875911603497251, + "learning_rate": 1.723568709280226e-05, + "loss": 0.4393, + "step": 4935 + }, + { + "epoch": 0.973186119873817, + "grad_norm": 0.5410138293600987, + "learning_rate": 1.7234617096792328e-05, + "loss": 0.4337, + "step": 4936 + }, + { + "epoch": 0.9733832807570978, + "grad_norm": 0.5406394954943095, + "learning_rate": 1.723354692696489e-05, + "loss": 0.4787, + "step": 4937 + }, + { + "epoch": 0.9735804416403786, + "grad_norm": 0.5039617193695565, + "learning_rate": 1.7232476583345667e-05, + "loss": 0.4269, + "step": 4938 + }, + { + "epoch": 0.9737776025236593, + "grad_norm": 0.5667126358374103, + "learning_rate": 1.7231406065960365e-05, + "loss": 0.4314, + "step": 4939 + }, + { + "epoch": 0.9739747634069401, + "grad_norm": 0.5223138203259416, + "learning_rate": 1.723033537483471e-05, + "loss": 0.4238, + "step": 4940 + }, + { + "epoch": 0.9741719242902208, + "grad_norm": 0.5134992034885483, + "learning_rate": 1.722926450999443e-05, + "loss": 0.435, + "step": 4941 + }, + { + "epoch": 0.9743690851735016, + "grad_norm": 0.5335314712456779, + "learning_rate": 1.7228193471465243e-05, + "loss": 0.4448, + "step": 4942 + }, + { + "epoch": 0.9745662460567823, + "grad_norm": 0.5187220320984581, + "learning_rate": 1.722712225927289e-05, + "loss": 0.4305, + "step": 4943 + }, + { + "epoch": 0.9747634069400631, + "grad_norm": 0.5236993505503822, + "learning_rate": 1.7226050873443103e-05, + "loss": 0.4324, + "step": 4944 + }, + { + "epoch": 0.9749605678233438, + "grad_norm": 0.5501948139908517, + "learning_rate": 1.7224979314001623e-05, + "loss": 0.4445, + "step": 4945 + }, + { + "epoch": 0.9751577287066246, + "grad_norm": 0.5025205140860234, + "learning_rate": 1.72239075809742e-05, + "loss": 0.4067, + "step": 4946 + }, + { + "epoch": 0.9753548895899053, + "grad_norm": 0.509021011008656, + "learning_rate": 1.722283567438658e-05, + "loss": 0.4127, + "step": 4947 + }, + { + "epoch": 0.9755520504731862, + "grad_norm": 0.5169322695492775, + "learning_rate": 1.7221763594264513e-05, + "loss": 0.4183, + "step": 4948 + }, + { + "epoch": 0.9757492113564669, + "grad_norm": 0.5893693443229753, + "learning_rate": 1.7220691340633762e-05, + "loss": 0.413, + "step": 4949 + }, + { + "epoch": 0.9759463722397477, + "grad_norm": 0.5032678433324692, + "learning_rate": 1.7219618913520086e-05, + "loss": 0.3999, + "step": 4950 + }, + { + "epoch": 0.9761435331230284, + "grad_norm": 0.5611832448812477, + "learning_rate": 1.7218546312949255e-05, + "loss": 0.4208, + "step": 4951 + }, + { + "epoch": 0.9763406940063092, + "grad_norm": 0.5402039539918267, + "learning_rate": 1.7217473538947032e-05, + "loss": 0.4592, + "step": 4952 + }, + { + "epoch": 0.9765378548895899, + "grad_norm": 0.5312684422263533, + "learning_rate": 1.7216400591539194e-05, + "loss": 0.3958, + "step": 4953 + }, + { + "epoch": 0.9767350157728707, + "grad_norm": 0.4838095794471189, + "learning_rate": 1.7215327470751525e-05, + "loss": 0.4012, + "step": 4954 + }, + { + "epoch": 0.9769321766561514, + "grad_norm": 0.5459043063215803, + "learning_rate": 1.72142541766098e-05, + "loss": 0.4889, + "step": 4955 + }, + { + "epoch": 0.9771293375394322, + "grad_norm": 0.4762352325018086, + "learning_rate": 1.721318070913981e-05, + "loss": 0.4194, + "step": 4956 + }, + { + "epoch": 0.9773264984227129, + "grad_norm": 1.1041727399927024, + "learning_rate": 1.7212107068367343e-05, + "loss": 0.4595, + "step": 4957 + }, + { + "epoch": 0.9775236593059937, + "grad_norm": 0.5072331377511347, + "learning_rate": 1.7211033254318195e-05, + "loss": 0.4192, + "step": 4958 + }, + { + "epoch": 0.9777208201892744, + "grad_norm": 0.5051945491324831, + "learning_rate": 1.720995926701817e-05, + "loss": 0.4018, + "step": 4959 + }, + { + "epoch": 0.9779179810725552, + "grad_norm": 0.5317945948669217, + "learning_rate": 1.7208885106493068e-05, + "loss": 0.4245, + "step": 4960 + }, + { + "epoch": 0.9781151419558359, + "grad_norm": 0.5003276873104907, + "learning_rate": 1.7207810772768692e-05, + "loss": 0.4175, + "step": 4961 + }, + { + "epoch": 0.9783123028391167, + "grad_norm": 0.5129068918599115, + "learning_rate": 1.720673626587086e-05, + "loss": 0.4263, + "step": 4962 + }, + { + "epoch": 0.9785094637223974, + "grad_norm": 0.49567393969405865, + "learning_rate": 1.7205661585825385e-05, + "loss": 0.4224, + "step": 4963 + }, + { + "epoch": 0.9787066246056783, + "grad_norm": 0.5210701609022403, + "learning_rate": 1.7204586732658088e-05, + "loss": 0.4054, + "step": 4964 + }, + { + "epoch": 0.978903785488959, + "grad_norm": 0.5237810329795967, + "learning_rate": 1.720351170639479e-05, + "loss": 0.4373, + "step": 4965 + }, + { + "epoch": 0.9791009463722398, + "grad_norm": 0.49172980877968686, + "learning_rate": 1.7202436507061327e-05, + "loss": 0.3981, + "step": 4966 + }, + { + "epoch": 0.9792981072555205, + "grad_norm": 0.5033156694120647, + "learning_rate": 1.7201361134683522e-05, + "loss": 0.405, + "step": 4967 + }, + { + "epoch": 0.9794952681388013, + "grad_norm": 0.532232285017258, + "learning_rate": 1.720028558928722e-05, + "loss": 0.4189, + "step": 4968 + }, + { + "epoch": 0.979692429022082, + "grad_norm": 0.5084865188261308, + "learning_rate": 1.7199209870898257e-05, + "loss": 0.426, + "step": 4969 + }, + { + "epoch": 0.9798895899053628, + "grad_norm": 0.5195794855402261, + "learning_rate": 1.719813397954248e-05, + "loss": 0.4279, + "step": 4970 + }, + { + "epoch": 0.9800867507886435, + "grad_norm": 0.5120454170989818, + "learning_rate": 1.7197057915245738e-05, + "loss": 0.4465, + "step": 4971 + }, + { + "epoch": 0.9802839116719243, + "grad_norm": 0.5104835439357177, + "learning_rate": 1.7195981678033883e-05, + "loss": 0.4551, + "step": 4972 + }, + { + "epoch": 0.980481072555205, + "grad_norm": 0.545753139194094, + "learning_rate": 1.7194905267932775e-05, + "loss": 0.4515, + "step": 4973 + }, + { + "epoch": 0.9806782334384858, + "grad_norm": 0.48063719552920575, + "learning_rate": 1.719382868496827e-05, + "loss": 0.3959, + "step": 4974 + }, + { + "epoch": 0.9808753943217665, + "grad_norm": 0.5081444056915589, + "learning_rate": 1.7192751929166237e-05, + "loss": 0.4267, + "step": 4975 + }, + { + "epoch": 0.9810725552050473, + "grad_norm": 0.7889320206894174, + "learning_rate": 1.7191675000552552e-05, + "loss": 0.4256, + "step": 4976 + }, + { + "epoch": 0.981269716088328, + "grad_norm": 0.5161493421048737, + "learning_rate": 1.719059789915308e-05, + "loss": 0.3944, + "step": 4977 + }, + { + "epoch": 0.9814668769716088, + "grad_norm": 0.5526599869469584, + "learning_rate": 1.7189520624993706e-05, + "loss": 0.4733, + "step": 4978 + }, + { + "epoch": 0.9816640378548895, + "grad_norm": 0.5921214088474026, + "learning_rate": 1.7188443178100306e-05, + "loss": 0.4402, + "step": 4979 + }, + { + "epoch": 0.9818611987381703, + "grad_norm": 0.4899387375326017, + "learning_rate": 1.7187365558498772e-05, + "loss": 0.403, + "step": 4980 + }, + { + "epoch": 0.982058359621451, + "grad_norm": 0.6284691568651574, + "learning_rate": 1.7186287766214992e-05, + "loss": 0.4247, + "step": 4981 + }, + { + "epoch": 0.9822555205047319, + "grad_norm": 0.5219769363772543, + "learning_rate": 1.7185209801274863e-05, + "loss": 0.4378, + "step": 4982 + }, + { + "epoch": 0.9824526813880127, + "grad_norm": 0.561058268499637, + "learning_rate": 1.718413166370428e-05, + "loss": 0.4402, + "step": 4983 + }, + { + "epoch": 0.9826498422712934, + "grad_norm": 0.5228751281665158, + "learning_rate": 1.7183053353529146e-05, + "loss": 0.4342, + "step": 4984 + }, + { + "epoch": 0.9828470031545742, + "grad_norm": 0.6359423300026139, + "learning_rate": 1.7181974870775374e-05, + "loss": 0.4791, + "step": 4985 + }, + { + "epoch": 0.9830441640378549, + "grad_norm": 0.5452791966413713, + "learning_rate": 1.718089621546887e-05, + "loss": 0.4311, + "step": 4986 + }, + { + "epoch": 0.9832413249211357, + "grad_norm": 0.5585163320278087, + "learning_rate": 1.7179817387635552e-05, + "loss": 0.3905, + "step": 4987 + }, + { + "epoch": 0.9834384858044164, + "grad_norm": 1.1312602954702684, + "learning_rate": 1.7178738387301342e-05, + "loss": 0.5146, + "step": 4988 + }, + { + "epoch": 0.9836356466876972, + "grad_norm": 0.544619439438417, + "learning_rate": 1.7177659214492162e-05, + "loss": 0.4722, + "step": 4989 + }, + { + "epoch": 0.9838328075709779, + "grad_norm": 0.48850131893298554, + "learning_rate": 1.7176579869233935e-05, + "loss": 0.387, + "step": 4990 + }, + { + "epoch": 0.9840299684542587, + "grad_norm": 0.646046327237621, + "learning_rate": 1.71755003515526e-05, + "loss": 0.4457, + "step": 4991 + }, + { + "epoch": 0.9842271293375394, + "grad_norm": 0.5156158640893732, + "learning_rate": 1.717442066147409e-05, + "loss": 0.4206, + "step": 4992 + }, + { + "epoch": 0.9844242902208202, + "grad_norm": 0.5424189331510925, + "learning_rate": 1.7173340799024346e-05, + "loss": 0.397, + "step": 4993 + }, + { + "epoch": 0.9846214511041009, + "grad_norm": 0.5235819964470497, + "learning_rate": 1.7172260764229312e-05, + "loss": 0.4467, + "step": 4994 + }, + { + "epoch": 0.9848186119873817, + "grad_norm": 0.5909753498690635, + "learning_rate": 1.717118055711494e-05, + "loss": 0.4398, + "step": 4995 + }, + { + "epoch": 0.9850157728706624, + "grad_norm": 0.4987198779431939, + "learning_rate": 1.7170100177707177e-05, + "loss": 0.4332, + "step": 4996 + }, + { + "epoch": 0.9852129337539433, + "grad_norm": 0.5563239550956313, + "learning_rate": 1.7169019626031985e-05, + "loss": 0.4193, + "step": 4997 + }, + { + "epoch": 0.985410094637224, + "grad_norm": 0.542094709088602, + "learning_rate": 1.7167938902115323e-05, + "loss": 0.4354, + "step": 4998 + }, + { + "epoch": 0.9856072555205048, + "grad_norm": 0.6222848829402873, + "learning_rate": 1.7166858005983154e-05, + "loss": 0.4455, + "step": 4999 + }, + { + "epoch": 0.9858044164037855, + "grad_norm": 0.5278021757853874, + "learning_rate": 1.7165776937661453e-05, + "loss": 0.4299, + "step": 5000 + }, + { + "epoch": 0.9860015772870663, + "grad_norm": 0.6100419080088035, + "learning_rate": 1.716469569717619e-05, + "loss": 0.3916, + "step": 5001 + }, + { + "epoch": 0.986198738170347, + "grad_norm": 0.5391831742527743, + "learning_rate": 1.716361428455334e-05, + "loss": 0.4103, + "step": 5002 + }, + { + "epoch": 0.9863958990536278, + "grad_norm": 0.5826789977832045, + "learning_rate": 1.7162532699818893e-05, + "loss": 0.4617, + "step": 5003 + }, + { + "epoch": 0.9865930599369085, + "grad_norm": 0.4977680784543337, + "learning_rate": 1.7161450942998827e-05, + "loss": 0.4102, + "step": 5004 + }, + { + "epoch": 0.9867902208201893, + "grad_norm": 0.5793799937360323, + "learning_rate": 1.7160369014119136e-05, + "loss": 0.424, + "step": 5005 + }, + { + "epoch": 0.98698738170347, + "grad_norm": 0.5113962294204524, + "learning_rate": 1.7159286913205813e-05, + "loss": 0.4317, + "step": 5006 + }, + { + "epoch": 0.9871845425867508, + "grad_norm": 0.5398003533758813, + "learning_rate": 1.7158204640284855e-05, + "loss": 0.445, + "step": 5007 + }, + { + "epoch": 0.9873817034700315, + "grad_norm": 0.5979699074869115, + "learning_rate": 1.7157122195382267e-05, + "loss": 0.452, + "step": 5008 + }, + { + "epoch": 0.9875788643533123, + "grad_norm": 0.5285476936820424, + "learning_rate": 1.7156039578524055e-05, + "loss": 0.4193, + "step": 5009 + }, + { + "epoch": 0.987776025236593, + "grad_norm": 0.5910776224079434, + "learning_rate": 1.715495678973623e-05, + "loss": 0.4859, + "step": 5010 + }, + { + "epoch": 0.9879731861198738, + "grad_norm": 0.5036681780427391, + "learning_rate": 1.7153873829044805e-05, + "loss": 0.4086, + "step": 5011 + }, + { + "epoch": 0.9881703470031545, + "grad_norm": 0.5363578632899204, + "learning_rate": 1.7152790696475804e-05, + "loss": 0.4109, + "step": 5012 + }, + { + "epoch": 0.9883675078864353, + "grad_norm": 0.5058850256040565, + "learning_rate": 1.715170739205524e-05, + "loss": 0.417, + "step": 5013 + }, + { + "epoch": 0.988564668769716, + "grad_norm": 0.5343518165708425, + "learning_rate": 1.7150623915809154e-05, + "loss": 0.4317, + "step": 5014 + }, + { + "epoch": 0.9887618296529969, + "grad_norm": 0.4732197281950188, + "learning_rate": 1.7149540267763566e-05, + "loss": 0.419, + "step": 5015 + }, + { + "epoch": 0.9889589905362776, + "grad_norm": 0.5460018301214419, + "learning_rate": 1.7148456447944514e-05, + "loss": 0.4371, + "step": 5016 + }, + { + "epoch": 0.9891561514195584, + "grad_norm": 0.5228662093888589, + "learning_rate": 1.714737245637804e-05, + "loss": 0.4348, + "step": 5017 + }, + { + "epoch": 0.9893533123028391, + "grad_norm": 0.5136606982937066, + "learning_rate": 1.7146288293090187e-05, + "loss": 0.4207, + "step": 5018 + }, + { + "epoch": 0.9895504731861199, + "grad_norm": 0.49381060992162196, + "learning_rate": 1.7145203958107005e-05, + "loss": 0.4386, + "step": 5019 + }, + { + "epoch": 0.9897476340694006, + "grad_norm": 0.5097224069523834, + "learning_rate": 1.714411945145454e-05, + "loss": 0.449, + "step": 5020 + }, + { + "epoch": 0.9899447949526814, + "grad_norm": 0.48982369331094644, + "learning_rate": 1.714303477315886e-05, + "loss": 0.4135, + "step": 5021 + }, + { + "epoch": 0.9901419558359621, + "grad_norm": 0.5348342417259278, + "learning_rate": 1.7141949923246007e-05, + "loss": 0.4645, + "step": 5022 + }, + { + "epoch": 0.9903391167192429, + "grad_norm": 0.4935854194155088, + "learning_rate": 1.7140864901742062e-05, + "loss": 0.4156, + "step": 5023 + }, + { + "epoch": 0.9905362776025236, + "grad_norm": 0.538994627268955, + "learning_rate": 1.7139779708673084e-05, + "loss": 0.434, + "step": 5024 + }, + { + "epoch": 0.9907334384858044, + "grad_norm": 0.5072487466974747, + "learning_rate": 1.7138694344065152e-05, + "loss": 0.4265, + "step": 5025 + }, + { + "epoch": 0.9909305993690851, + "grad_norm": 14.35107865712939, + "learning_rate": 1.7137608807944337e-05, + "loss": 0.4277, + "step": 5026 + }, + { + "epoch": 0.9911277602523659, + "grad_norm": 0.5905049052128314, + "learning_rate": 1.7136523100336725e-05, + "loss": 0.4204, + "step": 5027 + }, + { + "epoch": 0.9913249211356467, + "grad_norm": 0.5954561128927663, + "learning_rate": 1.7135437221268397e-05, + "loss": 0.4066, + "step": 5028 + }, + { + "epoch": 0.9915220820189274, + "grad_norm": 0.5481450959305048, + "learning_rate": 1.7134351170765443e-05, + "loss": 0.434, + "step": 5029 + }, + { + "epoch": 0.9917192429022083, + "grad_norm": 0.5757944206123269, + "learning_rate": 1.7133264948853957e-05, + "loss": 0.4157, + "step": 5030 + }, + { + "epoch": 0.991916403785489, + "grad_norm": 0.5811091665563136, + "learning_rate": 1.7132178555560038e-05, + "loss": 0.463, + "step": 5031 + }, + { + "epoch": 0.9921135646687698, + "grad_norm": 0.5095692078366557, + "learning_rate": 1.7131091990909786e-05, + "loss": 0.4403, + "step": 5032 + }, + { + "epoch": 0.9923107255520505, + "grad_norm": 0.5285063457801826, + "learning_rate": 1.71300052549293e-05, + "loss": 0.4018, + "step": 5033 + }, + { + "epoch": 0.9925078864353313, + "grad_norm": 0.5743875490503324, + "learning_rate": 1.7128918347644704e-05, + "loss": 0.463, + "step": 5034 + }, + { + "epoch": 0.992705047318612, + "grad_norm": 0.5553066045950534, + "learning_rate": 1.7127831269082103e-05, + "loss": 0.4358, + "step": 5035 + }, + { + "epoch": 0.9929022082018928, + "grad_norm": 0.5201145328150542, + "learning_rate": 1.712674401926761e-05, + "loss": 0.3973, + "step": 5036 + }, + { + "epoch": 0.9930993690851735, + "grad_norm": 0.5308940515290245, + "learning_rate": 1.7125656598227357e-05, + "loss": 0.4505, + "step": 5037 + }, + { + "epoch": 0.9932965299684543, + "grad_norm": 0.5307597719587108, + "learning_rate": 1.7124569005987466e-05, + "loss": 0.4296, + "step": 5038 + }, + { + "epoch": 0.993493690851735, + "grad_norm": 0.5368886391034855, + "learning_rate": 1.7123481242574066e-05, + "loss": 0.4627, + "step": 5039 + }, + { + "epoch": 0.9936908517350158, + "grad_norm": 0.5304804270019259, + "learning_rate": 1.7122393308013294e-05, + "loss": 0.4513, + "step": 5040 + }, + { + "epoch": 0.9938880126182965, + "grad_norm": 0.4837269924708032, + "learning_rate": 1.7121305202331284e-05, + "loss": 0.4263, + "step": 5041 + }, + { + "epoch": 0.9940851735015773, + "grad_norm": 0.538409666606145, + "learning_rate": 1.7120216925554185e-05, + "loss": 0.4424, + "step": 5042 + }, + { + "epoch": 0.994282334384858, + "grad_norm": 0.5107171099135106, + "learning_rate": 1.7119128477708137e-05, + "loss": 0.4143, + "step": 5043 + }, + { + "epoch": 0.9944794952681388, + "grad_norm": 0.5109327464115445, + "learning_rate": 1.7118039858819297e-05, + "loss": 0.4406, + "step": 5044 + }, + { + "epoch": 0.9946766561514195, + "grad_norm": 0.528632568268682, + "learning_rate": 1.711695106891382e-05, + "loss": 0.4322, + "step": 5045 + }, + { + "epoch": 0.9948738170347003, + "grad_norm": 0.5281250909959354, + "learning_rate": 1.711586210801786e-05, + "loss": 0.4486, + "step": 5046 + }, + { + "epoch": 0.995070977917981, + "grad_norm": 0.5636600965669576, + "learning_rate": 1.7114772976157578e-05, + "loss": 0.4576, + "step": 5047 + }, + { + "epoch": 0.9952681388012619, + "grad_norm": 0.5721847302432999, + "learning_rate": 1.711368367335915e-05, + "loss": 0.4841, + "step": 5048 + }, + { + "epoch": 0.9954652996845426, + "grad_norm": 0.4984992031526205, + "learning_rate": 1.7112594199648742e-05, + "loss": 0.4029, + "step": 5049 + }, + { + "epoch": 0.9956624605678234, + "grad_norm": 0.4985204521738928, + "learning_rate": 1.7111504555052533e-05, + "loss": 0.4062, + "step": 5050 + }, + { + "epoch": 0.9958596214511041, + "grad_norm": 0.510291189780098, + "learning_rate": 1.7110414739596697e-05, + "loss": 0.4291, + "step": 5051 + }, + { + "epoch": 0.9960567823343849, + "grad_norm": 0.5643661497979349, + "learning_rate": 1.710932475330742e-05, + "loss": 0.429, + "step": 5052 + }, + { + "epoch": 0.9962539432176656, + "grad_norm": 0.5118534697302934, + "learning_rate": 1.7108234596210892e-05, + "loss": 0.4399, + "step": 5053 + }, + { + "epoch": 0.9964511041009464, + "grad_norm": 0.49932647918668954, + "learning_rate": 1.7107144268333307e-05, + "loss": 0.383, + "step": 5054 + }, + { + "epoch": 0.9966482649842271, + "grad_norm": 0.5207920247222175, + "learning_rate": 1.7106053769700855e-05, + "loss": 0.4259, + "step": 5055 + }, + { + "epoch": 0.9968454258675079, + "grad_norm": 0.5040712672787065, + "learning_rate": 1.7104963100339738e-05, + "loss": 0.4396, + "step": 5056 + }, + { + "epoch": 0.9970425867507886, + "grad_norm": 0.49364985751506657, + "learning_rate": 1.7103872260276163e-05, + "loss": 0.3975, + "step": 5057 + }, + { + "epoch": 0.9972397476340694, + "grad_norm": 0.5562215293332458, + "learning_rate": 1.7102781249536333e-05, + "loss": 0.4626, + "step": 5058 + }, + { + "epoch": 0.9974369085173501, + "grad_norm": 0.5298513969515571, + "learning_rate": 1.7101690068146466e-05, + "loss": 0.4142, + "step": 5059 + }, + { + "epoch": 0.9976340694006309, + "grad_norm": 0.54683487357073, + "learning_rate": 1.7100598716132775e-05, + "loss": 0.4327, + "step": 5060 + }, + { + "epoch": 0.9978312302839116, + "grad_norm": 0.5440040494015383, + "learning_rate": 1.7099507193521482e-05, + "loss": 0.462, + "step": 5061 + }, + { + "epoch": 0.9980283911671924, + "grad_norm": 0.49509844160455657, + "learning_rate": 1.709841550033881e-05, + "loss": 0.4183, + "step": 5062 + }, + { + "epoch": 0.9982255520504731, + "grad_norm": 1.056087374364193, + "learning_rate": 1.7097323636610992e-05, + "loss": 0.4725, + "step": 5063 + }, + { + "epoch": 0.998422712933754, + "grad_norm": 0.5237038370525577, + "learning_rate": 1.7096231602364257e-05, + "loss": 0.414, + "step": 5064 + }, + { + "epoch": 0.9986198738170347, + "grad_norm": 0.5948843667990235, + "learning_rate": 1.7095139397624843e-05, + "loss": 0.4527, + "step": 5065 + }, + { + "epoch": 0.9988170347003155, + "grad_norm": 0.5150733173683878, + "learning_rate": 1.7094047022418995e-05, + "loss": 0.4904, + "step": 5066 + }, + { + "epoch": 0.9990141955835962, + "grad_norm": 0.7227117490491202, + "learning_rate": 1.709295447677295e-05, + "loss": 0.4206, + "step": 5067 + }, + { + "epoch": 0.999211356466877, + "grad_norm": 0.5268898131828004, + "learning_rate": 1.7091861760712963e-05, + "loss": 0.4402, + "step": 5068 + }, + { + "epoch": 0.9994085173501577, + "grad_norm": 0.4895461883001276, + "learning_rate": 1.7090768874265285e-05, + "loss": 0.43, + "step": 5069 + }, + { + "epoch": 0.9996056782334385, + "grad_norm": 0.5224645141727187, + "learning_rate": 1.7089675817456175e-05, + "loss": 0.4484, + "step": 5070 + }, + { + "epoch": 0.9998028391167192, + "grad_norm": 0.5380192178833433, + "learning_rate": 1.7088582590311896e-05, + "loss": 0.4619, + "step": 5071 + }, + { + "epoch": 1.0, + "grad_norm": 0.48949137873370546, + "learning_rate": 1.708748919285871e-05, + "loss": 0.4243, + "step": 5072 + }, + { + "epoch": 1.0, + "eval_loss": 0.4332016706466675, + "eval_runtime": 344.5029, + "eval_samples_per_second": 23.599, + "eval_steps_per_second": 1.477, + "step": 5072 + }, + { + "epoch": 1.0001971608832807, + "grad_norm": 0.5212823866085835, + "learning_rate": 1.7086395625122888e-05, + "loss": 0.4414, + "step": 5073 + }, + { + "epoch": 1.0001971608832807, + "grad_norm": 0.5373287163559608, + "learning_rate": 1.7085301887130708e-05, + "loss": 0.3227, + "step": 5074 + }, + { + "epoch": 1.0003943217665616, + "grad_norm": 0.6038566158503993, + "learning_rate": 1.708420797890844e-05, + "loss": 0.3653, + "step": 5075 + }, + { + "epoch": 1.0005914826498423, + "grad_norm": 0.6164839814322967, + "learning_rate": 1.7083113900482374e-05, + "loss": 0.3345, + "step": 5076 + }, + { + "epoch": 1.000788643533123, + "grad_norm": 0.5898548067122023, + "learning_rate": 1.708201965187879e-05, + "loss": 0.2978, + "step": 5077 + }, + { + "epoch": 1.0009858044164037, + "grad_norm": 0.6086636984908, + "learning_rate": 1.708092523312398e-05, + "loss": 0.3338, + "step": 5078 + }, + { + "epoch": 1.0011829652996846, + "grad_norm": 0.85830944121542, + "learning_rate": 1.707983064424424e-05, + "loss": 0.3628, + "step": 5079 + }, + { + "epoch": 1.0013801261829653, + "grad_norm": 0.7446958830040732, + "learning_rate": 1.7078735885265872e-05, + "loss": 0.3397, + "step": 5080 + }, + { + "epoch": 1.001577287066246, + "grad_norm": 0.5753953072853087, + "learning_rate": 1.707764095621517e-05, + "loss": 0.3338, + "step": 5081 + }, + { + "epoch": 1.0017744479495267, + "grad_norm": 0.5313937087435588, + "learning_rate": 1.707654585711844e-05, + "loss": 0.3294, + "step": 5082 + }, + { + "epoch": 1.0019716088328077, + "grad_norm": 0.5433192534151587, + "learning_rate": 1.7075450588002004e-05, + "loss": 0.3005, + "step": 5083 + }, + { + "epoch": 1.0021687697160884, + "grad_norm": 0.5974671475978584, + "learning_rate": 1.7074355148892167e-05, + "loss": 0.3355, + "step": 5084 + }, + { + "epoch": 1.002365930599369, + "grad_norm": 0.5292036690619668, + "learning_rate": 1.707325953981525e-05, + "loss": 0.3133, + "step": 5085 + }, + { + "epoch": 1.0025630914826498, + "grad_norm": 0.5791264643946715, + "learning_rate": 1.707216376079758e-05, + "loss": 0.3628, + "step": 5086 + }, + { + "epoch": 1.0027602523659307, + "grad_norm": 0.5367457017037465, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.323, + "step": 5087 + }, + { + "epoch": 1.0029574132492114, + "grad_norm": 0.5463141350198112, + "learning_rate": 1.7069971693045276e-05, + "loss": 0.3191, + "step": 5088 + }, + { + "epoch": 1.003154574132492, + "grad_norm": 0.5683958910749297, + "learning_rate": 1.706887540436331e-05, + "loss": 0.3338, + "step": 5089 + }, + { + "epoch": 1.0033517350157728, + "grad_norm": 2.555600213008143, + "learning_rate": 1.7067778945845923e-05, + "loss": 0.3227, + "step": 5090 + }, + { + "epoch": 1.0035488958990537, + "grad_norm": 0.602035566632611, + "learning_rate": 1.7066682317519453e-05, + "loss": 0.3194, + "step": 5091 + }, + { + "epoch": 1.0037460567823344, + "grad_norm": 0.5294253122717554, + "learning_rate": 1.7065585519410253e-05, + "loss": 0.3259, + "step": 5092 + }, + { + "epoch": 1.0039432176656151, + "grad_norm": 0.534836832974196, + "learning_rate": 1.706448855154467e-05, + "loss": 0.3003, + "step": 5093 + }, + { + "epoch": 1.0041403785488958, + "grad_norm": 0.5351347670561281, + "learning_rate": 1.7063391413949056e-05, + "loss": 0.3055, + "step": 5094 + }, + { + "epoch": 1.0043375394321767, + "grad_norm": 0.5619292444300296, + "learning_rate": 1.7062294106649777e-05, + "loss": 0.3276, + "step": 5095 + }, + { + "epoch": 1.0045347003154574, + "grad_norm": 0.5124532518746542, + "learning_rate": 1.7061196629673198e-05, + "loss": 0.3216, + "step": 5096 + }, + { + "epoch": 1.0047318611987381, + "grad_norm": 0.5657829767971062, + "learning_rate": 1.706009898304568e-05, + "loss": 0.3475, + "step": 5097 + }, + { + "epoch": 1.0049290220820188, + "grad_norm": 0.5118747834196178, + "learning_rate": 1.7059001166793604e-05, + "loss": 0.3161, + "step": 5098 + }, + { + "epoch": 1.0051261829652998, + "grad_norm": 0.5506568794847809, + "learning_rate": 1.7057903180943334e-05, + "loss": 0.3096, + "step": 5099 + }, + { + "epoch": 1.0053233438485805, + "grad_norm": 0.831367439521069, + "learning_rate": 1.7056805025521258e-05, + "loss": 0.3331, + "step": 5100 + }, + { + "epoch": 1.0055205047318612, + "grad_norm": 1.070362368316711, + "learning_rate": 1.705570670055376e-05, + "loss": 0.316, + "step": 5101 + }, + { + "epoch": 1.0057176656151419, + "grad_norm": 0.5858155470361337, + "learning_rate": 1.7054608206067225e-05, + "loss": 0.3482, + "step": 5102 + }, + { + "epoch": 1.0059148264984228, + "grad_norm": 1.0138659359886917, + "learning_rate": 1.705350954208805e-05, + "loss": 0.3311, + "step": 5103 + }, + { + "epoch": 1.0061119873817035, + "grad_norm": 0.5577957395835643, + "learning_rate": 1.705241070864262e-05, + "loss": 0.3392, + "step": 5104 + }, + { + "epoch": 1.0063091482649842, + "grad_norm": 0.5829503084317307, + "learning_rate": 1.7051311705757353e-05, + "loss": 0.3484, + "step": 5105 + }, + { + "epoch": 1.0065063091482649, + "grad_norm": 0.5629335133052799, + "learning_rate": 1.7050212533458637e-05, + "loss": 0.3403, + "step": 5106 + }, + { + "epoch": 1.0067034700315458, + "grad_norm": 0.545530969036501, + "learning_rate": 1.7049113191772892e-05, + "loss": 0.3129, + "step": 5107 + }, + { + "epoch": 1.0069006309148265, + "grad_norm": 0.5766133922035589, + "learning_rate": 1.7048013680726524e-05, + "loss": 0.3228, + "step": 5108 + }, + { + "epoch": 1.0070977917981072, + "grad_norm": 0.49331230147259286, + "learning_rate": 1.7046914000345955e-05, + "loss": 0.2823, + "step": 5109 + }, + { + "epoch": 1.007294952681388, + "grad_norm": 10.932286242518197, + "learning_rate": 1.7045814150657597e-05, + "loss": 0.3303, + "step": 5110 + }, + { + "epoch": 1.0074921135646688, + "grad_norm": 0.6743010499249699, + "learning_rate": 1.704471413168788e-05, + "loss": 0.3184, + "step": 5111 + }, + { + "epoch": 1.0076892744479495, + "grad_norm": 0.5417814723560809, + "learning_rate": 1.7043613943463236e-05, + "loss": 0.3313, + "step": 5112 + }, + { + "epoch": 1.0078864353312302, + "grad_norm": 0.6255338295712346, + "learning_rate": 1.7042513586010096e-05, + "loss": 0.3199, + "step": 5113 + }, + { + "epoch": 1.008083596214511, + "grad_norm": 0.5321460876030243, + "learning_rate": 1.7041413059354893e-05, + "loss": 0.3015, + "step": 5114 + }, + { + "epoch": 1.0082807570977919, + "grad_norm": 0.5526276788887993, + "learning_rate": 1.704031236352407e-05, + "loss": 0.3266, + "step": 5115 + }, + { + "epoch": 1.0084779179810726, + "grad_norm": 0.6188868104429317, + "learning_rate": 1.7039211498544075e-05, + "loss": 0.347, + "step": 5116 + }, + { + "epoch": 1.0086750788643533, + "grad_norm": 0.5552679635253073, + "learning_rate": 1.7038110464441354e-05, + "loss": 0.3225, + "step": 5117 + }, + { + "epoch": 1.008872239747634, + "grad_norm": 0.525568513553716, + "learning_rate": 1.703700926124236e-05, + "loss": 0.3194, + "step": 5118 + }, + { + "epoch": 1.0090694006309149, + "grad_norm": 0.6134298315197882, + "learning_rate": 1.7035907888973556e-05, + "loss": 0.3509, + "step": 5119 + }, + { + "epoch": 1.0092665615141956, + "grad_norm": 0.5390374770864321, + "learning_rate": 1.7034806347661398e-05, + "loss": 0.3143, + "step": 5120 + }, + { + "epoch": 1.0094637223974763, + "grad_norm": 0.5293103732186882, + "learning_rate": 1.703370463733235e-05, + "loss": 0.3249, + "step": 5121 + }, + { + "epoch": 1.0096608832807572, + "grad_norm": 0.7725038009082341, + "learning_rate": 1.7032602758012884e-05, + "loss": 0.3411, + "step": 5122 + }, + { + "epoch": 1.009858044164038, + "grad_norm": 0.5621329825507172, + "learning_rate": 1.703150070972947e-05, + "loss": 0.3549, + "step": 5123 + }, + { + "epoch": 1.0100552050473186, + "grad_norm": 0.5337193860173397, + "learning_rate": 1.7030398492508595e-05, + "loss": 0.3117, + "step": 5124 + }, + { + "epoch": 1.0102523659305993, + "grad_norm": 0.5648883470891011, + "learning_rate": 1.7029296106376732e-05, + "loss": 0.3125, + "step": 5125 + }, + { + "epoch": 1.0104495268138802, + "grad_norm": 0.5193119110839971, + "learning_rate": 1.702819355136037e-05, + "loss": 0.3176, + "step": 5126 + }, + { + "epoch": 1.010646687697161, + "grad_norm": 1.1538498444708232, + "learning_rate": 1.7027090827486e-05, + "loss": 0.3035, + "step": 5127 + }, + { + "epoch": 1.0108438485804416, + "grad_norm": 0.5536713733939184, + "learning_rate": 1.702598793478011e-05, + "loss": 0.3113, + "step": 5128 + }, + { + "epoch": 1.0110410094637223, + "grad_norm": 0.5121398955428231, + "learning_rate": 1.7024884873269206e-05, + "loss": 0.2913, + "step": 5129 + }, + { + "epoch": 1.0112381703470033, + "grad_norm": 0.7433927419882325, + "learning_rate": 1.7023781642979786e-05, + "loss": 0.3399, + "step": 5130 + }, + { + "epoch": 1.011435331230284, + "grad_norm": 0.5206836608515906, + "learning_rate": 1.7022678243938352e-05, + "loss": 0.2917, + "step": 5131 + }, + { + "epoch": 1.0116324921135647, + "grad_norm": 0.652680038981418, + "learning_rate": 1.7021574676171418e-05, + "loss": 0.3144, + "step": 5132 + }, + { + "epoch": 1.0118296529968454, + "grad_norm": 0.5271509086644123, + "learning_rate": 1.70204709397055e-05, + "loss": 0.3219, + "step": 5133 + }, + { + "epoch": 1.0120268138801263, + "grad_norm": 0.6052127133577494, + "learning_rate": 1.7019367034567115e-05, + "loss": 0.3163, + "step": 5134 + }, + { + "epoch": 1.012223974763407, + "grad_norm": 0.4968211645367436, + "learning_rate": 1.7018262960782783e-05, + "loss": 0.312, + "step": 5135 + }, + { + "epoch": 1.0124211356466877, + "grad_norm": 0.5506646508851154, + "learning_rate": 1.701715871837903e-05, + "loss": 0.3277, + "step": 5136 + }, + { + "epoch": 1.0126182965299684, + "grad_norm": 0.5474780371203971, + "learning_rate": 1.7016054307382387e-05, + "loss": 0.3104, + "step": 5137 + }, + { + "epoch": 1.0128154574132493, + "grad_norm": 0.5652503845654994, + "learning_rate": 1.7014949727819395e-05, + "loss": 0.3375, + "step": 5138 + }, + { + "epoch": 1.01301261829653, + "grad_norm": 0.5762231960940658, + "learning_rate": 1.701384497971658e-05, + "loss": 0.3515, + "step": 5139 + }, + { + "epoch": 1.0132097791798107, + "grad_norm": 0.5555119994103955, + "learning_rate": 1.7012740063100495e-05, + "loss": 0.304, + "step": 5140 + }, + { + "epoch": 1.0134069400630914, + "grad_norm": 0.5638374760070589, + "learning_rate": 1.7011634977997683e-05, + "loss": 0.3496, + "step": 5141 + }, + { + "epoch": 1.0136041009463723, + "grad_norm": 0.5643948965503662, + "learning_rate": 1.701052972443469e-05, + "loss": 0.3397, + "step": 5142 + }, + { + "epoch": 1.013801261829653, + "grad_norm": 0.5135163466842045, + "learning_rate": 1.700942430243808e-05, + "loss": 0.301, + "step": 5143 + }, + { + "epoch": 1.0139984227129337, + "grad_norm": 0.5539748285463232, + "learning_rate": 1.7008318712034405e-05, + "loss": 0.3244, + "step": 5144 + }, + { + "epoch": 1.0141955835962144, + "grad_norm": 0.5740027085216346, + "learning_rate": 1.700721295325023e-05, + "loss": 0.314, + "step": 5145 + }, + { + "epoch": 1.0143927444794953, + "grad_norm": 0.4898966401251555, + "learning_rate": 1.7006107026112117e-05, + "loss": 0.3106, + "step": 5146 + }, + { + "epoch": 1.014589905362776, + "grad_norm": 0.5414399089916463, + "learning_rate": 1.7005000930646643e-05, + "loss": 0.2973, + "step": 5147 + }, + { + "epoch": 1.0147870662460567, + "grad_norm": 0.5389719439899145, + "learning_rate": 1.700389466688038e-05, + "loss": 0.3285, + "step": 5148 + }, + { + "epoch": 1.0149842271293374, + "grad_norm": 0.5180514047808813, + "learning_rate": 1.7002788234839908e-05, + "loss": 0.3141, + "step": 5149 + }, + { + "epoch": 1.0151813880126184, + "grad_norm": 0.5437706206122538, + "learning_rate": 1.7001681634551813e-05, + "loss": 0.3277, + "step": 5150 + }, + { + "epoch": 1.015378548895899, + "grad_norm": 1.396200116042807, + "learning_rate": 1.700057486604267e-05, + "loss": 0.3558, + "step": 5151 + }, + { + "epoch": 1.0155757097791798, + "grad_norm": 0.563327018363544, + "learning_rate": 1.6999467929339086e-05, + "loss": 0.3421, + "step": 5152 + }, + { + "epoch": 1.0157728706624605, + "grad_norm": 0.5184359383889053, + "learning_rate": 1.6998360824467644e-05, + "loss": 0.3295, + "step": 5153 + }, + { + "epoch": 1.0159700315457414, + "grad_norm": 0.5208557657594138, + "learning_rate": 1.6997253551454948e-05, + "loss": 0.2896, + "step": 5154 + }, + { + "epoch": 1.016167192429022, + "grad_norm": 0.5291211490149857, + "learning_rate": 1.6996146110327604e-05, + "loss": 0.3234, + "step": 5155 + }, + { + "epoch": 1.0163643533123028, + "grad_norm": 0.7659645289802868, + "learning_rate": 1.699503850111221e-05, + "loss": 0.3266, + "step": 5156 + }, + { + "epoch": 1.0165615141955835, + "grad_norm": 0.5341642375091761, + "learning_rate": 1.699393072383539e-05, + "loss": 0.3227, + "step": 5157 + }, + { + "epoch": 1.0167586750788644, + "grad_norm": 0.5237990761161321, + "learning_rate": 1.6992822778523745e-05, + "loss": 0.3156, + "step": 5158 + }, + { + "epoch": 1.0169558359621451, + "grad_norm": 0.5556636270933202, + "learning_rate": 1.6991714665203905e-05, + "loss": 0.3382, + "step": 5159 + }, + { + "epoch": 1.0171529968454258, + "grad_norm": 0.5169961857756208, + "learning_rate": 1.699060638390249e-05, + "loss": 0.3083, + "step": 5160 + }, + { + "epoch": 1.0173501577287065, + "grad_norm": 0.577712432873463, + "learning_rate": 1.6989497934646128e-05, + "loss": 0.3235, + "step": 5161 + }, + { + "epoch": 1.0175473186119874, + "grad_norm": 0.48166753712469484, + "learning_rate": 1.6988389317461448e-05, + "loss": 0.3296, + "step": 5162 + }, + { + "epoch": 1.0177444794952681, + "grad_norm": 0.5588468932069051, + "learning_rate": 1.6987280532375082e-05, + "loss": 0.3205, + "step": 5163 + }, + { + "epoch": 1.0179416403785488, + "grad_norm": 0.5022075140943705, + "learning_rate": 1.698617157941368e-05, + "loss": 0.3183, + "step": 5164 + }, + { + "epoch": 1.0181388012618298, + "grad_norm": 0.5824010042694542, + "learning_rate": 1.698506245860388e-05, + "loss": 0.3339, + "step": 5165 + }, + { + "epoch": 1.0183359621451105, + "grad_norm": 0.48671865197581843, + "learning_rate": 1.6983953169972333e-05, + "loss": 0.3063, + "step": 5166 + }, + { + "epoch": 1.0185331230283912, + "grad_norm": 0.5664051734703313, + "learning_rate": 1.6982843713545678e-05, + "loss": 0.318, + "step": 5167 + }, + { + "epoch": 1.0187302839116719, + "grad_norm": 0.5307701667465677, + "learning_rate": 1.6981734089350585e-05, + "loss": 0.3375, + "step": 5168 + }, + { + "epoch": 1.0189274447949528, + "grad_norm": 0.5241759654651865, + "learning_rate": 1.698062429741371e-05, + "loss": 0.3219, + "step": 5169 + }, + { + "epoch": 1.0191246056782335, + "grad_norm": 0.5450182402988996, + "learning_rate": 1.697951433776171e-05, + "loss": 0.3373, + "step": 5170 + }, + { + "epoch": 1.0193217665615142, + "grad_norm": 0.5065193637889227, + "learning_rate": 1.6978404210421257e-05, + "loss": 0.3224, + "step": 5171 + }, + { + "epoch": 1.0195189274447949, + "grad_norm": 0.5440437100960368, + "learning_rate": 1.6977293915419025e-05, + "loss": 0.346, + "step": 5172 + }, + { + "epoch": 1.0197160883280758, + "grad_norm": 0.5167797629051604, + "learning_rate": 1.697618345278169e-05, + "loss": 0.3114, + "step": 5173 + }, + { + "epoch": 1.0199132492113565, + "grad_norm": 0.5242186802344402, + "learning_rate": 1.6975072822535924e-05, + "loss": 0.3092, + "step": 5174 + }, + { + "epoch": 1.0201104100946372, + "grad_norm": 0.5201227294258078, + "learning_rate": 1.6973962024708425e-05, + "loss": 0.331, + "step": 5175 + }, + { + "epoch": 1.020307570977918, + "grad_norm": 0.5072548524687776, + "learning_rate": 1.6972851059325866e-05, + "loss": 0.3226, + "step": 5176 + }, + { + "epoch": 1.0205047318611988, + "grad_norm": 0.5003313380328426, + "learning_rate": 1.6971739926414946e-05, + "loss": 0.3188, + "step": 5177 + }, + { + "epoch": 1.0207018927444795, + "grad_norm": 0.5360098965820386, + "learning_rate": 1.6970628626002362e-05, + "loss": 0.3379, + "step": 5178 + }, + { + "epoch": 1.0208990536277602, + "grad_norm": 0.5006169156718928, + "learning_rate": 1.6969517158114807e-05, + "loss": 0.33, + "step": 5179 + }, + { + "epoch": 1.021096214511041, + "grad_norm": 0.4797157904337746, + "learning_rate": 1.6968405522778996e-05, + "loss": 0.3035, + "step": 5180 + }, + { + "epoch": 1.0212933753943219, + "grad_norm": 0.5189338955937625, + "learning_rate": 1.6967293720021628e-05, + "loss": 0.3063, + "step": 5181 + }, + { + "epoch": 1.0214905362776026, + "grad_norm": 0.5284709213206259, + "learning_rate": 1.6966181749869417e-05, + "loss": 0.3307, + "step": 5182 + }, + { + "epoch": 1.0216876971608833, + "grad_norm": 0.5586108885453471, + "learning_rate": 1.6965069612349082e-05, + "loss": 0.3354, + "step": 5183 + }, + { + "epoch": 1.021884858044164, + "grad_norm": 0.4825892059596201, + "learning_rate": 1.6963957307487337e-05, + "loss": 0.3116, + "step": 5184 + }, + { + "epoch": 1.0220820189274449, + "grad_norm": 0.5626137147948211, + "learning_rate": 1.6962844835310912e-05, + "loss": 0.3449, + "step": 5185 + }, + { + "epoch": 1.0222791798107256, + "grad_norm": 0.4957252119548732, + "learning_rate": 1.696173219584653e-05, + "loss": 0.3201, + "step": 5186 + }, + { + "epoch": 1.0224763406940063, + "grad_norm": 0.5997034402929716, + "learning_rate": 1.696061938912093e-05, + "loss": 0.3329, + "step": 5187 + }, + { + "epoch": 1.022673501577287, + "grad_norm": 0.5074547201736874, + "learning_rate": 1.6959506415160838e-05, + "loss": 0.3124, + "step": 5188 + }, + { + "epoch": 1.022870662460568, + "grad_norm": 0.525820644885496, + "learning_rate": 1.6958393273993004e-05, + "loss": 0.3149, + "step": 5189 + }, + { + "epoch": 1.0230678233438486, + "grad_norm": 0.49149463015492956, + "learning_rate": 1.695727996564417e-05, + "loss": 0.3033, + "step": 5190 + }, + { + "epoch": 1.0232649842271293, + "grad_norm": 0.5812486165845497, + "learning_rate": 1.6956166490141076e-05, + "loss": 0.3592, + "step": 5191 + }, + { + "epoch": 1.02346214511041, + "grad_norm": 0.47448602856705274, + "learning_rate": 1.6955052847510486e-05, + "loss": 0.298, + "step": 5192 + }, + { + "epoch": 1.023659305993691, + "grad_norm": 0.4933808645190031, + "learning_rate": 1.6953939037779147e-05, + "loss": 0.3174, + "step": 5193 + }, + { + "epoch": 1.0238564668769716, + "grad_norm": 0.49672191971801205, + "learning_rate": 1.6952825060973826e-05, + "loss": 0.3035, + "step": 5194 + }, + { + "epoch": 1.0240536277602523, + "grad_norm": 0.5303109710476851, + "learning_rate": 1.695171091712128e-05, + "loss": 0.3174, + "step": 5195 + }, + { + "epoch": 1.024250788643533, + "grad_norm": 0.4983343105685653, + "learning_rate": 1.6950596606248283e-05, + "loss": 0.325, + "step": 5196 + }, + { + "epoch": 1.024447949526814, + "grad_norm": 0.5149354624056497, + "learning_rate": 1.6949482128381607e-05, + "loss": 0.3172, + "step": 5197 + }, + { + "epoch": 1.0246451104100947, + "grad_norm": 0.505149696080138, + "learning_rate": 1.694836748354802e-05, + "loss": 0.3276, + "step": 5198 + }, + { + "epoch": 1.0248422712933754, + "grad_norm": 0.5800181685156677, + "learning_rate": 1.6947252671774317e-05, + "loss": 0.3669, + "step": 5199 + }, + { + "epoch": 1.025039432176656, + "grad_norm": 0.530041440210273, + "learning_rate": 1.694613769308727e-05, + "loss": 0.3162, + "step": 5200 + }, + { + "epoch": 1.025236593059937, + "grad_norm": 0.5328709074168889, + "learning_rate": 1.6945022547513672e-05, + "loss": 0.3364, + "step": 5201 + }, + { + "epoch": 1.0254337539432177, + "grad_norm": 0.5659050716130124, + "learning_rate": 1.694390723508031e-05, + "loss": 0.352, + "step": 5202 + }, + { + "epoch": 1.0256309148264984, + "grad_norm": 0.47684840324515954, + "learning_rate": 1.694279175581399e-05, + "loss": 0.2927, + "step": 5203 + }, + { + "epoch": 1.025828075709779, + "grad_norm": 0.5261599552118007, + "learning_rate": 1.6941676109741506e-05, + "loss": 0.3474, + "step": 5204 + }, + { + "epoch": 1.02602523659306, + "grad_norm": 0.5295012959661953, + "learning_rate": 1.694056029688966e-05, + "loss": 0.3451, + "step": 5205 + }, + { + "epoch": 1.0262223974763407, + "grad_norm": 0.520003663405905, + "learning_rate": 1.693944431728527e-05, + "loss": 0.3223, + "step": 5206 + }, + { + "epoch": 1.0264195583596214, + "grad_norm": 0.4972533544450635, + "learning_rate": 1.693832817095514e-05, + "loss": 0.329, + "step": 5207 + }, + { + "epoch": 1.0266167192429023, + "grad_norm": 0.49743818639590465, + "learning_rate": 1.6937211857926087e-05, + "loss": 0.3246, + "step": 5208 + }, + { + "epoch": 1.026813880126183, + "grad_norm": 0.5108957204240979, + "learning_rate": 1.693609537822493e-05, + "loss": 0.3604, + "step": 5209 + }, + { + "epoch": 1.0270110410094637, + "grad_norm": 0.5027559180142284, + "learning_rate": 1.69349787318785e-05, + "loss": 0.3127, + "step": 5210 + }, + { + "epoch": 1.0272082018927444, + "grad_norm": 0.5060763508870827, + "learning_rate": 1.6933861918913617e-05, + "loss": 0.3157, + "step": 5211 + }, + { + "epoch": 1.0274053627760253, + "grad_norm": 0.5587497322838068, + "learning_rate": 1.693274493935712e-05, + "loss": 0.3335, + "step": 5212 + }, + { + "epoch": 1.027602523659306, + "grad_norm": 0.5328215936145846, + "learning_rate": 1.6931627793235845e-05, + "loss": 0.3401, + "step": 5213 + }, + { + "epoch": 1.0277996845425867, + "grad_norm": 0.4746075756485485, + "learning_rate": 1.693051048057663e-05, + "loss": 0.2946, + "step": 5214 + }, + { + "epoch": 1.0279968454258674, + "grad_norm": 0.5138896173325503, + "learning_rate": 1.6929393001406317e-05, + "loss": 0.3183, + "step": 5215 + }, + { + "epoch": 1.0281940063091484, + "grad_norm": 0.6224304648800318, + "learning_rate": 1.6928275355751758e-05, + "loss": 0.3385, + "step": 5216 + }, + { + "epoch": 1.028391167192429, + "grad_norm": 0.4778598347974904, + "learning_rate": 1.69271575436398e-05, + "loss": 0.2942, + "step": 5217 + }, + { + "epoch": 1.0285883280757098, + "grad_norm": 0.5206657983017704, + "learning_rate": 1.6926039565097313e-05, + "loss": 0.3232, + "step": 5218 + }, + { + "epoch": 1.0287854889589905, + "grad_norm": 0.517735918520925, + "learning_rate": 1.6924921420151143e-05, + "loss": 0.3415, + "step": 5219 + }, + { + "epoch": 1.0289826498422714, + "grad_norm": 0.49436293737722586, + "learning_rate": 1.6923803108828155e-05, + "loss": 0.2997, + "step": 5220 + }, + { + "epoch": 1.029179810725552, + "grad_norm": 0.48751091503344113, + "learning_rate": 1.6922684631155226e-05, + "loss": 0.3047, + "step": 5221 + }, + { + "epoch": 1.0293769716088328, + "grad_norm": 0.520881620751667, + "learning_rate": 1.6921565987159226e-05, + "loss": 0.335, + "step": 5222 + }, + { + "epoch": 1.0295741324921135, + "grad_norm": 0.6347905848505744, + "learning_rate": 1.6920447176867022e-05, + "loss": 0.2997, + "step": 5223 + }, + { + "epoch": 1.0297712933753944, + "grad_norm": 0.5493020000879799, + "learning_rate": 1.6919328200305507e-05, + "loss": 0.3646, + "step": 5224 + }, + { + "epoch": 1.0299684542586751, + "grad_norm": 0.5057286317589824, + "learning_rate": 1.691820905750156e-05, + "loss": 0.3125, + "step": 5225 + }, + { + "epoch": 1.0301656151419558, + "grad_norm": 0.5040769564600079, + "learning_rate": 1.691708974848207e-05, + "loss": 0.3069, + "step": 5226 + }, + { + "epoch": 1.0303627760252365, + "grad_norm": 0.5449234431968593, + "learning_rate": 1.6915970273273927e-05, + "loss": 0.3322, + "step": 5227 + }, + { + "epoch": 1.0305599369085174, + "grad_norm": 0.5763354754529781, + "learning_rate": 1.6914850631904027e-05, + "loss": 0.3389, + "step": 5228 + }, + { + "epoch": 1.0307570977917981, + "grad_norm": 0.51550599843538, + "learning_rate": 1.6913730824399274e-05, + "loss": 0.3618, + "step": 5229 + }, + { + "epoch": 1.0309542586750788, + "grad_norm": 0.5422495461192716, + "learning_rate": 1.691261085078657e-05, + "loss": 0.328, + "step": 5230 + }, + { + "epoch": 1.0311514195583595, + "grad_norm": 0.49409053708036643, + "learning_rate": 1.6911490711092824e-05, + "loss": 0.3175, + "step": 5231 + }, + { + "epoch": 1.0313485804416405, + "grad_norm": 0.5082339134425249, + "learning_rate": 1.6910370405344948e-05, + "loss": 0.3372, + "step": 5232 + }, + { + "epoch": 1.0315457413249212, + "grad_norm": 0.5304668788200992, + "learning_rate": 1.6909249933569856e-05, + "loss": 0.3237, + "step": 5233 + }, + { + "epoch": 1.0317429022082019, + "grad_norm": 0.5133955618334345, + "learning_rate": 1.690812929579447e-05, + "loss": 0.3391, + "step": 5234 + }, + { + "epoch": 1.0319400630914826, + "grad_norm": 0.5657397770422078, + "learning_rate": 1.690700849204572e-05, + "loss": 0.3315, + "step": 5235 + }, + { + "epoch": 1.0321372239747635, + "grad_norm": 0.5017068565390534, + "learning_rate": 1.6905887522350522e-05, + "loss": 0.3137, + "step": 5236 + }, + { + "epoch": 1.0323343848580442, + "grad_norm": 0.49468986516238367, + "learning_rate": 1.690476638673582e-05, + "loss": 0.3166, + "step": 5237 + }, + { + "epoch": 1.0325315457413249, + "grad_norm": 0.4900916730092504, + "learning_rate": 1.690364508522854e-05, + "loss": 0.3041, + "step": 5238 + }, + { + "epoch": 1.0327287066246056, + "grad_norm": 0.4786911698428022, + "learning_rate": 1.6902523617855633e-05, + "loss": 0.3154, + "step": 5239 + }, + { + "epoch": 1.0329258675078865, + "grad_norm": 0.5146263841992783, + "learning_rate": 1.6901401984644034e-05, + "loss": 0.2831, + "step": 5240 + }, + { + "epoch": 1.0331230283911672, + "grad_norm": 0.5189552944117162, + "learning_rate": 1.6900280185620697e-05, + "loss": 0.31, + "step": 5241 + }, + { + "epoch": 1.033320189274448, + "grad_norm": 0.5532426804255108, + "learning_rate": 1.689915822081257e-05, + "loss": 0.329, + "step": 5242 + }, + { + "epoch": 1.0335173501577286, + "grad_norm": 0.8076083652448988, + "learning_rate": 1.689803609024661e-05, + "loss": 0.3378, + "step": 5243 + }, + { + "epoch": 1.0337145110410095, + "grad_norm": 0.5725048421083666, + "learning_rate": 1.6896913793949782e-05, + "loss": 0.3247, + "step": 5244 + }, + { + "epoch": 1.0339116719242902, + "grad_norm": 0.49855659279047376, + "learning_rate": 1.6895791331949045e-05, + "loss": 0.32, + "step": 5245 + }, + { + "epoch": 1.034108832807571, + "grad_norm": 0.6416688834979003, + "learning_rate": 1.6894668704271363e-05, + "loss": 0.3466, + "step": 5246 + }, + { + "epoch": 1.0343059936908516, + "grad_norm": 0.51271995997633, + "learning_rate": 1.6893545910943718e-05, + "loss": 0.3314, + "step": 5247 + }, + { + "epoch": 1.0345031545741326, + "grad_norm": 0.5357629294597502, + "learning_rate": 1.689242295199308e-05, + "loss": 0.3238, + "step": 5248 + }, + { + "epoch": 1.0347003154574133, + "grad_norm": 2.907322545828229, + "learning_rate": 1.6891299827446428e-05, + "loss": 0.38, + "step": 5249 + }, + { + "epoch": 1.034897476340694, + "grad_norm": 0.5492499750569434, + "learning_rate": 1.689017653733075e-05, + "loss": 0.3136, + "step": 5250 + }, + { + "epoch": 1.0350946372239747, + "grad_norm": 0.5411422818363292, + "learning_rate": 1.688905308167303e-05, + "loss": 0.3282, + "step": 5251 + }, + { + "epoch": 1.0352917981072556, + "grad_norm": 0.5143337197907343, + "learning_rate": 1.6887929460500264e-05, + "loss": 0.2836, + "step": 5252 + }, + { + "epoch": 1.0354889589905363, + "grad_norm": 0.5153934469793959, + "learning_rate": 1.688680567383945e-05, + "loss": 0.3336, + "step": 5253 + }, + { + "epoch": 1.035686119873817, + "grad_norm": 0.5296091136769037, + "learning_rate": 1.6885681721717575e-05, + "loss": 0.3246, + "step": 5254 + }, + { + "epoch": 1.0358832807570977, + "grad_norm": 0.5144127492160583, + "learning_rate": 1.688455760416166e-05, + "loss": 0.3193, + "step": 5255 + }, + { + "epoch": 1.0360804416403786, + "grad_norm": 0.5575440600913101, + "learning_rate": 1.6883433321198697e-05, + "loss": 0.3244, + "step": 5256 + }, + { + "epoch": 1.0362776025236593, + "grad_norm": 0.4886892524188935, + "learning_rate": 1.688230887285571e-05, + "loss": 0.2849, + "step": 5257 + }, + { + "epoch": 1.03647476340694, + "grad_norm": 0.5024265088647863, + "learning_rate": 1.6881184259159708e-05, + "loss": 0.3105, + "step": 5258 + }, + { + "epoch": 1.036671924290221, + "grad_norm": 0.5130152621541834, + "learning_rate": 1.6880059480137715e-05, + "loss": 0.3275, + "step": 5259 + }, + { + "epoch": 1.0368690851735016, + "grad_norm": 0.4909264681540603, + "learning_rate": 1.687893453581675e-05, + "loss": 0.3102, + "step": 5260 + }, + { + "epoch": 1.0370662460567823, + "grad_norm": 0.5196024238559768, + "learning_rate": 1.6877809426223846e-05, + "loss": 0.3231, + "step": 5261 + }, + { + "epoch": 1.037263406940063, + "grad_norm": 0.5799374687724274, + "learning_rate": 1.6876684151386028e-05, + "loss": 0.3441, + "step": 5262 + }, + { + "epoch": 1.037460567823344, + "grad_norm": 0.5397818543610164, + "learning_rate": 1.687555871133034e-05, + "loss": 0.3306, + "step": 5263 + }, + { + "epoch": 1.0376577287066246, + "grad_norm": 8.650236468601472, + "learning_rate": 1.6874433106083815e-05, + "loss": 0.3565, + "step": 5264 + }, + { + "epoch": 1.0378548895899053, + "grad_norm": 0.653217018292931, + "learning_rate": 1.6873307335673498e-05, + "loss": 0.3491, + "step": 5265 + }, + { + "epoch": 1.038052050473186, + "grad_norm": 0.5706483442294323, + "learning_rate": 1.6872181400126434e-05, + "loss": 0.3686, + "step": 5266 + }, + { + "epoch": 1.038249211356467, + "grad_norm": 1.8045203865710944, + "learning_rate": 1.6871055299469686e-05, + "loss": 0.3007, + "step": 5267 + }, + { + "epoch": 1.0384463722397477, + "grad_norm": 0.5787568141280821, + "learning_rate": 1.6869929033730294e-05, + "loss": 0.339, + "step": 5268 + }, + { + "epoch": 1.0386435331230284, + "grad_norm": 0.5659159080692836, + "learning_rate": 1.6868802602935327e-05, + "loss": 0.3399, + "step": 5269 + }, + { + "epoch": 1.038840694006309, + "grad_norm": 0.6818154220606805, + "learning_rate": 1.6867676007111847e-05, + "loss": 0.3008, + "step": 5270 + }, + { + "epoch": 1.03903785488959, + "grad_norm": 0.617492098823351, + "learning_rate": 1.6866549246286918e-05, + "loss": 0.3438, + "step": 5271 + }, + { + "epoch": 1.0392350157728707, + "grad_norm": 0.49987762250168705, + "learning_rate": 1.6865422320487617e-05, + "loss": 0.295, + "step": 5272 + }, + { + "epoch": 1.0394321766561514, + "grad_norm": 0.5542454344482823, + "learning_rate": 1.6864295229741014e-05, + "loss": 0.3195, + "step": 5273 + }, + { + "epoch": 1.039629337539432, + "grad_norm": 0.5477043440442139, + "learning_rate": 1.686316797407419e-05, + "loss": 0.3124, + "step": 5274 + }, + { + "epoch": 1.039826498422713, + "grad_norm": 0.541129200628956, + "learning_rate": 1.6862040553514227e-05, + "loss": 0.3037, + "step": 5275 + }, + { + "epoch": 1.0400236593059937, + "grad_norm": 0.5301479545638493, + "learning_rate": 1.686091296808822e-05, + "loss": 0.3073, + "step": 5276 + }, + { + "epoch": 1.0402208201892744, + "grad_norm": 0.5425477118739194, + "learning_rate": 1.6859785217823247e-05, + "loss": 0.3094, + "step": 5277 + }, + { + "epoch": 1.0404179810725551, + "grad_norm": 20.30229094176611, + "learning_rate": 1.6858657302746412e-05, + "loss": 0.3857, + "step": 5278 + }, + { + "epoch": 1.040615141955836, + "grad_norm": 0.5535859868074469, + "learning_rate": 1.6857529222884813e-05, + "loss": 0.3197, + "step": 5279 + }, + { + "epoch": 1.0408123028391167, + "grad_norm": 0.5253753488109291, + "learning_rate": 1.685640097826555e-05, + "loss": 0.3229, + "step": 5280 + }, + { + "epoch": 1.0410094637223974, + "grad_norm": 0.5265804552077402, + "learning_rate": 1.6855272568915738e-05, + "loss": 0.3349, + "step": 5281 + }, + { + "epoch": 1.0412066246056781, + "grad_norm": 0.5239300594186814, + "learning_rate": 1.6854143994862476e-05, + "loss": 0.3265, + "step": 5282 + }, + { + "epoch": 1.041403785488959, + "grad_norm": 0.503539634551038, + "learning_rate": 1.685301525613289e-05, + "loss": 0.3356, + "step": 5283 + }, + { + "epoch": 1.0416009463722398, + "grad_norm": 0.540113881476233, + "learning_rate": 1.685188635275409e-05, + "loss": 0.3271, + "step": 5284 + }, + { + "epoch": 1.0417981072555205, + "grad_norm": 0.5497912101764829, + "learning_rate": 1.6850757284753202e-05, + "loss": 0.3238, + "step": 5285 + }, + { + "epoch": 1.0419952681388012, + "grad_norm": 0.5419609896862225, + "learning_rate": 1.6849628052157353e-05, + "loss": 0.328, + "step": 5286 + }, + { + "epoch": 1.042192429022082, + "grad_norm": 0.5282911482063344, + "learning_rate": 1.6848498654993676e-05, + "loss": 0.3391, + "step": 5287 + }, + { + "epoch": 1.0423895899053628, + "grad_norm": 0.5365488857264754, + "learning_rate": 1.6847369093289304e-05, + "loss": 0.3443, + "step": 5288 + }, + { + "epoch": 1.0425867507886435, + "grad_norm": 0.4884175319952028, + "learning_rate": 1.6846239367071372e-05, + "loss": 0.33, + "step": 5289 + }, + { + "epoch": 1.0427839116719242, + "grad_norm": 0.564983002840711, + "learning_rate": 1.684510947636703e-05, + "loss": 0.3469, + "step": 5290 + }, + { + "epoch": 1.0429810725552051, + "grad_norm": 0.5449133450818839, + "learning_rate": 1.684397942120342e-05, + "loss": 0.3259, + "step": 5291 + }, + { + "epoch": 1.0431782334384858, + "grad_norm": 0.5622289793546967, + "learning_rate": 1.684284920160769e-05, + "loss": 0.3485, + "step": 5292 + }, + { + "epoch": 1.0433753943217665, + "grad_norm": 0.5340801322832177, + "learning_rate": 1.6841718817607003e-05, + "loss": 0.3358, + "step": 5293 + }, + { + "epoch": 1.0435725552050472, + "grad_norm": 0.6083293183480287, + "learning_rate": 1.6840588269228507e-05, + "loss": 0.3317, + "step": 5294 + }, + { + "epoch": 1.0437697160883281, + "grad_norm": 0.4950617147026909, + "learning_rate": 1.6839457556499372e-05, + "loss": 0.3276, + "step": 5295 + }, + { + "epoch": 1.0439668769716088, + "grad_norm": 0.5662216265381399, + "learning_rate": 1.6838326679446756e-05, + "loss": 0.3515, + "step": 5296 + }, + { + "epoch": 1.0441640378548895, + "grad_norm": 0.5275768249043005, + "learning_rate": 1.683719563809784e-05, + "loss": 0.3291, + "step": 5297 + }, + { + "epoch": 1.0443611987381702, + "grad_norm": 0.5363502739274106, + "learning_rate": 1.683606443247979e-05, + "loss": 0.3348, + "step": 5298 + }, + { + "epoch": 1.0445583596214512, + "grad_norm": 0.5434941004906498, + "learning_rate": 1.6834933062619788e-05, + "loss": 0.3339, + "step": 5299 + }, + { + "epoch": 1.0447555205047319, + "grad_norm": 0.49586659679323425, + "learning_rate": 1.6833801528545016e-05, + "loss": 0.3006, + "step": 5300 + }, + { + "epoch": 1.0449526813880126, + "grad_norm": 0.5211848831043154, + "learning_rate": 1.6832669830282658e-05, + "loss": 0.3197, + "step": 5301 + }, + { + "epoch": 1.0451498422712935, + "grad_norm": 0.531819238704122, + "learning_rate": 1.6831537967859904e-05, + "loss": 0.3438, + "step": 5302 + }, + { + "epoch": 1.0453470031545742, + "grad_norm": 0.5489480069934828, + "learning_rate": 1.6830405941303948e-05, + "loss": 0.3295, + "step": 5303 + }, + { + "epoch": 1.0455441640378549, + "grad_norm": 0.5107386270726029, + "learning_rate": 1.6829273750641995e-05, + "loss": 0.3047, + "step": 5304 + }, + { + "epoch": 1.0457413249211356, + "grad_norm": 0.5400652269430585, + "learning_rate": 1.6828141395901236e-05, + "loss": 0.3339, + "step": 5305 + }, + { + "epoch": 1.0459384858044165, + "grad_norm": 0.5346562175161331, + "learning_rate": 1.682700887710888e-05, + "loss": 0.3171, + "step": 5306 + }, + { + "epoch": 1.0461356466876972, + "grad_norm": 0.5453238920962701, + "learning_rate": 1.682587619429214e-05, + "loss": 0.3333, + "step": 5307 + }, + { + "epoch": 1.046332807570978, + "grad_norm": 0.5404341648713517, + "learning_rate": 1.6824743347478224e-05, + "loss": 0.3207, + "step": 5308 + }, + { + "epoch": 1.0465299684542586, + "grad_norm": 0.48716882855128474, + "learning_rate": 1.682361033669436e-05, + "loss": 0.3093, + "step": 5309 + }, + { + "epoch": 1.0467271293375395, + "grad_norm": 0.5182039037513314, + "learning_rate": 1.6822477161967757e-05, + "loss": 0.3075, + "step": 5310 + }, + { + "epoch": 1.0469242902208202, + "grad_norm": 0.5052936292274123, + "learning_rate": 1.682134382332565e-05, + "loss": 0.3087, + "step": 5311 + }, + { + "epoch": 1.047121451104101, + "grad_norm": 0.5089202938636761, + "learning_rate": 1.682021032079526e-05, + "loss": 0.3342, + "step": 5312 + }, + { + "epoch": 1.0473186119873816, + "grad_norm": 0.5487804098811082, + "learning_rate": 1.681907665440383e-05, + "loss": 0.326, + "step": 5313 + }, + { + "epoch": 1.0475157728706626, + "grad_norm": 0.581936936175654, + "learning_rate": 1.6817942824178587e-05, + "loss": 0.3605, + "step": 5314 + }, + { + "epoch": 1.0477129337539433, + "grad_norm": 0.5264515076671745, + "learning_rate": 1.681680883014678e-05, + "loss": 0.3541, + "step": 5315 + }, + { + "epoch": 1.047910094637224, + "grad_norm": 0.49271519940647124, + "learning_rate": 1.6815674672335652e-05, + "loss": 0.3023, + "step": 5316 + }, + { + "epoch": 1.0481072555205047, + "grad_norm": 0.5364492375977938, + "learning_rate": 1.681454035077245e-05, + "loss": 0.3016, + "step": 5317 + }, + { + "epoch": 1.0483044164037856, + "grad_norm": 1.5271525514198456, + "learning_rate": 1.6813405865484426e-05, + "loss": 0.3644, + "step": 5318 + }, + { + "epoch": 1.0485015772870663, + "grad_norm": 0.5513325850528616, + "learning_rate": 1.6812271216498842e-05, + "loss": 0.3354, + "step": 5319 + }, + { + "epoch": 1.048698738170347, + "grad_norm": 0.5098889426381192, + "learning_rate": 1.6811136403842955e-05, + "loss": 0.3291, + "step": 5320 + }, + { + "epoch": 1.0488958990536277, + "grad_norm": 0.5852649954559482, + "learning_rate": 1.681000142754403e-05, + "loss": 0.2934, + "step": 5321 + }, + { + "epoch": 1.0490930599369086, + "grad_norm": 0.5187903351376137, + "learning_rate": 1.680886628762934e-05, + "loss": 0.29, + "step": 5322 + }, + { + "epoch": 1.0492902208201893, + "grad_norm": 0.5150689333720393, + "learning_rate": 1.6807730984126155e-05, + "loss": 0.3159, + "step": 5323 + }, + { + "epoch": 1.04948738170347, + "grad_norm": 0.5865797200646796, + "learning_rate": 1.6806595517061744e-05, + "loss": 0.3345, + "step": 5324 + }, + { + "epoch": 1.0496845425867507, + "grad_norm": 0.5411960758909427, + "learning_rate": 1.68054598864634e-05, + "loss": 0.3213, + "step": 5325 + }, + { + "epoch": 1.0498817034700316, + "grad_norm": 0.5402846419962667, + "learning_rate": 1.6804324092358402e-05, + "loss": 0.32, + "step": 5326 + }, + { + "epoch": 1.0500788643533123, + "grad_norm": 0.5039112744373472, + "learning_rate": 1.6803188134774037e-05, + "loss": 0.3259, + "step": 5327 + }, + { + "epoch": 1.050276025236593, + "grad_norm": 0.5462197388737413, + "learning_rate": 1.6802052013737595e-05, + "loss": 0.3309, + "step": 5328 + }, + { + "epoch": 1.0504731861198737, + "grad_norm": 0.5522764205643538, + "learning_rate": 1.680091572927638e-05, + "loss": 0.3086, + "step": 5329 + }, + { + "epoch": 1.0506703470031546, + "grad_norm": 0.5647445368693557, + "learning_rate": 1.6799779281417685e-05, + "loss": 0.349, + "step": 5330 + }, + { + "epoch": 1.0508675078864353, + "grad_norm": 0.7627307676176236, + "learning_rate": 1.679864267018882e-05, + "loss": 0.3259, + "step": 5331 + }, + { + "epoch": 1.051064668769716, + "grad_norm": 0.5107897902631611, + "learning_rate": 1.6797505895617087e-05, + "loss": 0.2968, + "step": 5332 + }, + { + "epoch": 1.0512618296529967, + "grad_norm": 0.525494347575112, + "learning_rate": 1.6796368957729802e-05, + "loss": 0.3038, + "step": 5333 + }, + { + "epoch": 1.0514589905362777, + "grad_norm": 0.5371290338339879, + "learning_rate": 1.679523185655428e-05, + "loss": 0.3366, + "step": 5334 + }, + { + "epoch": 1.0516561514195584, + "grad_norm": 0.5533840964718997, + "learning_rate": 1.6794094592117846e-05, + "loss": 0.3443, + "step": 5335 + }, + { + "epoch": 1.051853312302839, + "grad_norm": 0.571584471042435, + "learning_rate": 1.6792957164447807e-05, + "loss": 0.3503, + "step": 5336 + }, + { + "epoch": 1.0520504731861198, + "grad_norm": 0.5233336933797986, + "learning_rate": 1.6791819573571507e-05, + "loss": 0.3158, + "step": 5337 + }, + { + "epoch": 1.0522476340694007, + "grad_norm": 0.5494692589886215, + "learning_rate": 1.6790681819516275e-05, + "loss": 0.3101, + "step": 5338 + }, + { + "epoch": 1.0524447949526814, + "grad_norm": 0.5188119339090779, + "learning_rate": 1.6789543902309443e-05, + "loss": 0.3182, + "step": 5339 + }, + { + "epoch": 1.052641955835962, + "grad_norm": 0.4981089705542774, + "learning_rate": 1.6788405821978347e-05, + "loss": 0.3138, + "step": 5340 + }, + { + "epoch": 1.0528391167192428, + "grad_norm": 0.5222423077867616, + "learning_rate": 1.6787267578550338e-05, + "loss": 0.3465, + "step": 5341 + }, + { + "epoch": 1.0530362776025237, + "grad_norm": 0.6072170278417297, + "learning_rate": 1.678612917205276e-05, + "loss": 0.3759, + "step": 5342 + }, + { + "epoch": 1.0532334384858044, + "grad_norm": 0.49978080972584255, + "learning_rate": 1.6784990602512962e-05, + "loss": 0.3196, + "step": 5343 + }, + { + "epoch": 1.0534305993690851, + "grad_norm": 0.5037981316390495, + "learning_rate": 1.67838518699583e-05, + "loss": 0.3545, + "step": 5344 + }, + { + "epoch": 1.053627760252366, + "grad_norm": 0.48805755679759366, + "learning_rate": 1.6782712974416136e-05, + "loss": 0.3178, + "step": 5345 + }, + { + "epoch": 1.0538249211356467, + "grad_norm": 0.528992909703354, + "learning_rate": 1.678157391591383e-05, + "loss": 0.3609, + "step": 5346 + }, + { + "epoch": 1.0540220820189274, + "grad_norm": 0.48711439504089044, + "learning_rate": 1.6780434694478748e-05, + "loss": 0.3306, + "step": 5347 + }, + { + "epoch": 1.0542192429022081, + "grad_norm": 0.5482559735722008, + "learning_rate": 1.6779295310138264e-05, + "loss": 0.3241, + "step": 5348 + }, + { + "epoch": 1.054416403785489, + "grad_norm": 0.526517838176848, + "learning_rate": 1.677815576291975e-05, + "loss": 0.3523, + "step": 5349 + }, + { + "epoch": 1.0546135646687698, + "grad_norm": 0.560425688696832, + "learning_rate": 1.6777016052850586e-05, + "loss": 0.3306, + "step": 5350 + }, + { + "epoch": 1.0548107255520505, + "grad_norm": 0.5204711823514945, + "learning_rate": 1.6775876179958154e-05, + "loss": 0.31, + "step": 5351 + }, + { + "epoch": 1.0550078864353312, + "grad_norm": 0.5230775365579958, + "learning_rate": 1.677473614426984e-05, + "loss": 0.3317, + "step": 5352 + }, + { + "epoch": 1.055205047318612, + "grad_norm": 0.503669553362592, + "learning_rate": 1.6773595945813033e-05, + "loss": 0.31, + "step": 5353 + }, + { + "epoch": 1.0554022082018928, + "grad_norm": 0.5353586843398921, + "learning_rate": 1.677245558461513e-05, + "loss": 0.34, + "step": 5354 + }, + { + "epoch": 1.0555993690851735, + "grad_norm": 0.5027562913026351, + "learning_rate": 1.6771315060703525e-05, + "loss": 0.3345, + "step": 5355 + }, + { + "epoch": 1.0557965299684542, + "grad_norm": 0.5357732118957388, + "learning_rate": 1.6770174374105626e-05, + "loss": 0.3313, + "step": 5356 + }, + { + "epoch": 1.0559936908517351, + "grad_norm": 0.5261857108517175, + "learning_rate": 1.6769033524848833e-05, + "loss": 0.3115, + "step": 5357 + }, + { + "epoch": 1.0561908517350158, + "grad_norm": 0.5009914049381263, + "learning_rate": 1.6767892512960565e-05, + "loss": 0.3274, + "step": 5358 + }, + { + "epoch": 1.0563880126182965, + "grad_norm": 0.5301353617492294, + "learning_rate": 1.6766751338468222e-05, + "loss": 0.3338, + "step": 5359 + }, + { + "epoch": 1.0565851735015772, + "grad_norm": 0.4889759349749068, + "learning_rate": 1.6765610001399232e-05, + "loss": 0.3063, + "step": 5360 + }, + { + "epoch": 1.0567823343848581, + "grad_norm": 0.5153052654541606, + "learning_rate": 1.676446850178101e-05, + "loss": 0.3212, + "step": 5361 + }, + { + "epoch": 1.0569794952681388, + "grad_norm": 0.5432847626107267, + "learning_rate": 1.6763326839640993e-05, + "loss": 0.3401, + "step": 5362 + }, + { + "epoch": 1.0571766561514195, + "grad_norm": 0.5095940207939735, + "learning_rate": 1.6762185015006597e-05, + "loss": 0.3035, + "step": 5363 + }, + { + "epoch": 1.0573738170347002, + "grad_norm": 0.5082084315576471, + "learning_rate": 1.676104302790526e-05, + "loss": 0.3164, + "step": 5364 + }, + { + "epoch": 1.0575709779179812, + "grad_norm": 0.5317861846949568, + "learning_rate": 1.675990087836442e-05, + "loss": 0.3234, + "step": 5365 + }, + { + "epoch": 1.0577681388012619, + "grad_norm": 0.5438204191584657, + "learning_rate": 1.6758758566411516e-05, + "loss": 0.3276, + "step": 5366 + }, + { + "epoch": 1.0579652996845426, + "grad_norm": 0.5378516453459048, + "learning_rate": 1.6757616092073993e-05, + "loss": 0.3213, + "step": 5367 + }, + { + "epoch": 1.0581624605678233, + "grad_norm": 0.5241736593808551, + "learning_rate": 1.6756473455379306e-05, + "loss": 0.347, + "step": 5368 + }, + { + "epoch": 1.0583596214511042, + "grad_norm": 0.5281704964843222, + "learning_rate": 1.67553306563549e-05, + "loss": 0.3509, + "step": 5369 + }, + { + "epoch": 1.0585567823343849, + "grad_norm": 0.519218991265461, + "learning_rate": 1.675418769502824e-05, + "loss": 0.3438, + "step": 5370 + }, + { + "epoch": 1.0587539432176656, + "grad_norm": 0.5056062844302116, + "learning_rate": 1.6753044571426777e-05, + "loss": 0.3209, + "step": 5371 + }, + { + "epoch": 1.0589511041009463, + "grad_norm": 0.547727929795884, + "learning_rate": 1.6751901285577986e-05, + "loss": 0.3459, + "step": 5372 + }, + { + "epoch": 1.0591482649842272, + "grad_norm": 0.5147703062425203, + "learning_rate": 1.675075783750932e-05, + "loss": 0.309, + "step": 5373 + }, + { + "epoch": 1.059345425867508, + "grad_norm": 0.5136539353424504, + "learning_rate": 1.6749614227248265e-05, + "loss": 0.3356, + "step": 5374 + }, + { + "epoch": 1.0595425867507886, + "grad_norm": 0.5065991374201936, + "learning_rate": 1.6748470454822295e-05, + "loss": 0.3117, + "step": 5375 + }, + { + "epoch": 1.0597397476340693, + "grad_norm": 0.4802852456122994, + "learning_rate": 1.6747326520258884e-05, + "loss": 0.2974, + "step": 5376 + }, + { + "epoch": 1.0599369085173502, + "grad_norm": 0.590323007451885, + "learning_rate": 1.6746182423585524e-05, + "loss": 0.3275, + "step": 5377 + }, + { + "epoch": 1.060134069400631, + "grad_norm": 0.5031627981781183, + "learning_rate": 1.6745038164829695e-05, + "loss": 0.3443, + "step": 5378 + }, + { + "epoch": 1.0603312302839116, + "grad_norm": 0.5335220847872871, + "learning_rate": 1.6743893744018892e-05, + "loss": 0.3522, + "step": 5379 + }, + { + "epoch": 1.0605283911671923, + "grad_norm": 0.4886051328326671, + "learning_rate": 1.6742749161180614e-05, + "loss": 0.3241, + "step": 5380 + }, + { + "epoch": 1.0607255520504733, + "grad_norm": 0.5426979534289905, + "learning_rate": 1.6741604416342355e-05, + "loss": 0.3307, + "step": 5381 + }, + { + "epoch": 1.060922712933754, + "grad_norm": 0.4747697116176086, + "learning_rate": 1.674045950953162e-05, + "loss": 0.3072, + "step": 5382 + }, + { + "epoch": 1.0611198738170347, + "grad_norm": 0.4990309113836239, + "learning_rate": 1.673931444077592e-05, + "loss": 0.3243, + "step": 5383 + }, + { + "epoch": 1.0613170347003154, + "grad_norm": 0.5391230910740937, + "learning_rate": 1.6738169210102765e-05, + "loss": 0.3258, + "step": 5384 + }, + { + "epoch": 1.0615141955835963, + "grad_norm": 0.5003732086840169, + "learning_rate": 1.6737023817539665e-05, + "loss": 0.3081, + "step": 5385 + }, + { + "epoch": 1.061711356466877, + "grad_norm": 0.511666602281063, + "learning_rate": 1.6735878263114146e-05, + "loss": 0.3251, + "step": 5386 + }, + { + "epoch": 1.0619085173501577, + "grad_norm": 0.5342493746181992, + "learning_rate": 1.673473254685372e-05, + "loss": 0.3242, + "step": 5387 + }, + { + "epoch": 1.0621056782334386, + "grad_norm": 0.5066446089845047, + "learning_rate": 1.6733586668785926e-05, + "loss": 0.3415, + "step": 5388 + }, + { + "epoch": 1.0623028391167193, + "grad_norm": 0.49933317033206337, + "learning_rate": 1.6732440628938293e-05, + "loss": 0.3069, + "step": 5389 + }, + { + "epoch": 1.0625, + "grad_norm": 0.5395443938451984, + "learning_rate": 1.6731294427338344e-05, + "loss": 0.3343, + "step": 5390 + }, + { + "epoch": 1.0626971608832807, + "grad_norm": 0.5154874242963391, + "learning_rate": 1.6730148064013633e-05, + "loss": 0.3353, + "step": 5391 + }, + { + "epoch": 1.0628943217665614, + "grad_norm": 0.549850655755667, + "learning_rate": 1.672900153899169e-05, + "loss": 0.3339, + "step": 5392 + }, + { + "epoch": 1.0630914826498423, + "grad_norm": 0.5437692223908803, + "learning_rate": 1.6727854852300073e-05, + "loss": 0.325, + "step": 5393 + }, + { + "epoch": 1.063288643533123, + "grad_norm": 0.5319120551752757, + "learning_rate": 1.672670800396632e-05, + "loss": 0.3333, + "step": 5394 + }, + { + "epoch": 1.0634858044164037, + "grad_norm": 0.5888809384452421, + "learning_rate": 1.672556099401799e-05, + "loss": 0.3289, + "step": 5395 + }, + { + "epoch": 1.0636829652996846, + "grad_norm": 0.5174224708917468, + "learning_rate": 1.672441382248264e-05, + "loss": 0.325, + "step": 5396 + }, + { + "epoch": 1.0638801261829653, + "grad_norm": 0.4940106410321626, + "learning_rate": 1.6723266489387837e-05, + "loss": 0.3277, + "step": 5397 + }, + { + "epoch": 1.064077287066246, + "grad_norm": 0.5924938542192701, + "learning_rate": 1.672211899476114e-05, + "loss": 0.3159, + "step": 5398 + }, + { + "epoch": 1.0642744479495267, + "grad_norm": 0.6030406417673295, + "learning_rate": 1.672097133863012e-05, + "loss": 0.352, + "step": 5399 + }, + { + "epoch": 1.0644716088328077, + "grad_norm": 0.46675304029243586, + "learning_rate": 1.6719823521022355e-05, + "loss": 0.273, + "step": 5400 + }, + { + "epoch": 1.0646687697160884, + "grad_norm": 10.313913415266276, + "learning_rate": 1.6718675541965413e-05, + "loss": 0.3712, + "step": 5401 + }, + { + "epoch": 1.064865930599369, + "grad_norm": 0.650331375307367, + "learning_rate": 1.6717527401486882e-05, + "loss": 0.3173, + "step": 5402 + }, + { + "epoch": 1.0650630914826498, + "grad_norm": 0.5516904225558585, + "learning_rate": 1.6716379099614348e-05, + "loss": 0.3332, + "step": 5403 + }, + { + "epoch": 1.0652602523659307, + "grad_norm": 0.5826270309682446, + "learning_rate": 1.6715230636375397e-05, + "loss": 0.3162, + "step": 5404 + }, + { + "epoch": 1.0654574132492114, + "grad_norm": 0.5567258991880055, + "learning_rate": 1.6714082011797625e-05, + "loss": 0.3091, + "step": 5405 + }, + { + "epoch": 1.065654574132492, + "grad_norm": 0.5797543264480705, + "learning_rate": 1.6712933225908618e-05, + "loss": 0.3638, + "step": 5406 + }, + { + "epoch": 1.0658517350157728, + "grad_norm": 0.500686608596155, + "learning_rate": 1.6711784278735993e-05, + "loss": 0.3183, + "step": 5407 + }, + { + "epoch": 1.0660488958990537, + "grad_norm": 0.5482037516123067, + "learning_rate": 1.6710635170307336e-05, + "loss": 0.3282, + "step": 5408 + }, + { + "epoch": 1.0662460567823344, + "grad_norm": 0.5003783186696478, + "learning_rate": 1.6709485900650274e-05, + "loss": 0.2917, + "step": 5409 + }, + { + "epoch": 1.0664432176656151, + "grad_norm": 0.5886317793002753, + "learning_rate": 1.6708336469792407e-05, + "loss": 0.3297, + "step": 5410 + }, + { + "epoch": 1.0666403785488958, + "grad_norm": 0.5466590442325215, + "learning_rate": 1.670718687776135e-05, + "loss": 0.3549, + "step": 5411 + }, + { + "epoch": 1.0668375394321767, + "grad_norm": 0.5540316157067651, + "learning_rate": 1.670603712458473e-05, + "loss": 0.3215, + "step": 5412 + }, + { + "epoch": 1.0670347003154574, + "grad_norm": 0.593181096332669, + "learning_rate": 1.670488721029017e-05, + "loss": 0.3648, + "step": 5413 + }, + { + "epoch": 1.0672318611987381, + "grad_norm": 0.5482843023549779, + "learning_rate": 1.6703737134905296e-05, + "loss": 0.3269, + "step": 5414 + }, + { + "epoch": 1.0674290220820188, + "grad_norm": 0.5958232910254453, + "learning_rate": 1.6702586898457737e-05, + "loss": 0.3219, + "step": 5415 + }, + { + "epoch": 1.0676261829652998, + "grad_norm": 0.5248785667847379, + "learning_rate": 1.6701436500975127e-05, + "loss": 0.3113, + "step": 5416 + }, + { + "epoch": 1.0678233438485805, + "grad_norm": 0.506343180110072, + "learning_rate": 1.6700285942485113e-05, + "loss": 0.2969, + "step": 5417 + }, + { + "epoch": 1.0680205047318612, + "grad_norm": 0.6323082926274546, + "learning_rate": 1.669913522301533e-05, + "loss": 0.3197, + "step": 5418 + }, + { + "epoch": 1.0682176656151419, + "grad_norm": 0.5020570452865531, + "learning_rate": 1.669798434259343e-05, + "loss": 0.3101, + "step": 5419 + }, + { + "epoch": 1.0684148264984228, + "grad_norm": 0.7538065179489385, + "learning_rate": 1.669683330124706e-05, + "loss": 0.3097, + "step": 5420 + }, + { + "epoch": 1.0686119873817035, + "grad_norm": 0.7336379735027907, + "learning_rate": 1.669568209900388e-05, + "loss": 0.2805, + "step": 5421 + }, + { + "epoch": 1.0688091482649842, + "grad_norm": 0.5148666894817214, + "learning_rate": 1.6694530735891548e-05, + "loss": 0.3298, + "step": 5422 + }, + { + "epoch": 1.0690063091482649, + "grad_norm": 0.5938791350921867, + "learning_rate": 1.6693379211937717e-05, + "loss": 0.3594, + "step": 5423 + }, + { + "epoch": 1.0692034700315458, + "grad_norm": 0.5268352695165609, + "learning_rate": 1.6692227527170067e-05, + "loss": 0.3245, + "step": 5424 + }, + { + "epoch": 1.0694006309148265, + "grad_norm": 0.8124267042610954, + "learning_rate": 1.6691075681616257e-05, + "loss": 0.3672, + "step": 5425 + }, + { + "epoch": 1.0695977917981072, + "grad_norm": 0.5680106061709315, + "learning_rate": 1.6689923675303967e-05, + "loss": 0.3314, + "step": 5426 + }, + { + "epoch": 1.069794952681388, + "grad_norm": 0.5607346328108409, + "learning_rate": 1.6688771508260876e-05, + "loss": 0.3461, + "step": 5427 + }, + { + "epoch": 1.0699921135646688, + "grad_norm": 0.5433165552798762, + "learning_rate": 1.668761918051466e-05, + "loss": 0.3425, + "step": 5428 + }, + { + "epoch": 1.0701892744479495, + "grad_norm": 0.5476574461852847, + "learning_rate": 1.6686466692093007e-05, + "loss": 0.3172, + "step": 5429 + }, + { + "epoch": 1.0703864353312302, + "grad_norm": 0.6111021096749791, + "learning_rate": 1.6685314043023608e-05, + "loss": 0.3577, + "step": 5430 + }, + { + "epoch": 1.0705835962145112, + "grad_norm": 0.5552410097887198, + "learning_rate": 1.6684161233334157e-05, + "loss": 0.3154, + "step": 5431 + }, + { + "epoch": 1.0707807570977919, + "grad_norm": 0.5107392149662423, + "learning_rate": 1.6683008263052344e-05, + "loss": 0.3275, + "step": 5432 + }, + { + "epoch": 1.0709779179810726, + "grad_norm": 0.6074720035217149, + "learning_rate": 1.6681855132205882e-05, + "loss": 0.3597, + "step": 5433 + }, + { + "epoch": 1.0711750788643533, + "grad_norm": 0.515248985051321, + "learning_rate": 1.6680701840822468e-05, + "loss": 0.3278, + "step": 5434 + }, + { + "epoch": 1.071372239747634, + "grad_norm": 0.5676731678796564, + "learning_rate": 1.667954838892981e-05, + "loss": 0.324, + "step": 5435 + }, + { + "epoch": 1.0715694006309149, + "grad_norm": 0.5266114021796704, + "learning_rate": 1.6678394776555625e-05, + "loss": 0.3298, + "step": 5436 + }, + { + "epoch": 1.0717665615141956, + "grad_norm": 0.515711098109902, + "learning_rate": 1.667724100372763e-05, + "loss": 0.3301, + "step": 5437 + }, + { + "epoch": 1.0719637223974763, + "grad_norm": 0.5413115348636542, + "learning_rate": 1.667608707047354e-05, + "loss": 0.329, + "step": 5438 + }, + { + "epoch": 1.0721608832807572, + "grad_norm": 0.5133409836464313, + "learning_rate": 1.6674932976821078e-05, + "loss": 0.3062, + "step": 5439 + }, + { + "epoch": 1.072358044164038, + "grad_norm": 0.5514269531982732, + "learning_rate": 1.667377872279798e-05, + "loss": 0.3224, + "step": 5440 + }, + { + "epoch": 1.0725552050473186, + "grad_norm": 0.5381267123295007, + "learning_rate": 1.6672624308431977e-05, + "loss": 0.3554, + "step": 5441 + }, + { + "epoch": 1.0727523659305993, + "grad_norm": 0.5357973892732231, + "learning_rate": 1.6671469733750795e-05, + "loss": 0.3306, + "step": 5442 + }, + { + "epoch": 1.0729495268138802, + "grad_norm": 0.5334845958350081, + "learning_rate": 1.6670314998782183e-05, + "loss": 0.3522, + "step": 5443 + }, + { + "epoch": 1.073146687697161, + "grad_norm": 0.5078507960335631, + "learning_rate": 1.6669160103553884e-05, + "loss": 0.3192, + "step": 5444 + }, + { + "epoch": 1.0733438485804416, + "grad_norm": 0.526273076314691, + "learning_rate": 1.666800504809364e-05, + "loss": 0.3295, + "step": 5445 + }, + { + "epoch": 1.0735410094637223, + "grad_norm": 0.5501746473063156, + "learning_rate": 1.6666849832429207e-05, + "loss": 0.3559, + "step": 5446 + }, + { + "epoch": 1.0737381703470033, + "grad_norm": 0.5417331807201895, + "learning_rate": 1.6665694456588335e-05, + "loss": 0.3395, + "step": 5447 + }, + { + "epoch": 1.073935331230284, + "grad_norm": 0.4892217439505845, + "learning_rate": 1.666453892059879e-05, + "loss": 0.2989, + "step": 5448 + }, + { + "epoch": 1.0741324921135647, + "grad_norm": 0.5427686650470855, + "learning_rate": 1.6663383224488323e-05, + "loss": 0.3249, + "step": 5449 + }, + { + "epoch": 1.0743296529968454, + "grad_norm": 0.5467284567115355, + "learning_rate": 1.6662227368284716e-05, + "loss": 0.3353, + "step": 5450 + }, + { + "epoch": 1.0745268138801263, + "grad_norm": 0.49209796772640735, + "learning_rate": 1.6661071352015725e-05, + "loss": 0.3205, + "step": 5451 + }, + { + "epoch": 1.074723974763407, + "grad_norm": 0.5566384176423259, + "learning_rate": 1.6659915175709135e-05, + "loss": 0.3221, + "step": 5452 + }, + { + "epoch": 1.0749211356466877, + "grad_norm": 0.526499393616949, + "learning_rate": 1.665875883939272e-05, + "loss": 0.3263, + "step": 5453 + }, + { + "epoch": 1.0751182965299684, + "grad_norm": 2.739695761851264, + "learning_rate": 1.665760234309426e-05, + "loss": 0.3506, + "step": 5454 + }, + { + "epoch": 1.0753154574132493, + "grad_norm": 0.6593642459442813, + "learning_rate": 1.665644568684154e-05, + "loss": 0.3426, + "step": 5455 + }, + { + "epoch": 1.07551261829653, + "grad_norm": 0.5151614291220461, + "learning_rate": 1.6655288870662354e-05, + "loss": 0.328, + "step": 5456 + }, + { + "epoch": 1.0757097791798107, + "grad_norm": 0.5714376826042502, + "learning_rate": 1.6654131894584494e-05, + "loss": 0.318, + "step": 5457 + }, + { + "epoch": 1.0759069400630914, + "grad_norm": 0.5326114164538318, + "learning_rate": 1.665297475863576e-05, + "loss": 0.3268, + "step": 5458 + }, + { + "epoch": 1.0761041009463723, + "grad_norm": 0.5318642965648936, + "learning_rate": 1.6651817462843944e-05, + "loss": 0.3393, + "step": 5459 + }, + { + "epoch": 1.076301261829653, + "grad_norm": 0.5208615095763689, + "learning_rate": 1.665066000723686e-05, + "loss": 0.3048, + "step": 5460 + }, + { + "epoch": 1.0764984227129337, + "grad_norm": 0.5112237680335591, + "learning_rate": 1.6649502391842313e-05, + "loss": 0.3306, + "step": 5461 + }, + { + "epoch": 1.0766955835962144, + "grad_norm": 0.5020109988963485, + "learning_rate": 1.6648344616688116e-05, + "loss": 0.3079, + "step": 5462 + }, + { + "epoch": 1.0768927444794953, + "grad_norm": 0.5141839034905135, + "learning_rate": 1.664718668180208e-05, + "loss": 0.3433, + "step": 5463 + }, + { + "epoch": 1.077089905362776, + "grad_norm": 0.522715217511301, + "learning_rate": 1.664602858721204e-05, + "loss": 0.3273, + "step": 5464 + }, + { + "epoch": 1.0772870662460567, + "grad_norm": 0.4873672081469081, + "learning_rate": 1.6644870332945807e-05, + "loss": 0.3131, + "step": 5465 + }, + { + "epoch": 1.0774842271293374, + "grad_norm": 0.5208359970585005, + "learning_rate": 1.6643711919031217e-05, + "loss": 0.3326, + "step": 5466 + }, + { + "epoch": 1.0776813880126184, + "grad_norm": 0.5085175763798949, + "learning_rate": 1.6642553345496093e-05, + "loss": 0.336, + "step": 5467 + }, + { + "epoch": 1.077878548895899, + "grad_norm": 0.5395303777396451, + "learning_rate": 1.664139461236828e-05, + "loss": 0.3519, + "step": 5468 + }, + { + "epoch": 1.0780757097791798, + "grad_norm": 0.4939691741186544, + "learning_rate": 1.6640235719675607e-05, + "loss": 0.3149, + "step": 5469 + }, + { + "epoch": 1.0782728706624605, + "grad_norm": 0.5045822941059132, + "learning_rate": 1.663907666744593e-05, + "loss": 0.3257, + "step": 5470 + }, + { + "epoch": 1.0784700315457414, + "grad_norm": 0.4842189935972799, + "learning_rate": 1.6637917455707085e-05, + "loss": 0.2941, + "step": 5471 + }, + { + "epoch": 1.078667192429022, + "grad_norm": 0.488411947048396, + "learning_rate": 1.663675808448693e-05, + "loss": 0.3263, + "step": 5472 + }, + { + "epoch": 1.0788643533123028, + "grad_norm": 0.5204936040738815, + "learning_rate": 1.663559855381332e-05, + "loss": 0.337, + "step": 5473 + }, + { + "epoch": 1.0790615141955835, + "grad_norm": 0.4907540263785109, + "learning_rate": 1.6634438863714108e-05, + "loss": 0.3341, + "step": 5474 + }, + { + "epoch": 1.0792586750788644, + "grad_norm": 0.5341500420019042, + "learning_rate": 1.6633279014217158e-05, + "loss": 0.361, + "step": 5475 + }, + { + "epoch": 1.0794558359621451, + "grad_norm": 0.566795481318772, + "learning_rate": 1.663211900535034e-05, + "loss": 0.3565, + "step": 5476 + }, + { + "epoch": 1.0796529968454258, + "grad_norm": 0.5104378663095975, + "learning_rate": 1.663095883714152e-05, + "loss": 0.3172, + "step": 5477 + }, + { + "epoch": 1.0798501577287065, + "grad_norm": 0.7836320347832308, + "learning_rate": 1.6629798509618575e-05, + "loss": 0.3279, + "step": 5478 + }, + { + "epoch": 1.0800473186119874, + "grad_norm": 0.4941220351531997, + "learning_rate": 1.6628638022809384e-05, + "loss": 0.3237, + "step": 5479 + }, + { + "epoch": 1.0802444794952681, + "grad_norm": 0.538070953999582, + "learning_rate": 1.6627477376741824e-05, + "loss": 0.3186, + "step": 5480 + }, + { + "epoch": 1.0804416403785488, + "grad_norm": 0.49651925324616386, + "learning_rate": 1.6626316571443782e-05, + "loss": 0.3328, + "step": 5481 + }, + { + "epoch": 1.0806388012618298, + "grad_norm": 0.6699435537682001, + "learning_rate": 1.662515560694315e-05, + "loss": 0.3353, + "step": 5482 + }, + { + "epoch": 1.0808359621451105, + "grad_norm": 0.5278612363315855, + "learning_rate": 1.6623994483267823e-05, + "loss": 0.3275, + "step": 5483 + }, + { + "epoch": 1.0810331230283912, + "grad_norm": 0.5521883665813071, + "learning_rate": 1.6622833200445688e-05, + "loss": 0.3395, + "step": 5484 + }, + { + "epoch": 1.0812302839116719, + "grad_norm": 0.5101814960888886, + "learning_rate": 1.6621671758504656e-05, + "loss": 0.3229, + "step": 5485 + }, + { + "epoch": 1.0814274447949526, + "grad_norm": 0.5082133461105458, + "learning_rate": 1.6620510157472626e-05, + "loss": 0.3416, + "step": 5486 + }, + { + "epoch": 1.0816246056782335, + "grad_norm": 0.5152393878133755, + "learning_rate": 1.6619348397377508e-05, + "loss": 0.2995, + "step": 5487 + }, + { + "epoch": 1.0818217665615142, + "grad_norm": 0.5030381994487839, + "learning_rate": 1.6618186478247214e-05, + "loss": 0.3219, + "step": 5488 + }, + { + "epoch": 1.0820189274447949, + "grad_norm": 0.5033251374070803, + "learning_rate": 1.661702440010966e-05, + "loss": 0.3278, + "step": 5489 + }, + { + "epoch": 1.0822160883280758, + "grad_norm": 0.4858411621778881, + "learning_rate": 1.6615862162992765e-05, + "loss": 0.321, + "step": 5490 + }, + { + "epoch": 1.0824132492113565, + "grad_norm": 0.5299981365610831, + "learning_rate": 1.6614699766924457e-05, + "loss": 0.334, + "step": 5491 + }, + { + "epoch": 1.0826104100946372, + "grad_norm": 0.5398683026062827, + "learning_rate": 1.661353721193266e-05, + "loss": 0.3679, + "step": 5492 + }, + { + "epoch": 1.082807570977918, + "grad_norm": 0.4922123143204083, + "learning_rate": 1.6612374498045303e-05, + "loss": 0.3252, + "step": 5493 + }, + { + "epoch": 1.0830047318611988, + "grad_norm": 0.5402134649137565, + "learning_rate": 1.6611211625290328e-05, + "loss": 0.3488, + "step": 5494 + }, + { + "epoch": 1.0832018927444795, + "grad_norm": 0.49308565622911255, + "learning_rate": 1.6610048593695665e-05, + "loss": 0.3274, + "step": 5495 + }, + { + "epoch": 1.0833990536277602, + "grad_norm": 0.5330415075689533, + "learning_rate": 1.6608885403289264e-05, + "loss": 0.3416, + "step": 5496 + }, + { + "epoch": 1.083596214511041, + "grad_norm": 0.5546315403447396, + "learning_rate": 1.6607722054099066e-05, + "loss": 0.3428, + "step": 5497 + }, + { + "epoch": 1.0837933753943219, + "grad_norm": 0.548153275553841, + "learning_rate": 1.6606558546153027e-05, + "loss": 0.3274, + "step": 5498 + }, + { + "epoch": 1.0839905362776026, + "grad_norm": 0.5741893560590743, + "learning_rate": 1.6605394879479102e-05, + "loss": 0.3408, + "step": 5499 + }, + { + "epoch": 1.0841876971608833, + "grad_norm": 0.563655275701286, + "learning_rate": 1.660423105410524e-05, + "loss": 0.3535, + "step": 5500 + }, + { + "epoch": 1.084384858044164, + "grad_norm": 0.5292670828512388, + "learning_rate": 1.6603067070059413e-05, + "loss": 0.3451, + "step": 5501 + }, + { + "epoch": 1.0845820189274449, + "grad_norm": 0.4943062953455165, + "learning_rate": 1.6601902927369577e-05, + "loss": 0.305, + "step": 5502 + }, + { + "epoch": 1.0847791798107256, + "grad_norm": 0.5684207519421706, + "learning_rate": 1.660073862606371e-05, + "loss": 0.3544, + "step": 5503 + }, + { + "epoch": 1.0849763406940063, + "grad_norm": 0.5269370109869425, + "learning_rate": 1.6599574166169783e-05, + "loss": 0.3364, + "step": 5504 + }, + { + "epoch": 1.085173501577287, + "grad_norm": 0.5930908127100334, + "learning_rate": 1.659840954771577e-05, + "loss": 0.3293, + "step": 5505 + }, + { + "epoch": 1.085370662460568, + "grad_norm": 0.5064001248459025, + "learning_rate": 1.6597244770729656e-05, + "loss": 0.3364, + "step": 5506 + }, + { + "epoch": 1.0855678233438486, + "grad_norm": 0.5583107476093074, + "learning_rate": 1.659607983523942e-05, + "loss": 0.344, + "step": 5507 + }, + { + "epoch": 1.0857649842271293, + "grad_norm": 0.5186990686312586, + "learning_rate": 1.6594914741273058e-05, + "loss": 0.3203, + "step": 5508 + }, + { + "epoch": 1.08596214511041, + "grad_norm": 0.5412209056965988, + "learning_rate": 1.6593749488858554e-05, + "loss": 0.3514, + "step": 5509 + }, + { + "epoch": 1.086159305993691, + "grad_norm": 0.5847946899685581, + "learning_rate": 1.6592584078023915e-05, + "loss": 0.3498, + "step": 5510 + }, + { + "epoch": 1.0863564668769716, + "grad_norm": 0.49519738287188864, + "learning_rate": 1.659141850879713e-05, + "loss": 0.3065, + "step": 5511 + }, + { + "epoch": 1.0865536277602523, + "grad_norm": 0.49468808903487677, + "learning_rate": 1.659025278120621e-05, + "loss": 0.3129, + "step": 5512 + }, + { + "epoch": 1.086750788643533, + "grad_norm": 1.0581729165868015, + "learning_rate": 1.6589086895279156e-05, + "loss": 0.3673, + "step": 5513 + }, + { + "epoch": 1.086947949526814, + "grad_norm": 0.49002563953861666, + "learning_rate": 1.6587920851043986e-05, + "loss": 0.3223, + "step": 5514 + }, + { + "epoch": 1.0871451104100947, + "grad_norm": 0.4905578731396364, + "learning_rate": 1.6586754648528712e-05, + "loss": 0.3043, + "step": 5515 + }, + { + "epoch": 1.0873422712933754, + "grad_norm": 0.667527289823531, + "learning_rate": 1.658558828776135e-05, + "loss": 0.2788, + "step": 5516 + }, + { + "epoch": 1.087539432176656, + "grad_norm": 0.8459394709559949, + "learning_rate": 1.6584421768769933e-05, + "loss": 0.3415, + "step": 5517 + }, + { + "epoch": 1.087736593059937, + "grad_norm": 0.47906959750711475, + "learning_rate": 1.6583255091582474e-05, + "loss": 0.3405, + "step": 5518 + }, + { + "epoch": 1.0879337539432177, + "grad_norm": 0.48428073694586943, + "learning_rate": 1.658208825622701e-05, + "loss": 0.3322, + "step": 5519 + }, + { + "epoch": 1.0881309148264984, + "grad_norm": 0.7968161512793981, + "learning_rate": 1.6580921262731582e-05, + "loss": 0.3291, + "step": 5520 + }, + { + "epoch": 1.088328075709779, + "grad_norm": 0.519690983389764, + "learning_rate": 1.6579754111124215e-05, + "loss": 0.3248, + "step": 5521 + }, + { + "epoch": 1.08852523659306, + "grad_norm": 0.5091809572661937, + "learning_rate": 1.6578586801432958e-05, + "loss": 0.3133, + "step": 5522 + }, + { + "epoch": 1.0887223974763407, + "grad_norm": 0.5549122652560055, + "learning_rate": 1.6577419333685855e-05, + "loss": 0.3462, + "step": 5523 + }, + { + "epoch": 1.0889195583596214, + "grad_norm": 6.326360585378315, + "learning_rate": 1.6576251707910955e-05, + "loss": 0.3586, + "step": 5524 + }, + { + "epoch": 1.0891167192429023, + "grad_norm": 0.594210310368939, + "learning_rate": 1.6575083924136313e-05, + "loss": 0.2988, + "step": 5525 + }, + { + "epoch": 1.089313880126183, + "grad_norm": 0.5653001651310844, + "learning_rate": 1.6573915982389986e-05, + "loss": 0.3427, + "step": 5526 + }, + { + "epoch": 1.0895110410094637, + "grad_norm": 0.6419933562592661, + "learning_rate": 1.6572747882700034e-05, + "loss": 0.3479, + "step": 5527 + }, + { + "epoch": 1.0897082018927444, + "grad_norm": 0.549229419744898, + "learning_rate": 1.657157962509452e-05, + "loss": 0.3367, + "step": 5528 + }, + { + "epoch": 1.0899053627760251, + "grad_norm": 0.5649927832731517, + "learning_rate": 1.6570411209601515e-05, + "loss": 0.3288, + "step": 5529 + }, + { + "epoch": 1.090102523659306, + "grad_norm": 0.5593298618542406, + "learning_rate": 1.656924263624909e-05, + "loss": 0.3552, + "step": 5530 + }, + { + "epoch": 1.0902996845425867, + "grad_norm": 0.5487752889109879, + "learning_rate": 1.6568073905065313e-05, + "loss": 0.3467, + "step": 5531 + }, + { + "epoch": 1.0904968454258674, + "grad_norm": 0.517747474702925, + "learning_rate": 1.656690501607828e-05, + "loss": 0.3165, + "step": 5532 + }, + { + "epoch": 1.0906940063091484, + "grad_norm": 0.5187685507630857, + "learning_rate": 1.656573596931606e-05, + "loss": 0.3284, + "step": 5533 + }, + { + "epoch": 1.090891167192429, + "grad_norm": 0.5501636063381948, + "learning_rate": 1.656456676480675e-05, + "loss": 0.3071, + "step": 5534 + }, + { + "epoch": 1.0910883280757098, + "grad_norm": 0.5456326413445273, + "learning_rate": 1.6563397402578432e-05, + "loss": 0.3125, + "step": 5535 + }, + { + "epoch": 1.0912854889589905, + "grad_norm": 0.5648819649096508, + "learning_rate": 1.6562227882659213e-05, + "loss": 0.3379, + "step": 5536 + }, + { + "epoch": 1.0914826498422714, + "grad_norm": 0.5019198980389612, + "learning_rate": 1.656105820507718e-05, + "loss": 0.32, + "step": 5537 + }, + { + "epoch": 1.091679810725552, + "grad_norm": 0.5549566520131515, + "learning_rate": 1.655988836986044e-05, + "loss": 0.352, + "step": 5538 + }, + { + "epoch": 1.0918769716088328, + "grad_norm": 0.47299201163230054, + "learning_rate": 1.65587183770371e-05, + "loss": 0.2692, + "step": 5539 + }, + { + "epoch": 1.0920741324921135, + "grad_norm": 0.54789761189984, + "learning_rate": 1.6557548226635266e-05, + "loss": 0.3395, + "step": 5540 + }, + { + "epoch": 1.0922712933753944, + "grad_norm": 0.5853498159297642, + "learning_rate": 1.655637791868306e-05, + "loss": 0.336, + "step": 5541 + }, + { + "epoch": 1.0924684542586751, + "grad_norm": 0.630929601289797, + "learning_rate": 1.6555207453208596e-05, + "loss": 0.3531, + "step": 5542 + }, + { + "epoch": 1.0926656151419558, + "grad_norm": 0.5228836259363071, + "learning_rate": 1.655403683023999e-05, + "loss": 0.3291, + "step": 5543 + }, + { + "epoch": 1.0928627760252365, + "grad_norm": 0.5556250366890163, + "learning_rate": 1.655286604980537e-05, + "loss": 0.3462, + "step": 5544 + }, + { + "epoch": 1.0930599369085174, + "grad_norm": 0.5731618356985932, + "learning_rate": 1.655169511193287e-05, + "loss": 0.3276, + "step": 5545 + }, + { + "epoch": 1.0932570977917981, + "grad_norm": 0.5172208442087932, + "learning_rate": 1.6550524016650617e-05, + "loss": 0.3295, + "step": 5546 + }, + { + "epoch": 1.0934542586750788, + "grad_norm": 0.5971863550898441, + "learning_rate": 1.6549352763986747e-05, + "loss": 0.3635, + "step": 5547 + }, + { + "epoch": 1.0936514195583595, + "grad_norm": 0.5043253634738109, + "learning_rate": 1.6548181353969407e-05, + "loss": 0.3164, + "step": 5548 + }, + { + "epoch": 1.0938485804416405, + "grad_norm": 0.5441825059408084, + "learning_rate": 1.654700978662674e-05, + "loss": 0.3326, + "step": 5549 + }, + { + "epoch": 1.0940457413249212, + "grad_norm": 0.5397553446197998, + "learning_rate": 1.654583806198688e-05, + "loss": 0.3658, + "step": 5550 + }, + { + "epoch": 1.0942429022082019, + "grad_norm": 0.5504743804813628, + "learning_rate": 1.6544666180077996e-05, + "loss": 0.3526, + "step": 5551 + }, + { + "epoch": 1.0944400630914826, + "grad_norm": 0.5424844935779093, + "learning_rate": 1.6543494140928236e-05, + "loss": 0.3307, + "step": 5552 + }, + { + "epoch": 1.0946372239747635, + "grad_norm": 0.526220112929215, + "learning_rate": 1.654232194456576e-05, + "loss": 0.3308, + "step": 5553 + }, + { + "epoch": 1.0948343848580442, + "grad_norm": 0.5236749400742969, + "learning_rate": 1.6541149591018727e-05, + "loss": 0.3453, + "step": 5554 + }, + { + "epoch": 1.0950315457413249, + "grad_norm": 0.5266507318689269, + "learning_rate": 1.6539977080315313e-05, + "loss": 0.3305, + "step": 5555 + }, + { + "epoch": 1.0952287066246056, + "grad_norm": 0.5216109610169218, + "learning_rate": 1.653880441248368e-05, + "loss": 0.3316, + "step": 5556 + }, + { + "epoch": 1.0954258675078865, + "grad_norm": 0.5163493651525781, + "learning_rate": 1.6537631587552007e-05, + "loss": 0.342, + "step": 5557 + }, + { + "epoch": 1.0956230283911672, + "grad_norm": 0.5078186202086805, + "learning_rate": 1.6536458605548467e-05, + "loss": 0.3249, + "step": 5558 + }, + { + "epoch": 1.095820189274448, + "grad_norm": 0.5276901659588409, + "learning_rate": 1.6535285466501247e-05, + "loss": 0.3326, + "step": 5559 + }, + { + "epoch": 1.0960173501577286, + "grad_norm": 0.5355919928397173, + "learning_rate": 1.653411217043853e-05, + "loss": 0.352, + "step": 5560 + }, + { + "epoch": 1.0962145110410095, + "grad_norm": 0.4763053400285673, + "learning_rate": 1.653293871738851e-05, + "loss": 0.2937, + "step": 5561 + }, + { + "epoch": 1.0964116719242902, + "grad_norm": 0.5380622063499523, + "learning_rate": 1.6531765107379374e-05, + "loss": 0.3585, + "step": 5562 + }, + { + "epoch": 1.096608832807571, + "grad_norm": 0.5111119725625164, + "learning_rate": 1.653059134043932e-05, + "loss": 0.3336, + "step": 5563 + }, + { + "epoch": 1.0968059936908516, + "grad_norm": 0.49455292690513286, + "learning_rate": 1.652941741659655e-05, + "loss": 0.3136, + "step": 5564 + }, + { + "epoch": 1.0970031545741326, + "grad_norm": 0.6845117374932369, + "learning_rate": 1.652824333587927e-05, + "loss": 0.332, + "step": 5565 + }, + { + "epoch": 1.0972003154574133, + "grad_norm": 0.6523179647836078, + "learning_rate": 1.652706909831569e-05, + "loss": 0.3441, + "step": 5566 + }, + { + "epoch": 1.097397476340694, + "grad_norm": 0.4963174475713189, + "learning_rate": 1.6525894703934013e-05, + "loss": 0.3077, + "step": 5567 + }, + { + "epoch": 1.0975946372239749, + "grad_norm": 0.508004655738834, + "learning_rate": 1.6524720152762462e-05, + "loss": 0.3201, + "step": 5568 + }, + { + "epoch": 1.0977917981072556, + "grad_norm": 0.5984239179107959, + "learning_rate": 1.6523545444829254e-05, + "loss": 0.3592, + "step": 5569 + }, + { + "epoch": 1.0979889589905363, + "grad_norm": 0.5360407905331267, + "learning_rate": 1.6522370580162614e-05, + "loss": 0.3261, + "step": 5570 + }, + { + "epoch": 1.098186119873817, + "grad_norm": 0.5556220479614998, + "learning_rate": 1.652119555879077e-05, + "loss": 0.3431, + "step": 5571 + }, + { + "epoch": 1.0983832807570977, + "grad_norm": 0.5604439658652886, + "learning_rate": 1.652002038074195e-05, + "loss": 0.3145, + "step": 5572 + }, + { + "epoch": 1.0985804416403786, + "grad_norm": 0.5477226089327527, + "learning_rate": 1.651884504604439e-05, + "loss": 0.3404, + "step": 5573 + }, + { + "epoch": 1.0987776025236593, + "grad_norm": 0.5145353614083515, + "learning_rate": 1.6517669554726327e-05, + "loss": 0.3197, + "step": 5574 + }, + { + "epoch": 1.09897476340694, + "grad_norm": 0.5111044242152223, + "learning_rate": 1.6516493906816005e-05, + "loss": 0.3053, + "step": 5575 + }, + { + "epoch": 1.099171924290221, + "grad_norm": 0.5405424477707778, + "learning_rate": 1.6515318102341672e-05, + "loss": 0.3324, + "step": 5576 + }, + { + "epoch": 1.0993690851735016, + "grad_norm": 0.5193363546526818, + "learning_rate": 1.651414214133157e-05, + "loss": 0.3423, + "step": 5577 + }, + { + "epoch": 1.0995662460567823, + "grad_norm": 0.5144317929362779, + "learning_rate": 1.651296602381396e-05, + "loss": 0.3228, + "step": 5578 + }, + { + "epoch": 1.099763406940063, + "grad_norm": 0.5119676049482131, + "learning_rate": 1.6511789749817095e-05, + "loss": 0.3118, + "step": 5579 + }, + { + "epoch": 1.099960567823344, + "grad_norm": 0.5105599242984479, + "learning_rate": 1.651061331936924e-05, + "loss": 0.3323, + "step": 5580 + }, + { + "epoch": 1.1001577287066246, + "grad_norm": 0.5080653593030011, + "learning_rate": 1.6509436732498656e-05, + "loss": 0.3318, + "step": 5581 + }, + { + "epoch": 1.1003548895899053, + "grad_norm": 0.5031898165168965, + "learning_rate": 1.650825998923361e-05, + "loss": 0.3222, + "step": 5582 + }, + { + "epoch": 1.100552050473186, + "grad_norm": 0.5226955063741691, + "learning_rate": 1.650708308960238e-05, + "loss": 0.355, + "step": 5583 + }, + { + "epoch": 1.100749211356467, + "grad_norm": 0.5038434520036276, + "learning_rate": 1.6505906033633236e-05, + "loss": 0.3391, + "step": 5584 + }, + { + "epoch": 1.1009463722397477, + "grad_norm": 0.5059125745144066, + "learning_rate": 1.650472882135446e-05, + "loss": 0.3039, + "step": 5585 + }, + { + "epoch": 1.1011435331230284, + "grad_norm": 0.5040045595649187, + "learning_rate": 1.650355145279434e-05, + "loss": 0.3477, + "step": 5586 + }, + { + "epoch": 1.101340694006309, + "grad_norm": 0.5192804609775385, + "learning_rate": 1.6502373927981154e-05, + "loss": 0.33, + "step": 5587 + }, + { + "epoch": 1.10153785488959, + "grad_norm": 0.5922051341334427, + "learning_rate": 1.6501196246943202e-05, + "loss": 0.3541, + "step": 5588 + }, + { + "epoch": 1.1017350157728707, + "grad_norm": 0.49875642200196624, + "learning_rate": 1.650001840970877e-05, + "loss": 0.2838, + "step": 5589 + }, + { + "epoch": 1.1019321766561514, + "grad_norm": 0.4856484105077422, + "learning_rate": 1.6498840416306168e-05, + "loss": 0.3083, + "step": 5590 + }, + { + "epoch": 1.102129337539432, + "grad_norm": 0.5132652807308573, + "learning_rate": 1.6497662266763685e-05, + "loss": 0.315, + "step": 5591 + }, + { + "epoch": 1.102326498422713, + "grad_norm": 0.5193099983968429, + "learning_rate": 1.6496483961109638e-05, + "loss": 0.3029, + "step": 5592 + }, + { + "epoch": 1.1025236593059937, + "grad_norm": 0.5331749318093294, + "learning_rate": 1.649530549937233e-05, + "loss": 0.3383, + "step": 5593 + }, + { + "epoch": 1.1027208201892744, + "grad_norm": 0.5453618564420977, + "learning_rate": 1.6494126881580077e-05, + "loss": 0.3239, + "step": 5594 + }, + { + "epoch": 1.1029179810725551, + "grad_norm": 0.49856539989644866, + "learning_rate": 1.64929481077612e-05, + "loss": 0.3423, + "step": 5595 + }, + { + "epoch": 1.103115141955836, + "grad_norm": 0.5012478925347682, + "learning_rate": 1.649176917794401e-05, + "loss": 0.3283, + "step": 5596 + }, + { + "epoch": 1.1033123028391167, + "grad_norm": 0.5109791832923377, + "learning_rate": 1.649059009215684e-05, + "loss": 0.3039, + "step": 5597 + }, + { + "epoch": 1.1035094637223974, + "grad_norm": 0.5323841890352232, + "learning_rate": 1.6489410850428017e-05, + "loss": 0.3441, + "step": 5598 + }, + { + "epoch": 1.1037066246056781, + "grad_norm": 0.5666261888064066, + "learning_rate": 1.6488231452785867e-05, + "loss": 0.3586, + "step": 5599 + }, + { + "epoch": 1.103903785488959, + "grad_norm": 0.49589632413833346, + "learning_rate": 1.6487051899258738e-05, + "loss": 0.3359, + "step": 5600 + }, + { + "epoch": 1.1041009463722398, + "grad_norm": 0.522410156012698, + "learning_rate": 1.6485872189874962e-05, + "loss": 0.3242, + "step": 5601 + }, + { + "epoch": 1.1042981072555205, + "grad_norm": 0.5598921164556169, + "learning_rate": 1.6484692324662883e-05, + "loss": 0.3353, + "step": 5602 + }, + { + "epoch": 1.1044952681388012, + "grad_norm": 0.565537446173838, + "learning_rate": 1.6483512303650847e-05, + "loss": 0.3196, + "step": 5603 + }, + { + "epoch": 1.104692429022082, + "grad_norm": 0.5611042309344503, + "learning_rate": 1.648233212686721e-05, + "loss": 0.3531, + "step": 5604 + }, + { + "epoch": 1.1048895899053628, + "grad_norm": 0.6104352248715608, + "learning_rate": 1.648115179434032e-05, + "loss": 0.3411, + "step": 5605 + }, + { + "epoch": 1.1050867507886435, + "grad_norm": 0.5182267968343534, + "learning_rate": 1.647997130609854e-05, + "loss": 0.3328, + "step": 5606 + }, + { + "epoch": 1.1052839116719242, + "grad_norm": 0.6629279110385724, + "learning_rate": 1.647879066217023e-05, + "loss": 0.3664, + "step": 5607 + }, + { + "epoch": 1.1054810725552051, + "grad_norm": 0.48783718160836653, + "learning_rate": 1.6477609862583758e-05, + "loss": 0.3184, + "step": 5608 + }, + { + "epoch": 1.1056782334384858, + "grad_norm": 0.5388589810531627, + "learning_rate": 1.6476428907367497e-05, + "loss": 0.322, + "step": 5609 + }, + { + "epoch": 1.1058753943217665, + "grad_norm": 0.6019162312791431, + "learning_rate": 1.647524779654981e-05, + "loss": 0.35, + "step": 5610 + }, + { + "epoch": 1.1060725552050474, + "grad_norm": 0.5227308598811116, + "learning_rate": 1.6474066530159083e-05, + "loss": 0.346, + "step": 5611 + }, + { + "epoch": 1.1062697160883281, + "grad_norm": 0.5024291222077405, + "learning_rate": 1.6472885108223694e-05, + "loss": 0.3115, + "step": 5612 + }, + { + "epoch": 1.1064668769716088, + "grad_norm": 0.5971378494359267, + "learning_rate": 1.647170353077203e-05, + "loss": 0.3421, + "step": 5613 + }, + { + "epoch": 1.1066640378548895, + "grad_norm": 0.5323688610636911, + "learning_rate": 1.647052179783247e-05, + "loss": 0.3685, + "step": 5614 + }, + { + "epoch": 1.1068611987381702, + "grad_norm": 0.5494630004599759, + "learning_rate": 1.646933990943342e-05, + "loss": 0.3596, + "step": 5615 + }, + { + "epoch": 1.1070583596214512, + "grad_norm": 0.5315790166072238, + "learning_rate": 1.6468157865603265e-05, + "loss": 0.326, + "step": 5616 + }, + { + "epoch": 1.1072555205047319, + "grad_norm": 1.095318117653315, + "learning_rate": 1.646697566637041e-05, + "loss": 0.3391, + "step": 5617 + }, + { + "epoch": 1.1074526813880126, + "grad_norm": 0.4910427373938156, + "learning_rate": 1.6465793311763255e-05, + "loss": 0.3082, + "step": 5618 + }, + { + "epoch": 1.1076498422712935, + "grad_norm": 0.5479397492318889, + "learning_rate": 1.646461080181021e-05, + "loss": 0.3344, + "step": 5619 + }, + { + "epoch": 1.1078470031545742, + "grad_norm": 0.5103248145847307, + "learning_rate": 1.6463428136539684e-05, + "loss": 0.324, + "step": 5620 + }, + { + "epoch": 1.1080441640378549, + "grad_norm": 0.5630855688757884, + "learning_rate": 1.6462245315980094e-05, + "loss": 0.3275, + "step": 5621 + }, + { + "epoch": 1.1082413249211356, + "grad_norm": 0.5263509918106348, + "learning_rate": 1.6461062340159853e-05, + "loss": 0.3189, + "step": 5622 + }, + { + "epoch": 1.1084384858044165, + "grad_norm": 0.6493282607434927, + "learning_rate": 1.6459879209107394e-05, + "loss": 0.3394, + "step": 5623 + }, + { + "epoch": 1.1086356466876972, + "grad_norm": 0.6049538585958488, + "learning_rate": 1.6458695922851126e-05, + "loss": 0.3308, + "step": 5624 + }, + { + "epoch": 1.108832807570978, + "grad_norm": 0.531824713225462, + "learning_rate": 1.6457512481419492e-05, + "loss": 0.3412, + "step": 5625 + }, + { + "epoch": 1.1090299684542586, + "grad_norm": 0.5404382119881224, + "learning_rate": 1.6456328884840917e-05, + "loss": 0.3427, + "step": 5626 + }, + { + "epoch": 1.1092271293375395, + "grad_norm": 0.5405048280095781, + "learning_rate": 1.6455145133143843e-05, + "loss": 0.3322, + "step": 5627 + }, + { + "epoch": 1.1094242902208202, + "grad_norm": 0.5808034796183005, + "learning_rate": 1.645396122635671e-05, + "loss": 0.3764, + "step": 5628 + }, + { + "epoch": 1.109621451104101, + "grad_norm": 0.5655517916406795, + "learning_rate": 1.6452777164507957e-05, + "loss": 0.3236, + "step": 5629 + }, + { + "epoch": 1.1098186119873816, + "grad_norm": 0.5394522259534293, + "learning_rate": 1.6451592947626043e-05, + "loss": 0.3317, + "step": 5630 + }, + { + "epoch": 1.1100157728706626, + "grad_norm": 0.5151963061557067, + "learning_rate": 1.6450408575739407e-05, + "loss": 0.3467, + "step": 5631 + }, + { + "epoch": 1.1102129337539433, + "grad_norm": 0.5279890856885844, + "learning_rate": 1.6449224048876512e-05, + "loss": 0.3258, + "step": 5632 + }, + { + "epoch": 1.110410094637224, + "grad_norm": 0.5779114047767783, + "learning_rate": 1.6448039367065816e-05, + "loss": 0.3662, + "step": 5633 + }, + { + "epoch": 1.1106072555205047, + "grad_norm": 0.5019131557840137, + "learning_rate": 1.6446854530335783e-05, + "loss": 0.3201, + "step": 5634 + }, + { + "epoch": 1.1108044164037856, + "grad_norm": 0.5524102955169621, + "learning_rate": 1.6445669538714878e-05, + "loss": 0.3678, + "step": 5635 + }, + { + "epoch": 1.1110015772870663, + "grad_norm": 0.5274386217024158, + "learning_rate": 1.6444484392231574e-05, + "loss": 0.3452, + "step": 5636 + }, + { + "epoch": 1.111198738170347, + "grad_norm": 0.5065419330922859, + "learning_rate": 1.6443299090914336e-05, + "loss": 0.3135, + "step": 5637 + }, + { + "epoch": 1.1113958990536277, + "grad_norm": 0.5321762740305643, + "learning_rate": 1.6442113634791653e-05, + "loss": 0.36, + "step": 5638 + }, + { + "epoch": 1.1115930599369086, + "grad_norm": 0.4997951806163294, + "learning_rate": 1.6440928023892e-05, + "loss": 0.3222, + "step": 5639 + }, + { + "epoch": 1.1117902208201893, + "grad_norm": 0.4876569661311848, + "learning_rate": 1.643974225824387e-05, + "loss": 0.3212, + "step": 5640 + }, + { + "epoch": 1.11198738170347, + "grad_norm": 0.5216914599856519, + "learning_rate": 1.643855633787574e-05, + "loss": 0.3373, + "step": 5641 + }, + { + "epoch": 1.1121845425867507, + "grad_norm": 0.48155980759723177, + "learning_rate": 1.643737026281611e-05, + "loss": 0.3003, + "step": 5642 + }, + { + "epoch": 1.1123817034700316, + "grad_norm": 0.5168478481124772, + "learning_rate": 1.643618403309348e-05, + "loss": 0.3435, + "step": 5643 + }, + { + "epoch": 1.1125788643533123, + "grad_norm": 0.5105355384691713, + "learning_rate": 1.643499764873634e-05, + "loss": 0.3098, + "step": 5644 + }, + { + "epoch": 1.112776025236593, + "grad_norm": 0.5395714096049974, + "learning_rate": 1.6433811109773202e-05, + "loss": 0.3495, + "step": 5645 + }, + { + "epoch": 1.1129731861198737, + "grad_norm": 0.5042000786890821, + "learning_rate": 1.643262441623257e-05, + "loss": 0.3231, + "step": 5646 + }, + { + "epoch": 1.1131703470031546, + "grad_norm": 0.5278311430339191, + "learning_rate": 1.6431437568142956e-05, + "loss": 0.3529, + "step": 5647 + }, + { + "epoch": 1.1133675078864353, + "grad_norm": 0.5180324474360586, + "learning_rate": 1.6430250565532878e-05, + "loss": 0.3456, + "step": 5648 + }, + { + "epoch": 1.113564668769716, + "grad_norm": 0.5081894704748161, + "learning_rate": 1.642906340843085e-05, + "loss": 0.3288, + "step": 5649 + }, + { + "epoch": 1.1137618296529967, + "grad_norm": 0.6664115588691337, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.3455, + "step": 5650 + }, + { + "epoch": 1.1139589905362777, + "grad_norm": 3.5585025573079028, + "learning_rate": 1.642668863086504e-05, + "loss": 0.3337, + "step": 5651 + }, + { + "epoch": 1.1141561514195584, + "grad_norm": 0.5420249391026032, + "learning_rate": 1.642550101045832e-05, + "loss": 0.3192, + "step": 5652 + }, + { + "epoch": 1.114353312302839, + "grad_norm": 0.5862664575213457, + "learning_rate": 1.6424313235673758e-05, + "loss": 0.3308, + "step": 5653 + }, + { + "epoch": 1.1145504731861198, + "grad_norm": 1.8318481156437365, + "learning_rate": 1.6423125306539903e-05, + "loss": 0.3252, + "step": 5654 + }, + { + "epoch": 1.1147476340694007, + "grad_norm": 0.6180923856298878, + "learning_rate": 1.6421937223085284e-05, + "loss": 0.3386, + "step": 5655 + }, + { + "epoch": 1.1149447949526814, + "grad_norm": 0.5427448619206314, + "learning_rate": 1.6420748985338454e-05, + "loss": 0.3212, + "step": 5656 + }, + { + "epoch": 1.115141955835962, + "grad_norm": 0.5267157193078928, + "learning_rate": 1.641956059332796e-05, + "loss": 0.3326, + "step": 5657 + }, + { + "epoch": 1.1153391167192428, + "grad_norm": 0.5512555264071693, + "learning_rate": 1.6418372047082352e-05, + "loss": 0.3224, + "step": 5658 + }, + { + "epoch": 1.1155362776025237, + "grad_norm": 0.5720422317475261, + "learning_rate": 1.6417183346630188e-05, + "loss": 0.334, + "step": 5659 + }, + { + "epoch": 1.1157334384858044, + "grad_norm": 0.5754561614830481, + "learning_rate": 1.6415994492000026e-05, + "loss": 0.3141, + "step": 5660 + }, + { + "epoch": 1.1159305993690851, + "grad_norm": 0.6375296150804535, + "learning_rate": 1.6414805483220426e-05, + "loss": 0.3275, + "step": 5661 + }, + { + "epoch": 1.116127760252366, + "grad_norm": 0.509049237537517, + "learning_rate": 1.641361632031996e-05, + "loss": 0.3239, + "step": 5662 + }, + { + "epoch": 1.1163249211356467, + "grad_norm": 0.5128636795892693, + "learning_rate": 1.64124270033272e-05, + "loss": 0.3159, + "step": 5663 + }, + { + "epoch": 1.1165220820189274, + "grad_norm": 0.4912807390670567, + "learning_rate": 1.6411237532270718e-05, + "loss": 0.3144, + "step": 5664 + }, + { + "epoch": 1.1167192429022081, + "grad_norm": 0.5397224874275655, + "learning_rate": 1.641004790717909e-05, + "loss": 0.3313, + "step": 5665 + }, + { + "epoch": 1.1169164037854888, + "grad_norm": 0.5253163708246812, + "learning_rate": 1.64088581280809e-05, + "loss": 0.3363, + "step": 5666 + }, + { + "epoch": 1.1171135646687698, + "grad_norm": 0.6246187071629071, + "learning_rate": 1.6407668195004726e-05, + "loss": 0.3517, + "step": 5667 + }, + { + "epoch": 1.1173107255520505, + "grad_norm": 0.510212683182266, + "learning_rate": 1.640647810797917e-05, + "loss": 0.3423, + "step": 5668 + }, + { + "epoch": 1.1175078864353312, + "grad_norm": 0.5433994977386873, + "learning_rate": 1.6405287867032816e-05, + "loss": 0.3364, + "step": 5669 + }, + { + "epoch": 1.117705047318612, + "grad_norm": 0.5114129587959397, + "learning_rate": 1.6404097472194264e-05, + "loss": 0.3212, + "step": 5670 + }, + { + "epoch": 1.1179022082018928, + "grad_norm": 0.6184701890817311, + "learning_rate": 1.6402906923492113e-05, + "loss": 0.3586, + "step": 5671 + }, + { + "epoch": 1.1180993690851735, + "grad_norm": 0.5157737817133912, + "learning_rate": 1.6401716220954968e-05, + "loss": 0.3466, + "step": 5672 + }, + { + "epoch": 1.1182965299684542, + "grad_norm": 0.5239425894073806, + "learning_rate": 1.640052536461143e-05, + "loss": 0.3367, + "step": 5673 + }, + { + "epoch": 1.1184936908517351, + "grad_norm": 0.6216718049365948, + "learning_rate": 1.6399334354490123e-05, + "loss": 0.3495, + "step": 5674 + }, + { + "epoch": 1.1186908517350158, + "grad_norm": 0.5111026021999556, + "learning_rate": 1.6398143190619648e-05, + "loss": 0.3212, + "step": 5675 + }, + { + "epoch": 1.1188880126182965, + "grad_norm": 0.49466574177659056, + "learning_rate": 1.6396951873028634e-05, + "loss": 0.3287, + "step": 5676 + }, + { + "epoch": 1.1190851735015772, + "grad_norm": 0.48459040865141534, + "learning_rate": 1.63957604017457e-05, + "loss": 0.3209, + "step": 5677 + }, + { + "epoch": 1.1192823343848581, + "grad_norm": 0.5002774049961711, + "learning_rate": 1.6394568776799472e-05, + "loss": 0.3325, + "step": 5678 + }, + { + "epoch": 1.1194794952681388, + "grad_norm": 0.49713568658847396, + "learning_rate": 1.6393376998218583e-05, + "loss": 0.3456, + "step": 5679 + }, + { + "epoch": 1.1196766561514195, + "grad_norm": 0.49493207831234715, + "learning_rate": 1.6392185066031657e-05, + "loss": 0.3127, + "step": 5680 + }, + { + "epoch": 1.1198738170347002, + "grad_norm": 0.5362871237694371, + "learning_rate": 1.639099298026734e-05, + "loss": 0.3451, + "step": 5681 + }, + { + "epoch": 1.1200709779179812, + "grad_norm": 0.5294651321602112, + "learning_rate": 1.6389800740954268e-05, + "loss": 0.362, + "step": 5682 + }, + { + "epoch": 1.1202681388012619, + "grad_norm": 0.48735850316187274, + "learning_rate": 1.6388608348121088e-05, + "loss": 0.341, + "step": 5683 + }, + { + "epoch": 1.1204652996845426, + "grad_norm": 0.5248544933902106, + "learning_rate": 1.638741580179645e-05, + "loss": 0.3176, + "step": 5684 + }, + { + "epoch": 1.1206624605678233, + "grad_norm": 0.5439891987403144, + "learning_rate": 1.6386223102009e-05, + "loss": 0.359, + "step": 5685 + }, + { + "epoch": 1.1208596214511042, + "grad_norm": 0.5011856573812568, + "learning_rate": 1.6385030248787402e-05, + "loss": 0.33, + "step": 5686 + }, + { + "epoch": 1.1210567823343849, + "grad_norm": 0.4973353261615954, + "learning_rate": 1.6383837242160304e-05, + "loss": 0.3253, + "step": 5687 + }, + { + "epoch": 1.1212539432176656, + "grad_norm": 0.5271878211430877, + "learning_rate": 1.638264408215638e-05, + "loss": 0.3273, + "step": 5688 + }, + { + "epoch": 1.1214511041009463, + "grad_norm": 0.5007462516405465, + "learning_rate": 1.6381450768804293e-05, + "loss": 0.3566, + "step": 5689 + }, + { + "epoch": 1.1216482649842272, + "grad_norm": 0.5032257739082565, + "learning_rate": 1.638025730213271e-05, + "loss": 0.3189, + "step": 5690 + }, + { + "epoch": 1.121845425867508, + "grad_norm": 0.5057418106825747, + "learning_rate": 1.637906368217031e-05, + "loss": 0.3296, + "step": 5691 + }, + { + "epoch": 1.1220425867507886, + "grad_norm": 0.5220864858440248, + "learning_rate": 1.6377869908945763e-05, + "loss": 0.3013, + "step": 5692 + }, + { + "epoch": 1.1222397476340693, + "grad_norm": 0.49972633540948586, + "learning_rate": 1.637667598248776e-05, + "loss": 0.3246, + "step": 5693 + }, + { + "epoch": 1.1224369085173502, + "grad_norm": 0.48521860052546617, + "learning_rate": 1.6375481902824975e-05, + "loss": 0.3067, + "step": 5694 + }, + { + "epoch": 1.122634069400631, + "grad_norm": 0.5137855325686259, + "learning_rate": 1.6374287669986108e-05, + "loss": 0.333, + "step": 5695 + }, + { + "epoch": 1.1228312302839116, + "grad_norm": 0.5053341438549699, + "learning_rate": 1.637309328399985e-05, + "loss": 0.3131, + "step": 5696 + }, + { + "epoch": 1.1230283911671923, + "grad_norm": 0.4811130427810911, + "learning_rate": 1.6371898744894888e-05, + "loss": 0.2932, + "step": 5697 + }, + { + "epoch": 1.1232255520504733, + "grad_norm": 0.4692455927554586, + "learning_rate": 1.6370704052699927e-05, + "loss": 0.3081, + "step": 5698 + }, + { + "epoch": 1.123422712933754, + "grad_norm": 0.46390772652017165, + "learning_rate": 1.6369509207443676e-05, + "loss": 0.2891, + "step": 5699 + }, + { + "epoch": 1.1236198738170347, + "grad_norm": 0.5387624182052491, + "learning_rate": 1.6368314209154836e-05, + "loss": 0.357, + "step": 5700 + }, + { + "epoch": 1.1238170347003154, + "grad_norm": 0.4959986328725572, + "learning_rate": 1.6367119057862115e-05, + "loss": 0.3191, + "step": 5701 + }, + { + "epoch": 1.1240141955835963, + "grad_norm": 0.5071814974603067, + "learning_rate": 1.636592375359423e-05, + "loss": 0.3324, + "step": 5702 + }, + { + "epoch": 1.124211356466877, + "grad_norm": 0.5104341041644923, + "learning_rate": 1.6364728296379904e-05, + "loss": 0.3506, + "step": 5703 + }, + { + "epoch": 1.1244085173501577, + "grad_norm": 0.5464624679656105, + "learning_rate": 1.6363532686247853e-05, + "loss": 0.3123, + "step": 5704 + }, + { + "epoch": 1.1246056782334386, + "grad_norm": 0.5028129404728894, + "learning_rate": 1.636233692322681e-05, + "loss": 0.307, + "step": 5705 + }, + { + "epoch": 1.1248028391167193, + "grad_norm": 0.6152702419910974, + "learning_rate": 1.6361141007345494e-05, + "loss": 0.3553, + "step": 5706 + }, + { + "epoch": 1.125, + "grad_norm": 0.48179193603259657, + "learning_rate": 1.6359944938632645e-05, + "loss": 0.3147, + "step": 5707 + }, + { + "epoch": 1.1251971608832807, + "grad_norm": 0.5598824963334975, + "learning_rate": 1.6358748717116993e-05, + "loss": 0.3402, + "step": 5708 + }, + { + "epoch": 1.1253943217665614, + "grad_norm": 0.5571114963157853, + "learning_rate": 1.6357552342827284e-05, + "loss": 0.3206, + "step": 5709 + }, + { + "epoch": 1.1255914826498423, + "grad_norm": 0.4925155006030104, + "learning_rate": 1.6356355815792263e-05, + "loss": 0.3077, + "step": 5710 + }, + { + "epoch": 1.125788643533123, + "grad_norm": 0.6517198961949976, + "learning_rate": 1.635515913604067e-05, + "loss": 0.3521, + "step": 5711 + }, + { + "epoch": 1.1259858044164037, + "grad_norm": 0.5747480536099305, + "learning_rate": 1.6353962303601266e-05, + "loss": 0.364, + "step": 5712 + }, + { + "epoch": 1.1261829652996846, + "grad_norm": 0.5712666247584104, + "learning_rate": 1.63527653185028e-05, + "loss": 0.3705, + "step": 5713 + }, + { + "epoch": 1.1263801261829653, + "grad_norm": 0.5587416233409346, + "learning_rate": 1.6351568180774033e-05, + "loss": 0.319, + "step": 5714 + }, + { + "epoch": 1.126577287066246, + "grad_norm": 0.5358691155957552, + "learning_rate": 1.635037089044372e-05, + "loss": 0.3336, + "step": 5715 + }, + { + "epoch": 1.1267744479495267, + "grad_norm": 0.5829710216948807, + "learning_rate": 1.6349173447540634e-05, + "loss": 0.3508, + "step": 5716 + }, + { + "epoch": 1.1269716088328074, + "grad_norm": 0.5281339705441374, + "learning_rate": 1.6347975852093544e-05, + "loss": 0.3266, + "step": 5717 + }, + { + "epoch": 1.1271687697160884, + "grad_norm": 0.5300750941636183, + "learning_rate": 1.6346778104131222e-05, + "loss": 0.3367, + "step": 5718 + }, + { + "epoch": 1.127365930599369, + "grad_norm": 0.5660626646152135, + "learning_rate": 1.634558020368245e-05, + "loss": 0.3543, + "step": 5719 + }, + { + "epoch": 1.1275630914826498, + "grad_norm": 0.4900690395273459, + "learning_rate": 1.6344382150775994e-05, + "loss": 0.2959, + "step": 5720 + }, + { + "epoch": 1.1277602523659307, + "grad_norm": 0.5929670034437629, + "learning_rate": 1.6343183945440656e-05, + "loss": 0.3446, + "step": 5721 + }, + { + "epoch": 1.1279574132492114, + "grad_norm": 0.5581429670822288, + "learning_rate": 1.634198558770521e-05, + "loss": 0.3424, + "step": 5722 + }, + { + "epoch": 1.128154574132492, + "grad_norm": 0.5554338618967727, + "learning_rate": 1.6340787077598456e-05, + "loss": 0.3407, + "step": 5723 + }, + { + "epoch": 1.1283517350157728, + "grad_norm": 0.4917238000036681, + "learning_rate": 1.6339588415149186e-05, + "loss": 0.3234, + "step": 5724 + }, + { + "epoch": 1.1285488958990537, + "grad_norm": 0.6106081461017013, + "learning_rate": 1.63383896003862e-05, + "loss": 0.3239, + "step": 5725 + }, + { + "epoch": 1.1287460567823344, + "grad_norm": 0.5010753713189542, + "learning_rate": 1.6337190633338294e-05, + "loss": 0.3191, + "step": 5726 + }, + { + "epoch": 1.1289432176656151, + "grad_norm": 0.48674816598759957, + "learning_rate": 1.6335991514034283e-05, + "loss": 0.2984, + "step": 5727 + }, + { + "epoch": 1.1291403785488958, + "grad_norm": 0.505286321715634, + "learning_rate": 1.6334792242502978e-05, + "loss": 0.3447, + "step": 5728 + }, + { + "epoch": 1.1293375394321767, + "grad_norm": 0.5524393690270945, + "learning_rate": 1.633359281877318e-05, + "loss": 0.3392, + "step": 5729 + }, + { + "epoch": 1.1295347003154574, + "grad_norm": 0.49196800593653534, + "learning_rate": 1.633239324287372e-05, + "loss": 0.3213, + "step": 5730 + }, + { + "epoch": 1.1297318611987381, + "grad_norm": 0.5143287994894329, + "learning_rate": 1.6331193514833413e-05, + "loss": 0.3173, + "step": 5731 + }, + { + "epoch": 1.1299290220820188, + "grad_norm": 0.5176908223126655, + "learning_rate": 1.632999363468108e-05, + "loss": 0.331, + "step": 5732 + }, + { + "epoch": 1.1301261829652998, + "grad_norm": 0.5267013800473567, + "learning_rate": 1.6328793602445553e-05, + "loss": 0.3564, + "step": 5733 + }, + { + "epoch": 1.1303233438485805, + "grad_norm": 0.5075643399395108, + "learning_rate": 1.6327593418155667e-05, + "loss": 0.3161, + "step": 5734 + }, + { + "epoch": 1.1305205047318612, + "grad_norm": 0.5262025072548631, + "learning_rate": 1.632639308184025e-05, + "loss": 0.3345, + "step": 5735 + }, + { + "epoch": 1.1307176656151419, + "grad_norm": 0.5292704648957198, + "learning_rate": 1.632519259352814e-05, + "loss": 0.3492, + "step": 5736 + }, + { + "epoch": 1.1309148264984228, + "grad_norm": 0.48829983713727304, + "learning_rate": 1.6323991953248192e-05, + "loss": 0.3171, + "step": 5737 + }, + { + "epoch": 1.1311119873817035, + "grad_norm": 0.4988512975455296, + "learning_rate": 1.6322791161029245e-05, + "loss": 0.3317, + "step": 5738 + }, + { + "epoch": 1.1313091482649842, + "grad_norm": 0.5269442131354157, + "learning_rate": 1.6321590216900146e-05, + "loss": 0.3308, + "step": 5739 + }, + { + "epoch": 1.1315063091482649, + "grad_norm": 0.4902800227785573, + "learning_rate": 1.632038912088975e-05, + "loss": 0.3061, + "step": 5740 + }, + { + "epoch": 1.1317034700315458, + "grad_norm": 0.5247877126830968, + "learning_rate": 1.6319187873026917e-05, + "loss": 0.3197, + "step": 5741 + }, + { + "epoch": 1.1319006309148265, + "grad_norm": 0.5541688059243428, + "learning_rate": 1.6317986473340504e-05, + "loss": 0.3295, + "step": 5742 + }, + { + "epoch": 1.1320977917981072, + "grad_norm": 0.5564999524149586, + "learning_rate": 1.631678492185938e-05, + "loss": 0.3419, + "step": 5743 + }, + { + "epoch": 1.132294952681388, + "grad_norm": 0.4751388780686587, + "learning_rate": 1.631558321861241e-05, + "loss": 0.3068, + "step": 5744 + }, + { + "epoch": 1.1324921135646688, + "grad_norm": 0.5251581221447832, + "learning_rate": 1.631438136362847e-05, + "loss": 0.3438, + "step": 5745 + }, + { + "epoch": 1.1326892744479495, + "grad_norm": 0.49557848543104777, + "learning_rate": 1.6313179356936432e-05, + "loss": 0.3336, + "step": 5746 + }, + { + "epoch": 1.1328864353312302, + "grad_norm": 0.5759205928280032, + "learning_rate": 1.631197719856518e-05, + "loss": 0.341, + "step": 5747 + }, + { + "epoch": 1.1330835962145112, + "grad_norm": 0.5279003018602465, + "learning_rate": 1.6310774888543584e-05, + "loss": 0.3361, + "step": 5748 + }, + { + "epoch": 1.1332807570977919, + "grad_norm": 0.5105221641137494, + "learning_rate": 1.6309572426900544e-05, + "loss": 0.3463, + "step": 5749 + }, + { + "epoch": 1.1334779179810726, + "grad_norm": 0.4979241157777894, + "learning_rate": 1.6308369813664945e-05, + "loss": 0.3316, + "step": 5750 + }, + { + "epoch": 1.1336750788643533, + "grad_norm": 0.480147781148586, + "learning_rate": 1.630716704886568e-05, + "loss": 0.3278, + "step": 5751 + }, + { + "epoch": 1.133872239747634, + "grad_norm": 0.5190158936682682, + "learning_rate": 1.630596413253165e-05, + "loss": 0.3487, + "step": 5752 + }, + { + "epoch": 1.1340694006309149, + "grad_norm": 0.48721620702672924, + "learning_rate": 1.6304761064691752e-05, + "loss": 0.331, + "step": 5753 + }, + { + "epoch": 1.1342665615141956, + "grad_norm": 0.48669878145102674, + "learning_rate": 1.6303557845374894e-05, + "loss": 0.3157, + "step": 5754 + }, + { + "epoch": 1.1344637223974763, + "grad_norm": 0.48762645113611275, + "learning_rate": 1.630235447460998e-05, + "loss": 0.3414, + "step": 5755 + }, + { + "epoch": 1.1346608832807572, + "grad_norm": 0.45531042840793384, + "learning_rate": 1.6301150952425925e-05, + "loss": 0.3097, + "step": 5756 + }, + { + "epoch": 1.134858044164038, + "grad_norm": 0.5085651272774692, + "learning_rate": 1.6299947278851644e-05, + "loss": 0.333, + "step": 5757 + }, + { + "epoch": 1.1350552050473186, + "grad_norm": 0.4893879791327469, + "learning_rate": 1.6298743453916057e-05, + "loss": 0.3044, + "step": 5758 + }, + { + "epoch": 1.1352523659305993, + "grad_norm": 0.5589441744381375, + "learning_rate": 1.6297539477648087e-05, + "loss": 0.3467, + "step": 5759 + }, + { + "epoch": 1.13544952681388, + "grad_norm": 0.4815754124533146, + "learning_rate": 1.6296335350076658e-05, + "loss": 0.3328, + "step": 5760 + }, + { + "epoch": 1.135646687697161, + "grad_norm": 0.4867892480819176, + "learning_rate": 1.6295131071230704e-05, + "loss": 0.3256, + "step": 5761 + }, + { + "epoch": 1.1358438485804416, + "grad_norm": 0.5067944820954798, + "learning_rate": 1.6293926641139154e-05, + "loss": 0.3323, + "step": 5762 + }, + { + "epoch": 1.1360410094637223, + "grad_norm": 0.5113632611165255, + "learning_rate": 1.629272205983095e-05, + "loss": 0.3311, + "step": 5763 + }, + { + "epoch": 1.1362381703470033, + "grad_norm": 0.5560655366878887, + "learning_rate": 1.6291517327335027e-05, + "loss": 0.3531, + "step": 5764 + }, + { + "epoch": 1.136435331230284, + "grad_norm": 0.5079516665115742, + "learning_rate": 1.6290312443680335e-05, + "loss": 0.3268, + "step": 5765 + }, + { + "epoch": 1.1366324921135647, + "grad_norm": 0.49476357370054935, + "learning_rate": 1.6289107408895827e-05, + "loss": 0.3199, + "step": 5766 + }, + { + "epoch": 1.1368296529968454, + "grad_norm": 0.5068561554934843, + "learning_rate": 1.6287902223010442e-05, + "loss": 0.3093, + "step": 5767 + }, + { + "epoch": 1.1370268138801263, + "grad_norm": 0.5241913684270388, + "learning_rate": 1.6286696886053146e-05, + "loss": 0.3442, + "step": 5768 + }, + { + "epoch": 1.137223974763407, + "grad_norm": 0.5303723352436314, + "learning_rate": 1.6285491398052892e-05, + "loss": 0.3473, + "step": 5769 + }, + { + "epoch": 1.1374211356466877, + "grad_norm": 0.5013219330856153, + "learning_rate": 1.6284285759038647e-05, + "loss": 0.3334, + "step": 5770 + }, + { + "epoch": 1.1376182965299684, + "grad_norm": 0.5116295193399402, + "learning_rate": 1.6283079969039376e-05, + "loss": 0.3348, + "step": 5771 + }, + { + "epoch": 1.1378154574132493, + "grad_norm": 0.5072830412189899, + "learning_rate": 1.6281874028084048e-05, + "loss": 0.3304, + "step": 5772 + }, + { + "epoch": 1.13801261829653, + "grad_norm": 0.5969770845470891, + "learning_rate": 1.6280667936201638e-05, + "loss": 0.3693, + "step": 5773 + }, + { + "epoch": 1.1382097791798107, + "grad_norm": 0.49707202160221337, + "learning_rate": 1.6279461693421122e-05, + "loss": 0.3092, + "step": 5774 + }, + { + "epoch": 1.1384069400630914, + "grad_norm": 0.48306958026968283, + "learning_rate": 1.6278255299771485e-05, + "loss": 0.3133, + "step": 5775 + }, + { + "epoch": 1.1386041009463723, + "grad_norm": 0.49732568766268076, + "learning_rate": 1.6277048755281706e-05, + "loss": 0.3133, + "step": 5776 + }, + { + "epoch": 1.138801261829653, + "grad_norm": 0.49440667619738315, + "learning_rate": 1.6275842059980777e-05, + "loss": 0.3081, + "step": 5777 + }, + { + "epoch": 1.1389984227129337, + "grad_norm": 0.49394133306858135, + "learning_rate": 1.627463521389769e-05, + "loss": 0.3075, + "step": 5778 + }, + { + "epoch": 1.1391955835962144, + "grad_norm": 0.5006133223548448, + "learning_rate": 1.6273428217061438e-05, + "loss": 0.3382, + "step": 5779 + }, + { + "epoch": 1.1393927444794953, + "grad_norm": 0.5094072983950105, + "learning_rate": 1.627222106950102e-05, + "loss": 0.3195, + "step": 5780 + }, + { + "epoch": 1.139589905362776, + "grad_norm": 0.4911846217605256, + "learning_rate": 1.627101377124544e-05, + "loss": 0.3157, + "step": 5781 + }, + { + "epoch": 1.1397870662460567, + "grad_norm": 0.5197657629535235, + "learning_rate": 1.626980632232371e-05, + "loss": 0.3455, + "step": 5782 + }, + { + "epoch": 1.1399842271293374, + "grad_norm": 0.5165791218182875, + "learning_rate": 1.6268598722764825e-05, + "loss": 0.3571, + "step": 5783 + }, + { + "epoch": 1.1401813880126184, + "grad_norm": 0.4911136281917946, + "learning_rate": 1.6267390972597808e-05, + "loss": 0.3258, + "step": 5784 + }, + { + "epoch": 1.140378548895899, + "grad_norm": 0.4791334939004954, + "learning_rate": 1.626618307185168e-05, + "loss": 0.313, + "step": 5785 + }, + { + "epoch": 1.1405757097791798, + "grad_norm": 0.507257163220278, + "learning_rate": 1.626497502055546e-05, + "loss": 0.3348, + "step": 5786 + }, + { + "epoch": 1.1407728706624605, + "grad_norm": 0.5095261380886309, + "learning_rate": 1.6263766818738163e-05, + "loss": 0.3352, + "step": 5787 + }, + { + "epoch": 1.1409700315457414, + "grad_norm": 0.489168264070942, + "learning_rate": 1.6262558466428827e-05, + "loss": 0.3181, + "step": 5788 + }, + { + "epoch": 1.141167192429022, + "grad_norm": 0.5758632623492379, + "learning_rate": 1.6261349963656482e-05, + "loss": 0.3221, + "step": 5789 + }, + { + "epoch": 1.1413643533123028, + "grad_norm": 0.5155189277830298, + "learning_rate": 1.6260141310450158e-05, + "loss": 0.3378, + "step": 5790 + }, + { + "epoch": 1.1415615141955837, + "grad_norm": 0.5361821367760298, + "learning_rate": 1.6258932506838903e-05, + "loss": 0.3548, + "step": 5791 + }, + { + "epoch": 1.1417586750788644, + "grad_norm": 0.510073475492786, + "learning_rate": 1.6257723552851752e-05, + "loss": 0.3259, + "step": 5792 + }, + { + "epoch": 1.1419558359621451, + "grad_norm": 0.5181336652283103, + "learning_rate": 1.6256514448517753e-05, + "loss": 0.3181, + "step": 5793 + }, + { + "epoch": 1.1421529968454258, + "grad_norm": 0.5218135564565195, + "learning_rate": 1.6255305193865957e-05, + "loss": 0.3269, + "step": 5794 + }, + { + "epoch": 1.1423501577287065, + "grad_norm": 0.6090159220142208, + "learning_rate": 1.6254095788925413e-05, + "loss": 0.3487, + "step": 5795 + }, + { + "epoch": 1.1425473186119874, + "grad_norm": 0.4918572565910208, + "learning_rate": 1.6252886233725186e-05, + "loss": 0.3222, + "step": 5796 + }, + { + "epoch": 1.1427444794952681, + "grad_norm": 0.5170092942493402, + "learning_rate": 1.625167652829433e-05, + "loss": 0.3075, + "step": 5797 + }, + { + "epoch": 1.1429416403785488, + "grad_norm": 0.5153552400826925, + "learning_rate": 1.625046667266191e-05, + "loss": 0.3552, + "step": 5798 + }, + { + "epoch": 1.1431388012618298, + "grad_norm": 0.5092426572265226, + "learning_rate": 1.6249256666856995e-05, + "loss": 0.2974, + "step": 5799 + }, + { + "epoch": 1.1433359621451105, + "grad_norm": 0.5381814851040011, + "learning_rate": 1.6248046510908654e-05, + "loss": 0.3186, + "step": 5800 + }, + { + "epoch": 1.1435331230283912, + "grad_norm": 0.5393871623205769, + "learning_rate": 1.6246836204845967e-05, + "loss": 0.3503, + "step": 5801 + }, + { + "epoch": 1.1437302839116719, + "grad_norm": 0.5047753301047979, + "learning_rate": 1.624562574869801e-05, + "loss": 0.3232, + "step": 5802 + }, + { + "epoch": 1.1439274447949526, + "grad_norm": 0.5134380411824181, + "learning_rate": 1.6244415142493867e-05, + "loss": 0.3374, + "step": 5803 + }, + { + "epoch": 1.1441246056782335, + "grad_norm": 0.5382530413103754, + "learning_rate": 1.6243204386262618e-05, + "loss": 0.3472, + "step": 5804 + }, + { + "epoch": 1.1443217665615142, + "grad_norm": 0.4860783558028898, + "learning_rate": 1.6241993480033353e-05, + "loss": 0.2894, + "step": 5805 + }, + { + "epoch": 1.1445189274447949, + "grad_norm": 0.5360606741317754, + "learning_rate": 1.6240782423835174e-05, + "loss": 0.3526, + "step": 5806 + }, + { + "epoch": 1.1447160883280758, + "grad_norm": 0.4950368935661035, + "learning_rate": 1.6239571217697164e-05, + "loss": 0.3282, + "step": 5807 + }, + { + "epoch": 1.1449132492113565, + "grad_norm": 0.5007041011557887, + "learning_rate": 1.6238359861648438e-05, + "loss": 0.3199, + "step": 5808 + }, + { + "epoch": 1.1451104100946372, + "grad_norm": 0.4866543775966263, + "learning_rate": 1.6237148355718092e-05, + "loss": 0.3306, + "step": 5809 + }, + { + "epoch": 1.145307570977918, + "grad_norm": 0.4824258516786147, + "learning_rate": 1.623593669993523e-05, + "loss": 0.3098, + "step": 5810 + }, + { + "epoch": 1.1455047318611988, + "grad_norm": 0.4805879120897678, + "learning_rate": 1.623472489432897e-05, + "loss": 0.3024, + "step": 5811 + }, + { + "epoch": 1.1457018927444795, + "grad_norm": 0.5450607872650646, + "learning_rate": 1.623351293892842e-05, + "loss": 0.3426, + "step": 5812 + }, + { + "epoch": 1.1458990536277602, + "grad_norm": 0.49414340389336664, + "learning_rate": 1.623230083376271e-05, + "loss": 0.3353, + "step": 5813 + }, + { + "epoch": 1.146096214511041, + "grad_norm": 0.5993703898368199, + "learning_rate": 1.6231088578860946e-05, + "loss": 0.3437, + "step": 5814 + }, + { + "epoch": 1.1462933753943219, + "grad_norm": 0.5174297786339515, + "learning_rate": 1.6229876174252265e-05, + "loss": 0.3284, + "step": 5815 + }, + { + "epoch": 1.1464905362776026, + "grad_norm": 0.49563315628009424, + "learning_rate": 1.6228663619965787e-05, + "loss": 0.316, + "step": 5816 + }, + { + "epoch": 1.1466876971608833, + "grad_norm": 0.5185380709534432, + "learning_rate": 1.6227450916030655e-05, + "loss": 0.3324, + "step": 5817 + }, + { + "epoch": 1.146884858044164, + "grad_norm": 0.517728568283838, + "learning_rate": 1.6226238062476e-05, + "loss": 0.3301, + "step": 5818 + }, + { + "epoch": 1.1470820189274449, + "grad_norm": 0.5337322717327413, + "learning_rate": 1.6225025059330954e-05, + "loss": 0.3549, + "step": 5819 + }, + { + "epoch": 1.1472791798107256, + "grad_norm": 0.51659644720446, + "learning_rate": 1.6223811906624675e-05, + "loss": 0.3304, + "step": 5820 + }, + { + "epoch": 1.1474763406940063, + "grad_norm": 0.5135534115538601, + "learning_rate": 1.6222598604386303e-05, + "loss": 0.3253, + "step": 5821 + }, + { + "epoch": 1.147673501577287, + "grad_norm": 0.5683240033324098, + "learning_rate": 1.6221385152644986e-05, + "loss": 0.3719, + "step": 5822 + }, + { + "epoch": 1.147870662460568, + "grad_norm": 0.5006723049086741, + "learning_rate": 1.622017155142988e-05, + "loss": 0.3163, + "step": 5823 + }, + { + "epoch": 1.1480678233438486, + "grad_norm": 0.5148405356777013, + "learning_rate": 1.6218957800770146e-05, + "loss": 0.3548, + "step": 5824 + }, + { + "epoch": 1.1482649842271293, + "grad_norm": 0.5750083625058464, + "learning_rate": 1.621774390069494e-05, + "loss": 0.3561, + "step": 5825 + }, + { + "epoch": 1.14846214511041, + "grad_norm": 0.5165474137396464, + "learning_rate": 1.621652985123343e-05, + "loss": 0.3339, + "step": 5826 + }, + { + "epoch": 1.148659305993691, + "grad_norm": 0.5084589221237357, + "learning_rate": 1.6215315652414786e-05, + "loss": 0.3318, + "step": 5827 + }, + { + "epoch": 1.1488564668769716, + "grad_norm": 0.5035248616427002, + "learning_rate": 1.6214101304268177e-05, + "loss": 0.338, + "step": 5828 + }, + { + "epoch": 1.1490536277602523, + "grad_norm": 0.5544001188077334, + "learning_rate": 1.621288680682278e-05, + "loss": 0.3366, + "step": 5829 + }, + { + "epoch": 1.149250788643533, + "grad_norm": 0.5160867252412951, + "learning_rate": 1.6211672160107776e-05, + "loss": 0.3518, + "step": 5830 + }, + { + "epoch": 1.149447949526814, + "grad_norm": 0.5010643439725077, + "learning_rate": 1.6210457364152345e-05, + "loss": 0.3436, + "step": 5831 + }, + { + "epoch": 1.1496451104100947, + "grad_norm": 0.5106652676242482, + "learning_rate": 1.6209242418985673e-05, + "loss": 0.3226, + "step": 5832 + }, + { + "epoch": 1.1498422712933754, + "grad_norm": 0.48330155765068944, + "learning_rate": 1.6208027324636956e-05, + "loss": 0.3091, + "step": 5833 + }, + { + "epoch": 1.1500394321766563, + "grad_norm": 0.5026909229195045, + "learning_rate": 1.620681208113538e-05, + "loss": 0.313, + "step": 5834 + }, + { + "epoch": 1.150236593059937, + "grad_norm": 0.4919091752987136, + "learning_rate": 1.6205596688510144e-05, + "loss": 0.3118, + "step": 5835 + }, + { + "epoch": 1.1504337539432177, + "grad_norm": 0.4987724584414972, + "learning_rate": 1.6204381146790452e-05, + "loss": 0.3159, + "step": 5836 + }, + { + "epoch": 1.1506309148264984, + "grad_norm": 0.47351362341946696, + "learning_rate": 1.6203165456005505e-05, + "loss": 0.3215, + "step": 5837 + }, + { + "epoch": 1.150828075709779, + "grad_norm": 0.5390815160396467, + "learning_rate": 1.6201949616184515e-05, + "loss": 0.3512, + "step": 5838 + }, + { + "epoch": 1.15102523659306, + "grad_norm": 0.4907164796379062, + "learning_rate": 1.620073362735669e-05, + "loss": 0.3351, + "step": 5839 + }, + { + "epoch": 1.1512223974763407, + "grad_norm": 0.5183309334339059, + "learning_rate": 1.6199517489551246e-05, + "loss": 0.3471, + "step": 5840 + }, + { + "epoch": 1.1514195583596214, + "grad_norm": 0.5094113386762072, + "learning_rate": 1.61983012027974e-05, + "loss": 0.3393, + "step": 5841 + }, + { + "epoch": 1.1516167192429023, + "grad_norm": 0.47916372568319, + "learning_rate": 1.6197084767124378e-05, + "loss": 0.3002, + "step": 5842 + }, + { + "epoch": 1.151813880126183, + "grad_norm": 0.5554961928781669, + "learning_rate": 1.619586818256141e-05, + "loss": 0.3465, + "step": 5843 + }, + { + "epoch": 1.1520110410094637, + "grad_norm": 0.4760523166959915, + "learning_rate": 1.6194651449137708e-05, + "loss": 0.315, + "step": 5844 + }, + { + "epoch": 1.1522082018927444, + "grad_norm": 0.49772863471397966, + "learning_rate": 1.6193434566882522e-05, + "loss": 0.3392, + "step": 5845 + }, + { + "epoch": 1.1524053627760251, + "grad_norm": 0.49931835312839523, + "learning_rate": 1.6192217535825084e-05, + "loss": 0.3259, + "step": 5846 + }, + { + "epoch": 1.152602523659306, + "grad_norm": 0.4935523430213534, + "learning_rate": 1.619100035599463e-05, + "loss": 0.3188, + "step": 5847 + }, + { + "epoch": 1.1527996845425867, + "grad_norm": 0.5009652113164703, + "learning_rate": 1.618978302742041e-05, + "loss": 0.3017, + "step": 5848 + }, + { + "epoch": 1.1529968454258674, + "grad_norm": 0.513184184558755, + "learning_rate": 1.6188565550131667e-05, + "loss": 0.3389, + "step": 5849 + }, + { + "epoch": 1.1531940063091484, + "grad_norm": 0.5120199704705086, + "learning_rate": 1.6187347924157654e-05, + "loss": 0.3272, + "step": 5850 + }, + { + "epoch": 1.153391167192429, + "grad_norm": 0.55709606253943, + "learning_rate": 1.618613014952762e-05, + "loss": 0.3511, + "step": 5851 + }, + { + "epoch": 1.1535883280757098, + "grad_norm": 0.528693155299282, + "learning_rate": 1.6184912226270833e-05, + "loss": 0.3246, + "step": 5852 + }, + { + "epoch": 1.1537854889589905, + "grad_norm": 0.4926792899919863, + "learning_rate": 1.6183694154416548e-05, + "loss": 0.3401, + "step": 5853 + }, + { + "epoch": 1.1539826498422712, + "grad_norm": 0.5247479545911596, + "learning_rate": 1.618247593399403e-05, + "loss": 0.3147, + "step": 5854 + }, + { + "epoch": 1.154179810725552, + "grad_norm": 0.5391139938461473, + "learning_rate": 1.6181257565032548e-05, + "loss": 0.3424, + "step": 5855 + }, + { + "epoch": 1.1543769716088328, + "grad_norm": 0.5379238676508531, + "learning_rate": 1.6180039047561375e-05, + "loss": 0.3489, + "step": 5856 + }, + { + "epoch": 1.1545741324921135, + "grad_norm": 0.5433587033942305, + "learning_rate": 1.6178820381609793e-05, + "loss": 0.3583, + "step": 5857 + }, + { + "epoch": 1.1547712933753944, + "grad_norm": 0.5515081493988264, + "learning_rate": 1.617760156720707e-05, + "loss": 0.3409, + "step": 5858 + }, + { + "epoch": 1.1549684542586751, + "grad_norm": 0.5211754285676179, + "learning_rate": 1.617638260438249e-05, + "loss": 0.3441, + "step": 5859 + }, + { + "epoch": 1.1551656151419558, + "grad_norm": 0.5728695227853102, + "learning_rate": 1.6175163493165353e-05, + "loss": 0.3692, + "step": 5860 + }, + { + "epoch": 1.1553627760252365, + "grad_norm": 0.5398536734238296, + "learning_rate": 1.6173944233584936e-05, + "loss": 0.3333, + "step": 5861 + }, + { + "epoch": 1.1555599369085174, + "grad_norm": 0.5261150430536126, + "learning_rate": 1.6172724825670537e-05, + "loss": 0.3279, + "step": 5862 + }, + { + "epoch": 1.1557570977917981, + "grad_norm": 0.5106454983015667, + "learning_rate": 1.6171505269451456e-05, + "loss": 0.339, + "step": 5863 + }, + { + "epoch": 1.1559542586750788, + "grad_norm": 0.5055443785259361, + "learning_rate": 1.617028556495699e-05, + "loss": 0.2946, + "step": 5864 + }, + { + "epoch": 1.1561514195583595, + "grad_norm": 0.5803137026274476, + "learning_rate": 1.6169065712216444e-05, + "loss": 0.3789, + "step": 5865 + }, + { + "epoch": 1.1563485804416405, + "grad_norm": 0.512878015699482, + "learning_rate": 1.6167845711259123e-05, + "loss": 0.3469, + "step": 5866 + }, + { + "epoch": 1.1565457413249212, + "grad_norm": 0.5349976563998219, + "learning_rate": 1.6166625562114347e-05, + "loss": 0.3428, + "step": 5867 + }, + { + "epoch": 1.1567429022082019, + "grad_norm": 0.46278679315128507, + "learning_rate": 1.616540526481142e-05, + "loss": 0.2879, + "step": 5868 + }, + { + "epoch": 1.1569400630914826, + "grad_norm": 0.5228615238771711, + "learning_rate": 1.6164184819379673e-05, + "loss": 0.33, + "step": 5869 + }, + { + "epoch": 1.1571372239747635, + "grad_norm": 0.5253461017754688, + "learning_rate": 1.6162964225848416e-05, + "loss": 0.3451, + "step": 5870 + }, + { + "epoch": 1.1573343848580442, + "grad_norm": 0.5368330566994861, + "learning_rate": 1.6161743484246987e-05, + "loss": 0.32, + "step": 5871 + }, + { + "epoch": 1.1575315457413249, + "grad_norm": 0.5655488658811362, + "learning_rate": 1.6160522594604704e-05, + "loss": 0.3267, + "step": 5872 + }, + { + "epoch": 1.1577287066246056, + "grad_norm": 0.553771815896319, + "learning_rate": 1.6159301556950904e-05, + "loss": 0.3338, + "step": 5873 + }, + { + "epoch": 1.1579258675078865, + "grad_norm": 0.49679032602454654, + "learning_rate": 1.6158080371314926e-05, + "loss": 0.3207, + "step": 5874 + }, + { + "epoch": 1.1581230283911672, + "grad_norm": 0.5110047440222282, + "learning_rate": 1.6156859037726108e-05, + "loss": 0.3273, + "step": 5875 + }, + { + "epoch": 1.158320189274448, + "grad_norm": 0.5164914172279298, + "learning_rate": 1.6155637556213793e-05, + "loss": 0.3248, + "step": 5876 + }, + { + "epoch": 1.1585173501577288, + "grad_norm": 0.5120868579780778, + "learning_rate": 1.6154415926807327e-05, + "loss": 0.3603, + "step": 5877 + }, + { + "epoch": 1.1587145110410095, + "grad_norm": 0.4585506611956392, + "learning_rate": 1.6153194149536064e-05, + "loss": 0.2974, + "step": 5878 + }, + { + "epoch": 1.1589116719242902, + "grad_norm": 0.5616942590634053, + "learning_rate": 1.6151972224429356e-05, + "loss": 0.3255, + "step": 5879 + }, + { + "epoch": 1.159108832807571, + "grad_norm": 0.5276396699788394, + "learning_rate": 1.615075015151656e-05, + "loss": 0.3336, + "step": 5880 + }, + { + "epoch": 1.1593059936908516, + "grad_norm": 0.48785185958924593, + "learning_rate": 1.6149527930827043e-05, + "loss": 0.3183, + "step": 5881 + }, + { + "epoch": 1.1595031545741326, + "grad_norm": 0.5070222632900729, + "learning_rate": 1.614830556239016e-05, + "loss": 0.3392, + "step": 5882 + }, + { + "epoch": 1.1597003154574133, + "grad_norm": 0.5357213010735683, + "learning_rate": 1.6147083046235287e-05, + "loss": 0.3302, + "step": 5883 + }, + { + "epoch": 1.159897476340694, + "grad_norm": 0.492115580680464, + "learning_rate": 1.6145860382391792e-05, + "loss": 0.3062, + "step": 5884 + }, + { + "epoch": 1.1600946372239749, + "grad_norm": 0.4775935270120072, + "learning_rate": 1.6144637570889055e-05, + "loss": 0.3084, + "step": 5885 + }, + { + "epoch": 1.1602917981072556, + "grad_norm": 0.5574502509553525, + "learning_rate": 1.6143414611756448e-05, + "loss": 0.3762, + "step": 5886 + }, + { + "epoch": 1.1604889589905363, + "grad_norm": 0.5324231837542485, + "learning_rate": 1.6142191505023362e-05, + "loss": 0.3394, + "step": 5887 + }, + { + "epoch": 1.160686119873817, + "grad_norm": 0.5540695998548545, + "learning_rate": 1.6140968250719177e-05, + "loss": 0.3334, + "step": 5888 + }, + { + "epoch": 1.1608832807570977, + "grad_norm": 0.5152035583057871, + "learning_rate": 1.6139744848873283e-05, + "loss": 0.333, + "step": 5889 + }, + { + "epoch": 1.1610804416403786, + "grad_norm": 0.5016308910529644, + "learning_rate": 1.613852129951508e-05, + "loss": 0.328, + "step": 5890 + }, + { + "epoch": 1.1612776025236593, + "grad_norm": 0.4671308137477831, + "learning_rate": 1.6137297602673955e-05, + "loss": 0.3006, + "step": 5891 + }, + { + "epoch": 1.16147476340694, + "grad_norm": 0.47717698267201425, + "learning_rate": 1.613607375837931e-05, + "loss": 0.3085, + "step": 5892 + }, + { + "epoch": 1.161671924290221, + "grad_norm": 0.5411199798727852, + "learning_rate": 1.6134849766660557e-05, + "loss": 0.3512, + "step": 5893 + }, + { + "epoch": 1.1618690851735016, + "grad_norm": 0.48072485668659576, + "learning_rate": 1.6133625627547096e-05, + "loss": 0.2991, + "step": 5894 + }, + { + "epoch": 1.1620662460567823, + "grad_norm": 0.49074020021120335, + "learning_rate": 1.613240134106834e-05, + "loss": 0.3185, + "step": 5895 + }, + { + "epoch": 1.162263406940063, + "grad_norm": 0.4742652414752287, + "learning_rate": 1.6131176907253703e-05, + "loss": 0.3219, + "step": 5896 + }, + { + "epoch": 1.1624605678233437, + "grad_norm": 0.4926211188349463, + "learning_rate": 1.6129952326132603e-05, + "loss": 0.3213, + "step": 5897 + }, + { + "epoch": 1.1626577287066246, + "grad_norm": 0.5260995869588211, + "learning_rate": 1.6128727597734465e-05, + "loss": 0.3266, + "step": 5898 + }, + { + "epoch": 1.1628548895899053, + "grad_norm": 0.5005689603012923, + "learning_rate": 1.6127502722088703e-05, + "loss": 0.3281, + "step": 5899 + }, + { + "epoch": 1.163052050473186, + "grad_norm": 0.5286226637548996, + "learning_rate": 1.612627769922476e-05, + "loss": 0.3383, + "step": 5900 + }, + { + "epoch": 1.163249211356467, + "grad_norm": 0.5241554407104515, + "learning_rate": 1.612505252917206e-05, + "loss": 0.3442, + "step": 5901 + }, + { + "epoch": 1.1634463722397477, + "grad_norm": 0.4969893983731258, + "learning_rate": 1.6123827211960044e-05, + "loss": 0.3355, + "step": 5902 + }, + { + "epoch": 1.1636435331230284, + "grad_norm": 0.5176767431545333, + "learning_rate": 1.6122601747618144e-05, + "loss": 0.3456, + "step": 5903 + }, + { + "epoch": 1.163840694006309, + "grad_norm": 0.5541524213460989, + "learning_rate": 1.612137613617581e-05, + "loss": 0.3082, + "step": 5904 + }, + { + "epoch": 1.16403785488959, + "grad_norm": 0.5130645123161176, + "learning_rate": 1.612015037766248e-05, + "loss": 0.3595, + "step": 5905 + }, + { + "epoch": 1.1642350157728707, + "grad_norm": 0.49077914052450783, + "learning_rate": 1.611892447210761e-05, + "loss": 0.3201, + "step": 5906 + }, + { + "epoch": 1.1644321766561514, + "grad_norm": 0.5154065030810983, + "learning_rate": 1.6117698419540655e-05, + "loss": 0.345, + "step": 5907 + }, + { + "epoch": 1.164629337539432, + "grad_norm": 0.5137017205525405, + "learning_rate": 1.6116472219991066e-05, + "loss": 0.321, + "step": 5908 + }, + { + "epoch": 1.164826498422713, + "grad_norm": 0.5185538568944038, + "learning_rate": 1.6115245873488308e-05, + "loss": 0.3349, + "step": 5909 + }, + { + "epoch": 1.1650236593059937, + "grad_norm": 0.47872637622833353, + "learning_rate": 1.6114019380061844e-05, + "loss": 0.3091, + "step": 5910 + }, + { + "epoch": 1.1652208201892744, + "grad_norm": 0.5349706635614739, + "learning_rate": 1.6112792739741138e-05, + "loss": 0.3416, + "step": 5911 + }, + { + "epoch": 1.1654179810725551, + "grad_norm": 0.494058437524682, + "learning_rate": 1.6111565952555666e-05, + "loss": 0.3411, + "step": 5912 + }, + { + "epoch": 1.165615141955836, + "grad_norm": 0.5148890708118689, + "learning_rate": 1.6110339018534898e-05, + "loss": 0.3379, + "step": 5913 + }, + { + "epoch": 1.1658123028391167, + "grad_norm": 0.48546285073114465, + "learning_rate": 1.6109111937708317e-05, + "loss": 0.3157, + "step": 5914 + }, + { + "epoch": 1.1660094637223974, + "grad_norm": 0.5386295226523584, + "learning_rate": 1.61078847101054e-05, + "loss": 0.3524, + "step": 5915 + }, + { + "epoch": 1.1662066246056781, + "grad_norm": 0.5357486231753327, + "learning_rate": 1.6106657335755636e-05, + "loss": 0.3296, + "step": 5916 + }, + { + "epoch": 1.166403785488959, + "grad_norm": 0.5071347852474872, + "learning_rate": 1.610542981468851e-05, + "loss": 0.2972, + "step": 5917 + }, + { + "epoch": 1.1666009463722398, + "grad_norm": 0.5136673972393638, + "learning_rate": 1.6104202146933517e-05, + "loss": 0.3427, + "step": 5918 + }, + { + "epoch": 1.1667981072555205, + "grad_norm": 0.5366052967903738, + "learning_rate": 1.6102974332520155e-05, + "loss": 0.3348, + "step": 5919 + }, + { + "epoch": 1.1669952681388012, + "grad_norm": 0.5142076244004472, + "learning_rate": 1.6101746371477915e-05, + "loss": 0.3409, + "step": 5920 + }, + { + "epoch": 1.167192429022082, + "grad_norm": 0.5156383871361846, + "learning_rate": 1.6100518263836305e-05, + "loss": 0.3441, + "step": 5921 + }, + { + "epoch": 1.1673895899053628, + "grad_norm": 0.48857227681411236, + "learning_rate": 1.609929000962483e-05, + "loss": 0.3249, + "step": 5922 + }, + { + "epoch": 1.1675867507886435, + "grad_norm": 0.5116415565941662, + "learning_rate": 1.6098061608873006e-05, + "loss": 0.3347, + "step": 5923 + }, + { + "epoch": 1.1677839116719242, + "grad_norm": 0.476427280273157, + "learning_rate": 1.609683306161034e-05, + "loss": 0.2935, + "step": 5924 + }, + { + "epoch": 1.1679810725552051, + "grad_norm": 0.5315870178041014, + "learning_rate": 1.6095604367866348e-05, + "loss": 0.3298, + "step": 5925 + }, + { + "epoch": 1.1681782334384858, + "grad_norm": 0.4885155093607642, + "learning_rate": 1.6094375527670553e-05, + "loss": 0.2914, + "step": 5926 + }, + { + "epoch": 1.1683753943217665, + "grad_norm": 0.4958137107083927, + "learning_rate": 1.6093146541052472e-05, + "loss": 0.3236, + "step": 5927 + }, + { + "epoch": 1.1685725552050474, + "grad_norm": 0.5186709145538795, + "learning_rate": 1.609191740804165e-05, + "loss": 0.3474, + "step": 5928 + }, + { + "epoch": 1.1687697160883281, + "grad_norm": 0.5431827154799177, + "learning_rate": 1.6090688128667597e-05, + "loss": 0.2922, + "step": 5929 + }, + { + "epoch": 1.1689668769716088, + "grad_norm": 0.5276177466035217, + "learning_rate": 1.608945870295986e-05, + "loss": 0.3273, + "step": 5930 + }, + { + "epoch": 1.1691640378548895, + "grad_norm": 0.5403429181736961, + "learning_rate": 1.6088229130947976e-05, + "loss": 0.3573, + "step": 5931 + }, + { + "epoch": 1.1693611987381702, + "grad_norm": 0.5682917162052413, + "learning_rate": 1.6086999412661483e-05, + "loss": 0.3577, + "step": 5932 + }, + { + "epoch": 1.1695583596214512, + "grad_norm": 0.5092581015992266, + "learning_rate": 1.6085769548129928e-05, + "loss": 0.3178, + "step": 5933 + }, + { + "epoch": 1.1697555205047319, + "grad_norm": 0.5341988017389373, + "learning_rate": 1.6084539537382853e-05, + "loss": 0.318, + "step": 5934 + }, + { + "epoch": 1.1699526813880126, + "grad_norm": 0.5508242925818101, + "learning_rate": 1.6083309380449822e-05, + "loss": 0.3399, + "step": 5935 + }, + { + "epoch": 1.1701498422712935, + "grad_norm": 0.5516468570148881, + "learning_rate": 1.6082079077360382e-05, + "loss": 0.3464, + "step": 5936 + }, + { + "epoch": 1.1703470031545742, + "grad_norm": 0.5310729616685209, + "learning_rate": 1.6080848628144097e-05, + "loss": 0.3549, + "step": 5937 + }, + { + "epoch": 1.1705441640378549, + "grad_norm": 0.5007231153163549, + "learning_rate": 1.6079618032830523e-05, + "loss": 0.317, + "step": 5938 + }, + { + "epoch": 1.1707413249211356, + "grad_norm": 0.5205324245736465, + "learning_rate": 1.6078387291449234e-05, + "loss": 0.3343, + "step": 5939 + }, + { + "epoch": 1.1709384858044163, + "grad_norm": 0.5293927543610932, + "learning_rate": 1.607715640402979e-05, + "loss": 0.3364, + "step": 5940 + }, + { + "epoch": 1.1711356466876972, + "grad_norm": 0.5437502522427697, + "learning_rate": 1.607592537060177e-05, + "loss": 0.3625, + "step": 5941 + }, + { + "epoch": 1.171332807570978, + "grad_norm": 0.5852873503945257, + "learning_rate": 1.6074694191194758e-05, + "loss": 0.343, + "step": 5942 + }, + { + "epoch": 1.1715299684542586, + "grad_norm": 0.5258888667294329, + "learning_rate": 1.607346286583832e-05, + "loss": 0.3525, + "step": 5943 + }, + { + "epoch": 1.1717271293375395, + "grad_norm": 0.49502127400893137, + "learning_rate": 1.6072231394562045e-05, + "loss": 0.3429, + "step": 5944 + }, + { + "epoch": 1.1719242902208202, + "grad_norm": 0.5340808573650885, + "learning_rate": 1.6070999777395522e-05, + "loss": 0.3265, + "step": 5945 + }, + { + "epoch": 1.172121451104101, + "grad_norm": 0.4913659047348541, + "learning_rate": 1.6069768014368344e-05, + "loss": 0.3264, + "step": 5946 + }, + { + "epoch": 1.1723186119873816, + "grad_norm": 0.5796915968301445, + "learning_rate": 1.6068536105510095e-05, + "loss": 0.3314, + "step": 5947 + }, + { + "epoch": 1.1725157728706626, + "grad_norm": 0.48015072710313483, + "learning_rate": 1.606730405085038e-05, + "loss": 0.3063, + "step": 5948 + }, + { + "epoch": 1.1727129337539433, + "grad_norm": 0.5798304016447527, + "learning_rate": 1.60660718504188e-05, + "loss": 0.3673, + "step": 5949 + }, + { + "epoch": 1.172910094637224, + "grad_norm": 0.49383450156062897, + "learning_rate": 1.6064839504244964e-05, + "loss": 0.3137, + "step": 5950 + }, + { + "epoch": 1.1731072555205047, + "grad_norm": 0.49990802164314824, + "learning_rate": 1.6063607012358474e-05, + "loss": 0.345, + "step": 5951 + }, + { + "epoch": 1.1733044164037856, + "grad_norm": 0.5710511478007725, + "learning_rate": 1.6062374374788938e-05, + "loss": 0.3489, + "step": 5952 + }, + { + "epoch": 1.1735015772870663, + "grad_norm": 0.5406553339299977, + "learning_rate": 1.6061141591565977e-05, + "loss": 0.3376, + "step": 5953 + }, + { + "epoch": 1.173698738170347, + "grad_norm": 0.5266705373762315, + "learning_rate": 1.605990866271921e-05, + "loss": 0.3169, + "step": 5954 + }, + { + "epoch": 1.1738958990536277, + "grad_norm": 0.5290688903751791, + "learning_rate": 1.605867558827825e-05, + "loss": 0.3314, + "step": 5955 + }, + { + "epoch": 1.1740930599369086, + "grad_norm": 0.4981019544107017, + "learning_rate": 1.605744236827274e-05, + "loss": 0.3499, + "step": 5956 + }, + { + "epoch": 1.1742902208201893, + "grad_norm": 0.4966110971975489, + "learning_rate": 1.6056209002732293e-05, + "loss": 0.3241, + "step": 5957 + }, + { + "epoch": 1.17448738170347, + "grad_norm": 0.5075484095401199, + "learning_rate": 1.605497549168655e-05, + "loss": 0.3164, + "step": 5958 + }, + { + "epoch": 1.1746845425867507, + "grad_norm": 0.7450970335942361, + "learning_rate": 1.6053741835165146e-05, + "loss": 0.3461, + "step": 5959 + }, + { + "epoch": 1.1748817034700316, + "grad_norm": 0.5291598106311712, + "learning_rate": 1.6052508033197713e-05, + "loss": 0.323, + "step": 5960 + }, + { + "epoch": 1.1750788643533123, + "grad_norm": 0.5970311083346598, + "learning_rate": 1.6051274085813906e-05, + "loss": 0.3433, + "step": 5961 + }, + { + "epoch": 1.175276025236593, + "grad_norm": 0.5257206942997955, + "learning_rate": 1.6050039993043366e-05, + "loss": 0.3312, + "step": 5962 + }, + { + "epoch": 1.1754731861198737, + "grad_norm": 0.5470667266403514, + "learning_rate": 1.604880575491574e-05, + "loss": 0.3241, + "step": 5963 + }, + { + "epoch": 1.1756703470031546, + "grad_norm": 0.526634277205355, + "learning_rate": 1.6047571371460688e-05, + "loss": 0.3486, + "step": 5964 + }, + { + "epoch": 1.1758675078864353, + "grad_norm": 0.5087896217443573, + "learning_rate": 1.6046336842707862e-05, + "loss": 0.3373, + "step": 5965 + }, + { + "epoch": 1.176064668769716, + "grad_norm": 0.5213596470506094, + "learning_rate": 1.6045102168686925e-05, + "loss": 0.3399, + "step": 5966 + }, + { + "epoch": 1.1762618296529967, + "grad_norm": 0.5025963895764581, + "learning_rate": 1.604386734942754e-05, + "loss": 0.3309, + "step": 5967 + }, + { + "epoch": 1.1764589905362777, + "grad_norm": 0.4947895294362806, + "learning_rate": 1.6042632384959377e-05, + "loss": 0.3266, + "step": 5968 + }, + { + "epoch": 1.1766561514195584, + "grad_norm": 0.5280726977614281, + "learning_rate": 1.6041397275312102e-05, + "loss": 0.3347, + "step": 5969 + }, + { + "epoch": 1.176853312302839, + "grad_norm": 0.49491475836802235, + "learning_rate": 1.6040162020515394e-05, + "loss": 0.3107, + "step": 5970 + }, + { + "epoch": 1.17705047318612, + "grad_norm": 0.50955563379636, + "learning_rate": 1.6038926620598924e-05, + "loss": 0.3438, + "step": 5971 + }, + { + "epoch": 1.1772476340694007, + "grad_norm": 0.47434081410915296, + "learning_rate": 1.6037691075592384e-05, + "loss": 0.2874, + "step": 5972 + }, + { + "epoch": 1.1774447949526814, + "grad_norm": 0.5265992460388663, + "learning_rate": 1.6036455385525452e-05, + "loss": 0.3138, + "step": 5973 + }, + { + "epoch": 1.177641955835962, + "grad_norm": 0.508538165601405, + "learning_rate": 1.603521955042782e-05, + "loss": 0.3364, + "step": 5974 + }, + { + "epoch": 1.1778391167192428, + "grad_norm": 0.536584712113376, + "learning_rate": 1.603398357032918e-05, + "loss": 0.3425, + "step": 5975 + }, + { + "epoch": 1.1780362776025237, + "grad_norm": 0.5637925361320564, + "learning_rate": 1.603274744525922e-05, + "loss": 0.3751, + "step": 5976 + }, + { + "epoch": 1.1782334384858044, + "grad_norm": 0.4828201596184613, + "learning_rate": 1.6031511175247648e-05, + "loss": 0.3267, + "step": 5977 + }, + { + "epoch": 1.1784305993690851, + "grad_norm": 0.5702750284269689, + "learning_rate": 1.6030274760324163e-05, + "loss": 0.3426, + "step": 5978 + }, + { + "epoch": 1.178627760252366, + "grad_norm": 0.5166744599492986, + "learning_rate": 1.602903820051847e-05, + "loss": 0.3443, + "step": 5979 + }, + { + "epoch": 1.1788249211356467, + "grad_norm": 0.4553996700869784, + "learning_rate": 1.602780149586028e-05, + "loss": 0.2921, + "step": 5980 + }, + { + "epoch": 1.1790220820189274, + "grad_norm": 0.5395408232911948, + "learning_rate": 1.60265646463793e-05, + "loss": 0.3487, + "step": 5981 + }, + { + "epoch": 1.1792192429022081, + "grad_norm": 0.5246852833105909, + "learning_rate": 1.6025327652105256e-05, + "loss": 0.353, + "step": 5982 + }, + { + "epoch": 1.1794164037854888, + "grad_norm": 0.5504345490202539, + "learning_rate": 1.6024090513067864e-05, + "loss": 0.3337, + "step": 5983 + }, + { + "epoch": 1.1796135646687698, + "grad_norm": 0.5273314886798296, + "learning_rate": 1.6022853229296844e-05, + "loss": 0.3613, + "step": 5984 + }, + { + "epoch": 1.1798107255520505, + "grad_norm": 0.4976467528520897, + "learning_rate": 1.6021615800821923e-05, + "loss": 0.3119, + "step": 5985 + }, + { + "epoch": 1.1800078864353312, + "grad_norm": 0.544586740515642, + "learning_rate": 1.6020378227672834e-05, + "loss": 0.3593, + "step": 5986 + }, + { + "epoch": 1.180205047318612, + "grad_norm": 0.5288888595664981, + "learning_rate": 1.6019140509879312e-05, + "loss": 0.365, + "step": 5987 + }, + { + "epoch": 1.1804022082018928, + "grad_norm": 0.5499369482970153, + "learning_rate": 1.601790264747109e-05, + "loss": 0.3673, + "step": 5988 + }, + { + "epoch": 1.1805993690851735, + "grad_norm": 0.5132871954718244, + "learning_rate": 1.6016664640477912e-05, + "loss": 0.349, + "step": 5989 + }, + { + "epoch": 1.1807965299684542, + "grad_norm": 0.5107401573623064, + "learning_rate": 1.601542648892952e-05, + "loss": 0.3328, + "step": 5990 + }, + { + "epoch": 1.1809936908517351, + "grad_norm": 0.5003005634953831, + "learning_rate": 1.6014188192855667e-05, + "loss": 0.3325, + "step": 5991 + }, + { + "epoch": 1.1811908517350158, + "grad_norm": 0.5419236213545058, + "learning_rate": 1.6012949752286093e-05, + "loss": 0.3567, + "step": 5992 + }, + { + "epoch": 1.1813880126182965, + "grad_norm": 0.46447030510821447, + "learning_rate": 1.6011711167250563e-05, + "loss": 0.3133, + "step": 5993 + }, + { + "epoch": 1.1815851735015772, + "grad_norm": 0.5146898936963578, + "learning_rate": 1.6010472437778827e-05, + "loss": 0.3507, + "step": 5994 + }, + { + "epoch": 1.1817823343848581, + "grad_norm": 1.9545816777357674, + "learning_rate": 1.6009233563900654e-05, + "loss": 0.3232, + "step": 5995 + }, + { + "epoch": 1.1819794952681388, + "grad_norm": 0.504150566251979, + "learning_rate": 1.6007994545645807e-05, + "loss": 0.3385, + "step": 5996 + }, + { + "epoch": 1.1821766561514195, + "grad_norm": 0.5178105405608192, + "learning_rate": 1.600675538304405e-05, + "loss": 0.3198, + "step": 5997 + }, + { + "epoch": 1.1823738170347002, + "grad_norm": 0.49033729293788897, + "learning_rate": 1.600551607612516e-05, + "loss": 0.3074, + "step": 5998 + }, + { + "epoch": 1.1825709779179812, + "grad_norm": 0.5216261093702694, + "learning_rate": 1.6004276624918906e-05, + "loss": 0.3524, + "step": 5999 + }, + { + "epoch": 1.1827681388012619, + "grad_norm": 0.48871872582944514, + "learning_rate": 1.600303702945507e-05, + "loss": 0.3333, + "step": 6000 + }, + { + "epoch": 1.1829652996845426, + "grad_norm": 0.5425430954135214, + "learning_rate": 1.600179728976344e-05, + "loss": 0.3351, + "step": 6001 + }, + { + "epoch": 1.1831624605678233, + "grad_norm": 0.4997582084309298, + "learning_rate": 1.6000557405873793e-05, + "loss": 0.3281, + "step": 6002 + }, + { + "epoch": 1.1833596214511042, + "grad_norm": 0.5468290480793452, + "learning_rate": 1.5999317377815927e-05, + "loss": 0.3504, + "step": 6003 + }, + { + "epoch": 1.1835567823343849, + "grad_norm": 0.5176645046959065, + "learning_rate": 1.5998077205619625e-05, + "loss": 0.3362, + "step": 6004 + }, + { + "epoch": 1.1837539432176656, + "grad_norm": 0.5013566738277708, + "learning_rate": 1.599683688931469e-05, + "loss": 0.3276, + "step": 6005 + }, + { + "epoch": 1.1839511041009463, + "grad_norm": 0.5694727653608741, + "learning_rate": 1.599559642893092e-05, + "loss": 0.346, + "step": 6006 + }, + { + "epoch": 1.1841482649842272, + "grad_norm": 0.49787309953663916, + "learning_rate": 1.5994355824498118e-05, + "loss": 0.3237, + "step": 6007 + }, + { + "epoch": 1.184345425867508, + "grad_norm": 0.5217262967923424, + "learning_rate": 1.5993115076046085e-05, + "loss": 0.3272, + "step": 6008 + }, + { + "epoch": 1.1845425867507886, + "grad_norm": 0.5171039380183391, + "learning_rate": 1.5991874183604638e-05, + "loss": 0.3409, + "step": 6009 + }, + { + "epoch": 1.1847397476340693, + "grad_norm": 0.5173593200967992, + "learning_rate": 1.5990633147203595e-05, + "loss": 0.3439, + "step": 6010 + }, + { + "epoch": 1.1849369085173502, + "grad_norm": 0.5600380235112447, + "learning_rate": 1.598939196687276e-05, + "loss": 0.3539, + "step": 6011 + }, + { + "epoch": 1.185134069400631, + "grad_norm": 0.47322318486676124, + "learning_rate": 1.5988150642641963e-05, + "loss": 0.3241, + "step": 6012 + }, + { + "epoch": 1.1853312302839116, + "grad_norm": 0.509209862620946, + "learning_rate": 1.598690917454102e-05, + "loss": 0.3384, + "step": 6013 + }, + { + "epoch": 1.1855283911671926, + "grad_norm": 0.5071917580583539, + "learning_rate": 1.598566756259977e-05, + "loss": 0.3306, + "step": 6014 + }, + { + "epoch": 1.1857255520504733, + "grad_norm": 0.5285407284823473, + "learning_rate": 1.598442580684803e-05, + "loss": 0.353, + "step": 6015 + }, + { + "epoch": 1.185922712933754, + "grad_norm": 0.5370829605446722, + "learning_rate": 1.598318390731564e-05, + "loss": 0.3423, + "step": 6016 + }, + { + "epoch": 1.1861198738170347, + "grad_norm": 0.4874835855626257, + "learning_rate": 1.5981941864032444e-05, + "loss": 0.3312, + "step": 6017 + }, + { + "epoch": 1.1863170347003154, + "grad_norm": 0.5441187518003243, + "learning_rate": 1.5980699677028276e-05, + "loss": 0.3475, + "step": 6018 + }, + { + "epoch": 1.1865141955835963, + "grad_norm": 0.5291193191006429, + "learning_rate": 1.597945734633298e-05, + "loss": 0.3461, + "step": 6019 + }, + { + "epoch": 1.186711356466877, + "grad_norm": 0.5254879886979995, + "learning_rate": 1.5978214871976408e-05, + "loss": 0.3455, + "step": 6020 + }, + { + "epoch": 1.1869085173501577, + "grad_norm": 0.5377765608865902, + "learning_rate": 1.597697225398841e-05, + "loss": 0.3352, + "step": 6021 + }, + { + "epoch": 1.1871056782334386, + "grad_norm": 0.5072825222986831, + "learning_rate": 1.5975729492398836e-05, + "loss": 0.3208, + "step": 6022 + }, + { + "epoch": 1.1873028391167193, + "grad_norm": 0.47380472273260626, + "learning_rate": 1.5974486587237554e-05, + "loss": 0.3119, + "step": 6023 + }, + { + "epoch": 1.1875, + "grad_norm": 0.5284173062047662, + "learning_rate": 1.5973243538534416e-05, + "loss": 0.348, + "step": 6024 + }, + { + "epoch": 1.1876971608832807, + "grad_norm": 0.532090130405166, + "learning_rate": 1.5972000346319296e-05, + "loss": 0.3584, + "step": 6025 + }, + { + "epoch": 1.1878943217665614, + "grad_norm": 0.5219678872457383, + "learning_rate": 1.5970757010622056e-05, + "loss": 0.3243, + "step": 6026 + }, + { + "epoch": 1.1880914826498423, + "grad_norm": 0.49984607033393075, + "learning_rate": 1.596951353147257e-05, + "loss": 0.3215, + "step": 6027 + }, + { + "epoch": 1.188288643533123, + "grad_norm": 0.49207605863384773, + "learning_rate": 1.5968269908900714e-05, + "loss": 0.2912, + "step": 6028 + }, + { + "epoch": 1.1884858044164037, + "grad_norm": 0.5613782227268997, + "learning_rate": 1.596702614293637e-05, + "loss": 0.3553, + "step": 6029 + }, + { + "epoch": 1.1886829652996846, + "grad_norm": 0.5250585921793455, + "learning_rate": 1.5965782233609416e-05, + "loss": 0.3738, + "step": 6030 + }, + { + "epoch": 1.1888801261829653, + "grad_norm": 0.5008108248618431, + "learning_rate": 1.5964538180949738e-05, + "loss": 0.3319, + "step": 6031 + }, + { + "epoch": 1.189077287066246, + "grad_norm": 0.5245356584082338, + "learning_rate": 1.596329398498723e-05, + "loss": 0.3239, + "step": 6032 + }, + { + "epoch": 1.1892744479495267, + "grad_norm": 0.5355385935858578, + "learning_rate": 1.5962049645751778e-05, + "loss": 0.339, + "step": 6033 + }, + { + "epoch": 1.1894716088328074, + "grad_norm": 0.6132239805769646, + "learning_rate": 1.5960805163273287e-05, + "loss": 0.365, + "step": 6034 + }, + { + "epoch": 1.1896687697160884, + "grad_norm": 9.838847324850864, + "learning_rate": 1.5959560537581646e-05, + "loss": 0.343, + "step": 6035 + }, + { + "epoch": 1.189865930599369, + "grad_norm": 0.5645786328771389, + "learning_rate": 1.5958315768706767e-05, + "loss": 0.3648, + "step": 6036 + }, + { + "epoch": 1.1900630914826498, + "grad_norm": 0.48867840037373045, + "learning_rate": 1.5957070856678553e-05, + "loss": 0.3233, + "step": 6037 + }, + { + "epoch": 1.1902602523659307, + "grad_norm": 0.529456568702109, + "learning_rate": 1.5955825801526918e-05, + "loss": 0.327, + "step": 6038 + }, + { + "epoch": 1.1904574132492114, + "grad_norm": 1.176277930833411, + "learning_rate": 1.5954580603281768e-05, + "loss": 0.3511, + "step": 6039 + }, + { + "epoch": 1.190654574132492, + "grad_norm": 0.5138247235703703, + "learning_rate": 1.5953335261973024e-05, + "loss": 0.3266, + "step": 6040 + }, + { + "epoch": 1.1908517350157728, + "grad_norm": 0.504680208228039, + "learning_rate": 1.5952089777630604e-05, + "loss": 0.3379, + "step": 6041 + }, + { + "epoch": 1.1910488958990537, + "grad_norm": 0.516549288250157, + "learning_rate": 1.5950844150284438e-05, + "loss": 0.3391, + "step": 6042 + }, + { + "epoch": 1.1912460567823344, + "grad_norm": 0.524867491432488, + "learning_rate": 1.5949598379964447e-05, + "loss": 0.3345, + "step": 6043 + }, + { + "epoch": 1.1914432176656151, + "grad_norm": 0.5311974639533232, + "learning_rate": 1.594835246670056e-05, + "loss": 0.3475, + "step": 6044 + }, + { + "epoch": 1.1916403785488958, + "grad_norm": 0.4916105988107756, + "learning_rate": 1.5947106410522722e-05, + "loss": 0.318, + "step": 6045 + }, + { + "epoch": 1.1918375394321767, + "grad_norm": 0.5324212670370398, + "learning_rate": 1.594586021146086e-05, + "loss": 0.3236, + "step": 6046 + }, + { + "epoch": 1.1920347003154574, + "grad_norm": 0.4964775871993211, + "learning_rate": 1.594461386954492e-05, + "loss": 0.3247, + "step": 6047 + }, + { + "epoch": 1.1922318611987381, + "grad_norm": 0.5471553857731185, + "learning_rate": 1.5943367384804842e-05, + "loss": 0.3463, + "step": 6048 + }, + { + "epoch": 1.1924290220820188, + "grad_norm": 0.7831995811222979, + "learning_rate": 1.5942120757270578e-05, + "loss": 0.3071, + "step": 6049 + }, + { + "epoch": 1.1926261829652998, + "grad_norm": 0.5010080635510805, + "learning_rate": 1.5940873986972078e-05, + "loss": 0.3082, + "step": 6050 + }, + { + "epoch": 1.1928233438485805, + "grad_norm": 0.49204619020050877, + "learning_rate": 1.5939627073939298e-05, + "loss": 0.325, + "step": 6051 + }, + { + "epoch": 1.1930205047318612, + "grad_norm": 0.5694016315454971, + "learning_rate": 1.593838001820219e-05, + "loss": 0.355, + "step": 6052 + }, + { + "epoch": 1.1932176656151419, + "grad_norm": 0.5103103854682324, + "learning_rate": 1.5937132819790722e-05, + "loss": 0.336, + "step": 6053 + }, + { + "epoch": 1.1934148264984228, + "grad_norm": 0.5971611394525508, + "learning_rate": 1.593588547873486e-05, + "loss": 0.2987, + "step": 6054 + }, + { + "epoch": 1.1936119873817035, + "grad_norm": 0.5324309908139868, + "learning_rate": 1.593463799506456e-05, + "loss": 0.3381, + "step": 6055 + }, + { + "epoch": 1.1938091482649842, + "grad_norm": 0.5513616850381257, + "learning_rate": 1.593339036880981e-05, + "loss": 0.3436, + "step": 6056 + }, + { + "epoch": 1.1940063091482649, + "grad_norm": 0.48370871862770526, + "learning_rate": 1.5932142600000577e-05, + "loss": 0.3131, + "step": 6057 + }, + { + "epoch": 1.1942034700315458, + "grad_norm": 0.5723593799005773, + "learning_rate": 1.5930894688666843e-05, + "loss": 0.3526, + "step": 6058 + }, + { + "epoch": 1.1944006309148265, + "grad_norm": 0.5244334828806275, + "learning_rate": 1.5929646634838583e-05, + "loss": 0.3277, + "step": 6059 + }, + { + "epoch": 1.1945977917981072, + "grad_norm": 0.5360911703031235, + "learning_rate": 1.5928398438545792e-05, + "loss": 0.3359, + "step": 6060 + }, + { + "epoch": 1.194794952681388, + "grad_norm": 0.5103692154019288, + "learning_rate": 1.5927150099818454e-05, + "loss": 0.3397, + "step": 6061 + }, + { + "epoch": 1.1949921135646688, + "grad_norm": 0.4920913803322767, + "learning_rate": 1.592590161868656e-05, + "loss": 0.2967, + "step": 6062 + }, + { + "epoch": 1.1951892744479495, + "grad_norm": 0.5338396158035961, + "learning_rate": 1.5924652995180106e-05, + "loss": 0.3377, + "step": 6063 + }, + { + "epoch": 1.1953864353312302, + "grad_norm": 0.4978817940429984, + "learning_rate": 1.5923404229329097e-05, + "loss": 0.3148, + "step": 6064 + }, + { + "epoch": 1.1955835962145112, + "grad_norm": 0.5108248246371543, + "learning_rate": 1.5922155321163528e-05, + "loss": 0.344, + "step": 6065 + }, + { + "epoch": 1.1957807570977919, + "grad_norm": 0.4962965845967901, + "learning_rate": 1.592090627071341e-05, + "loss": 0.32, + "step": 6066 + }, + { + "epoch": 1.1959779179810726, + "grad_norm": 0.5153462208898505, + "learning_rate": 1.591965707800875e-05, + "loss": 0.3368, + "step": 6067 + }, + { + "epoch": 1.1961750788643533, + "grad_norm": 0.5214640418478306, + "learning_rate": 1.5918407743079564e-05, + "loss": 0.3297, + "step": 6068 + }, + { + "epoch": 1.196372239747634, + "grad_norm": 0.5677250498709867, + "learning_rate": 1.5917158265955863e-05, + "loss": 0.3743, + "step": 6069 + }, + { + "epoch": 1.1965694006309149, + "grad_norm": 0.5268238165290028, + "learning_rate": 1.591590864666767e-05, + "loss": 0.3654, + "step": 6070 + }, + { + "epoch": 1.1967665615141956, + "grad_norm": 0.4961093876386208, + "learning_rate": 1.5914658885245006e-05, + "loss": 0.3319, + "step": 6071 + }, + { + "epoch": 1.1969637223974763, + "grad_norm": 0.5169884357870341, + "learning_rate": 1.5913408981717902e-05, + "loss": 0.355, + "step": 6072 + }, + { + "epoch": 1.1971608832807572, + "grad_norm": 0.4863348861419478, + "learning_rate": 1.5912158936116383e-05, + "loss": 0.3118, + "step": 6073 + }, + { + "epoch": 1.197358044164038, + "grad_norm": 0.49535517239582816, + "learning_rate": 1.5910908748470485e-05, + "loss": 0.2871, + "step": 6074 + }, + { + "epoch": 1.1975552050473186, + "grad_norm": 0.5012578788415777, + "learning_rate": 1.5909658418810246e-05, + "loss": 0.3124, + "step": 6075 + }, + { + "epoch": 1.1977523659305993, + "grad_norm": 6.581609483160163, + "learning_rate": 1.5908407947165704e-05, + "loss": 0.3924, + "step": 6076 + }, + { + "epoch": 1.19794952681388, + "grad_norm": 0.5416411980138617, + "learning_rate": 1.59071573335669e-05, + "loss": 0.3265, + "step": 6077 + }, + { + "epoch": 1.198146687697161, + "grad_norm": 5.137867502750501, + "learning_rate": 1.5905906578043892e-05, + "loss": 0.4025, + "step": 6078 + }, + { + "epoch": 1.1983438485804416, + "grad_norm": 0.498758781908228, + "learning_rate": 1.5904655680626712e-05, + "loss": 0.3361, + "step": 6079 + }, + { + "epoch": 1.1985410094637223, + "grad_norm": 0.5129283262777451, + "learning_rate": 1.590340464134543e-05, + "loss": 0.3302, + "step": 6080 + }, + { + "epoch": 1.1987381703470033, + "grad_norm": 0.7337508073690954, + "learning_rate": 1.5902153460230097e-05, + "loss": 0.3584, + "step": 6081 + }, + { + "epoch": 1.198935331230284, + "grad_norm": 0.4927136722041788, + "learning_rate": 1.5900902137310777e-05, + "loss": 0.315, + "step": 6082 + }, + { + "epoch": 1.1991324921135647, + "grad_norm": 0.4999312067571768, + "learning_rate": 1.5899650672617526e-05, + "loss": 0.3218, + "step": 6083 + }, + { + "epoch": 1.1993296529968454, + "grad_norm": 0.5108041586913794, + "learning_rate": 1.589839906618042e-05, + "loss": 0.3266, + "step": 6084 + }, + { + "epoch": 1.1995268138801263, + "grad_norm": 0.5344740208260175, + "learning_rate": 1.5897147318029524e-05, + "loss": 0.3587, + "step": 6085 + }, + { + "epoch": 1.199723974763407, + "grad_norm": 0.5888513085931902, + "learning_rate": 1.5895895428194915e-05, + "loss": 0.3455, + "step": 6086 + }, + { + "epoch": 1.1999211356466877, + "grad_norm": 0.4926690283535484, + "learning_rate": 1.5894643396706674e-05, + "loss": 0.3181, + "step": 6087 + }, + { + "epoch": 1.2001182965299684, + "grad_norm": 0.5047531232097675, + "learning_rate": 1.5893391223594873e-05, + "loss": 0.3396, + "step": 6088 + }, + { + "epoch": 1.2003154574132493, + "grad_norm": 0.5194620816609076, + "learning_rate": 1.5892138908889606e-05, + "loss": 0.3498, + "step": 6089 + }, + { + "epoch": 1.20051261829653, + "grad_norm": 0.49106900692891253, + "learning_rate": 1.589088645262096e-05, + "loss": 0.3156, + "step": 6090 + }, + { + "epoch": 1.2007097791798107, + "grad_norm": 0.4816616653634188, + "learning_rate": 1.5889633854819014e-05, + "loss": 0.3269, + "step": 6091 + }, + { + "epoch": 1.2009069400630914, + "grad_norm": 0.5018051104959026, + "learning_rate": 1.5888381115513878e-05, + "loss": 0.3261, + "step": 6092 + }, + { + "epoch": 1.2011041009463723, + "grad_norm": 0.5243143133854937, + "learning_rate": 1.5887128234735638e-05, + "loss": 0.3356, + "step": 6093 + }, + { + "epoch": 1.201301261829653, + "grad_norm": 0.5816921886358895, + "learning_rate": 1.5885875212514408e-05, + "loss": 0.3673, + "step": 6094 + }, + { + "epoch": 1.2014984227129337, + "grad_norm": 0.4886736205149519, + "learning_rate": 1.5884622048880283e-05, + "loss": 0.348, + "step": 6095 + }, + { + "epoch": 1.2016955835962144, + "grad_norm": 0.5352170470566625, + "learning_rate": 1.5883368743863376e-05, + "loss": 0.3531, + "step": 6096 + }, + { + "epoch": 1.2018927444794953, + "grad_norm": 0.5070353571291337, + "learning_rate": 1.5882115297493793e-05, + "loss": 0.3074, + "step": 6097 + }, + { + "epoch": 1.202089905362776, + "grad_norm": 0.5299404136107287, + "learning_rate": 1.588086170980166e-05, + "loss": 0.3548, + "step": 6098 + }, + { + "epoch": 1.2022870662460567, + "grad_norm": 0.5010029776145218, + "learning_rate": 1.5879607980817084e-05, + "loss": 0.3322, + "step": 6099 + }, + { + "epoch": 1.2024842271293374, + "grad_norm": 0.5443027119113688, + "learning_rate": 1.5878354110570188e-05, + "loss": 0.3445, + "step": 6100 + }, + { + "epoch": 1.2026813880126184, + "grad_norm": 0.5116498162874465, + "learning_rate": 1.5877100099091106e-05, + "loss": 0.35, + "step": 6101 + }, + { + "epoch": 1.202878548895899, + "grad_norm": 0.4990846323105568, + "learning_rate": 1.587584594640996e-05, + "loss": 0.303, + "step": 6102 + }, + { + "epoch": 1.2030757097791798, + "grad_norm": 0.6083340219487477, + "learning_rate": 1.5874591652556887e-05, + "loss": 0.3466, + "step": 6103 + }, + { + "epoch": 1.2032728706624605, + "grad_norm": 0.5699359859289749, + "learning_rate": 1.5873337217562012e-05, + "loss": 0.3492, + "step": 6104 + }, + { + "epoch": 1.2034700315457414, + "grad_norm": 0.5305845146338529, + "learning_rate": 1.5872082641455484e-05, + "loss": 0.3311, + "step": 6105 + }, + { + "epoch": 1.203667192429022, + "grad_norm": 0.4987574811623951, + "learning_rate": 1.5870827924267442e-05, + "loss": 0.3127, + "step": 6106 + }, + { + "epoch": 1.2038643533123028, + "grad_norm": 0.5542197601383188, + "learning_rate": 1.586957306602803e-05, + "loss": 0.3711, + "step": 6107 + }, + { + "epoch": 1.2040615141955837, + "grad_norm": 0.5296385507003382, + "learning_rate": 1.58683180667674e-05, + "loss": 0.3322, + "step": 6108 + }, + { + "epoch": 1.2042586750788644, + "grad_norm": 0.510771246311118, + "learning_rate": 1.5867062926515702e-05, + "loss": 0.3478, + "step": 6109 + }, + { + "epoch": 1.2044558359621451, + "grad_norm": 0.5083219009195116, + "learning_rate": 1.586580764530309e-05, + "loss": 0.3283, + "step": 6110 + }, + { + "epoch": 1.2046529968454258, + "grad_norm": 0.4984543211011751, + "learning_rate": 1.586455222315973e-05, + "loss": 0.3161, + "step": 6111 + }, + { + "epoch": 1.2048501577287065, + "grad_norm": 0.48449825361455534, + "learning_rate": 1.5863296660115778e-05, + "loss": 0.3235, + "step": 6112 + }, + { + "epoch": 1.2050473186119874, + "grad_norm": 0.4880911697740563, + "learning_rate": 1.58620409562014e-05, + "loss": 0.3221, + "step": 6113 + }, + { + "epoch": 1.2052444794952681, + "grad_norm": 0.6718849269017811, + "learning_rate": 1.586078511144677e-05, + "loss": 0.3891, + "step": 6114 + }, + { + "epoch": 1.2054416403785488, + "grad_norm": 0.536419239014414, + "learning_rate": 1.5859529125882058e-05, + "loss": 0.3603, + "step": 6115 + }, + { + "epoch": 1.2056388012618298, + "grad_norm": 0.5329763858491461, + "learning_rate": 1.585827299953744e-05, + "loss": 0.3488, + "step": 6116 + }, + { + "epoch": 1.2058359621451105, + "grad_norm": 1.3571017677348047, + "learning_rate": 1.5857016732443096e-05, + "loss": 0.331, + "step": 6117 + }, + { + "epoch": 1.2060331230283912, + "grad_norm": 0.5105153466144287, + "learning_rate": 1.5855760324629204e-05, + "loss": 0.324, + "step": 6118 + }, + { + "epoch": 1.2062302839116719, + "grad_norm": 0.4980780054716261, + "learning_rate": 1.585450377612596e-05, + "loss": 0.3108, + "step": 6119 + }, + { + "epoch": 1.2064274447949526, + "grad_norm": 0.969040129147523, + "learning_rate": 1.5853247086963546e-05, + "loss": 0.3388, + "step": 6120 + }, + { + "epoch": 1.2066246056782335, + "grad_norm": 0.5442833624434943, + "learning_rate": 1.585199025717216e-05, + "loss": 0.3501, + "step": 6121 + }, + { + "epoch": 1.2068217665615142, + "grad_norm": 0.50130789731431, + "learning_rate": 1.585073328678199e-05, + "loss": 0.33, + "step": 6122 + }, + { + "epoch": 1.2070189274447949, + "grad_norm": 0.5068279641638377, + "learning_rate": 1.5849476175823242e-05, + "loss": 0.3371, + "step": 6123 + }, + { + "epoch": 1.2072160883280758, + "grad_norm": 0.567218532987645, + "learning_rate": 1.584821892432612e-05, + "loss": 0.3313, + "step": 6124 + }, + { + "epoch": 1.2074132492113565, + "grad_norm": 0.5501597914721346, + "learning_rate": 1.5846961532320833e-05, + "loss": 0.3363, + "step": 6125 + }, + { + "epoch": 1.2076104100946372, + "grad_norm": 0.5130749844399497, + "learning_rate": 1.584570399983758e-05, + "loss": 0.3331, + "step": 6126 + }, + { + "epoch": 1.207807570977918, + "grad_norm": 0.7105268873576728, + "learning_rate": 1.5844446326906585e-05, + "loss": 0.3289, + "step": 6127 + }, + { + "epoch": 1.2080047318611988, + "grad_norm": 0.49590774482060557, + "learning_rate": 1.5843188513558056e-05, + "loss": 0.3159, + "step": 6128 + }, + { + "epoch": 1.2082018927444795, + "grad_norm": 0.522580528542394, + "learning_rate": 1.5841930559822222e-05, + "loss": 0.3258, + "step": 6129 + }, + { + "epoch": 1.2083990536277602, + "grad_norm": 0.5232730029770233, + "learning_rate": 1.58406724657293e-05, + "loss": 0.3242, + "step": 6130 + }, + { + "epoch": 1.208596214511041, + "grad_norm": 0.5564685376916361, + "learning_rate": 1.583941423130952e-05, + "loss": 0.3414, + "step": 6131 + }, + { + "epoch": 1.2087933753943219, + "grad_norm": 0.5155603754900965, + "learning_rate": 1.583815585659311e-05, + "loss": 0.3329, + "step": 6132 + }, + { + "epoch": 1.2089905362776026, + "grad_norm": 0.49929691738974885, + "learning_rate": 1.58368973416103e-05, + "loss": 0.3302, + "step": 6133 + }, + { + "epoch": 1.2091876971608833, + "grad_norm": 0.5307058011919017, + "learning_rate": 1.5835638686391338e-05, + "loss": 0.3232, + "step": 6134 + }, + { + "epoch": 1.209384858044164, + "grad_norm": 0.5116249657384987, + "learning_rate": 1.583437989096645e-05, + "loss": 0.3149, + "step": 6135 + }, + { + "epoch": 1.2095820189274449, + "grad_norm": 0.4958052917998363, + "learning_rate": 1.5833120955365894e-05, + "loss": 0.345, + "step": 6136 + }, + { + "epoch": 1.2097791798107256, + "grad_norm": 0.5772963968620416, + "learning_rate": 1.5831861879619904e-05, + "loss": 0.3658, + "step": 6137 + }, + { + "epoch": 1.2099763406940063, + "grad_norm": 0.5176663312841757, + "learning_rate": 1.5830602663758737e-05, + "loss": 0.3321, + "step": 6138 + }, + { + "epoch": 1.210173501577287, + "grad_norm": 0.5572969524001481, + "learning_rate": 1.582934330781265e-05, + "loss": 0.3646, + "step": 6139 + }, + { + "epoch": 1.210370662460568, + "grad_norm": 0.5149376440705687, + "learning_rate": 1.582808381181189e-05, + "loss": 0.3233, + "step": 6140 + }, + { + "epoch": 1.2105678233438486, + "grad_norm": 0.5060377765127014, + "learning_rate": 1.5826824175786724e-05, + "loss": 0.3241, + "step": 6141 + }, + { + "epoch": 1.2107649842271293, + "grad_norm": 0.5075915990283572, + "learning_rate": 1.5825564399767416e-05, + "loss": 0.3315, + "step": 6142 + }, + { + "epoch": 1.21096214511041, + "grad_norm": 0.48101366074112933, + "learning_rate": 1.5824304483784234e-05, + "loss": 0.3271, + "step": 6143 + }, + { + "epoch": 1.211159305993691, + "grad_norm": 0.5006150490244125, + "learning_rate": 1.5823044427867446e-05, + "loss": 0.3208, + "step": 6144 + }, + { + "epoch": 1.2113564668769716, + "grad_norm": 0.5083292731196384, + "learning_rate": 1.582178423204732e-05, + "loss": 0.3332, + "step": 6145 + }, + { + "epoch": 1.2115536277602523, + "grad_norm": 0.5431538616132132, + "learning_rate": 1.5820523896354146e-05, + "loss": 0.3188, + "step": 6146 + }, + { + "epoch": 1.211750788643533, + "grad_norm": 0.4949725843395628, + "learning_rate": 1.5819263420818198e-05, + "loss": 0.3279, + "step": 6147 + }, + { + "epoch": 1.211947949526814, + "grad_norm": 0.5283861058082052, + "learning_rate": 1.5818002805469758e-05, + "loss": 0.3466, + "step": 6148 + }, + { + "epoch": 1.2121451104100947, + "grad_norm": 0.49009428195008536, + "learning_rate": 1.581674205033912e-05, + "loss": 0.316, + "step": 6149 + }, + { + "epoch": 1.2123422712933754, + "grad_norm": 0.527558337367723, + "learning_rate": 1.5815481155456566e-05, + "loss": 0.3217, + "step": 6150 + }, + { + "epoch": 1.2125394321766563, + "grad_norm": 0.46986929303135894, + "learning_rate": 1.581422012085239e-05, + "loss": 0.3353, + "step": 6151 + }, + { + "epoch": 1.212736593059937, + "grad_norm": 0.5248790360545358, + "learning_rate": 1.5812958946556897e-05, + "loss": 0.3471, + "step": 6152 + }, + { + "epoch": 1.2129337539432177, + "grad_norm": 0.4914294810645838, + "learning_rate": 1.581169763260039e-05, + "loss": 0.3302, + "step": 6153 + }, + { + "epoch": 1.2131309148264984, + "grad_norm": 0.4763550637519196, + "learning_rate": 1.5810436179013158e-05, + "loss": 0.3128, + "step": 6154 + }, + { + "epoch": 1.213328075709779, + "grad_norm": 0.5141786955182022, + "learning_rate": 1.5809174585825523e-05, + "loss": 0.3347, + "step": 6155 + }, + { + "epoch": 1.21352523659306, + "grad_norm": 0.5181612279002368, + "learning_rate": 1.5807912853067787e-05, + "loss": 0.3764, + "step": 6156 + }, + { + "epoch": 1.2137223974763407, + "grad_norm": 0.5063765486686969, + "learning_rate": 1.5806650980770273e-05, + "loss": 0.3455, + "step": 6157 + }, + { + "epoch": 1.2139195583596214, + "grad_norm": 0.5010394541454587, + "learning_rate": 1.5805388968963286e-05, + "loss": 0.3283, + "step": 6158 + }, + { + "epoch": 1.2141167192429023, + "grad_norm": 0.4758924748836648, + "learning_rate": 1.5804126817677158e-05, + "loss": 0.31, + "step": 6159 + }, + { + "epoch": 1.214313880126183, + "grad_norm": 0.5025450657121269, + "learning_rate": 1.580286452694221e-05, + "loss": 0.3045, + "step": 6160 + }, + { + "epoch": 1.2145110410094637, + "grad_norm": 0.5146546499338246, + "learning_rate": 1.5801602096788768e-05, + "loss": 0.3355, + "step": 6161 + }, + { + "epoch": 1.2147082018927444, + "grad_norm": 0.5384444969117445, + "learning_rate": 1.5800339527247163e-05, + "loss": 0.3499, + "step": 6162 + }, + { + "epoch": 1.2149053627760251, + "grad_norm": 0.4940594458584491, + "learning_rate": 1.579907681834773e-05, + "loss": 0.3296, + "step": 6163 + }, + { + "epoch": 1.215102523659306, + "grad_norm": 0.5343762136162648, + "learning_rate": 1.579781397012081e-05, + "loss": 0.3352, + "step": 6164 + }, + { + "epoch": 1.2152996845425867, + "grad_norm": 0.48752315188028517, + "learning_rate": 1.5796550982596732e-05, + "loss": 0.3282, + "step": 6165 + }, + { + "epoch": 1.2154968454258674, + "grad_norm": 0.5547331237218581, + "learning_rate": 1.5795287855805853e-05, + "loss": 0.3829, + "step": 6166 + }, + { + "epoch": 1.2156940063091484, + "grad_norm": 0.5004995021041893, + "learning_rate": 1.5794024589778518e-05, + "loss": 0.328, + "step": 6167 + }, + { + "epoch": 1.215891167192429, + "grad_norm": 0.5175275340376262, + "learning_rate": 1.5792761184545076e-05, + "loss": 0.3307, + "step": 6168 + }, + { + "epoch": 1.2160883280757098, + "grad_norm": 0.5027473643529223, + "learning_rate": 1.579149764013588e-05, + "loss": 0.3395, + "step": 6169 + }, + { + "epoch": 1.2162854889589905, + "grad_norm": 0.4817141906134851, + "learning_rate": 1.579023395658129e-05, + "loss": 0.3249, + "step": 6170 + }, + { + "epoch": 1.2164826498422712, + "grad_norm": 0.46188472835340966, + "learning_rate": 1.578897013391167e-05, + "loss": 0.3147, + "step": 6171 + }, + { + "epoch": 1.216679810725552, + "grad_norm": 0.4988989117087451, + "learning_rate": 1.5787706172157374e-05, + "loss": 0.33, + "step": 6172 + }, + { + "epoch": 1.2168769716088328, + "grad_norm": 0.49979162808926375, + "learning_rate": 1.578644207134878e-05, + "loss": 0.335, + "step": 6173 + }, + { + "epoch": 1.2170741324921135, + "grad_norm": 9.87115802371154, + "learning_rate": 1.5785177831516255e-05, + "loss": 0.5038, + "step": 6174 + }, + { + "epoch": 1.2172712933753944, + "grad_norm": 0.49379194281460687, + "learning_rate": 1.5783913452690174e-05, + "loss": 0.3263, + "step": 6175 + }, + { + "epoch": 1.2174684542586751, + "grad_norm": 0.7283072054648132, + "learning_rate": 1.5782648934900915e-05, + "loss": 0.3288, + "step": 6176 + }, + { + "epoch": 1.2176656151419558, + "grad_norm": 0.5063345599140339, + "learning_rate": 1.5781384278178858e-05, + "loss": 0.3322, + "step": 6177 + }, + { + "epoch": 1.2178627760252365, + "grad_norm": 0.49525686855704687, + "learning_rate": 1.578011948255439e-05, + "loss": 0.3412, + "step": 6178 + }, + { + "epoch": 1.2180599369085174, + "grad_norm": 0.4937155808164859, + "learning_rate": 1.5778854548057893e-05, + "loss": 0.3404, + "step": 6179 + }, + { + "epoch": 1.2182570977917981, + "grad_norm": 0.5601096943034672, + "learning_rate": 1.5777589474719764e-05, + "loss": 0.3301, + "step": 6180 + }, + { + "epoch": 1.2184542586750788, + "grad_norm": 0.5134943557442895, + "learning_rate": 1.5776324262570394e-05, + "loss": 0.3036, + "step": 6181 + }, + { + "epoch": 1.2186514195583595, + "grad_norm": 0.5234618403194057, + "learning_rate": 1.577505891164018e-05, + "loss": 0.3487, + "step": 6182 + }, + { + "epoch": 1.2188485804416405, + "grad_norm": 0.5060468910052316, + "learning_rate": 1.5773793421959528e-05, + "loss": 0.3045, + "step": 6183 + }, + { + "epoch": 1.2190457413249212, + "grad_norm": 0.5385344505438445, + "learning_rate": 1.577252779355884e-05, + "loss": 0.3471, + "step": 6184 + }, + { + "epoch": 1.2192429022082019, + "grad_norm": 0.47439335012121125, + "learning_rate": 1.577126202646852e-05, + "loss": 0.3137, + "step": 6185 + }, + { + "epoch": 1.2194400630914826, + "grad_norm": 0.5634484011687884, + "learning_rate": 1.5769996120718985e-05, + "loss": 0.3504, + "step": 6186 + }, + { + "epoch": 1.2196372239747635, + "grad_norm": 0.4783159791866919, + "learning_rate": 1.5768730076340646e-05, + "loss": 0.3238, + "step": 6187 + }, + { + "epoch": 1.2198343848580442, + "grad_norm": 0.5282172252080681, + "learning_rate": 1.5767463893363925e-05, + "loss": 0.324, + "step": 6188 + }, + { + "epoch": 1.2200315457413249, + "grad_norm": 0.508650741550587, + "learning_rate": 1.5766197571819234e-05, + "loss": 0.3108, + "step": 6189 + }, + { + "epoch": 1.2202287066246056, + "grad_norm": 0.47948864235134103, + "learning_rate": 1.5764931111737005e-05, + "loss": 0.335, + "step": 6190 + }, + { + "epoch": 1.2204258675078865, + "grad_norm": 0.5276091720414406, + "learning_rate": 1.576366451314766e-05, + "loss": 0.3546, + "step": 6191 + }, + { + "epoch": 1.2206230283911672, + "grad_norm": 0.48975202677127483, + "learning_rate": 1.576239777608164e-05, + "loss": 0.3195, + "step": 6192 + }, + { + "epoch": 1.220820189274448, + "grad_norm": 0.5115397169665277, + "learning_rate": 1.576113090056937e-05, + "loss": 0.3552, + "step": 6193 + }, + { + "epoch": 1.2210173501577288, + "grad_norm": 0.5544373081682192, + "learning_rate": 1.575986388664129e-05, + "loss": 0.3388, + "step": 6194 + }, + { + "epoch": 1.2212145110410095, + "grad_norm": 0.4789487893265251, + "learning_rate": 1.5758596734327842e-05, + "loss": 0.3049, + "step": 6195 + }, + { + "epoch": 1.2214116719242902, + "grad_norm": 0.5085580816350456, + "learning_rate": 1.5757329443659468e-05, + "loss": 0.3337, + "step": 6196 + }, + { + "epoch": 1.221608832807571, + "grad_norm": 0.5018198548590921, + "learning_rate": 1.5756062014666622e-05, + "loss": 0.3602, + "step": 6197 + }, + { + "epoch": 1.2218059936908516, + "grad_norm": 0.5459993872154436, + "learning_rate": 1.5754794447379747e-05, + "loss": 0.3734, + "step": 6198 + }, + { + "epoch": 1.2220031545741326, + "grad_norm": 0.5069803136116705, + "learning_rate": 1.5753526741829302e-05, + "loss": 0.3292, + "step": 6199 + }, + { + "epoch": 1.2222003154574133, + "grad_norm": 0.5349842354961301, + "learning_rate": 1.5752258898045747e-05, + "loss": 0.3591, + "step": 6200 + }, + { + "epoch": 1.222397476340694, + "grad_norm": 0.5113741033129703, + "learning_rate": 1.5750990916059537e-05, + "loss": 0.3354, + "step": 6201 + }, + { + "epoch": 1.2225946372239749, + "grad_norm": 0.4764417943474872, + "learning_rate": 1.5749722795901142e-05, + "loss": 0.2776, + "step": 6202 + }, + { + "epoch": 1.2227917981072556, + "grad_norm": 0.4852541319749403, + "learning_rate": 1.574845453760102e-05, + "loss": 0.2877, + "step": 6203 + }, + { + "epoch": 1.2229889589905363, + "grad_norm": 0.5011535867185861, + "learning_rate": 1.5747186141189654e-05, + "loss": 0.3188, + "step": 6204 + }, + { + "epoch": 1.223186119873817, + "grad_norm": 0.5585191080381091, + "learning_rate": 1.574591760669751e-05, + "loss": 0.3278, + "step": 6205 + }, + { + "epoch": 1.2233832807570977, + "grad_norm": 0.5095163688965774, + "learning_rate": 1.574464893415507e-05, + "loss": 0.3388, + "step": 6206 + }, + { + "epoch": 1.2235804416403786, + "grad_norm": 0.489944289171812, + "learning_rate": 1.5743380123592815e-05, + "loss": 0.3298, + "step": 6207 + }, + { + "epoch": 1.2237776025236593, + "grad_norm": 0.56552053195053, + "learning_rate": 1.5742111175041222e-05, + "loss": 0.3271, + "step": 6208 + }, + { + "epoch": 1.22397476340694, + "grad_norm": 0.4975929434887488, + "learning_rate": 1.5740842088530788e-05, + "loss": 0.3171, + "step": 6209 + }, + { + "epoch": 1.224171924290221, + "grad_norm": 0.5635718664701436, + "learning_rate": 1.5739572864091995e-05, + "loss": 0.333, + "step": 6210 + }, + { + "epoch": 1.2243690851735016, + "grad_norm": 0.5466750621448705, + "learning_rate": 1.573830350175535e-05, + "loss": 0.3226, + "step": 6211 + }, + { + "epoch": 1.2245662460567823, + "grad_norm": 0.5179765647986949, + "learning_rate": 1.5737034001551336e-05, + "loss": 0.3451, + "step": 6212 + }, + { + "epoch": 1.224763406940063, + "grad_norm": 0.505297026337151, + "learning_rate": 1.573576436351046e-05, + "loss": 0.3263, + "step": 6213 + }, + { + "epoch": 1.2249605678233437, + "grad_norm": 0.5323927892854735, + "learning_rate": 1.573449458766323e-05, + "loss": 0.3348, + "step": 6214 + }, + { + "epoch": 1.2251577287066246, + "grad_norm": 0.4585385421095369, + "learning_rate": 1.573322467404015e-05, + "loss": 0.2889, + "step": 6215 + }, + { + "epoch": 1.2253548895899053, + "grad_norm": 0.5133915738969325, + "learning_rate": 1.573195462267173e-05, + "loss": 0.3354, + "step": 6216 + }, + { + "epoch": 1.225552050473186, + "grad_norm": 0.4976376614042412, + "learning_rate": 1.573068443358848e-05, + "loss": 0.3261, + "step": 6217 + }, + { + "epoch": 1.225749211356467, + "grad_norm": 0.5204545809416458, + "learning_rate": 1.572941410682092e-05, + "loss": 0.3152, + "step": 6218 + }, + { + "epoch": 1.2259463722397477, + "grad_norm": 0.46008522086558107, + "learning_rate": 1.572814364239958e-05, + "loss": 0.3049, + "step": 6219 + }, + { + "epoch": 1.2261435331230284, + "grad_norm": 0.5085240798538043, + "learning_rate": 1.572687304035497e-05, + "loss": 0.3392, + "step": 6220 + }, + { + "epoch": 1.226340694006309, + "grad_norm": 0.49075420527666574, + "learning_rate": 1.5725602300717628e-05, + "loss": 0.3238, + "step": 6221 + }, + { + "epoch": 1.22653785488959, + "grad_norm": 0.5009588499072523, + "learning_rate": 1.5724331423518076e-05, + "loss": 0.3588, + "step": 6222 + }, + { + "epoch": 1.2267350157728707, + "grad_norm": 0.5004985108490898, + "learning_rate": 1.572306040878685e-05, + "loss": 0.3413, + "step": 6223 + }, + { + "epoch": 1.2269321766561514, + "grad_norm": 0.5036837263941046, + "learning_rate": 1.5721789256554495e-05, + "loss": 0.3472, + "step": 6224 + }, + { + "epoch": 1.227129337539432, + "grad_norm": 0.7266521472928784, + "learning_rate": 1.5720517966851544e-05, + "loss": 0.3553, + "step": 6225 + }, + { + "epoch": 1.227326498422713, + "grad_norm": 0.5013043340465552, + "learning_rate": 1.5719246539708536e-05, + "loss": 0.3436, + "step": 6226 + }, + { + "epoch": 1.2275236593059937, + "grad_norm": 0.4940987339117502, + "learning_rate": 1.571797497515603e-05, + "loss": 0.3127, + "step": 6227 + }, + { + "epoch": 1.2277208201892744, + "grad_norm": 0.5237341159455894, + "learning_rate": 1.5716703273224568e-05, + "loss": 0.3173, + "step": 6228 + }, + { + "epoch": 1.2279179810725551, + "grad_norm": 0.5284887213486391, + "learning_rate": 1.5715431433944706e-05, + "loss": 0.3326, + "step": 6229 + }, + { + "epoch": 1.228115141955836, + "grad_norm": 0.5224757623724461, + "learning_rate": 1.5714159457347007e-05, + "loss": 0.3254, + "step": 6230 + }, + { + "epoch": 1.2283123028391167, + "grad_norm": 0.49876895032485197, + "learning_rate": 1.571288734346202e-05, + "loss": 0.3423, + "step": 6231 + }, + { + "epoch": 1.2285094637223974, + "grad_norm": 0.512553101690978, + "learning_rate": 1.5711615092320315e-05, + "loss": 0.3414, + "step": 6232 + }, + { + "epoch": 1.2287066246056781, + "grad_norm": 0.5014118944270054, + "learning_rate": 1.571034270395246e-05, + "loss": 0.3183, + "step": 6233 + }, + { + "epoch": 1.228903785488959, + "grad_norm": 1.4423480845194543, + "learning_rate": 1.570907017838902e-05, + "loss": 0.3328, + "step": 6234 + }, + { + "epoch": 1.2291009463722398, + "grad_norm": 0.4737177745880879, + "learning_rate": 1.5707797515660574e-05, + "loss": 0.3193, + "step": 6235 + }, + { + "epoch": 1.2292981072555205, + "grad_norm": 0.5036458325808413, + "learning_rate": 1.5706524715797693e-05, + "loss": 0.3438, + "step": 6236 + }, + { + "epoch": 1.2294952681388012, + "grad_norm": 0.5174813077151698, + "learning_rate": 1.5705251778830962e-05, + "loss": 0.3515, + "step": 6237 + }, + { + "epoch": 1.229692429022082, + "grad_norm": 0.4799244634840848, + "learning_rate": 1.5703978704790962e-05, + "loss": 0.3149, + "step": 6238 + }, + { + "epoch": 1.2298895899053628, + "grad_norm": 0.5581993797302612, + "learning_rate": 1.5702705493708283e-05, + "loss": 0.3553, + "step": 6239 + }, + { + "epoch": 1.2300867507886435, + "grad_norm": 0.5064827009129059, + "learning_rate": 1.5701432145613508e-05, + "loss": 0.3266, + "step": 6240 + }, + { + "epoch": 1.2302839116719242, + "grad_norm": 0.6670134919739141, + "learning_rate": 1.5700158660537235e-05, + "loss": 0.3515, + "step": 6241 + }, + { + "epoch": 1.2304810725552051, + "grad_norm": 0.510108854937946, + "learning_rate": 1.569888503851006e-05, + "loss": 0.3321, + "step": 6242 + }, + { + "epoch": 1.2306782334384858, + "grad_norm": 0.5046667999599455, + "learning_rate": 1.5697611279562584e-05, + "loss": 0.3339, + "step": 6243 + }, + { + "epoch": 1.2308753943217665, + "grad_norm": 0.49224188852519396, + "learning_rate": 1.5696337383725412e-05, + "loss": 0.3119, + "step": 6244 + }, + { + "epoch": 1.2310725552050474, + "grad_norm": 0.5172616601262686, + "learning_rate": 1.569506335102914e-05, + "loss": 0.3443, + "step": 6245 + }, + { + "epoch": 1.2312697160883281, + "grad_norm": 0.5191142722710178, + "learning_rate": 1.569378918150439e-05, + "loss": 0.356, + "step": 6246 + }, + { + "epoch": 1.2314668769716088, + "grad_norm": 0.5412878379897871, + "learning_rate": 1.5692514875181767e-05, + "loss": 0.3716, + "step": 6247 + }, + { + "epoch": 1.2316640378548895, + "grad_norm": 0.5465050490209235, + "learning_rate": 1.5691240432091892e-05, + "loss": 0.3478, + "step": 6248 + }, + { + "epoch": 1.2318611987381702, + "grad_norm": 0.4939130381768039, + "learning_rate": 1.5689965852265383e-05, + "loss": 0.334, + "step": 6249 + }, + { + "epoch": 1.2320583596214512, + "grad_norm": 0.5058870629567691, + "learning_rate": 1.568869113573286e-05, + "loss": 0.336, + "step": 6250 + }, + { + "epoch": 1.2322555205047319, + "grad_norm": 0.48700312555624475, + "learning_rate": 1.568741628252495e-05, + "loss": 0.329, + "step": 6251 + }, + { + "epoch": 1.2324526813880126, + "grad_norm": 0.5093770682043642, + "learning_rate": 1.5686141292672287e-05, + "loss": 0.3498, + "step": 6252 + }, + { + "epoch": 1.2326498422712935, + "grad_norm": 0.49687858919236133, + "learning_rate": 1.56848661662055e-05, + "loss": 0.3263, + "step": 6253 + }, + { + "epoch": 1.2328470031545742, + "grad_norm": 0.5077643833980352, + "learning_rate": 1.5683590903155222e-05, + "loss": 0.3385, + "step": 6254 + }, + { + "epoch": 1.2330441640378549, + "grad_norm": 0.5185258515611184, + "learning_rate": 1.56823155035521e-05, + "loss": 0.3456, + "step": 6255 + }, + { + "epoch": 1.2332413249211356, + "grad_norm": 0.5278963932263402, + "learning_rate": 1.5681039967426773e-05, + "loss": 0.3555, + "step": 6256 + }, + { + "epoch": 1.2334384858044163, + "grad_norm": 0.4941475259778851, + "learning_rate": 1.5679764294809882e-05, + "loss": 0.3451, + "step": 6257 + }, + { + "epoch": 1.2336356466876972, + "grad_norm": 0.4917220521203785, + "learning_rate": 1.567848848573208e-05, + "loss": 0.3115, + "step": 6258 + }, + { + "epoch": 1.233832807570978, + "grad_norm": 0.4949766275099023, + "learning_rate": 1.567721254022402e-05, + "loss": 0.3181, + "step": 6259 + }, + { + "epoch": 1.2340299684542586, + "grad_norm": 0.5273568098480352, + "learning_rate": 1.5675936458316357e-05, + "loss": 0.3399, + "step": 6260 + }, + { + "epoch": 1.2342271293375395, + "grad_norm": 0.4886485870140973, + "learning_rate": 1.567466024003975e-05, + "loss": 0.3564, + "step": 6261 + }, + { + "epoch": 1.2344242902208202, + "grad_norm": 0.5030459426717067, + "learning_rate": 1.567338388542486e-05, + "loss": 0.3335, + "step": 6262 + }, + { + "epoch": 1.234621451104101, + "grad_norm": 0.4845019722082114, + "learning_rate": 1.5672107394502357e-05, + "loss": 0.3083, + "step": 6263 + }, + { + "epoch": 1.2348186119873816, + "grad_norm": 0.49034401686565704, + "learning_rate": 1.56708307673029e-05, + "loss": 0.3406, + "step": 6264 + }, + { + "epoch": 1.2350157728706626, + "grad_norm": 0.8790381187259672, + "learning_rate": 1.5669554003857172e-05, + "loss": 0.3443, + "step": 6265 + }, + { + "epoch": 1.2352129337539433, + "grad_norm": 0.4998894425749621, + "learning_rate": 1.566827710419584e-05, + "loss": 0.335, + "step": 6266 + }, + { + "epoch": 1.235410094637224, + "grad_norm": 0.532021790282657, + "learning_rate": 1.566700006834959e-05, + "loss": 0.3623, + "step": 6267 + }, + { + "epoch": 1.2356072555205047, + "grad_norm": 0.5388159940953764, + "learning_rate": 1.5665722896349098e-05, + "loss": 0.3463, + "step": 6268 + }, + { + "epoch": 1.2358044164037856, + "grad_norm": 0.5157145162260288, + "learning_rate": 1.566444558822505e-05, + "loss": 0.3421, + "step": 6269 + }, + { + "epoch": 1.2360015772870663, + "grad_norm": 0.5605990079950705, + "learning_rate": 1.5663168144008136e-05, + "loss": 0.3561, + "step": 6270 + }, + { + "epoch": 1.236198738170347, + "grad_norm": 0.4715933715273965, + "learning_rate": 1.5661890563729045e-05, + "loss": 0.3115, + "step": 6271 + }, + { + "epoch": 1.2363958990536277, + "grad_norm": 0.5133180282336455, + "learning_rate": 1.5660612847418476e-05, + "loss": 0.3462, + "step": 6272 + }, + { + "epoch": 1.2365930599369086, + "grad_norm": 0.5181709969632247, + "learning_rate": 1.5659334995107124e-05, + "loss": 0.349, + "step": 6273 + }, + { + "epoch": 1.2367902208201893, + "grad_norm": 0.5196223017058347, + "learning_rate": 1.565805700682569e-05, + "loss": 0.3526, + "step": 6274 + }, + { + "epoch": 1.23698738170347, + "grad_norm": 0.5375416524081914, + "learning_rate": 1.565677888260488e-05, + "loss": 0.3509, + "step": 6275 + }, + { + "epoch": 1.2371845425867507, + "grad_norm": 0.4661403315888071, + "learning_rate": 1.5655500622475405e-05, + "loss": 0.2998, + "step": 6276 + }, + { + "epoch": 1.2373817034700316, + "grad_norm": 0.5026678272938364, + "learning_rate": 1.565422222646797e-05, + "loss": 0.316, + "step": 6277 + }, + { + "epoch": 1.2375788643533123, + "grad_norm": 0.5370007180074455, + "learning_rate": 1.5652943694613293e-05, + "loss": 0.3638, + "step": 6278 + }, + { + "epoch": 1.237776025236593, + "grad_norm": 0.5161124298507306, + "learning_rate": 1.5651665026942094e-05, + "loss": 0.3346, + "step": 6279 + }, + { + "epoch": 1.2379731861198737, + "grad_norm": 0.5652741095921482, + "learning_rate": 1.565038622348509e-05, + "loss": 0.3288, + "step": 6280 + }, + { + "epoch": 1.2381703470031546, + "grad_norm": 0.9272850647527554, + "learning_rate": 1.5649107284273007e-05, + "loss": 0.3186, + "step": 6281 + }, + { + "epoch": 1.2383675078864353, + "grad_norm": 0.5570913267865101, + "learning_rate": 1.5647828209336572e-05, + "loss": 0.3537, + "step": 6282 + }, + { + "epoch": 1.238564668769716, + "grad_norm": 0.5082360498168839, + "learning_rate": 1.5646548998706514e-05, + "loss": 0.3099, + "step": 6283 + }, + { + "epoch": 1.2387618296529967, + "grad_norm": 0.5205218960702347, + "learning_rate": 1.5645269652413574e-05, + "loss": 0.306, + "step": 6284 + }, + { + "epoch": 1.2389589905362777, + "grad_norm": 0.504848365836123, + "learning_rate": 1.564399017048848e-05, + "loss": 0.3249, + "step": 6285 + }, + { + "epoch": 1.2391561514195584, + "grad_norm": 0.5448082615998162, + "learning_rate": 1.5642710552961982e-05, + "loss": 0.3271, + "step": 6286 + }, + { + "epoch": 1.239353312302839, + "grad_norm": 0.5213986756373774, + "learning_rate": 1.564143079986481e-05, + "loss": 0.3245, + "step": 6287 + }, + { + "epoch": 1.23955047318612, + "grad_norm": 0.5069938937165306, + "learning_rate": 1.564015091122773e-05, + "loss": 0.3336, + "step": 6288 + }, + { + "epoch": 1.2397476340694007, + "grad_norm": 0.5053084261882574, + "learning_rate": 1.5638870887081476e-05, + "loss": 0.3217, + "step": 6289 + }, + { + "epoch": 1.2399447949526814, + "grad_norm": 0.521104683492062, + "learning_rate": 1.5637590727456808e-05, + "loss": 0.3168, + "step": 6290 + }, + { + "epoch": 1.240141955835962, + "grad_norm": 0.5467959912507718, + "learning_rate": 1.5636310432384487e-05, + "loss": 0.3569, + "step": 6291 + }, + { + "epoch": 1.2403391167192428, + "grad_norm": 0.5102807325723169, + "learning_rate": 1.5635030001895267e-05, + "loss": 0.3149, + "step": 6292 + }, + { + "epoch": 1.2405362776025237, + "grad_norm": 1.3045505295401632, + "learning_rate": 1.5633749436019913e-05, + "loss": 0.3757, + "step": 6293 + }, + { + "epoch": 1.2407334384858044, + "grad_norm": 0.5444503336415735, + "learning_rate": 1.5632468734789192e-05, + "loss": 0.3235, + "step": 6294 + }, + { + "epoch": 1.2409305993690851, + "grad_norm": 0.5048607240462756, + "learning_rate": 1.563118789823387e-05, + "loss": 0.3353, + "step": 6295 + }, + { + "epoch": 1.241127760252366, + "grad_norm": 0.5667148895534923, + "learning_rate": 1.562990692638473e-05, + "loss": 0.3819, + "step": 6296 + }, + { + "epoch": 1.2413249211356467, + "grad_norm": 0.5446964393578572, + "learning_rate": 1.562862581927254e-05, + "loss": 0.3701, + "step": 6297 + }, + { + "epoch": 1.2415220820189274, + "grad_norm": 0.5257025409305759, + "learning_rate": 1.5627344576928085e-05, + "loss": 0.3344, + "step": 6298 + }, + { + "epoch": 1.2417192429022081, + "grad_norm": 0.5256923271112383, + "learning_rate": 1.5626063199382138e-05, + "loss": 0.3574, + "step": 6299 + }, + { + "epoch": 1.2419164037854888, + "grad_norm": 0.5316599865695444, + "learning_rate": 1.5624781686665498e-05, + "loss": 0.3574, + "step": 6300 + }, + { + "epoch": 1.2421135646687698, + "grad_norm": 0.5313857364398521, + "learning_rate": 1.5623500038808946e-05, + "loss": 0.3563, + "step": 6301 + }, + { + "epoch": 1.2423107255520505, + "grad_norm": 0.5039351482023608, + "learning_rate": 1.5622218255843276e-05, + "loss": 0.3031, + "step": 6302 + }, + { + "epoch": 1.2425078864353312, + "grad_norm": 0.5072016335647022, + "learning_rate": 1.5620936337799287e-05, + "loss": 0.3515, + "step": 6303 + }, + { + "epoch": 1.242705047318612, + "grad_norm": 0.5672328567093293, + "learning_rate": 1.5619654284707773e-05, + "loss": 0.314, + "step": 6304 + }, + { + "epoch": 1.2429022082018928, + "grad_norm": 0.5399431684544427, + "learning_rate": 1.5618372096599547e-05, + "loss": 0.3492, + "step": 6305 + }, + { + "epoch": 1.2430993690851735, + "grad_norm": 0.49668681766651646, + "learning_rate": 1.56170897735054e-05, + "loss": 0.3435, + "step": 6306 + }, + { + "epoch": 1.2432965299684542, + "grad_norm": 0.5134554384784643, + "learning_rate": 1.561580731545615e-05, + "loss": 0.3542, + "step": 6307 + }, + { + "epoch": 1.2434936908517351, + "grad_norm": 0.507277410037156, + "learning_rate": 1.5614524722482604e-05, + "loss": 0.3333, + "step": 6308 + }, + { + "epoch": 1.2436908517350158, + "grad_norm": 0.5301659849549519, + "learning_rate": 1.561324199461558e-05, + "loss": 0.3351, + "step": 6309 + }, + { + "epoch": 1.2438880126182965, + "grad_norm": 0.4915261347852841, + "learning_rate": 1.56119591318859e-05, + "loss": 0.3319, + "step": 6310 + }, + { + "epoch": 1.2440851735015772, + "grad_norm": 0.5361265111719739, + "learning_rate": 1.561067613432438e-05, + "loss": 0.3189, + "step": 6311 + }, + { + "epoch": 1.2442823343848581, + "grad_norm": 0.5259903970751432, + "learning_rate": 1.560939300196185e-05, + "loss": 0.3497, + "step": 6312 + }, + { + "epoch": 1.2444794952681388, + "grad_norm": 0.5066141181133125, + "learning_rate": 1.5608109734829134e-05, + "loss": 0.3335, + "step": 6313 + }, + { + "epoch": 1.2446766561514195, + "grad_norm": 0.5640167051107872, + "learning_rate": 1.5606826332957066e-05, + "loss": 0.3371, + "step": 6314 + }, + { + "epoch": 1.2448738170347002, + "grad_norm": 0.4993378182654541, + "learning_rate": 1.560554279637648e-05, + "loss": 0.3393, + "step": 6315 + }, + { + "epoch": 1.2450709779179812, + "grad_norm": 0.4864313789939513, + "learning_rate": 1.560425912511822e-05, + "loss": 0.3251, + "step": 6316 + }, + { + "epoch": 1.2452681388012619, + "grad_norm": 0.4908528558657437, + "learning_rate": 1.5602975319213115e-05, + "loss": 0.3377, + "step": 6317 + }, + { + "epoch": 1.2454652996845426, + "grad_norm": 0.5386582667468415, + "learning_rate": 1.5601691378692014e-05, + "loss": 0.3517, + "step": 6318 + }, + { + "epoch": 1.2456624605678233, + "grad_norm": 0.5109315855247596, + "learning_rate": 1.5600407303585773e-05, + "loss": 0.3195, + "step": 6319 + }, + { + "epoch": 1.2458596214511042, + "grad_norm": 0.5286420872063581, + "learning_rate": 1.559912309392523e-05, + "loss": 0.3531, + "step": 6320 + }, + { + "epoch": 1.2460567823343849, + "grad_norm": 0.5019801643055133, + "learning_rate": 1.559783874974125e-05, + "loss": 0.3223, + "step": 6321 + }, + { + "epoch": 1.2462539432176656, + "grad_norm": 0.5106613080428457, + "learning_rate": 1.559655427106468e-05, + "loss": 0.3308, + "step": 6322 + }, + { + "epoch": 1.2464511041009463, + "grad_norm": 0.5460596574557645, + "learning_rate": 1.5595269657926396e-05, + "loss": 0.3453, + "step": 6323 + }, + { + "epoch": 1.2466482649842272, + "grad_norm": 0.5260622334928508, + "learning_rate": 1.559398491035725e-05, + "loss": 0.3253, + "step": 6324 + }, + { + "epoch": 1.246845425867508, + "grad_norm": 0.4969801726967249, + "learning_rate": 1.5592700028388107e-05, + "loss": 0.3253, + "step": 6325 + }, + { + "epoch": 1.2470425867507886, + "grad_norm": 0.4980689481986874, + "learning_rate": 1.5591415012049846e-05, + "loss": 0.3236, + "step": 6326 + }, + { + "epoch": 1.2472397476340693, + "grad_norm": 0.49763176731095426, + "learning_rate": 1.5590129861373335e-05, + "loss": 0.3378, + "step": 6327 + }, + { + "epoch": 1.2474369085173502, + "grad_norm": 0.5169444490816916, + "learning_rate": 1.5588844576389454e-05, + "loss": 0.3312, + "step": 6328 + }, + { + "epoch": 1.247634069400631, + "grad_norm": 0.5109026656737135, + "learning_rate": 1.5587559157129078e-05, + "loss": 0.333, + "step": 6329 + }, + { + "epoch": 1.2478312302839116, + "grad_norm": 0.5055755741491358, + "learning_rate": 1.5586273603623098e-05, + "loss": 0.3435, + "step": 6330 + }, + { + "epoch": 1.2480283911671926, + "grad_norm": 0.5005392793433827, + "learning_rate": 1.5584987915902393e-05, + "loss": 0.3399, + "step": 6331 + }, + { + "epoch": 1.2482255520504733, + "grad_norm": 0.4977560846580881, + "learning_rate": 1.5583702093997855e-05, + "loss": 0.3299, + "step": 6332 + }, + { + "epoch": 1.248422712933754, + "grad_norm": 0.49705232631811735, + "learning_rate": 1.558241613794038e-05, + "loss": 0.3119, + "step": 6333 + }, + { + "epoch": 1.2486198738170347, + "grad_norm": 0.5240628686982806, + "learning_rate": 1.5581130047760865e-05, + "loss": 0.3357, + "step": 6334 + }, + { + "epoch": 1.2488170347003154, + "grad_norm": 0.5317216595261723, + "learning_rate": 1.55798438234902e-05, + "loss": 0.3368, + "step": 6335 + }, + { + "epoch": 1.2490141955835963, + "grad_norm": 0.5388823858020947, + "learning_rate": 1.5578557465159296e-05, + "loss": 0.3451, + "step": 6336 + }, + { + "epoch": 1.249211356466877, + "grad_norm": 0.5020216428038514, + "learning_rate": 1.5577270972799056e-05, + "loss": 0.3266, + "step": 6337 + }, + { + "epoch": 1.2494085173501577, + "grad_norm": 0.5251732240656299, + "learning_rate": 1.5575984346440393e-05, + "loss": 0.3255, + "step": 6338 + }, + { + "epoch": 1.2496056782334386, + "grad_norm": 0.5065709033953211, + "learning_rate": 1.5574697586114213e-05, + "loss": 0.3441, + "step": 6339 + }, + { + "epoch": 1.2498028391167193, + "grad_norm": 0.49243721616902913, + "learning_rate": 1.5573410691851432e-05, + "loss": 0.3347, + "step": 6340 + }, + { + "epoch": 1.2498028391167193, + "eval_loss": 0.43939557671546936, + "eval_runtime": 344.8004, + "eval_samples_per_second": 23.579, + "eval_steps_per_second": 1.476, + "step": 6340 + }, + { + "epoch": 1.25, + "grad_norm": 0.5101061503167154, + "learning_rate": 1.5572123663682975e-05, + "loss": 0.3102, + "step": 6341 + }, + { + "epoch": 1.2501971608832807, + "grad_norm": 0.4808666726594073, + "learning_rate": 1.5570836501639754e-05, + "loss": 0.3435, + "step": 6342 + }, + { + "epoch": 1.2503943217665614, + "grad_norm": 0.4999834817361975, + "learning_rate": 1.5569549205752707e-05, + "loss": 0.3381, + "step": 6343 + }, + { + "epoch": 1.2505914826498423, + "grad_norm": 0.49204094898528117, + "learning_rate": 1.556826177605275e-05, + "loss": 0.3274, + "step": 6344 + }, + { + "epoch": 1.250788643533123, + "grad_norm": 0.4949146499447072, + "learning_rate": 1.556697421257082e-05, + "loss": 0.3592, + "step": 6345 + }, + { + "epoch": 1.2509858044164037, + "grad_norm": 0.47762676196393705, + "learning_rate": 1.556568651533785e-05, + "loss": 0.322, + "step": 6346 + }, + { + "epoch": 1.2511829652996846, + "grad_norm": 0.5191724256820243, + "learning_rate": 1.5564398684384787e-05, + "loss": 0.3442, + "step": 6347 + }, + { + "epoch": 1.2513801261829653, + "grad_norm": 0.4787762214934278, + "learning_rate": 1.5563110719742558e-05, + "loss": 0.3285, + "step": 6348 + }, + { + "epoch": 1.251577287066246, + "grad_norm": 0.48201229976584514, + "learning_rate": 1.5561822621442114e-05, + "loss": 0.326, + "step": 6349 + }, + { + "epoch": 1.2517744479495267, + "grad_norm": 0.46392186774839445, + "learning_rate": 1.5560534389514407e-05, + "loss": 0.3004, + "step": 6350 + }, + { + "epoch": 1.2519716088328074, + "grad_norm": 0.4815291317559283, + "learning_rate": 1.555924602399038e-05, + "loss": 0.307, + "step": 6351 + }, + { + "epoch": 1.2521687697160884, + "grad_norm": 0.8785203455669748, + "learning_rate": 1.5557957524900993e-05, + "loss": 0.3367, + "step": 6352 + }, + { + "epoch": 1.252365930599369, + "grad_norm": 0.48203047338077637, + "learning_rate": 1.5556668892277197e-05, + "loss": 0.3262, + "step": 6353 + }, + { + "epoch": 1.2525630914826498, + "grad_norm": 0.5133633911448251, + "learning_rate": 1.555538012614996e-05, + "loss": 0.3317, + "step": 6354 + }, + { + "epoch": 1.2527602523659307, + "grad_norm": 0.5341940509642474, + "learning_rate": 1.555409122655024e-05, + "loss": 0.3572, + "step": 6355 + }, + { + "epoch": 1.2529574132492114, + "grad_norm": 0.482176622994259, + "learning_rate": 1.5552802193509003e-05, + "loss": 0.3147, + "step": 6356 + }, + { + "epoch": 1.253154574132492, + "grad_norm": 0.49374641797023383, + "learning_rate": 1.5551513027057225e-05, + "loss": 0.3481, + "step": 6357 + }, + { + "epoch": 1.2533517350157728, + "grad_norm": 0.46859298112273634, + "learning_rate": 1.5550223727225875e-05, + "loss": 0.3099, + "step": 6358 + }, + { + "epoch": 1.2535488958990535, + "grad_norm": 0.4986948734759226, + "learning_rate": 1.554893429404593e-05, + "loss": 0.3068, + "step": 6359 + }, + { + "epoch": 1.2537460567823344, + "grad_norm": 0.49945204691498807, + "learning_rate": 1.5547644727548373e-05, + "loss": 0.3256, + "step": 6360 + }, + { + "epoch": 1.2539432176656151, + "grad_norm": 0.5162525662741562, + "learning_rate": 1.554635502776418e-05, + "loss": 0.3326, + "step": 6361 + }, + { + "epoch": 1.2541403785488958, + "grad_norm": 0.534638355896996, + "learning_rate": 1.554506519472434e-05, + "loss": 0.3336, + "step": 6362 + }, + { + "epoch": 1.2543375394321767, + "grad_norm": 0.5029286598031089, + "learning_rate": 1.5543775228459846e-05, + "loss": 0.3516, + "step": 6363 + }, + { + "epoch": 1.2545347003154574, + "grad_norm": 0.5261189540493615, + "learning_rate": 1.554248512900169e-05, + "loss": 0.328, + "step": 6364 + }, + { + "epoch": 1.2547318611987381, + "grad_norm": 0.8234311010835397, + "learning_rate": 1.5541194896380863e-05, + "loss": 0.3409, + "step": 6365 + }, + { + "epoch": 1.254929022082019, + "grad_norm": 0.5154543512907336, + "learning_rate": 1.5539904530628365e-05, + "loss": 0.3431, + "step": 6366 + }, + { + "epoch": 1.2551261829652998, + "grad_norm": 0.478219086021865, + "learning_rate": 1.55386140317752e-05, + "loss": 0.3156, + "step": 6367 + }, + { + "epoch": 1.2553233438485805, + "grad_norm": 0.49818756625431365, + "learning_rate": 1.5537323399852373e-05, + "loss": 0.3157, + "step": 6368 + }, + { + "epoch": 1.2555205047318612, + "grad_norm": 0.47906698592620167, + "learning_rate": 1.5536032634890892e-05, + "loss": 0.3299, + "step": 6369 + }, + { + "epoch": 1.2557176656151419, + "grad_norm": 0.5062243330914792, + "learning_rate": 1.553474173692177e-05, + "loss": 0.3484, + "step": 6370 + }, + { + "epoch": 1.2559148264984228, + "grad_norm": 0.5307599529911612, + "learning_rate": 1.5533450705976018e-05, + "loss": 0.3385, + "step": 6371 + }, + { + "epoch": 1.2561119873817035, + "grad_norm": 0.573572643755291, + "learning_rate": 1.553215954208466e-05, + "loss": 0.357, + "step": 6372 + }, + { + "epoch": 1.2563091482649842, + "grad_norm": 0.5968608411513199, + "learning_rate": 1.5530868245278708e-05, + "loss": 0.3489, + "step": 6373 + }, + { + "epoch": 1.256506309148265, + "grad_norm": 0.58115296279192, + "learning_rate": 1.55295768155892e-05, + "loss": 0.3522, + "step": 6374 + }, + { + "epoch": 1.2567034700315458, + "grad_norm": 0.49820315714499325, + "learning_rate": 1.5528285253047153e-05, + "loss": 0.3303, + "step": 6375 + }, + { + "epoch": 1.2569006309148265, + "grad_norm": 0.5089032372495764, + "learning_rate": 1.55269935576836e-05, + "loss": 0.344, + "step": 6376 + }, + { + "epoch": 1.2570977917981072, + "grad_norm": 0.5371768444531454, + "learning_rate": 1.5525701729529578e-05, + "loss": 0.3675, + "step": 6377 + }, + { + "epoch": 1.257294952681388, + "grad_norm": 0.5002102537095656, + "learning_rate": 1.552440976861612e-05, + "loss": 0.3248, + "step": 6378 + }, + { + "epoch": 1.2574921135646688, + "grad_norm": 0.5018100903078823, + "learning_rate": 1.5523117674974267e-05, + "loss": 0.3217, + "step": 6379 + }, + { + "epoch": 1.2576892744479495, + "grad_norm": 0.5336700648969083, + "learning_rate": 1.5521825448635066e-05, + "loss": 0.3441, + "step": 6380 + }, + { + "epoch": 1.2578864353312302, + "grad_norm": 0.48938739876612136, + "learning_rate": 1.5520533089629562e-05, + "loss": 0.328, + "step": 6381 + }, + { + "epoch": 1.2580835962145112, + "grad_norm": 0.49246018639601746, + "learning_rate": 1.5519240597988806e-05, + "loss": 0.3418, + "step": 6382 + }, + { + "epoch": 1.2582807570977919, + "grad_norm": 0.49124354097385314, + "learning_rate": 1.551794797374385e-05, + "loss": 0.3294, + "step": 6383 + }, + { + "epoch": 1.2584779179810726, + "grad_norm": 0.5583036731814169, + "learning_rate": 1.5516655216925748e-05, + "loss": 0.3576, + "step": 6384 + }, + { + "epoch": 1.2586750788643533, + "grad_norm": 0.5097818791868225, + "learning_rate": 1.5515362327565564e-05, + "loss": 0.3372, + "step": 6385 + }, + { + "epoch": 1.258872239747634, + "grad_norm": 0.5176295462370966, + "learning_rate": 1.5514069305694356e-05, + "loss": 0.3537, + "step": 6386 + }, + { + "epoch": 1.2590694006309149, + "grad_norm": 0.5797974728770534, + "learning_rate": 1.5512776151343198e-05, + "loss": 0.3595, + "step": 6387 + }, + { + "epoch": 1.2592665615141956, + "grad_norm": 0.6262421558004105, + "learning_rate": 1.5511482864543147e-05, + "loss": 0.3253, + "step": 6388 + }, + { + "epoch": 1.2594637223974763, + "grad_norm": 0.5273042130470622, + "learning_rate": 1.5510189445325284e-05, + "loss": 0.3221, + "step": 6389 + }, + { + "epoch": 1.2596608832807572, + "grad_norm": 0.5732125441797801, + "learning_rate": 1.5508895893720685e-05, + "loss": 0.3532, + "step": 6390 + }, + { + "epoch": 1.259858044164038, + "grad_norm": 0.5275542533558232, + "learning_rate": 1.550760220976042e-05, + "loss": 0.356, + "step": 6391 + }, + { + "epoch": 1.2600552050473186, + "grad_norm": 0.4857065591782998, + "learning_rate": 1.5506308393475582e-05, + "loss": 0.3308, + "step": 6392 + }, + { + "epoch": 1.2602523659305993, + "grad_norm": 0.5099413329942116, + "learning_rate": 1.550501444489725e-05, + "loss": 0.3362, + "step": 6393 + }, + { + "epoch": 1.26044952681388, + "grad_norm": 0.5056333274989698, + "learning_rate": 1.5503720364056512e-05, + "loss": 0.3489, + "step": 6394 + }, + { + "epoch": 1.260646687697161, + "grad_norm": 0.4774413513695629, + "learning_rate": 1.550242615098446e-05, + "loss": 0.3304, + "step": 6395 + }, + { + "epoch": 1.2608438485804416, + "grad_norm": 0.4843542395552344, + "learning_rate": 1.5501131805712188e-05, + "loss": 0.3047, + "step": 6396 + }, + { + "epoch": 1.2610410094637223, + "grad_norm": 0.4843638404403003, + "learning_rate": 1.549983732827079e-05, + "loss": 0.319, + "step": 6397 + }, + { + "epoch": 1.2612381703470033, + "grad_norm": 0.4783957926301324, + "learning_rate": 1.5498542718691378e-05, + "loss": 0.2986, + "step": 6398 + }, + { + "epoch": 1.261435331230284, + "grad_norm": 0.5171370998231292, + "learning_rate": 1.5497247977005047e-05, + "loss": 0.3288, + "step": 6399 + }, + { + "epoch": 1.2616324921135647, + "grad_norm": 0.5587928424642727, + "learning_rate": 1.5495953103242908e-05, + "loss": 0.3409, + "step": 6400 + }, + { + "epoch": 1.2618296529968454, + "grad_norm": 0.5069107019620349, + "learning_rate": 1.549465809743607e-05, + "loss": 0.3489, + "step": 6401 + }, + { + "epoch": 1.262026813880126, + "grad_norm": 0.5048904621722605, + "learning_rate": 1.5493362959615646e-05, + "loss": 0.3363, + "step": 6402 + }, + { + "epoch": 1.262223974763407, + "grad_norm": 0.4918922172890412, + "learning_rate": 1.549206768981275e-05, + "loss": 0.3215, + "step": 6403 + }, + { + "epoch": 1.2624211356466877, + "grad_norm": 0.5373254295839985, + "learning_rate": 1.5490772288058508e-05, + "loss": 0.3351, + "step": 6404 + }, + { + "epoch": 1.2626182965299684, + "grad_norm": 0.5194441534825306, + "learning_rate": 1.5489476754384035e-05, + "loss": 0.3395, + "step": 6405 + }, + { + "epoch": 1.2628154574132493, + "grad_norm": 0.49136904473331555, + "learning_rate": 1.5488181088820468e-05, + "loss": 0.3144, + "step": 6406 + }, + { + "epoch": 1.26301261829653, + "grad_norm": 0.5815486766369652, + "learning_rate": 1.548688529139893e-05, + "loss": 0.3718, + "step": 6407 + }, + { + "epoch": 1.2632097791798107, + "grad_norm": 0.49216042299624607, + "learning_rate": 1.5485589362150552e-05, + "loss": 0.3384, + "step": 6408 + }, + { + "epoch": 1.2634069400630916, + "grad_norm": 0.49257478050805037, + "learning_rate": 1.5484293301106475e-05, + "loss": 0.3161, + "step": 6409 + }, + { + "epoch": 1.2636041009463723, + "grad_norm": 0.5111418677883345, + "learning_rate": 1.5482997108297834e-05, + "loss": 0.3159, + "step": 6410 + }, + { + "epoch": 1.263801261829653, + "grad_norm": 0.48719719419588625, + "learning_rate": 1.5481700783755772e-05, + "loss": 0.31, + "step": 6411 + }, + { + "epoch": 1.2639984227129337, + "grad_norm": 0.6071472420270517, + "learning_rate": 1.548040432751143e-05, + "loss": 0.3906, + "step": 6412 + }, + { + "epoch": 1.2641955835962144, + "grad_norm": 0.5109344409789744, + "learning_rate": 1.5479107739595967e-05, + "loss": 0.3505, + "step": 6413 + }, + { + "epoch": 1.2643927444794953, + "grad_norm": 0.5163073576362812, + "learning_rate": 1.5477811020040525e-05, + "loss": 0.3396, + "step": 6414 + }, + { + "epoch": 1.264589905362776, + "grad_norm": 0.5159856675009115, + "learning_rate": 1.5476514168876264e-05, + "loss": 0.3458, + "step": 6415 + }, + { + "epoch": 1.2647870662460567, + "grad_norm": 0.5208996581565274, + "learning_rate": 1.5475217186134335e-05, + "loss": 0.3575, + "step": 6416 + }, + { + "epoch": 1.2649842271293377, + "grad_norm": 0.6501488219759475, + "learning_rate": 1.5473920071845906e-05, + "loss": 0.3406, + "step": 6417 + }, + { + "epoch": 1.2651813880126184, + "grad_norm": 0.5045419985244192, + "learning_rate": 1.5472622826042144e-05, + "loss": 0.3216, + "step": 6418 + }, + { + "epoch": 1.265378548895899, + "grad_norm": 0.5083528685308321, + "learning_rate": 1.5471325448754207e-05, + "loss": 0.3485, + "step": 6419 + }, + { + "epoch": 1.2655757097791798, + "grad_norm": 0.4756731932644996, + "learning_rate": 1.547002794001327e-05, + "loss": 0.3249, + "step": 6420 + }, + { + "epoch": 1.2657728706624605, + "grad_norm": 0.5179143417797316, + "learning_rate": 1.546873029985051e-05, + "loss": 0.3634, + "step": 6421 + }, + { + "epoch": 1.2659700315457414, + "grad_norm": 0.5165453574064824, + "learning_rate": 1.54674325282971e-05, + "loss": 0.3135, + "step": 6422 + }, + { + "epoch": 1.266167192429022, + "grad_norm": 0.4897247296838715, + "learning_rate": 1.5466134625384216e-05, + "loss": 0.3222, + "step": 6423 + }, + { + "epoch": 1.2663643533123028, + "grad_norm": 0.5067798904225712, + "learning_rate": 1.546483659114305e-05, + "loss": 0.3574, + "step": 6424 + }, + { + "epoch": 1.2665615141955837, + "grad_norm": 0.4586524801850368, + "learning_rate": 1.5463538425604782e-05, + "loss": 0.3044, + "step": 6425 + }, + { + "epoch": 1.2667586750788644, + "grad_norm": 0.48557837904062384, + "learning_rate": 1.5462240128800604e-05, + "loss": 0.3395, + "step": 6426 + }, + { + "epoch": 1.2669558359621451, + "grad_norm": 0.4999102108308707, + "learning_rate": 1.5460941700761706e-05, + "loss": 0.3213, + "step": 6427 + }, + { + "epoch": 1.2671529968454258, + "grad_norm": 0.5132500026273736, + "learning_rate": 1.545964314151929e-05, + "loss": 0.3422, + "step": 6428 + }, + { + "epoch": 1.2673501577287065, + "grad_norm": 0.4913356576984297, + "learning_rate": 1.545834445110455e-05, + "loss": 0.3383, + "step": 6429 + }, + { + "epoch": 1.2675473186119874, + "grad_norm": 0.5025975016108518, + "learning_rate": 1.5457045629548687e-05, + "loss": 0.3397, + "step": 6430 + }, + { + "epoch": 1.2677444794952681, + "grad_norm": 0.4826877165006056, + "learning_rate": 1.545574667688291e-05, + "loss": 0.3151, + "step": 6431 + }, + { + "epoch": 1.2679416403785488, + "grad_norm": 0.4772272594998397, + "learning_rate": 1.5454447593138424e-05, + "loss": 0.3349, + "step": 6432 + }, + { + "epoch": 1.2681388012618298, + "grad_norm": 2.53849935570129, + "learning_rate": 1.5453148378346444e-05, + "loss": 0.3515, + "step": 6433 + }, + { + "epoch": 1.2683359621451105, + "grad_norm": 0.5128594643909096, + "learning_rate": 1.5451849032538185e-05, + "loss": 0.3419, + "step": 6434 + }, + { + "epoch": 1.2685331230283912, + "grad_norm": 0.493655568948996, + "learning_rate": 1.5450549555744857e-05, + "loss": 0.3182, + "step": 6435 + }, + { + "epoch": 1.2687302839116719, + "grad_norm": 0.4969357386584924, + "learning_rate": 1.5449249947997687e-05, + "loss": 0.3429, + "step": 6436 + }, + { + "epoch": 1.2689274447949526, + "grad_norm": 0.5468340570267272, + "learning_rate": 1.5447950209327905e-05, + "loss": 0.3493, + "step": 6437 + }, + { + "epoch": 1.2691246056782335, + "grad_norm": 0.5418093845032692, + "learning_rate": 1.5446650339766723e-05, + "loss": 0.3476, + "step": 6438 + }, + { + "epoch": 1.2693217665615142, + "grad_norm": 0.6775784568233131, + "learning_rate": 1.544535033934539e-05, + "loss": 0.3739, + "step": 6439 + }, + { + "epoch": 1.2695189274447949, + "grad_norm": 0.48385467083683753, + "learning_rate": 1.5444050208095124e-05, + "loss": 0.3195, + "step": 6440 + }, + { + "epoch": 1.2697160883280758, + "grad_norm": 0.493918698352294, + "learning_rate": 1.544274994604717e-05, + "loss": 0.3302, + "step": 6441 + }, + { + "epoch": 1.2699132492113565, + "grad_norm": 0.5530138835909129, + "learning_rate": 1.5441449553232764e-05, + "loss": 0.3495, + "step": 6442 + }, + { + "epoch": 1.2701104100946372, + "grad_norm": 0.49621433371123225, + "learning_rate": 1.544014902968315e-05, + "loss": 0.3333, + "step": 6443 + }, + { + "epoch": 1.270307570977918, + "grad_norm": 0.49721837630990706, + "learning_rate": 1.5438848375429576e-05, + "loss": 0.3283, + "step": 6444 + }, + { + "epoch": 1.2705047318611986, + "grad_norm": 0.5533506511752488, + "learning_rate": 1.5437547590503288e-05, + "loss": 0.385, + "step": 6445 + }, + { + "epoch": 1.2707018927444795, + "grad_norm": 0.5283600379328754, + "learning_rate": 1.5436246674935543e-05, + "loss": 0.3434, + "step": 6446 + }, + { + "epoch": 1.2708990536277602, + "grad_norm": 0.5013523816913494, + "learning_rate": 1.5434945628757595e-05, + "loss": 0.338, + "step": 6447 + }, + { + "epoch": 1.271096214511041, + "grad_norm": 0.4905233287935434, + "learning_rate": 1.54336444520007e-05, + "loss": 0.3287, + "step": 6448 + }, + { + "epoch": 1.2712933753943219, + "grad_norm": 0.5437498180898698, + "learning_rate": 1.5432343144696117e-05, + "loss": 0.3708, + "step": 6449 + }, + { + "epoch": 1.2714905362776026, + "grad_norm": 0.48421826633903103, + "learning_rate": 1.543104170687512e-05, + "loss": 0.3125, + "step": 6450 + }, + { + "epoch": 1.2716876971608833, + "grad_norm": 0.4793437101259274, + "learning_rate": 1.542974013856897e-05, + "loss": 0.3341, + "step": 6451 + }, + { + "epoch": 1.271884858044164, + "grad_norm": 19.93851850988607, + "learning_rate": 1.5428438439808942e-05, + "loss": 0.6517, + "step": 6452 + }, + { + "epoch": 1.2720820189274447, + "grad_norm": 0.5087943803471097, + "learning_rate": 1.542713661062631e-05, + "loss": 0.3086, + "step": 6453 + }, + { + "epoch": 1.2722791798107256, + "grad_norm": 0.47671813979964683, + "learning_rate": 1.542583465105235e-05, + "loss": 0.3208, + "step": 6454 + }, + { + "epoch": 1.2724763406940063, + "grad_norm": 0.5236254976524284, + "learning_rate": 1.542453256111834e-05, + "loss": 0.3477, + "step": 6455 + }, + { + "epoch": 1.272673501577287, + "grad_norm": 0.49008247823279333, + "learning_rate": 1.5423230340855572e-05, + "loss": 0.3354, + "step": 6456 + }, + { + "epoch": 1.272870662460568, + "grad_norm": 0.48169835947954537, + "learning_rate": 1.5421927990295325e-05, + "loss": 0.3369, + "step": 6457 + }, + { + "epoch": 1.2730678233438486, + "grad_norm": 0.5245321000278956, + "learning_rate": 1.5420625509468892e-05, + "loss": 0.3466, + "step": 6458 + }, + { + "epoch": 1.2732649842271293, + "grad_norm": 0.6897513827097659, + "learning_rate": 1.5419322898407562e-05, + "loss": 0.3384, + "step": 6459 + }, + { + "epoch": 1.2734621451104102, + "grad_norm": 0.5001285523422395, + "learning_rate": 1.541802015714264e-05, + "loss": 0.3512, + "step": 6460 + }, + { + "epoch": 1.273659305993691, + "grad_norm": 0.5336358617640525, + "learning_rate": 1.5416717285705417e-05, + "loss": 0.3448, + "step": 6461 + }, + { + "epoch": 1.2738564668769716, + "grad_norm": 0.5466334074547992, + "learning_rate": 1.5415414284127207e-05, + "loss": 0.3506, + "step": 6462 + }, + { + "epoch": 1.2740536277602523, + "grad_norm": 0.526166079775761, + "learning_rate": 1.54141111524393e-05, + "loss": 0.3268, + "step": 6463 + }, + { + "epoch": 1.274250788643533, + "grad_norm": 0.47399584750409807, + "learning_rate": 1.5412807890673015e-05, + "loss": 0.3079, + "step": 6464 + }, + { + "epoch": 1.274447949526814, + "grad_norm": 0.505741406230065, + "learning_rate": 1.541150449885966e-05, + "loss": 0.3296, + "step": 6465 + }, + { + "epoch": 1.2746451104100947, + "grad_norm": 0.5386050119365994, + "learning_rate": 1.5410200977030553e-05, + "loss": 0.3652, + "step": 6466 + }, + { + "epoch": 1.2748422712933754, + "grad_norm": 0.47587895585088, + "learning_rate": 1.5408897325217012e-05, + "loss": 0.3239, + "step": 6467 + }, + { + "epoch": 1.2750394321766563, + "grad_norm": 0.504126225675362, + "learning_rate": 1.5407593543450358e-05, + "loss": 0.3362, + "step": 6468 + }, + { + "epoch": 1.275236593059937, + "grad_norm": 0.485501366242255, + "learning_rate": 1.540628963176191e-05, + "loss": 0.3153, + "step": 6469 + }, + { + "epoch": 1.2754337539432177, + "grad_norm": 0.49475043227065524, + "learning_rate": 1.5404985590183e-05, + "loss": 0.3079, + "step": 6470 + }, + { + "epoch": 1.2756309148264984, + "grad_norm": 0.4753091395271322, + "learning_rate": 1.5403681418744962e-05, + "loss": 0.3128, + "step": 6471 + }, + { + "epoch": 1.275828075709779, + "grad_norm": 0.5027088286339224, + "learning_rate": 1.5402377117479127e-05, + "loss": 0.3438, + "step": 6472 + }, + { + "epoch": 1.27602523659306, + "grad_norm": 0.4918844457862931, + "learning_rate": 1.5401072686416826e-05, + "loss": 0.3433, + "step": 6473 + }, + { + "epoch": 1.2762223974763407, + "grad_norm": 0.5104026241302757, + "learning_rate": 1.539976812558941e-05, + "loss": 0.3238, + "step": 6474 + }, + { + "epoch": 1.2764195583596214, + "grad_norm": 0.5129479397583762, + "learning_rate": 1.539846343502821e-05, + "loss": 0.3521, + "step": 6475 + }, + { + "epoch": 1.2766167192429023, + "grad_norm": 0.5917550936700113, + "learning_rate": 1.5397158614764584e-05, + "loss": 0.3882, + "step": 6476 + }, + { + "epoch": 1.276813880126183, + "grad_norm": 0.5595087268305293, + "learning_rate": 1.5395853664829876e-05, + "loss": 0.3397, + "step": 6477 + }, + { + "epoch": 1.2770110410094637, + "grad_norm": 0.4600650571043877, + "learning_rate": 1.5394548585255437e-05, + "loss": 0.3025, + "step": 6478 + }, + { + "epoch": 1.2772082018927444, + "grad_norm": 0.49890829885179133, + "learning_rate": 1.5393243376072625e-05, + "loss": 0.3205, + "step": 6479 + }, + { + "epoch": 1.2774053627760251, + "grad_norm": 0.4890558758581133, + "learning_rate": 1.5391938037312795e-05, + "loss": 0.3315, + "step": 6480 + }, + { + "epoch": 1.277602523659306, + "grad_norm": 0.7150885015408796, + "learning_rate": 1.5390632569007314e-05, + "loss": 0.3116, + "step": 6481 + }, + { + "epoch": 1.2777996845425867, + "grad_norm": 0.5271705955690947, + "learning_rate": 1.5389326971187543e-05, + "loss": 0.3453, + "step": 6482 + }, + { + "epoch": 1.2779968454258674, + "grad_norm": 0.508576676488475, + "learning_rate": 1.538802124388485e-05, + "loss": 0.3424, + "step": 6483 + }, + { + "epoch": 1.2781940063091484, + "grad_norm": 0.5597504943231402, + "learning_rate": 1.538671538713061e-05, + "loss": 0.3741, + "step": 6484 + }, + { + "epoch": 1.278391167192429, + "grad_norm": 0.5144168371604326, + "learning_rate": 1.5385409400956196e-05, + "loss": 0.3434, + "step": 6485 + }, + { + "epoch": 1.2785883280757098, + "grad_norm": 0.5022134043659201, + "learning_rate": 1.538410328539298e-05, + "loss": 0.3229, + "step": 6486 + }, + { + "epoch": 1.2787854889589905, + "grad_norm": 0.5129040379585406, + "learning_rate": 1.5382797040472352e-05, + "loss": 0.326, + "step": 6487 + }, + { + "epoch": 1.2789826498422712, + "grad_norm": 0.49329203335900706, + "learning_rate": 1.538149066622569e-05, + "loss": 0.3395, + "step": 6488 + }, + { + "epoch": 1.279179810725552, + "grad_norm": 0.5055724718310393, + "learning_rate": 1.538018416268438e-05, + "loss": 0.3472, + "step": 6489 + }, + { + "epoch": 1.2793769716088328, + "grad_norm": 0.5258954355075676, + "learning_rate": 1.537887752987981e-05, + "loss": 0.3538, + "step": 6490 + }, + { + "epoch": 1.2795741324921135, + "grad_norm": 0.542150664111908, + "learning_rate": 1.5377570767843377e-05, + "loss": 0.3396, + "step": 6491 + }, + { + "epoch": 1.2797712933753944, + "grad_norm": 0.47748206124296466, + "learning_rate": 1.5376263876606475e-05, + "loss": 0.3288, + "step": 6492 + }, + { + "epoch": 1.2799684542586751, + "grad_norm": 3.411379492061442, + "learning_rate": 1.5374956856200504e-05, + "loss": 0.33, + "step": 6493 + }, + { + "epoch": 1.2801656151419558, + "grad_norm": 0.5132738135149726, + "learning_rate": 1.537364970665687e-05, + "loss": 0.3298, + "step": 6494 + }, + { + "epoch": 1.2803627760252365, + "grad_norm": 1.1343123502825927, + "learning_rate": 1.537234242800697e-05, + "loss": 0.336, + "step": 6495 + }, + { + "epoch": 1.2805599369085172, + "grad_norm": 0.518109994618307, + "learning_rate": 1.537103502028222e-05, + "loss": 0.3338, + "step": 6496 + }, + { + "epoch": 1.2807570977917981, + "grad_norm": 0.5142434953565715, + "learning_rate": 1.5369727483514026e-05, + "loss": 0.3498, + "step": 6497 + }, + { + "epoch": 1.2809542586750788, + "grad_norm": 0.4857243954146504, + "learning_rate": 1.536841981773381e-05, + "loss": 0.3212, + "step": 6498 + }, + { + "epoch": 1.2811514195583595, + "grad_norm": 0.5583875828356074, + "learning_rate": 1.5367112022972977e-05, + "loss": 0.3449, + "step": 6499 + }, + { + "epoch": 1.2813485804416405, + "grad_norm": 0.7616833629256355, + "learning_rate": 1.536580409926296e-05, + "loss": 0.3047, + "step": 6500 + }, + { + "epoch": 1.2815457413249212, + "grad_norm": 0.5422560861681756, + "learning_rate": 1.5364496046635175e-05, + "loss": 0.3336, + "step": 6501 + }, + { + "epoch": 1.2817429022082019, + "grad_norm": 0.5234629592906149, + "learning_rate": 1.5363187865121058e-05, + "loss": 0.3207, + "step": 6502 + }, + { + "epoch": 1.2819400630914828, + "grad_norm": 0.5788807821840986, + "learning_rate": 1.5361879554752027e-05, + "loss": 0.3518, + "step": 6503 + }, + { + "epoch": 1.2821372239747635, + "grad_norm": 0.560152361635895, + "learning_rate": 1.536057111555953e-05, + "loss": 0.3671, + "step": 6504 + }, + { + "epoch": 1.2823343848580442, + "grad_norm": 0.573760389051277, + "learning_rate": 1.5359262547574986e-05, + "loss": 0.3296, + "step": 6505 + }, + { + "epoch": 1.2825315457413249, + "grad_norm": 0.639852449462523, + "learning_rate": 1.535795385082985e-05, + "loss": 0.3648, + "step": 6506 + }, + { + "epoch": 1.2827287066246056, + "grad_norm": 0.5684843971461204, + "learning_rate": 1.5356645025355556e-05, + "loss": 0.3628, + "step": 6507 + }, + { + "epoch": 1.2829258675078865, + "grad_norm": 0.48085580385372134, + "learning_rate": 1.535533607118355e-05, + "loss": 0.3272, + "step": 6508 + }, + { + "epoch": 1.2831230283911672, + "grad_norm": 0.5728460211410128, + "learning_rate": 1.5354026988345284e-05, + "loss": 0.36, + "step": 6509 + }, + { + "epoch": 1.283320189274448, + "grad_norm": 0.5181581225517601, + "learning_rate": 1.5352717776872208e-05, + "loss": 0.3219, + "step": 6510 + }, + { + "epoch": 1.2835173501577288, + "grad_norm": 0.5081886354156057, + "learning_rate": 1.5351408436795777e-05, + "loss": 0.3369, + "step": 6511 + }, + { + "epoch": 1.2837145110410095, + "grad_norm": 0.5230355727747609, + "learning_rate": 1.535009896814745e-05, + "loss": 0.3244, + "step": 6512 + }, + { + "epoch": 1.2839116719242902, + "grad_norm": 0.5375516631231971, + "learning_rate": 1.5348789370958687e-05, + "loss": 0.3419, + "step": 6513 + }, + { + "epoch": 1.284108832807571, + "grad_norm": 0.5618743671777963, + "learning_rate": 1.534747964526095e-05, + "loss": 0.3321, + "step": 6514 + }, + { + "epoch": 1.2843059936908516, + "grad_norm": 0.5648952032025825, + "learning_rate": 1.5346169791085707e-05, + "loss": 0.3586, + "step": 6515 + }, + { + "epoch": 1.2845031545741326, + "grad_norm": 0.4704499047660257, + "learning_rate": 1.534485980846443e-05, + "loss": 0.2988, + "step": 6516 + }, + { + "epoch": 1.2847003154574133, + "grad_norm": 0.5115491062943802, + "learning_rate": 1.5343549697428596e-05, + "loss": 0.3401, + "step": 6517 + }, + { + "epoch": 1.284897476340694, + "grad_norm": 0.5183202372840393, + "learning_rate": 1.5342239458009675e-05, + "loss": 0.3253, + "step": 6518 + }, + { + "epoch": 1.2850946372239749, + "grad_norm": 0.48683435924195767, + "learning_rate": 1.5340929090239146e-05, + "loss": 0.3006, + "step": 6519 + }, + { + "epoch": 1.2852917981072556, + "grad_norm": 0.5138313161943242, + "learning_rate": 1.5339618594148497e-05, + "loss": 0.3063, + "step": 6520 + }, + { + "epoch": 1.2854889589905363, + "grad_norm": 0.5017471245997583, + "learning_rate": 1.533830796976921e-05, + "loss": 0.3297, + "step": 6521 + }, + { + "epoch": 1.285686119873817, + "grad_norm": 0.5123547738441006, + "learning_rate": 1.5336997217132777e-05, + "loss": 0.3525, + "step": 6522 + }, + { + "epoch": 1.2858832807570977, + "grad_norm": 0.5338495423729164, + "learning_rate": 1.533568633627069e-05, + "loss": 0.3295, + "step": 6523 + }, + { + "epoch": 1.2860804416403786, + "grad_norm": 0.5013934896990097, + "learning_rate": 1.5334375327214437e-05, + "loss": 0.3459, + "step": 6524 + }, + { + "epoch": 1.2862776025236593, + "grad_norm": 0.4948643141041335, + "learning_rate": 1.5333064189995523e-05, + "loss": 0.3553, + "step": 6525 + }, + { + "epoch": 1.28647476340694, + "grad_norm": 0.46557296850936236, + "learning_rate": 1.5331752924645448e-05, + "loss": 0.3048, + "step": 6526 + }, + { + "epoch": 1.286671924290221, + "grad_norm": 0.48584258034002087, + "learning_rate": 1.5330441531195714e-05, + "loss": 0.3127, + "step": 6527 + }, + { + "epoch": 1.2868690851735016, + "grad_norm": 0.5002880962180022, + "learning_rate": 1.532913000967783e-05, + "loss": 0.3415, + "step": 6528 + }, + { + "epoch": 1.2870662460567823, + "grad_norm": 0.5028923628602395, + "learning_rate": 1.5327818360123307e-05, + "loss": 0.3148, + "step": 6529 + }, + { + "epoch": 1.287263406940063, + "grad_norm": 0.5195591290608911, + "learning_rate": 1.532650658256366e-05, + "loss": 0.3567, + "step": 6530 + }, + { + "epoch": 1.2874605678233437, + "grad_norm": 0.5004401636204268, + "learning_rate": 1.5325194677030396e-05, + "loss": 0.3399, + "step": 6531 + }, + { + "epoch": 1.2876577287066246, + "grad_norm": 0.4867335475638141, + "learning_rate": 1.5323882643555045e-05, + "loss": 0.3228, + "step": 6532 + }, + { + "epoch": 1.2878548895899053, + "grad_norm": 0.501274068173939, + "learning_rate": 1.5322570482169127e-05, + "loss": 0.3034, + "step": 6533 + }, + { + "epoch": 1.288052050473186, + "grad_norm": 0.502185959674175, + "learning_rate": 1.5321258192904165e-05, + "loss": 0.3134, + "step": 6534 + }, + { + "epoch": 1.288249211356467, + "grad_norm": 0.51496969492382, + "learning_rate": 1.531994577579169e-05, + "loss": 0.3469, + "step": 6535 + }, + { + "epoch": 1.2884463722397477, + "grad_norm": 0.4872788476858814, + "learning_rate": 1.5318633230863237e-05, + "loss": 0.3189, + "step": 6536 + }, + { + "epoch": 1.2886435331230284, + "grad_norm": 0.5229951752622122, + "learning_rate": 1.5317320558150336e-05, + "loss": 0.3392, + "step": 6537 + }, + { + "epoch": 1.288840694006309, + "grad_norm": 0.4772007251776008, + "learning_rate": 1.5316007757684523e-05, + "loss": 0.3089, + "step": 6538 + }, + { + "epoch": 1.2890378548895898, + "grad_norm": 0.5003182201377973, + "learning_rate": 1.5314694829497344e-05, + "loss": 0.3164, + "step": 6539 + }, + { + "epoch": 1.2892350157728707, + "grad_norm": 0.5065544170909857, + "learning_rate": 1.5313381773620344e-05, + "loss": 0.3202, + "step": 6540 + }, + { + "epoch": 1.2894321766561514, + "grad_norm": 0.5077740483878833, + "learning_rate": 1.5312068590085067e-05, + "loss": 0.343, + "step": 6541 + }, + { + "epoch": 1.289629337539432, + "grad_norm": 0.5577531098567273, + "learning_rate": 1.5310755278923067e-05, + "loss": 0.3386, + "step": 6542 + }, + { + "epoch": 1.289826498422713, + "grad_norm": 10.547858415373208, + "learning_rate": 1.530944184016589e-05, + "loss": 0.3561, + "step": 6543 + }, + { + "epoch": 1.2900236593059937, + "grad_norm": 0.5345161964355895, + "learning_rate": 1.53081282738451e-05, + "loss": 0.3251, + "step": 6544 + }, + { + "epoch": 1.2902208201892744, + "grad_norm": 0.5104260494177608, + "learning_rate": 1.5306814579992254e-05, + "loss": 0.3355, + "step": 6545 + }, + { + "epoch": 1.2904179810725553, + "grad_norm": 0.46675498748982025, + "learning_rate": 1.530550075863891e-05, + "loss": 0.3111, + "step": 6546 + }, + { + "epoch": 1.290615141955836, + "grad_norm": 0.5293919854328544, + "learning_rate": 1.5304186809816644e-05, + "loss": 0.3247, + "step": 6547 + }, + { + "epoch": 1.2908123028391167, + "grad_norm": 0.4883341570443513, + "learning_rate": 1.5302872733557013e-05, + "loss": 0.316, + "step": 6548 + }, + { + "epoch": 1.2910094637223974, + "grad_norm": 0.5129224662462487, + "learning_rate": 1.53015585298916e-05, + "loss": 0.3536, + "step": 6549 + }, + { + "epoch": 1.2912066246056781, + "grad_norm": 0.4954761184679624, + "learning_rate": 1.5300244198851965e-05, + "loss": 0.3269, + "step": 6550 + }, + { + "epoch": 1.291403785488959, + "grad_norm": 0.5132713779944943, + "learning_rate": 1.5298929740469707e-05, + "loss": 0.34, + "step": 6551 + }, + { + "epoch": 1.2916009463722398, + "grad_norm": 0.5025532639181867, + "learning_rate": 1.5297615154776384e-05, + "loss": 0.3253, + "step": 6552 + }, + { + "epoch": 1.2917981072555205, + "grad_norm": 0.5058178309161923, + "learning_rate": 1.5296300441803594e-05, + "loss": 0.3464, + "step": 6553 + }, + { + "epoch": 1.2919952681388014, + "grad_norm": 0.5248801782387191, + "learning_rate": 1.5294985601582922e-05, + "loss": 0.3535, + "step": 6554 + }, + { + "epoch": 1.292192429022082, + "grad_norm": 0.4613729865477732, + "learning_rate": 1.5293670634145955e-05, + "loss": 0.2844, + "step": 6555 + }, + { + "epoch": 1.2923895899053628, + "grad_norm": 0.548846735245273, + "learning_rate": 1.529235553952429e-05, + "loss": 0.3593, + "step": 6556 + }, + { + "epoch": 1.2925867507886435, + "grad_norm": 0.48961397369557463, + "learning_rate": 1.5291040317749522e-05, + "loss": 0.3248, + "step": 6557 + }, + { + "epoch": 1.2927839116719242, + "grad_norm": 0.5242798395592415, + "learning_rate": 1.528972496885325e-05, + "loss": 0.3569, + "step": 6558 + }, + { + "epoch": 1.2929810725552051, + "grad_norm": 0.5183526269970178, + "learning_rate": 1.5288409492867075e-05, + "loss": 0.3333, + "step": 6559 + }, + { + "epoch": 1.2931782334384858, + "grad_norm": 0.6706130182709, + "learning_rate": 1.52870938898226e-05, + "loss": 0.3381, + "step": 6560 + }, + { + "epoch": 1.2933753943217665, + "grad_norm": 0.5081309223342054, + "learning_rate": 1.528577815975144e-05, + "loss": 0.3075, + "step": 6561 + }, + { + "epoch": 1.2935725552050474, + "grad_norm": 0.5513296961101606, + "learning_rate": 1.5284462302685203e-05, + "loss": 0.3199, + "step": 6562 + }, + { + "epoch": 1.2937697160883281, + "grad_norm": 0.49894267015353083, + "learning_rate": 1.52831463186555e-05, + "loss": 0.3275, + "step": 6563 + }, + { + "epoch": 1.2939668769716088, + "grad_norm": 0.5327112678308222, + "learning_rate": 1.5281830207693955e-05, + "loss": 0.3446, + "step": 6564 + }, + { + "epoch": 1.2941640378548895, + "grad_norm": 0.5119589336578062, + "learning_rate": 1.5280513969832185e-05, + "loss": 0.3238, + "step": 6565 + }, + { + "epoch": 1.2943611987381702, + "grad_norm": 0.5181686124023328, + "learning_rate": 1.5279197605101814e-05, + "loss": 0.327, + "step": 6566 + }, + { + "epoch": 1.2945583596214512, + "grad_norm": 0.5017388093095246, + "learning_rate": 1.527788111353447e-05, + "loss": 0.3306, + "step": 6567 + }, + { + "epoch": 1.2947555205047319, + "grad_norm": 0.5001792806641484, + "learning_rate": 1.5276564495161787e-05, + "loss": 0.3197, + "step": 6568 + }, + { + "epoch": 1.2949526813880126, + "grad_norm": 1.4540496805328393, + "learning_rate": 1.5275247750015383e-05, + "loss": 0.3723, + "step": 6569 + }, + { + "epoch": 1.2951498422712935, + "grad_norm": 1.8774355679525034, + "learning_rate": 1.5273930878126912e-05, + "loss": 0.3558, + "step": 6570 + }, + { + "epoch": 1.2953470031545742, + "grad_norm": 0.6034476548951655, + "learning_rate": 1.5272613879528e-05, + "loss": 0.3327, + "step": 6571 + }, + { + "epoch": 1.2955441640378549, + "grad_norm": 0.6074067971559209, + "learning_rate": 1.5271296754250296e-05, + "loss": 0.3475, + "step": 6572 + }, + { + "epoch": 1.2957413249211356, + "grad_norm": 2.176593999697865, + "learning_rate": 1.526997950232544e-05, + "loss": 0.388, + "step": 6573 + }, + { + "epoch": 1.2959384858044163, + "grad_norm": 0.6536726906942081, + "learning_rate": 1.5268662123785084e-05, + "loss": 0.3288, + "step": 6574 + }, + { + "epoch": 1.2961356466876972, + "grad_norm": 0.5254118788592566, + "learning_rate": 1.5267344618660876e-05, + "loss": 0.3427, + "step": 6575 + }, + { + "epoch": 1.296332807570978, + "grad_norm": 0.504643420107229, + "learning_rate": 1.526602698698447e-05, + "loss": 0.3277, + "step": 6576 + }, + { + "epoch": 1.2965299684542586, + "grad_norm": 0.5190602422795867, + "learning_rate": 1.5264709228787534e-05, + "loss": 0.3404, + "step": 6577 + }, + { + "epoch": 1.2967271293375395, + "grad_norm": 0.525866200522297, + "learning_rate": 1.5263391344101713e-05, + "loss": 0.351, + "step": 6578 + }, + { + "epoch": 1.2969242902208202, + "grad_norm": 0.5358394825870619, + "learning_rate": 1.5262073332958677e-05, + "loss": 0.3478, + "step": 6579 + }, + { + "epoch": 1.297121451104101, + "grad_norm": 0.5301335549732624, + "learning_rate": 1.526075519539009e-05, + "loss": 0.3253, + "step": 6580 + }, + { + "epoch": 1.2973186119873816, + "grad_norm": 0.5239831490523945, + "learning_rate": 1.5259436931427624e-05, + "loss": 0.3308, + "step": 6581 + }, + { + "epoch": 1.2975157728706623, + "grad_norm": 0.5013826442278009, + "learning_rate": 1.525811854110295e-05, + "loss": 0.3492, + "step": 6582 + }, + { + "epoch": 1.2977129337539433, + "grad_norm": 0.5146221951802675, + "learning_rate": 1.5256800024447744e-05, + "loss": 0.3288, + "step": 6583 + }, + { + "epoch": 1.297910094637224, + "grad_norm": 0.5480449934066463, + "learning_rate": 1.5255481381493686e-05, + "loss": 0.3557, + "step": 6584 + }, + { + "epoch": 1.2981072555205047, + "grad_norm": 0.5334935940934137, + "learning_rate": 1.5254162612272451e-05, + "loss": 0.3467, + "step": 6585 + }, + { + "epoch": 1.2983044164037856, + "grad_norm": 0.5358329007175838, + "learning_rate": 1.5252843716815733e-05, + "loss": 0.3814, + "step": 6586 + }, + { + "epoch": 1.2985015772870663, + "grad_norm": 0.49941905926425306, + "learning_rate": 1.5251524695155214e-05, + "loss": 0.3412, + "step": 6587 + }, + { + "epoch": 1.298698738170347, + "grad_norm": 0.5883299960109943, + "learning_rate": 1.525020554732258e-05, + "loss": 0.3469, + "step": 6588 + }, + { + "epoch": 1.2988958990536277, + "grad_norm": 0.4752126077673626, + "learning_rate": 1.5248886273349537e-05, + "loss": 0.3147, + "step": 6589 + }, + { + "epoch": 1.2990930599369084, + "grad_norm": 0.5009771354117454, + "learning_rate": 1.524756687326777e-05, + "loss": 0.3461, + "step": 6590 + }, + { + "epoch": 1.2992902208201893, + "grad_norm": 0.5678355964259746, + "learning_rate": 1.5246247347108984e-05, + "loss": 0.3774, + "step": 6591 + }, + { + "epoch": 1.29948738170347, + "grad_norm": 0.4622894199811546, + "learning_rate": 1.524492769490488e-05, + "loss": 0.3359, + "step": 6592 + }, + { + "epoch": 1.2996845425867507, + "grad_norm": 0.5169518693086831, + "learning_rate": 1.5243607916687167e-05, + "loss": 0.3513, + "step": 6593 + }, + { + "epoch": 1.2998817034700316, + "grad_norm": 0.49140954893508976, + "learning_rate": 1.524228801248755e-05, + "loss": 0.3222, + "step": 6594 + }, + { + "epoch": 1.3000788643533123, + "grad_norm": 0.5198162659011691, + "learning_rate": 1.5240967982337738e-05, + "loss": 0.3309, + "step": 6595 + }, + { + "epoch": 1.300276025236593, + "grad_norm": 0.5186960022124844, + "learning_rate": 1.5239647826269455e-05, + "loss": 0.3305, + "step": 6596 + }, + { + "epoch": 1.300473186119874, + "grad_norm": 0.5377070970944036, + "learning_rate": 1.5238327544314409e-05, + "loss": 0.3584, + "step": 6597 + }, + { + "epoch": 1.3006703470031546, + "grad_norm": 0.5046133132932712, + "learning_rate": 1.5237007136504329e-05, + "loss": 0.3403, + "step": 6598 + }, + { + "epoch": 1.3008675078864353, + "grad_norm": 0.5454800240840171, + "learning_rate": 1.5235686602870932e-05, + "loss": 0.377, + "step": 6599 + }, + { + "epoch": 1.301064668769716, + "grad_norm": 0.5025344180773049, + "learning_rate": 1.5234365943445953e-05, + "loss": 0.3432, + "step": 6600 + }, + { + "epoch": 1.3012618296529967, + "grad_norm": 0.5276225149052198, + "learning_rate": 1.523304515826111e-05, + "loss": 0.3527, + "step": 6601 + }, + { + "epoch": 1.3014589905362777, + "grad_norm": 0.47724462165996756, + "learning_rate": 1.5231724247348148e-05, + "loss": 0.3126, + "step": 6602 + }, + { + "epoch": 1.3016561514195584, + "grad_norm": 0.5567156598290902, + "learning_rate": 1.5230403210738796e-05, + "loss": 0.3633, + "step": 6603 + }, + { + "epoch": 1.301853312302839, + "grad_norm": 0.48550253059271353, + "learning_rate": 1.5229082048464796e-05, + "loss": 0.3226, + "step": 6604 + }, + { + "epoch": 1.30205047318612, + "grad_norm": 0.5343562201937075, + "learning_rate": 1.5227760760557887e-05, + "loss": 0.3421, + "step": 6605 + }, + { + "epoch": 1.3022476340694007, + "grad_norm": 0.5152095426808443, + "learning_rate": 1.522643934704982e-05, + "loss": 0.3454, + "step": 6606 + }, + { + "epoch": 1.3024447949526814, + "grad_norm": 0.5411901174169776, + "learning_rate": 1.5225117807972334e-05, + "loss": 0.3494, + "step": 6607 + }, + { + "epoch": 1.302641955835962, + "grad_norm": 0.5079018703629262, + "learning_rate": 1.5223796143357188e-05, + "loss": 0.3364, + "step": 6608 + }, + { + "epoch": 1.3028391167192428, + "grad_norm": 0.5422309368847433, + "learning_rate": 1.522247435323613e-05, + "loss": 0.332, + "step": 6609 + }, + { + "epoch": 1.3030362776025237, + "grad_norm": 1.8926296586911842, + "learning_rate": 1.5221152437640922e-05, + "loss": 0.321, + "step": 6610 + }, + { + "epoch": 1.3032334384858044, + "grad_norm": 0.5764351786305123, + "learning_rate": 1.5219830396603321e-05, + "loss": 0.3629, + "step": 6611 + }, + { + "epoch": 1.3034305993690851, + "grad_norm": 0.5013264006168145, + "learning_rate": 1.5218508230155093e-05, + "loss": 0.3306, + "step": 6612 + }, + { + "epoch": 1.303627760252366, + "grad_norm": 0.5367421957046887, + "learning_rate": 1.5217185938328003e-05, + "loss": 0.3283, + "step": 6613 + }, + { + "epoch": 1.3038249211356467, + "grad_norm": 0.6051753693199979, + "learning_rate": 1.5215863521153817e-05, + "loss": 0.3365, + "step": 6614 + }, + { + "epoch": 1.3040220820189274, + "grad_norm": 0.5121455196545641, + "learning_rate": 1.521454097866431e-05, + "loss": 0.3465, + "step": 6615 + }, + { + "epoch": 1.3042192429022081, + "grad_norm": 0.5204250557148496, + "learning_rate": 1.5213218310891256e-05, + "loss": 0.343, + "step": 6616 + }, + { + "epoch": 1.3044164037854888, + "grad_norm": 0.5312764234356772, + "learning_rate": 1.5211895517866437e-05, + "loss": 0.3386, + "step": 6617 + }, + { + "epoch": 1.3046135646687698, + "grad_norm": 0.5071761949128407, + "learning_rate": 1.5210572599621626e-05, + "loss": 0.3328, + "step": 6618 + }, + { + "epoch": 1.3048107255520505, + "grad_norm": 0.5433017323113483, + "learning_rate": 1.5209249556188619e-05, + "loss": 0.3632, + "step": 6619 + }, + { + "epoch": 1.3050078864353312, + "grad_norm": 0.568002908855061, + "learning_rate": 1.520792638759919e-05, + "loss": 0.3341, + "step": 6620 + }, + { + "epoch": 1.305205047318612, + "grad_norm": 0.4905457370725845, + "learning_rate": 1.520660309388514e-05, + "loss": 0.3361, + "step": 6621 + }, + { + "epoch": 1.3054022082018928, + "grad_norm": 0.5619377123694016, + "learning_rate": 1.5205279675078255e-05, + "loss": 0.3574, + "step": 6622 + }, + { + "epoch": 1.3055993690851735, + "grad_norm": 0.5451812237029258, + "learning_rate": 1.5203956131210333e-05, + "loss": 0.3632, + "step": 6623 + }, + { + "epoch": 1.3057965299684542, + "grad_norm": 0.541134699754437, + "learning_rate": 1.5202632462313178e-05, + "loss": 0.35, + "step": 6624 + }, + { + "epoch": 1.305993690851735, + "grad_norm": 0.5465872753368111, + "learning_rate": 1.5201308668418588e-05, + "loss": 0.3686, + "step": 6625 + }, + { + "epoch": 1.3061908517350158, + "grad_norm": 0.48991274685024205, + "learning_rate": 1.5199984749558367e-05, + "loss": 0.3211, + "step": 6626 + }, + { + "epoch": 1.3063880126182965, + "grad_norm": 0.4938660731028014, + "learning_rate": 1.5198660705764326e-05, + "loss": 0.3393, + "step": 6627 + }, + { + "epoch": 1.3065851735015772, + "grad_norm": 0.5656182414746819, + "learning_rate": 1.5197336537068275e-05, + "loss": 0.3565, + "step": 6628 + }, + { + "epoch": 1.3067823343848581, + "grad_norm": 0.48654324405735194, + "learning_rate": 1.5196012243502027e-05, + "loss": 0.3061, + "step": 6629 + }, + { + "epoch": 1.3069794952681388, + "grad_norm": 0.5696072738061521, + "learning_rate": 1.5194687825097401e-05, + "loss": 0.3624, + "step": 6630 + }, + { + "epoch": 1.3071766561514195, + "grad_norm": 0.4992301751223368, + "learning_rate": 1.5193363281886217e-05, + "loss": 0.3332, + "step": 6631 + }, + { + "epoch": 1.3073738170347002, + "grad_norm": 0.47480549774075814, + "learning_rate": 1.5192038613900297e-05, + "loss": 0.33, + "step": 6632 + }, + { + "epoch": 1.307570977917981, + "grad_norm": 0.621679280561631, + "learning_rate": 1.519071382117147e-05, + "loss": 0.3277, + "step": 6633 + }, + { + "epoch": 1.3077681388012619, + "grad_norm": 0.5098237493871496, + "learning_rate": 1.5189388903731562e-05, + "loss": 0.347, + "step": 6634 + }, + { + "epoch": 1.3079652996845426, + "grad_norm": 0.4863659684564988, + "learning_rate": 1.5188063861612405e-05, + "loss": 0.3305, + "step": 6635 + }, + { + "epoch": 1.3081624605678233, + "grad_norm": 0.5124706812541354, + "learning_rate": 1.518673869484584e-05, + "loss": 0.3275, + "step": 6636 + }, + { + "epoch": 1.3083596214511042, + "grad_norm": 0.5190003537835063, + "learning_rate": 1.5185413403463698e-05, + "loss": 0.3355, + "step": 6637 + }, + { + "epoch": 1.3085567823343849, + "grad_norm": 1.1096517702373996, + "learning_rate": 1.5184087987497824e-05, + "loss": 0.3525, + "step": 6638 + }, + { + "epoch": 1.3087539432176656, + "grad_norm": 0.531753230713042, + "learning_rate": 1.5182762446980061e-05, + "loss": 0.3487, + "step": 6639 + }, + { + "epoch": 1.3089511041009465, + "grad_norm": 0.5226931181499541, + "learning_rate": 1.5181436781942258e-05, + "loss": 0.336, + "step": 6640 + }, + { + "epoch": 1.3091482649842272, + "grad_norm": 0.49468338224537034, + "learning_rate": 1.5180110992416262e-05, + "loss": 0.3089, + "step": 6641 + }, + { + "epoch": 1.309345425867508, + "grad_norm": 0.47660945364337853, + "learning_rate": 1.5178785078433928e-05, + "loss": 0.3207, + "step": 6642 + }, + { + "epoch": 1.3095425867507886, + "grad_norm": 0.5172184894772541, + "learning_rate": 1.5177459040027114e-05, + "loss": 0.3366, + "step": 6643 + }, + { + "epoch": 1.3097397476340693, + "grad_norm": 0.570608404247494, + "learning_rate": 1.5176132877227674e-05, + "loss": 0.3571, + "step": 6644 + }, + { + "epoch": 1.3099369085173502, + "grad_norm": 0.5231899305828399, + "learning_rate": 1.5174806590067475e-05, + "loss": 0.3496, + "step": 6645 + }, + { + "epoch": 1.310134069400631, + "grad_norm": 0.499514906187556, + "learning_rate": 1.517348017857838e-05, + "loss": 0.3125, + "step": 6646 + }, + { + "epoch": 1.3103312302839116, + "grad_norm": 0.49103990204237496, + "learning_rate": 1.517215364279226e-05, + "loss": 0.3127, + "step": 6647 + }, + { + "epoch": 1.3105283911671926, + "grad_norm": 0.47212331175692046, + "learning_rate": 1.517082698274098e-05, + "loss": 0.3166, + "step": 6648 + }, + { + "epoch": 1.3107255520504733, + "grad_norm": 0.5661729386565737, + "learning_rate": 1.5169500198456417e-05, + "loss": 0.376, + "step": 6649 + }, + { + "epoch": 1.310922712933754, + "grad_norm": 0.5266195177462345, + "learning_rate": 1.5168173289970453e-05, + "loss": 0.3619, + "step": 6650 + }, + { + "epoch": 1.3111198738170347, + "grad_norm": 0.4968849869857472, + "learning_rate": 1.5166846257314961e-05, + "loss": 0.3527, + "step": 6651 + }, + { + "epoch": 1.3113170347003154, + "grad_norm": 0.5036423318919169, + "learning_rate": 1.5165519100521828e-05, + "loss": 0.339, + "step": 6652 + }, + { + "epoch": 1.3115141955835963, + "grad_norm": 0.8033202698895086, + "learning_rate": 1.5164191819622937e-05, + "loss": 0.3572, + "step": 6653 + }, + { + "epoch": 1.311711356466877, + "grad_norm": 0.5025110332228436, + "learning_rate": 1.516286441465018e-05, + "loss": 0.335, + "step": 6654 + }, + { + "epoch": 1.3119085173501577, + "grad_norm": 0.5067929824788047, + "learning_rate": 1.5161536885635451e-05, + "loss": 0.3255, + "step": 6655 + }, + { + "epoch": 1.3121056782334386, + "grad_norm": 0.6200278707751725, + "learning_rate": 1.5160209232610637e-05, + "loss": 0.3556, + "step": 6656 + }, + { + "epoch": 1.3123028391167193, + "grad_norm": 0.5316032164983252, + "learning_rate": 1.5158881455607643e-05, + "loss": 0.3387, + "step": 6657 + }, + { + "epoch": 1.3125, + "grad_norm": 0.5213969466823882, + "learning_rate": 1.5157553554658367e-05, + "loss": 0.3442, + "step": 6658 + }, + { + "epoch": 1.3126971608832807, + "grad_norm": 0.5401001295546158, + "learning_rate": 1.5156225529794713e-05, + "loss": 0.3456, + "step": 6659 + }, + { + "epoch": 1.3128943217665614, + "grad_norm": 0.5668921110797275, + "learning_rate": 1.5154897381048588e-05, + "loss": 0.3473, + "step": 6660 + }, + { + "epoch": 1.3130914826498423, + "grad_norm": 0.5275183994588266, + "learning_rate": 1.5153569108451905e-05, + "loss": 0.3245, + "step": 6661 + }, + { + "epoch": 1.313288643533123, + "grad_norm": 0.5126597475316212, + "learning_rate": 1.5152240712036573e-05, + "loss": 0.3375, + "step": 6662 + }, + { + "epoch": 1.3134858044164037, + "grad_norm": 0.49094805836830735, + "learning_rate": 1.5150912191834504e-05, + "loss": 0.3237, + "step": 6663 + }, + { + "epoch": 1.3136829652996846, + "grad_norm": 0.5276645393537767, + "learning_rate": 1.5149583547877629e-05, + "loss": 0.3586, + "step": 6664 + }, + { + "epoch": 1.3138801261829653, + "grad_norm": 0.5293512949651924, + "learning_rate": 1.5148254780197856e-05, + "loss": 0.3064, + "step": 6665 + }, + { + "epoch": 1.314077287066246, + "grad_norm": 0.5109488720261187, + "learning_rate": 1.514692588882712e-05, + "loss": 0.3164, + "step": 6666 + }, + { + "epoch": 1.3142744479495267, + "grad_norm": 0.49826567908107544, + "learning_rate": 1.5145596873797342e-05, + "loss": 0.3158, + "step": 6667 + }, + { + "epoch": 1.3144716088328074, + "grad_norm": 0.5250546099243462, + "learning_rate": 1.5144267735140459e-05, + "loss": 0.3415, + "step": 6668 + }, + { + "epoch": 1.3146687697160884, + "grad_norm": 0.4974244732092123, + "learning_rate": 1.5142938472888395e-05, + "loss": 0.3153, + "step": 6669 + }, + { + "epoch": 1.314865930599369, + "grad_norm": 0.5030333271298699, + "learning_rate": 1.5141609087073099e-05, + "loss": 0.3212, + "step": 6670 + }, + { + "epoch": 1.3150630914826498, + "grad_norm": 0.49516039944032675, + "learning_rate": 1.51402795777265e-05, + "loss": 0.3373, + "step": 6671 + }, + { + "epoch": 1.3152602523659307, + "grad_norm": 0.5251022630682348, + "learning_rate": 1.5138949944880547e-05, + "loss": 0.3594, + "step": 6672 + }, + { + "epoch": 1.3154574132492114, + "grad_norm": 0.5080937428606618, + "learning_rate": 1.5137620188567183e-05, + "loss": 0.2982, + "step": 6673 + }, + { + "epoch": 1.315654574132492, + "grad_norm": 0.5762031753271423, + "learning_rate": 1.5136290308818355e-05, + "loss": 0.365, + "step": 6674 + }, + { + "epoch": 1.3158517350157728, + "grad_norm": 0.492333953850378, + "learning_rate": 1.5134960305666017e-05, + "loss": 0.3197, + "step": 6675 + }, + { + "epoch": 1.3160488958990535, + "grad_norm": 0.5353009146536675, + "learning_rate": 1.5133630179142124e-05, + "loss": 0.3206, + "step": 6676 + }, + { + "epoch": 1.3162460567823344, + "grad_norm": 0.5193275884525621, + "learning_rate": 1.5132299929278631e-05, + "loss": 0.3569, + "step": 6677 + }, + { + "epoch": 1.3164432176656151, + "grad_norm": 0.5170594306368025, + "learning_rate": 1.5130969556107498e-05, + "loss": 0.3609, + "step": 6678 + }, + { + "epoch": 1.3166403785488958, + "grad_norm": 0.5376643454719803, + "learning_rate": 1.512963905966069e-05, + "loss": 0.332, + "step": 6679 + }, + { + "epoch": 1.3168375394321767, + "grad_norm": 0.5133927845944496, + "learning_rate": 1.5128308439970174e-05, + "loss": 0.3372, + "step": 6680 + }, + { + "epoch": 1.3170347003154574, + "grad_norm": 0.5399002043459604, + "learning_rate": 1.5126977697067915e-05, + "loss": 0.363, + "step": 6681 + }, + { + "epoch": 1.3172318611987381, + "grad_norm": 0.49610768530525307, + "learning_rate": 1.5125646830985892e-05, + "loss": 0.3136, + "step": 6682 + }, + { + "epoch": 1.317429022082019, + "grad_norm": 0.5473955010074304, + "learning_rate": 1.5124315841756072e-05, + "loss": 0.355, + "step": 6683 + }, + { + "epoch": 1.3176261829652998, + "grad_norm": 0.578461631477163, + "learning_rate": 1.5122984729410437e-05, + "loss": 0.3719, + "step": 6684 + }, + { + "epoch": 1.3178233438485805, + "grad_norm": 0.5463357652437992, + "learning_rate": 1.5121653493980973e-05, + "loss": 0.3432, + "step": 6685 + }, + { + "epoch": 1.3180205047318612, + "grad_norm": 0.4772451928962519, + "learning_rate": 1.5120322135499654e-05, + "loss": 0.3071, + "step": 6686 + }, + { + "epoch": 1.3182176656151419, + "grad_norm": 0.5767867603367968, + "learning_rate": 1.511899065399848e-05, + "loss": 0.3512, + "step": 6687 + }, + { + "epoch": 1.3184148264984228, + "grad_norm": 0.5041565865933484, + "learning_rate": 1.5117659049509425e-05, + "loss": 0.3437, + "step": 6688 + }, + { + "epoch": 1.3186119873817035, + "grad_norm": 0.5042883160416222, + "learning_rate": 1.5116327322064497e-05, + "loss": 0.3271, + "step": 6689 + }, + { + "epoch": 1.3188091482649842, + "grad_norm": 0.4844327881363672, + "learning_rate": 1.5114995471695679e-05, + "loss": 0.2951, + "step": 6690 + }, + { + "epoch": 1.319006309148265, + "grad_norm": 0.5160775384323053, + "learning_rate": 1.5113663498434979e-05, + "loss": 0.3308, + "step": 6691 + }, + { + "epoch": 1.3192034700315458, + "grad_norm": 0.5182153228216186, + "learning_rate": 1.5112331402314393e-05, + "loss": 0.3311, + "step": 6692 + }, + { + "epoch": 1.3194006309148265, + "grad_norm": 0.5509813748947842, + "learning_rate": 1.5110999183365933e-05, + "loss": 0.3603, + "step": 6693 + }, + { + "epoch": 1.3195977917981072, + "grad_norm": 0.4984381584603614, + "learning_rate": 1.5109666841621597e-05, + "loss": 0.3316, + "step": 6694 + }, + { + "epoch": 1.319794952681388, + "grad_norm": 0.5204935976874747, + "learning_rate": 1.51083343771134e-05, + "loss": 0.3161, + "step": 6695 + }, + { + "epoch": 1.3199921135646688, + "grad_norm": 0.536865132939174, + "learning_rate": 1.510700178987336e-05, + "loss": 0.3332, + "step": 6696 + }, + { + "epoch": 1.3201892744479495, + "grad_norm": 0.520088029730267, + "learning_rate": 1.5105669079933486e-05, + "loss": 0.336, + "step": 6697 + }, + { + "epoch": 1.3203864353312302, + "grad_norm": 0.898006104693384, + "learning_rate": 1.5104336247325803e-05, + "loss": 0.3888, + "step": 6698 + }, + { + "epoch": 1.3205835962145112, + "grad_norm": 0.49085502589272256, + "learning_rate": 1.510300329208233e-05, + "loss": 0.3267, + "step": 6699 + }, + { + "epoch": 1.3207807570977919, + "grad_norm": 0.5621876865017514, + "learning_rate": 1.5101670214235094e-05, + "loss": 0.3831, + "step": 6700 + }, + { + "epoch": 1.3209779179810726, + "grad_norm": 0.5512732326331895, + "learning_rate": 1.5100337013816122e-05, + "loss": 0.3478, + "step": 6701 + }, + { + "epoch": 1.3211750788643533, + "grad_norm": 0.7789605507183492, + "learning_rate": 1.5099003690857448e-05, + "loss": 0.3209, + "step": 6702 + }, + { + "epoch": 1.321372239747634, + "grad_norm": 0.5419622274274699, + "learning_rate": 1.50976702453911e-05, + "loss": 0.3443, + "step": 6703 + }, + { + "epoch": 1.3215694006309149, + "grad_norm": 0.5100055997526229, + "learning_rate": 1.5096336677449125e-05, + "loss": 0.3405, + "step": 6704 + }, + { + "epoch": 1.3217665615141956, + "grad_norm": 0.4976601575607139, + "learning_rate": 1.5095002987063549e-05, + "loss": 0.3209, + "step": 6705 + }, + { + "epoch": 1.3219637223974763, + "grad_norm": 0.5566156732462322, + "learning_rate": 1.509366917426643e-05, + "loss": 0.3416, + "step": 6706 + }, + { + "epoch": 1.3221608832807572, + "grad_norm": 0.5069545882513634, + "learning_rate": 1.5092335239089803e-05, + "loss": 0.3394, + "step": 6707 + }, + { + "epoch": 1.322358044164038, + "grad_norm": 0.5033024334874232, + "learning_rate": 1.5091001181565725e-05, + "loss": 0.3424, + "step": 6708 + }, + { + "epoch": 1.3225552050473186, + "grad_norm": 0.49579326452908207, + "learning_rate": 1.5089667001726243e-05, + "loss": 0.3147, + "step": 6709 + }, + { + "epoch": 1.3227523659305993, + "grad_norm": 0.5271145089705676, + "learning_rate": 1.5088332699603412e-05, + "loss": 0.3593, + "step": 6710 + }, + { + "epoch": 1.32294952681388, + "grad_norm": 0.5320580774064494, + "learning_rate": 1.508699827522929e-05, + "loss": 0.3364, + "step": 6711 + }, + { + "epoch": 1.323146687697161, + "grad_norm": 0.5104256370446169, + "learning_rate": 1.5085663728635935e-05, + "loss": 0.333, + "step": 6712 + }, + { + "epoch": 1.3233438485804416, + "grad_norm": 0.49891079466541427, + "learning_rate": 1.5084329059855419e-05, + "loss": 0.3334, + "step": 6713 + }, + { + "epoch": 1.3235410094637223, + "grad_norm": 0.5733949567223368, + "learning_rate": 1.5082994268919798e-05, + "loss": 0.3157, + "step": 6714 + }, + { + "epoch": 1.3237381703470033, + "grad_norm": 0.5435038605202684, + "learning_rate": 1.508165935586115e-05, + "loss": 0.3233, + "step": 6715 + }, + { + "epoch": 1.323935331230284, + "grad_norm": 0.5673725835719423, + "learning_rate": 1.5080324320711542e-05, + "loss": 0.3399, + "step": 6716 + }, + { + "epoch": 1.3241324921135647, + "grad_norm": 0.46434225808628016, + "learning_rate": 1.507898916350305e-05, + "loss": 0.315, + "step": 6717 + }, + { + "epoch": 1.3243296529968454, + "grad_norm": 0.5335227211463376, + "learning_rate": 1.5077653884267753e-05, + "loss": 0.3509, + "step": 6718 + }, + { + "epoch": 1.324526813880126, + "grad_norm": 0.5194316243511954, + "learning_rate": 1.5076318483037736e-05, + "loss": 0.3524, + "step": 6719 + }, + { + "epoch": 1.324723974763407, + "grad_norm": 0.5237285016193873, + "learning_rate": 1.5074982959845077e-05, + "loss": 0.3842, + "step": 6720 + }, + { + "epoch": 1.3249211356466877, + "grad_norm": 0.512608848459545, + "learning_rate": 1.5073647314721867e-05, + "loss": 0.3554, + "step": 6721 + }, + { + "epoch": 1.3251182965299684, + "grad_norm": 22.837779278990155, + "learning_rate": 1.5072311547700194e-05, + "loss": 0.3395, + "step": 6722 + }, + { + "epoch": 1.3253154574132493, + "grad_norm": 0.5584335319441984, + "learning_rate": 1.507097565881215e-05, + "loss": 0.3746, + "step": 6723 + }, + { + "epoch": 1.32551261829653, + "grad_norm": 0.4725105964351633, + "learning_rate": 1.5069639648089833e-05, + "loss": 0.3079, + "step": 6724 + }, + { + "epoch": 1.3257097791798107, + "grad_norm": 0.48649112789508087, + "learning_rate": 1.506830351556534e-05, + "loss": 0.3243, + "step": 6725 + }, + { + "epoch": 1.3259069400630916, + "grad_norm": 0.5047703578036561, + "learning_rate": 1.5066967261270775e-05, + "loss": 0.352, + "step": 6726 + }, + { + "epoch": 1.3261041009463723, + "grad_norm": 0.5240622470462101, + "learning_rate": 1.506563088523824e-05, + "loss": 0.3676, + "step": 6727 + }, + { + "epoch": 1.326301261829653, + "grad_norm": 0.5168279521689652, + "learning_rate": 1.5064294387499844e-05, + "loss": 0.3542, + "step": 6728 + }, + { + "epoch": 1.3264984227129337, + "grad_norm": 0.4816304700981381, + "learning_rate": 1.5062957768087698e-05, + "loss": 0.3053, + "step": 6729 + }, + { + "epoch": 1.3266955835962144, + "grad_norm": 0.606385343673032, + "learning_rate": 1.5061621027033914e-05, + "loss": 0.3315, + "step": 6730 + }, + { + "epoch": 1.3268927444794953, + "grad_norm": 0.49813137918087436, + "learning_rate": 1.5060284164370606e-05, + "loss": 0.3501, + "step": 6731 + }, + { + "epoch": 1.327089905362776, + "grad_norm": 0.5003849761342135, + "learning_rate": 1.5058947180129902e-05, + "loss": 0.3308, + "step": 6732 + }, + { + "epoch": 1.3272870662460567, + "grad_norm": 0.5092842617270525, + "learning_rate": 1.5057610074343911e-05, + "loss": 0.3215, + "step": 6733 + }, + { + "epoch": 1.3274842271293377, + "grad_norm": 0.5426283767618892, + "learning_rate": 1.505627284704477e-05, + "loss": 0.3455, + "step": 6734 + }, + { + "epoch": 1.3276813880126184, + "grad_norm": 0.5093218204427572, + "learning_rate": 1.50549354982646e-05, + "loss": 0.341, + "step": 6735 + }, + { + "epoch": 1.327878548895899, + "grad_norm": 0.47254370160551584, + "learning_rate": 1.5053598028035534e-05, + "loss": 0.3253, + "step": 6736 + }, + { + "epoch": 1.3280757097791798, + "grad_norm": 0.5063195573320013, + "learning_rate": 1.5052260436389708e-05, + "loss": 0.3158, + "step": 6737 + }, + { + "epoch": 1.3282728706624605, + "grad_norm": 0.5253016137313395, + "learning_rate": 1.5050922723359254e-05, + "loss": 0.3387, + "step": 6738 + }, + { + "epoch": 1.3284700315457414, + "grad_norm": 0.4894300336691798, + "learning_rate": 1.5049584888976311e-05, + "loss": 0.3385, + "step": 6739 + }, + { + "epoch": 1.328667192429022, + "grad_norm": 0.5590119104561938, + "learning_rate": 1.504824693327303e-05, + "loss": 0.3397, + "step": 6740 + }, + { + "epoch": 1.3288643533123028, + "grad_norm": 0.5350527773958544, + "learning_rate": 1.504690885628155e-05, + "loss": 0.3326, + "step": 6741 + }, + { + "epoch": 1.3290615141955837, + "grad_norm": 0.5138921218572929, + "learning_rate": 1.5045570658034022e-05, + "loss": 0.3447, + "step": 6742 + }, + { + "epoch": 1.3292586750788644, + "grad_norm": 1.730892234036529, + "learning_rate": 1.504423233856259e-05, + "loss": 0.373, + "step": 6743 + }, + { + "epoch": 1.3294558359621451, + "grad_norm": 0.5491132539355397, + "learning_rate": 1.5042893897899417e-05, + "loss": 0.3453, + "step": 6744 + }, + { + "epoch": 1.3296529968454258, + "grad_norm": 0.539322312311082, + "learning_rate": 1.5041555336076661e-05, + "loss": 0.3468, + "step": 6745 + }, + { + "epoch": 1.3298501577287065, + "grad_norm": 0.5125742645152108, + "learning_rate": 1.5040216653126471e-05, + "loss": 0.3266, + "step": 6746 + }, + { + "epoch": 1.3300473186119874, + "grad_norm": 0.5010768816584499, + "learning_rate": 1.5038877849081023e-05, + "loss": 0.325, + "step": 6747 + }, + { + "epoch": 1.3302444794952681, + "grad_norm": 0.5237464923768348, + "learning_rate": 1.5037538923972474e-05, + "loss": 0.3392, + "step": 6748 + }, + { + "epoch": 1.3304416403785488, + "grad_norm": 0.5259667674407368, + "learning_rate": 1.5036199877832997e-05, + "loss": 0.3405, + "step": 6749 + }, + { + "epoch": 1.3306388012618298, + "grad_norm": 0.5653193705799677, + "learning_rate": 1.503486071069476e-05, + "loss": 0.3687, + "step": 6750 + }, + { + "epoch": 1.3308359621451105, + "grad_norm": 0.5180820594611166, + "learning_rate": 1.5033521422589943e-05, + "loss": 0.3785, + "step": 6751 + }, + { + "epoch": 1.3310331230283912, + "grad_norm": 0.5264455832857787, + "learning_rate": 1.5032182013550719e-05, + "loss": 0.349, + "step": 6752 + }, + { + "epoch": 1.3312302839116719, + "grad_norm": 0.5131681170313048, + "learning_rate": 1.5030842483609268e-05, + "loss": 0.3309, + "step": 6753 + }, + { + "epoch": 1.3314274447949526, + "grad_norm": 0.5006781906744829, + "learning_rate": 1.5029502832797775e-05, + "loss": 0.3106, + "step": 6754 + }, + { + "epoch": 1.3316246056782335, + "grad_norm": 0.4732747904769726, + "learning_rate": 1.5028163061148432e-05, + "loss": 0.3355, + "step": 6755 + }, + { + "epoch": 1.3318217665615142, + "grad_norm": 0.4995838593733347, + "learning_rate": 1.5026823168693414e-05, + "loss": 0.3322, + "step": 6756 + }, + { + "epoch": 1.3320189274447949, + "grad_norm": 0.478174941952485, + "learning_rate": 1.5025483155464926e-05, + "loss": 0.3073, + "step": 6757 + }, + { + "epoch": 1.3322160883280758, + "grad_norm": 0.484122961464807, + "learning_rate": 1.5024143021495157e-05, + "loss": 0.3413, + "step": 6758 + }, + { + "epoch": 1.3324132492113565, + "grad_norm": 0.5951585544576415, + "learning_rate": 1.5022802766816306e-05, + "loss": 0.3635, + "step": 6759 + }, + { + "epoch": 1.3326104100946372, + "grad_norm": 0.5340216464990926, + "learning_rate": 1.5021462391460576e-05, + "loss": 0.3552, + "step": 6760 + }, + { + "epoch": 1.332807570977918, + "grad_norm": 0.5645843694433802, + "learning_rate": 1.5020121895460165e-05, + "loss": 0.3593, + "step": 6761 + }, + { + "epoch": 1.3330047318611986, + "grad_norm": 0.5728974673232404, + "learning_rate": 1.5018781278847286e-05, + "loss": 0.3529, + "step": 6762 + }, + { + "epoch": 1.3332018927444795, + "grad_norm": 0.5256692890594539, + "learning_rate": 1.501744054165414e-05, + "loss": 0.3387, + "step": 6763 + }, + { + "epoch": 1.3333990536277602, + "grad_norm": 0.5674819000439734, + "learning_rate": 1.501609968391295e-05, + "loss": 0.3509, + "step": 6764 + }, + { + "epoch": 1.333596214511041, + "grad_norm": 0.4986446571718649, + "learning_rate": 1.5014758705655922e-05, + "loss": 0.3239, + "step": 6765 + }, + { + "epoch": 1.3337933753943219, + "grad_norm": 0.4896630219969831, + "learning_rate": 1.5013417606915279e-05, + "loss": 0.3204, + "step": 6766 + }, + { + "epoch": 1.3339905362776026, + "grad_norm": 0.5177912836591282, + "learning_rate": 1.501207638772324e-05, + "loss": 0.3386, + "step": 6767 + }, + { + "epoch": 1.3341876971608833, + "grad_norm": 0.48153562676597106, + "learning_rate": 1.5010735048112031e-05, + "loss": 0.3056, + "step": 6768 + }, + { + "epoch": 1.334384858044164, + "grad_norm": 0.499695810551327, + "learning_rate": 1.5009393588113876e-05, + "loss": 0.2962, + "step": 6769 + }, + { + "epoch": 1.3345820189274447, + "grad_norm": 0.48456873034066944, + "learning_rate": 1.5008052007761009e-05, + "loss": 0.3056, + "step": 6770 + }, + { + "epoch": 1.3347791798107256, + "grad_norm": 0.5138152037859967, + "learning_rate": 1.5006710307085656e-05, + "loss": 0.3473, + "step": 6771 + }, + { + "epoch": 1.3349763406940063, + "grad_norm": 0.4989689701123164, + "learning_rate": 1.5005368486120058e-05, + "loss": 0.3527, + "step": 6772 + }, + { + "epoch": 1.335173501577287, + "grad_norm": 0.5240843932016397, + "learning_rate": 1.5004026544896448e-05, + "loss": 0.3289, + "step": 6773 + }, + { + "epoch": 1.335370662460568, + "grad_norm": 0.51061194084439, + "learning_rate": 1.5002684483447074e-05, + "loss": 0.3582, + "step": 6774 + }, + { + "epoch": 1.3355678233438486, + "grad_norm": 0.5005446884388874, + "learning_rate": 1.5001342301804176e-05, + "loss": 0.3458, + "step": 6775 + }, + { + "epoch": 1.3357649842271293, + "grad_norm": 0.5140258358304517, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.3606, + "step": 6776 + }, + { + "epoch": 1.3359621451104102, + "grad_norm": 0.49447960825964604, + "learning_rate": 1.49986575780668e-05, + "loss": 0.3337, + "step": 6777 + }, + { + "epoch": 1.336159305993691, + "grad_norm": 0.4734352743858005, + "learning_rate": 1.4997315036036826e-05, + "loss": 0.3288, + "step": 6778 + }, + { + "epoch": 1.3363564668769716, + "grad_norm": 0.5153787348270493, + "learning_rate": 1.4995972373942334e-05, + "loss": 0.3512, + "step": 6779 + }, + { + "epoch": 1.3365536277602523, + "grad_norm": 0.5614500712850554, + "learning_rate": 1.4994629591815579e-05, + "loss": 0.3581, + "step": 6780 + }, + { + "epoch": 1.336750788643533, + "grad_norm": 0.5086243455682851, + "learning_rate": 1.4993286689688831e-05, + "loss": 0.3368, + "step": 6781 + }, + { + "epoch": 1.336947949526814, + "grad_norm": 0.5109131318818986, + "learning_rate": 1.4991943667594344e-05, + "loss": 0.3473, + "step": 6782 + }, + { + "epoch": 1.3371451104100947, + "grad_norm": 0.48373369562864504, + "learning_rate": 1.4990600525564394e-05, + "loss": 0.3171, + "step": 6783 + }, + { + "epoch": 1.3373422712933754, + "grad_norm": 0.48077824595792773, + "learning_rate": 1.4989257263631246e-05, + "loss": 0.3112, + "step": 6784 + }, + { + "epoch": 1.3375394321766563, + "grad_norm": 0.5403123400327008, + "learning_rate": 1.4987913881827177e-05, + "loss": 0.3509, + "step": 6785 + }, + { + "epoch": 1.337736593059937, + "grad_norm": 0.5283987542462503, + "learning_rate": 1.4986570380184454e-05, + "loss": 0.3369, + "step": 6786 + }, + { + "epoch": 1.3379337539432177, + "grad_norm": 0.5135808090920458, + "learning_rate": 1.4985226758735368e-05, + "loss": 0.3436, + "step": 6787 + }, + { + "epoch": 1.3381309148264984, + "grad_norm": 0.5241808952900016, + "learning_rate": 1.498388301751219e-05, + "loss": 0.329, + "step": 6788 + }, + { + "epoch": 1.338328075709779, + "grad_norm": 0.539221498149948, + "learning_rate": 1.4982539156547214e-05, + "loss": 0.3565, + "step": 6789 + }, + { + "epoch": 1.33852523659306, + "grad_norm": 0.5312950545419894, + "learning_rate": 1.4981195175872718e-05, + "loss": 0.3392, + "step": 6790 + }, + { + "epoch": 1.3387223974763407, + "grad_norm": 0.5526638195715327, + "learning_rate": 1.4979851075521e-05, + "loss": 0.3518, + "step": 6791 + }, + { + "epoch": 1.3389195583596214, + "grad_norm": 0.5416794319715723, + "learning_rate": 1.4978506855524348e-05, + "loss": 0.3632, + "step": 6792 + }, + { + "epoch": 1.3391167192429023, + "grad_norm": 0.5235504870365736, + "learning_rate": 1.497716251591506e-05, + "loss": 0.316, + "step": 6793 + }, + { + "epoch": 1.339313880126183, + "grad_norm": 0.5043048732799276, + "learning_rate": 1.4975818056725433e-05, + "loss": 0.3434, + "step": 6794 + }, + { + "epoch": 1.3395110410094637, + "grad_norm": 0.49798925055448356, + "learning_rate": 1.4974473477987771e-05, + "loss": 0.3184, + "step": 6795 + }, + { + "epoch": 1.3397082018927444, + "grad_norm": 0.4979629803712915, + "learning_rate": 1.4973128779734381e-05, + "loss": 0.321, + "step": 6796 + }, + { + "epoch": 1.3399053627760251, + "grad_norm": 0.5040873484662636, + "learning_rate": 1.4971783961997561e-05, + "loss": 0.3426, + "step": 6797 + }, + { + "epoch": 1.340102523659306, + "grad_norm": 0.4974631337369193, + "learning_rate": 1.4970439024809634e-05, + "loss": 0.3271, + "step": 6798 + }, + { + "epoch": 1.3402996845425867, + "grad_norm": 0.5323248192377603, + "learning_rate": 1.49690939682029e-05, + "loss": 0.3537, + "step": 6799 + }, + { + "epoch": 1.3404968454258674, + "grad_norm": 0.48171766055599236, + "learning_rate": 1.4967748792209689e-05, + "loss": 0.3173, + "step": 6800 + }, + { + "epoch": 1.3406940063091484, + "grad_norm": 0.510686613417193, + "learning_rate": 1.4966403496862304e-05, + "loss": 0.3195, + "step": 6801 + }, + { + "epoch": 1.340891167192429, + "grad_norm": 0.48429649890064824, + "learning_rate": 1.4965058082193084e-05, + "loss": 0.3141, + "step": 6802 + }, + { + "epoch": 1.3410883280757098, + "grad_norm": 0.49594807930521567, + "learning_rate": 1.496371254823434e-05, + "loss": 0.321, + "step": 6803 + }, + { + "epoch": 1.3412854889589905, + "grad_norm": 0.526673612495566, + "learning_rate": 1.496236689501841e-05, + "loss": 0.3415, + "step": 6804 + }, + { + "epoch": 1.3414826498422712, + "grad_norm": 0.45404096769055313, + "learning_rate": 1.4961021122577613e-05, + "loss": 0.2986, + "step": 6805 + }, + { + "epoch": 1.341679810725552, + "grad_norm": 0.5806061745177113, + "learning_rate": 1.495967523094429e-05, + "loss": 0.3928, + "step": 6806 + }, + { + "epoch": 1.3418769716088328, + "grad_norm": 0.470337630001986, + "learning_rate": 1.4958329220150778e-05, + "loss": 0.3148, + "step": 6807 + }, + { + "epoch": 1.3420741324921135, + "grad_norm": 0.47772699425478293, + "learning_rate": 1.4956983090229413e-05, + "loss": 0.3203, + "step": 6808 + }, + { + "epoch": 1.3422712933753944, + "grad_norm": 0.4886302060088961, + "learning_rate": 1.4955636841212538e-05, + "loss": 0.3518, + "step": 6809 + }, + { + "epoch": 1.3424684542586751, + "grad_norm": 0.48102229230186133, + "learning_rate": 1.4954290473132495e-05, + "loss": 0.3198, + "step": 6810 + }, + { + "epoch": 1.3426656151419558, + "grad_norm": 0.48930648987210024, + "learning_rate": 1.4952943986021633e-05, + "loss": 0.3305, + "step": 6811 + }, + { + "epoch": 1.3428627760252365, + "grad_norm": 0.540876771967004, + "learning_rate": 1.4951597379912306e-05, + "loss": 0.3504, + "step": 6812 + }, + { + "epoch": 1.3430599369085172, + "grad_norm": 0.5164837203011601, + "learning_rate": 1.4950250654836862e-05, + "loss": 0.3657, + "step": 6813 + }, + { + "epoch": 1.3432570977917981, + "grad_norm": 0.4476636874195238, + "learning_rate": 1.4948903810827662e-05, + "loss": 0.295, + "step": 6814 + }, + { + "epoch": 1.3434542586750788, + "grad_norm": 0.5015700179009981, + "learning_rate": 1.4947556847917062e-05, + "loss": 0.3495, + "step": 6815 + }, + { + "epoch": 1.3436514195583595, + "grad_norm": 0.4883735796558397, + "learning_rate": 1.4946209766137422e-05, + "loss": 0.3262, + "step": 6816 + }, + { + "epoch": 1.3438485804416405, + "grad_norm": 0.4960996354859811, + "learning_rate": 1.4944862565521113e-05, + "loss": 0.3266, + "step": 6817 + }, + { + "epoch": 1.3440457413249212, + "grad_norm": 0.5111574948370762, + "learning_rate": 1.4943515246100498e-05, + "loss": 0.3534, + "step": 6818 + }, + { + "epoch": 1.3442429022082019, + "grad_norm": 0.4806601835647626, + "learning_rate": 1.4942167807907945e-05, + "loss": 0.3156, + "step": 6819 + }, + { + "epoch": 1.3444400630914828, + "grad_norm": 0.4771804770811557, + "learning_rate": 1.494082025097583e-05, + "loss": 0.3411, + "step": 6820 + }, + { + "epoch": 1.3446372239747635, + "grad_norm": 0.5010047370612712, + "learning_rate": 1.4939472575336535e-05, + "loss": 0.3332, + "step": 6821 + }, + { + "epoch": 1.3448343848580442, + "grad_norm": 0.5128977303636474, + "learning_rate": 1.4938124781022429e-05, + "loss": 0.3502, + "step": 6822 + }, + { + "epoch": 1.3450315457413249, + "grad_norm": 0.5115752389432223, + "learning_rate": 1.4936776868065904e-05, + "loss": 0.3251, + "step": 6823 + }, + { + "epoch": 1.3452287066246056, + "grad_norm": 0.5115575705230232, + "learning_rate": 1.4935428836499333e-05, + "loss": 0.3251, + "step": 6824 + }, + { + "epoch": 1.3454258675078865, + "grad_norm": 0.45496251393031, + "learning_rate": 1.4934080686355112e-05, + "loss": 0.3212, + "step": 6825 + }, + { + "epoch": 1.3456230283911672, + "grad_norm": 0.4683704737332005, + "learning_rate": 1.4932732417665627e-05, + "loss": 0.314, + "step": 6826 + }, + { + "epoch": 1.345820189274448, + "grad_norm": 0.5058243078911916, + "learning_rate": 1.4931384030463276e-05, + "loss": 0.3547, + "step": 6827 + }, + { + "epoch": 1.3460173501577288, + "grad_norm": 0.49135394576091385, + "learning_rate": 1.4930035524780455e-05, + "loss": 0.3351, + "step": 6828 + }, + { + "epoch": 1.3462145110410095, + "grad_norm": 0.49176472785254427, + "learning_rate": 1.4928686900649557e-05, + "loss": 0.3467, + "step": 6829 + }, + { + "epoch": 1.3464116719242902, + "grad_norm": 0.4877950490063829, + "learning_rate": 1.4927338158102988e-05, + "loss": 0.32, + "step": 6830 + }, + { + "epoch": 1.346608832807571, + "grad_norm": 0.5644001631439873, + "learning_rate": 1.4925989297173148e-05, + "loss": 0.3492, + "step": 6831 + }, + { + "epoch": 1.3468059936908516, + "grad_norm": 0.5182639423714361, + "learning_rate": 1.4924640317892457e-05, + "loss": 0.3616, + "step": 6832 + }, + { + "epoch": 1.3470031545741326, + "grad_norm": 0.4911569508385157, + "learning_rate": 1.4923291220293307e-05, + "loss": 0.3187, + "step": 6833 + }, + { + "epoch": 1.3472003154574133, + "grad_norm": 0.4939068383078795, + "learning_rate": 1.4921942004408126e-05, + "loss": 0.3352, + "step": 6834 + }, + { + "epoch": 1.347397476340694, + "grad_norm": 0.48064147010076375, + "learning_rate": 1.4920592670269323e-05, + "loss": 0.3334, + "step": 6835 + }, + { + "epoch": 1.3475946372239749, + "grad_norm": 0.5150465597535828, + "learning_rate": 1.4919243217909318e-05, + "loss": 0.3525, + "step": 6836 + }, + { + "epoch": 1.3477917981072556, + "grad_norm": 0.48198241690638255, + "learning_rate": 1.4917893647360538e-05, + "loss": 0.3215, + "step": 6837 + }, + { + "epoch": 1.3479889589905363, + "grad_norm": 0.49192236847221166, + "learning_rate": 1.4916543958655396e-05, + "loss": 0.3446, + "step": 6838 + }, + { + "epoch": 1.348186119873817, + "grad_norm": 0.5380105418206355, + "learning_rate": 1.491519415182633e-05, + "loss": 0.3455, + "step": 6839 + }, + { + "epoch": 1.3483832807570977, + "grad_norm": 0.5360035265768491, + "learning_rate": 1.4913844226905767e-05, + "loss": 0.3578, + "step": 6840 + }, + { + "epoch": 1.3485804416403786, + "grad_norm": 0.6845647174250372, + "learning_rate": 1.4912494183926139e-05, + "loss": 0.3396, + "step": 6841 + }, + { + "epoch": 1.3487776025236593, + "grad_norm": 0.5269738365614332, + "learning_rate": 1.4911144022919879e-05, + "loss": 0.3559, + "step": 6842 + }, + { + "epoch": 1.34897476340694, + "grad_norm": 0.4813494392754745, + "learning_rate": 1.4909793743919432e-05, + "loss": 0.3228, + "step": 6843 + }, + { + "epoch": 1.349171924290221, + "grad_norm": 0.4816948637640496, + "learning_rate": 1.4908443346957235e-05, + "loss": 0.3154, + "step": 6844 + }, + { + "epoch": 1.3493690851735016, + "grad_norm": 0.45620872496423787, + "learning_rate": 1.4907092832065734e-05, + "loss": 0.3105, + "step": 6845 + }, + { + "epoch": 1.3495662460567823, + "grad_norm": 0.5146288021545241, + "learning_rate": 1.4905742199277376e-05, + "loss": 0.3563, + "step": 6846 + }, + { + "epoch": 1.349763406940063, + "grad_norm": 0.508730043660222, + "learning_rate": 1.4904391448624612e-05, + "loss": 0.3473, + "step": 6847 + }, + { + "epoch": 1.3499605678233437, + "grad_norm": 0.5656947915688717, + "learning_rate": 1.4903040580139891e-05, + "loss": 0.3603, + "step": 6848 + }, + { + "epoch": 1.3501577287066246, + "grad_norm": 0.48859833009143055, + "learning_rate": 1.4901689593855677e-05, + "loss": 0.3243, + "step": 6849 + }, + { + "epoch": 1.3503548895899053, + "grad_norm": 0.5710007684390376, + "learning_rate": 1.4900338489804418e-05, + "loss": 0.3194, + "step": 6850 + }, + { + "epoch": 1.350552050473186, + "grad_norm": 0.5249828286967833, + "learning_rate": 1.4898987268018586e-05, + "loss": 0.3633, + "step": 6851 + }, + { + "epoch": 1.350749211356467, + "grad_norm": 0.5414339406348158, + "learning_rate": 1.4897635928530634e-05, + "loss": 0.3512, + "step": 6852 + }, + { + "epoch": 1.3509463722397477, + "grad_norm": 0.44873869097621316, + "learning_rate": 1.4896284471373038e-05, + "loss": 0.3052, + "step": 6853 + }, + { + "epoch": 1.3511435331230284, + "grad_norm": 0.4672888541955871, + "learning_rate": 1.4894932896578262e-05, + "loss": 0.3198, + "step": 6854 + }, + { + "epoch": 1.351340694006309, + "grad_norm": 0.5477287513630034, + "learning_rate": 1.4893581204178785e-05, + "loss": 0.3337, + "step": 6855 + }, + { + "epoch": 1.3515378548895898, + "grad_norm": 0.47013217105100197, + "learning_rate": 1.4892229394207076e-05, + "loss": 0.3484, + "step": 6856 + }, + { + "epoch": 1.3517350157728707, + "grad_norm": 0.47570179900580484, + "learning_rate": 1.4890877466695617e-05, + "loss": 0.3142, + "step": 6857 + }, + { + "epoch": 1.3519321766561514, + "grad_norm": 0.5040215090763913, + "learning_rate": 1.488952542167689e-05, + "loss": 0.3213, + "step": 6858 + }, + { + "epoch": 1.352129337539432, + "grad_norm": 0.5374781827924892, + "learning_rate": 1.4888173259183375e-05, + "loss": 0.3559, + "step": 6859 + }, + { + "epoch": 1.352326498422713, + "grad_norm": 0.4893306035975573, + "learning_rate": 1.4886820979247561e-05, + "loss": 0.3199, + "step": 6860 + }, + { + "epoch": 1.3525236593059937, + "grad_norm": 0.47307143181191913, + "learning_rate": 1.4885468581901939e-05, + "loss": 0.3081, + "step": 6861 + }, + { + "epoch": 1.3527208201892744, + "grad_norm": 0.49226139486267406, + "learning_rate": 1.4884116067178997e-05, + "loss": 0.3359, + "step": 6862 + }, + { + "epoch": 1.3529179810725553, + "grad_norm": 2.9054703966108226, + "learning_rate": 1.4882763435111236e-05, + "loss": 0.3751, + "step": 6863 + }, + { + "epoch": 1.353115141955836, + "grad_norm": 0.5783559792239383, + "learning_rate": 1.4881410685731152e-05, + "loss": 0.341, + "step": 6864 + }, + { + "epoch": 1.3533123028391167, + "grad_norm": 0.5284375900313468, + "learning_rate": 1.4880057819071244e-05, + "loss": 0.3455, + "step": 6865 + }, + { + "epoch": 1.3535094637223974, + "grad_norm": 0.47608340152473533, + "learning_rate": 1.4878704835164018e-05, + "loss": 0.2983, + "step": 6866 + }, + { + "epoch": 1.3537066246056781, + "grad_norm": 0.506845850224768, + "learning_rate": 1.4877351734041976e-05, + "loss": 0.3592, + "step": 6867 + }, + { + "epoch": 1.353903785488959, + "grad_norm": 0.4935101468881354, + "learning_rate": 1.4875998515737635e-05, + "loss": 0.3066, + "step": 6868 + }, + { + "epoch": 1.3541009463722398, + "grad_norm": 0.5059246656166723, + "learning_rate": 1.48746451802835e-05, + "loss": 0.3418, + "step": 6869 + }, + { + "epoch": 1.3542981072555205, + "grad_norm": 0.46928991796641456, + "learning_rate": 1.4873291727712094e-05, + "loss": 0.3037, + "step": 6870 + }, + { + "epoch": 1.3544952681388014, + "grad_norm": 0.5073410603661683, + "learning_rate": 1.4871938158055926e-05, + "loss": 0.3429, + "step": 6871 + }, + { + "epoch": 1.354692429022082, + "grad_norm": 0.5676528958205695, + "learning_rate": 1.487058447134752e-05, + "loss": 0.386, + "step": 6872 + }, + { + "epoch": 1.3548895899053628, + "grad_norm": 0.5449613542540548, + "learning_rate": 1.4869230667619399e-05, + "loss": 0.3391, + "step": 6873 + }, + { + "epoch": 1.3550867507886435, + "grad_norm": 0.5309796727926993, + "learning_rate": 1.4867876746904093e-05, + "loss": 0.3191, + "step": 6874 + }, + { + "epoch": 1.3552839116719242, + "grad_norm": 0.48139917834477924, + "learning_rate": 1.4866522709234125e-05, + "loss": 0.3207, + "step": 6875 + }, + { + "epoch": 1.3554810725552051, + "grad_norm": 0.4741924876595785, + "learning_rate": 1.4865168554642033e-05, + "loss": 0.3279, + "step": 6876 + }, + { + "epoch": 1.3556782334384858, + "grad_norm": 0.48900065703542145, + "learning_rate": 1.4863814283160348e-05, + "loss": 0.3399, + "step": 6877 + }, + { + "epoch": 1.3558753943217665, + "grad_norm": 0.5735243814672437, + "learning_rate": 1.4862459894821606e-05, + "loss": 0.3556, + "step": 6878 + }, + { + "epoch": 1.3560725552050474, + "grad_norm": 0.5398399252571137, + "learning_rate": 1.486110538965835e-05, + "loss": 0.384, + "step": 6879 + }, + { + "epoch": 1.3562697160883281, + "grad_norm": 0.5117636275917532, + "learning_rate": 1.4859750767703122e-05, + "loss": 0.3438, + "step": 6880 + }, + { + "epoch": 1.3564668769716088, + "grad_norm": 0.5051003334328982, + "learning_rate": 1.4858396028988472e-05, + "loss": 0.3428, + "step": 6881 + }, + { + "epoch": 1.3566640378548895, + "grad_norm": 0.5441154566116012, + "learning_rate": 1.4857041173546941e-05, + "loss": 0.3412, + "step": 6882 + }, + { + "epoch": 1.3568611987381702, + "grad_norm": 0.4798420407498326, + "learning_rate": 1.4855686201411086e-05, + "loss": 0.3448, + "step": 6883 + }, + { + "epoch": 1.3570583596214512, + "grad_norm": 0.5141849025940272, + "learning_rate": 1.485433111261346e-05, + "loss": 0.3407, + "step": 6884 + }, + { + "epoch": 1.3572555205047319, + "grad_norm": 0.534717445086196, + "learning_rate": 1.4852975907186618e-05, + "loss": 0.3731, + "step": 6885 + }, + { + "epoch": 1.3574526813880126, + "grad_norm": 0.5435535748807395, + "learning_rate": 1.4851620585163123e-05, + "loss": 0.3712, + "step": 6886 + }, + { + "epoch": 1.3576498422712935, + "grad_norm": 0.4963966514211321, + "learning_rate": 1.4850265146575535e-05, + "loss": 0.3203, + "step": 6887 + }, + { + "epoch": 1.3578470031545742, + "grad_norm": 0.5097541899694029, + "learning_rate": 1.4848909591456421e-05, + "loss": 0.3511, + "step": 6888 + }, + { + "epoch": 1.3580441640378549, + "grad_norm": 0.5022870460738562, + "learning_rate": 1.4847553919838353e-05, + "loss": 0.3459, + "step": 6889 + }, + { + "epoch": 1.3582413249211356, + "grad_norm": 0.5010019389495004, + "learning_rate": 1.4846198131753894e-05, + "loss": 0.349, + "step": 6890 + }, + { + "epoch": 1.3584384858044163, + "grad_norm": 0.4838676445079221, + "learning_rate": 1.4844842227235628e-05, + "loss": 0.3237, + "step": 6891 + }, + { + "epoch": 1.3586356466876972, + "grad_norm": 0.49788296424059325, + "learning_rate": 1.4843486206316122e-05, + "loss": 0.3319, + "step": 6892 + }, + { + "epoch": 1.358832807570978, + "grad_norm": 0.5331292005137582, + "learning_rate": 1.4842130069027957e-05, + "loss": 0.3446, + "step": 6893 + }, + { + "epoch": 1.3590299684542586, + "grad_norm": 0.5009763516051616, + "learning_rate": 1.4840773815403722e-05, + "loss": 0.3255, + "step": 6894 + }, + { + "epoch": 1.3592271293375395, + "grad_norm": 0.5154972070799596, + "learning_rate": 1.4839417445475995e-05, + "loss": 0.3382, + "step": 6895 + }, + { + "epoch": 1.3594242902208202, + "grad_norm": 0.522383735377928, + "learning_rate": 1.483806095927737e-05, + "loss": 0.3481, + "step": 6896 + }, + { + "epoch": 1.359621451104101, + "grad_norm": 0.5040201789415579, + "learning_rate": 1.4836704356840428e-05, + "loss": 0.3329, + "step": 6897 + }, + { + "epoch": 1.3598186119873816, + "grad_norm": 0.525037747965827, + "learning_rate": 1.4835347638197777e-05, + "loss": 0.3359, + "step": 6898 + }, + { + "epoch": 1.3600157728706623, + "grad_norm": 0.5051300920201072, + "learning_rate": 1.4833990803381997e-05, + "loss": 0.3467, + "step": 6899 + }, + { + "epoch": 1.3602129337539433, + "grad_norm": 0.4999739246930199, + "learning_rate": 1.4832633852425702e-05, + "loss": 0.361, + "step": 6900 + }, + { + "epoch": 1.360410094637224, + "grad_norm": 0.5107030410105877, + "learning_rate": 1.4831276785361484e-05, + "loss": 0.3556, + "step": 6901 + }, + { + "epoch": 1.3606072555205047, + "grad_norm": 0.4974505550953817, + "learning_rate": 1.4829919602221949e-05, + "loss": 0.3282, + "step": 6902 + }, + { + "epoch": 1.3608044164037856, + "grad_norm": 0.48064179340063234, + "learning_rate": 1.4828562303039708e-05, + "loss": 0.3416, + "step": 6903 + }, + { + "epoch": 1.3610015772870663, + "grad_norm": 0.5127276668204793, + "learning_rate": 1.4827204887847369e-05, + "loss": 0.3347, + "step": 6904 + }, + { + "epoch": 1.361198738170347, + "grad_norm": 0.4780273243210127, + "learning_rate": 1.4825847356677546e-05, + "loss": 0.304, + "step": 6905 + }, + { + "epoch": 1.3613958990536277, + "grad_norm": 0.4887295391580163, + "learning_rate": 1.4824489709562854e-05, + "loss": 0.3327, + "step": 6906 + }, + { + "epoch": 1.3615930599369084, + "grad_norm": 0.48670211288915743, + "learning_rate": 1.4823131946535912e-05, + "loss": 0.3355, + "step": 6907 + }, + { + "epoch": 1.3617902208201893, + "grad_norm": 0.5294737580364534, + "learning_rate": 1.4821774067629338e-05, + "loss": 0.3631, + "step": 6908 + }, + { + "epoch": 1.36198738170347, + "grad_norm": 0.4803323354930984, + "learning_rate": 1.482041607287576e-05, + "loss": 0.3238, + "step": 6909 + }, + { + "epoch": 1.3621845425867507, + "grad_norm": 0.5172113196727169, + "learning_rate": 1.4819057962307805e-05, + "loss": 0.3276, + "step": 6910 + }, + { + "epoch": 1.3623817034700316, + "grad_norm": 0.5122817965816258, + "learning_rate": 1.4817699735958103e-05, + "loss": 0.34, + "step": 6911 + }, + { + "epoch": 1.3625788643533123, + "grad_norm": 0.500184273669358, + "learning_rate": 1.4816341393859283e-05, + "loss": 0.3579, + "step": 6912 + }, + { + "epoch": 1.362776025236593, + "grad_norm": 0.49730830810220844, + "learning_rate": 1.4814982936043984e-05, + "loss": 0.3338, + "step": 6913 + }, + { + "epoch": 1.362973186119874, + "grad_norm": 0.4523508533879372, + "learning_rate": 1.481362436254484e-05, + "loss": 0.3122, + "step": 6914 + }, + { + "epoch": 1.3631703470031546, + "grad_norm": 0.5318999450446498, + "learning_rate": 1.4812265673394496e-05, + "loss": 0.3442, + "step": 6915 + }, + { + "epoch": 1.3633675078864353, + "grad_norm": 0.4991165752969515, + "learning_rate": 1.4810906868625595e-05, + "loss": 0.3403, + "step": 6916 + }, + { + "epoch": 1.363564668769716, + "grad_norm": 0.5238776570531024, + "learning_rate": 1.4809547948270782e-05, + "loss": 0.358, + "step": 6917 + }, + { + "epoch": 1.3637618296529967, + "grad_norm": 0.4971815188764086, + "learning_rate": 1.4808188912362705e-05, + "loss": 0.3149, + "step": 6918 + }, + { + "epoch": 1.3639589905362777, + "grad_norm": 0.5124620131345615, + "learning_rate": 1.4806829760934018e-05, + "loss": 0.3232, + "step": 6919 + }, + { + "epoch": 1.3641561514195584, + "grad_norm": 0.5178627263514879, + "learning_rate": 1.4805470494017373e-05, + "loss": 0.3513, + "step": 6920 + }, + { + "epoch": 1.364353312302839, + "grad_norm": 0.52380799206788, + "learning_rate": 1.4804111111645434e-05, + "loss": 0.3487, + "step": 6921 + }, + { + "epoch": 1.36455047318612, + "grad_norm": 0.48203134434437056, + "learning_rate": 1.4802751613850853e-05, + "loss": 0.3213, + "step": 6922 + }, + { + "epoch": 1.3647476340694007, + "grad_norm": 0.5102651067337406, + "learning_rate": 1.4801392000666297e-05, + "loss": 0.3266, + "step": 6923 + }, + { + "epoch": 1.3649447949526814, + "grad_norm": 0.5414007491135933, + "learning_rate": 1.4800032272124432e-05, + "loss": 0.3818, + "step": 6924 + }, + { + "epoch": 1.365141955835962, + "grad_norm": 0.5237634060391902, + "learning_rate": 1.4798672428257928e-05, + "loss": 0.3349, + "step": 6925 + }, + { + "epoch": 1.3653391167192428, + "grad_norm": 0.5011403516477672, + "learning_rate": 1.4797312469099454e-05, + "loss": 0.3213, + "step": 6926 + }, + { + "epoch": 1.3655362776025237, + "grad_norm": 0.5571042867990981, + "learning_rate": 1.4795952394681682e-05, + "loss": 0.3691, + "step": 6927 + }, + { + "epoch": 1.3657334384858044, + "grad_norm": 0.48652049099214323, + "learning_rate": 1.4794592205037295e-05, + "loss": 0.3387, + "step": 6928 + }, + { + "epoch": 1.3659305993690851, + "grad_norm": 0.4823341354472324, + "learning_rate": 1.4793231900198968e-05, + "loss": 0.3235, + "step": 6929 + }, + { + "epoch": 1.366127760252366, + "grad_norm": 0.5152844218945443, + "learning_rate": 1.4791871480199385e-05, + "loss": 0.3405, + "step": 6930 + }, + { + "epoch": 1.3663249211356467, + "grad_norm": 0.4901566411635077, + "learning_rate": 1.479051094507123e-05, + "loss": 0.328, + "step": 6931 + }, + { + "epoch": 1.3665220820189274, + "grad_norm": 0.5243962870706206, + "learning_rate": 1.4789150294847192e-05, + "loss": 0.3436, + "step": 6932 + }, + { + "epoch": 1.3667192429022081, + "grad_norm": 0.5194816070602105, + "learning_rate": 1.4787789529559961e-05, + "loss": 0.3309, + "step": 6933 + }, + { + "epoch": 1.3669164037854888, + "grad_norm": 0.5123859611810547, + "learning_rate": 1.4786428649242232e-05, + "loss": 0.3367, + "step": 6934 + }, + { + "epoch": 1.3671135646687698, + "grad_norm": 0.4896479386075604, + "learning_rate": 1.4785067653926701e-05, + "loss": 0.3204, + "step": 6935 + }, + { + "epoch": 1.3673107255520505, + "grad_norm": 0.49790804021992413, + "learning_rate": 1.4783706543646066e-05, + "loss": 0.3368, + "step": 6936 + }, + { + "epoch": 1.3675078864353312, + "grad_norm": 0.5019017261167511, + "learning_rate": 1.4782345318433025e-05, + "loss": 0.3278, + "step": 6937 + }, + { + "epoch": 1.367705047318612, + "grad_norm": 0.4904028947701245, + "learning_rate": 1.478098397832029e-05, + "loss": 0.3129, + "step": 6938 + }, + { + "epoch": 1.3679022082018928, + "grad_norm": 0.5321594647922132, + "learning_rate": 1.4779622523340562e-05, + "loss": 0.3425, + "step": 6939 + }, + { + "epoch": 1.3680993690851735, + "grad_norm": 0.4986035776438812, + "learning_rate": 1.477826095352656e-05, + "loss": 0.3367, + "step": 6940 + }, + { + "epoch": 1.3682965299684542, + "grad_norm": 0.5246630991127984, + "learning_rate": 1.4776899268910985e-05, + "loss": 0.3542, + "step": 6941 + }, + { + "epoch": 1.368493690851735, + "grad_norm": 0.4648451353245271, + "learning_rate": 1.477553746952656e-05, + "loss": 0.3196, + "step": 6942 + }, + { + "epoch": 1.3686908517350158, + "grad_norm": 0.546723814002127, + "learning_rate": 1.4774175555406e-05, + "loss": 0.3466, + "step": 6943 + }, + { + "epoch": 1.3688880126182965, + "grad_norm": 0.5018774391569213, + "learning_rate": 1.477281352658203e-05, + "loss": 0.3239, + "step": 6944 + }, + { + "epoch": 1.3690851735015772, + "grad_norm": 0.4926341199579351, + "learning_rate": 1.4771451383087373e-05, + "loss": 0.3356, + "step": 6945 + }, + { + "epoch": 1.3692823343848581, + "grad_norm": 0.5338603435373444, + "learning_rate": 1.477008912495475e-05, + "loss": 0.3629, + "step": 6946 + }, + { + "epoch": 1.3694794952681388, + "grad_norm": 0.46986959001400663, + "learning_rate": 1.4768726752216898e-05, + "loss": 0.3107, + "step": 6947 + }, + { + "epoch": 1.3696766561514195, + "grad_norm": 0.5335668009350745, + "learning_rate": 1.4767364264906542e-05, + "loss": 0.3481, + "step": 6948 + }, + { + "epoch": 1.3698738170347002, + "grad_norm": 0.4653687936969232, + "learning_rate": 1.4766001663056422e-05, + "loss": 0.3155, + "step": 6949 + }, + { + "epoch": 1.370070977917981, + "grad_norm": 0.5286992819313316, + "learning_rate": 1.4764638946699275e-05, + "loss": 0.3449, + "step": 6950 + }, + { + "epoch": 1.3702681388012619, + "grad_norm": 0.5307937528972434, + "learning_rate": 1.476327611586784e-05, + "loss": 0.3482, + "step": 6951 + }, + { + "epoch": 1.3704652996845426, + "grad_norm": 0.47920732199670424, + "learning_rate": 1.4761913170594859e-05, + "loss": 0.3334, + "step": 6952 + }, + { + "epoch": 1.3706624605678233, + "grad_norm": 0.4986071058412949, + "learning_rate": 1.4760550110913081e-05, + "loss": 0.3575, + "step": 6953 + }, + { + "epoch": 1.3708596214511042, + "grad_norm": 0.48580007339999265, + "learning_rate": 1.4759186936855253e-05, + "loss": 0.3166, + "step": 6954 + }, + { + "epoch": 1.3710567823343849, + "grad_norm": 0.4847134680546892, + "learning_rate": 1.4757823648454124e-05, + "loss": 0.3355, + "step": 6955 + }, + { + "epoch": 1.3712539432176656, + "grad_norm": 0.49892520436181714, + "learning_rate": 1.475646024574245e-05, + "loss": 0.3437, + "step": 6956 + }, + { + "epoch": 1.3714511041009465, + "grad_norm": 0.46579178489145134, + "learning_rate": 1.4755096728752992e-05, + "loss": 0.3251, + "step": 6957 + }, + { + "epoch": 1.3716482649842272, + "grad_norm": 0.4854313491900763, + "learning_rate": 1.4753733097518503e-05, + "loss": 0.3197, + "step": 6958 + }, + { + "epoch": 1.371845425867508, + "grad_norm": 0.5084781508505444, + "learning_rate": 1.475236935207175e-05, + "loss": 0.3447, + "step": 6959 + }, + { + "epoch": 1.3720425867507886, + "grad_norm": 0.5154369206311532, + "learning_rate": 1.4751005492445496e-05, + "loss": 0.3022, + "step": 6960 + }, + { + "epoch": 1.3722397476340693, + "grad_norm": 0.5072733874919334, + "learning_rate": 1.4749641518672508e-05, + "loss": 0.3478, + "step": 6961 + }, + { + "epoch": 1.3724369085173502, + "grad_norm": 0.5276500649999918, + "learning_rate": 1.4748277430785557e-05, + "loss": 0.3433, + "step": 6962 + }, + { + "epoch": 1.372634069400631, + "grad_norm": 0.5068275820907489, + "learning_rate": 1.4746913228817416e-05, + "loss": 0.3324, + "step": 6963 + }, + { + "epoch": 1.3728312302839116, + "grad_norm": 0.5588764584629632, + "learning_rate": 1.4745548912800867e-05, + "loss": 0.381, + "step": 6964 + }, + { + "epoch": 1.3730283911671926, + "grad_norm": 0.48397923052902175, + "learning_rate": 1.4744184482768678e-05, + "loss": 0.3241, + "step": 6965 + }, + { + "epoch": 1.3732255520504733, + "grad_norm": 0.5009095018317993, + "learning_rate": 1.4742819938753641e-05, + "loss": 0.3374, + "step": 6966 + }, + { + "epoch": 1.373422712933754, + "grad_norm": 0.5214836401414918, + "learning_rate": 1.4741455280788533e-05, + "loss": 0.3338, + "step": 6967 + }, + { + "epoch": 1.3736198738170347, + "grad_norm": 0.4969774753340322, + "learning_rate": 1.4740090508906147e-05, + "loss": 0.3243, + "step": 6968 + }, + { + "epoch": 1.3738170347003154, + "grad_norm": 0.5559922169108119, + "learning_rate": 1.4738725623139263e-05, + "loss": 0.3567, + "step": 6969 + }, + { + "epoch": 1.3740141955835963, + "grad_norm": 0.5165083267867453, + "learning_rate": 1.4737360623520684e-05, + "loss": 0.3324, + "step": 6970 + }, + { + "epoch": 1.374211356466877, + "grad_norm": 0.5186856386291233, + "learning_rate": 1.47359955100832e-05, + "loss": 0.3333, + "step": 6971 + }, + { + "epoch": 1.3744085173501577, + "grad_norm": 0.5238837322487031, + "learning_rate": 1.473463028285961e-05, + "loss": 0.3484, + "step": 6972 + }, + { + "epoch": 1.3746056782334386, + "grad_norm": 2.292582007349146, + "learning_rate": 1.4733264941882714e-05, + "loss": 0.3815, + "step": 6973 + }, + { + "epoch": 1.3748028391167193, + "grad_norm": 0.5643806541463167, + "learning_rate": 1.4731899487185319e-05, + "loss": 0.3442, + "step": 6974 + }, + { + "epoch": 1.375, + "grad_norm": 0.47507958786208415, + "learning_rate": 1.4730533918800227e-05, + "loss": 0.3075, + "step": 6975 + }, + { + "epoch": 1.3751971608832807, + "grad_norm": 0.4912700593505576, + "learning_rate": 1.4729168236760248e-05, + "loss": 0.3229, + "step": 6976 + }, + { + "epoch": 1.3753943217665614, + "grad_norm": 0.5116118374339484, + "learning_rate": 1.4727802441098193e-05, + "loss": 0.3345, + "step": 6977 + }, + { + "epoch": 1.3755914826498423, + "grad_norm": 0.5865259693172717, + "learning_rate": 1.4726436531846877e-05, + "loss": 0.3333, + "step": 6978 + }, + { + "epoch": 1.375788643533123, + "grad_norm": 0.49366649497140386, + "learning_rate": 1.4725070509039117e-05, + "loss": 0.3337, + "step": 6979 + }, + { + "epoch": 1.3759858044164037, + "grad_norm": 0.5372885703885392, + "learning_rate": 1.4723704372707734e-05, + "loss": 0.3287, + "step": 6980 + }, + { + "epoch": 1.3761829652996846, + "grad_norm": 0.5043319157124423, + "learning_rate": 1.4722338122885548e-05, + "loss": 0.3396, + "step": 6981 + }, + { + "epoch": 1.3763801261829653, + "grad_norm": 0.6345501977793165, + "learning_rate": 1.4720971759605387e-05, + "loss": 0.3289, + "step": 6982 + }, + { + "epoch": 1.376577287066246, + "grad_norm": 0.5375836546232604, + "learning_rate": 1.4719605282900077e-05, + "loss": 0.3757, + "step": 6983 + }, + { + "epoch": 1.3767744479495267, + "grad_norm": 0.5178077248278057, + "learning_rate": 1.4718238692802449e-05, + "loss": 0.3594, + "step": 6984 + }, + { + "epoch": 1.3769716088328074, + "grad_norm": 0.5147912712127123, + "learning_rate": 1.4716871989345338e-05, + "loss": 0.3474, + "step": 6985 + }, + { + "epoch": 1.3771687697160884, + "grad_norm": 0.5189874060272969, + "learning_rate": 1.4715505172561577e-05, + "loss": 0.3247, + "step": 6986 + }, + { + "epoch": 1.377365930599369, + "grad_norm": 0.5281430394013332, + "learning_rate": 1.471413824248401e-05, + "loss": 0.3568, + "step": 6987 + }, + { + "epoch": 1.3775630914826498, + "grad_norm": 0.5147078126304419, + "learning_rate": 1.4712771199145472e-05, + "loss": 0.3559, + "step": 6988 + }, + { + "epoch": 1.3777602523659307, + "grad_norm": 0.49074990134353796, + "learning_rate": 1.4711404042578814e-05, + "loss": 0.3416, + "step": 6989 + }, + { + "epoch": 1.3779574132492114, + "grad_norm": 0.4841551234965781, + "learning_rate": 1.4710036772816877e-05, + "loss": 0.3476, + "step": 6990 + }, + { + "epoch": 1.378154574132492, + "grad_norm": 0.5076124986937911, + "learning_rate": 1.4708669389892514e-05, + "loss": 0.3451, + "step": 6991 + }, + { + "epoch": 1.3783517350157728, + "grad_norm": 0.5101395115062465, + "learning_rate": 1.4707301893838578e-05, + "loss": 0.337, + "step": 6992 + }, + { + "epoch": 1.3785488958990535, + "grad_norm": 0.47965592126549544, + "learning_rate": 1.4705934284687923e-05, + "loss": 0.3242, + "step": 6993 + }, + { + "epoch": 1.3787460567823344, + "grad_norm": 0.5219389338261724, + "learning_rate": 1.4704566562473408e-05, + "loss": 0.3683, + "step": 6994 + }, + { + "epoch": 1.3789432176656151, + "grad_norm": 0.5205227382874077, + "learning_rate": 1.4703198727227892e-05, + "loss": 0.3318, + "step": 6995 + }, + { + "epoch": 1.3791403785488958, + "grad_norm": 0.48222107531876607, + "learning_rate": 1.4701830778984239e-05, + "loss": 0.3253, + "step": 6996 + }, + { + "epoch": 1.3793375394321767, + "grad_norm": 0.49353311997235216, + "learning_rate": 1.4700462717775317e-05, + "loss": 0.3219, + "step": 6997 + }, + { + "epoch": 1.3795347003154574, + "grad_norm": 0.5345425712355901, + "learning_rate": 1.4699094543633989e-05, + "loss": 0.3624, + "step": 6998 + }, + { + "epoch": 1.3797318611987381, + "grad_norm": 0.5050606461841181, + "learning_rate": 1.4697726256593132e-05, + "loss": 0.3441, + "step": 6999 + }, + { + "epoch": 1.379929022082019, + "grad_norm": 0.47478110398234236, + "learning_rate": 1.469635785668562e-05, + "loss": 0.3204, + "step": 7000 + }, + { + "epoch": 1.3801261829652998, + "grad_norm": 0.503768054032645, + "learning_rate": 1.4694989343944327e-05, + "loss": 0.3508, + "step": 7001 + }, + { + "epoch": 1.3803233438485805, + "grad_norm": 0.6492703807376875, + "learning_rate": 1.4693620718402137e-05, + "loss": 0.3212, + "step": 7002 + }, + { + "epoch": 1.3805205047318612, + "grad_norm": 0.4805328225116543, + "learning_rate": 1.4692251980091927e-05, + "loss": 0.3254, + "step": 7003 + }, + { + "epoch": 1.3807176656151419, + "grad_norm": 0.4842656353572685, + "learning_rate": 1.4690883129046585e-05, + "loss": 0.3372, + "step": 7004 + }, + { + "epoch": 1.3809148264984228, + "grad_norm": 0.5211693690974576, + "learning_rate": 1.4689514165298995e-05, + "loss": 0.3491, + "step": 7005 + }, + { + "epoch": 1.3811119873817035, + "grad_norm": 0.5885371549295725, + "learning_rate": 1.4688145088882056e-05, + "loss": 0.3444, + "step": 7006 + }, + { + "epoch": 1.3813091482649842, + "grad_norm": 0.4754832288654815, + "learning_rate": 1.4686775899828651e-05, + "loss": 0.3224, + "step": 7007 + }, + { + "epoch": 1.381506309148265, + "grad_norm": 0.4780150559237149, + "learning_rate": 1.4685406598171686e-05, + "loss": 0.3052, + "step": 7008 + }, + { + "epoch": 1.3817034700315458, + "grad_norm": 0.49765867952072046, + "learning_rate": 1.4684037183944051e-05, + "loss": 0.3434, + "step": 7009 + }, + { + "epoch": 1.3819006309148265, + "grad_norm": 0.47835652909606524, + "learning_rate": 1.4682667657178653e-05, + "loss": 0.3396, + "step": 7010 + }, + { + "epoch": 1.3820977917981072, + "grad_norm": 0.46329597154012175, + "learning_rate": 1.4681298017908391e-05, + "loss": 0.3226, + "step": 7011 + }, + { + "epoch": 1.382294952681388, + "grad_norm": 0.49817719336350075, + "learning_rate": 1.4679928266166175e-05, + "loss": 0.3357, + "step": 7012 + }, + { + "epoch": 1.3824921135646688, + "grad_norm": 0.8663152836331668, + "learning_rate": 1.4678558401984915e-05, + "loss": 0.3431, + "step": 7013 + }, + { + "epoch": 1.3826892744479495, + "grad_norm": 0.5197376334684173, + "learning_rate": 1.467718842539752e-05, + "loss": 0.3367, + "step": 7014 + }, + { + "epoch": 1.3828864353312302, + "grad_norm": 0.5621592048068038, + "learning_rate": 1.467581833643691e-05, + "loss": 0.3152, + "step": 7015 + }, + { + "epoch": 1.3830835962145112, + "grad_norm": 0.5392174642427607, + "learning_rate": 1.4674448135135993e-05, + "loss": 0.348, + "step": 7016 + }, + { + "epoch": 1.3832807570977919, + "grad_norm": 0.4767809373705618, + "learning_rate": 1.46730778215277e-05, + "loss": 0.3133, + "step": 7017 + }, + { + "epoch": 1.3834779179810726, + "grad_norm": 1.1165470822976469, + "learning_rate": 1.4671707395644946e-05, + "loss": 0.3479, + "step": 7018 + }, + { + "epoch": 1.3836750788643533, + "grad_norm": 0.5264241691242134, + "learning_rate": 1.4670336857520661e-05, + "loss": 0.3528, + "step": 7019 + }, + { + "epoch": 1.383872239747634, + "grad_norm": 0.4677220107524557, + "learning_rate": 1.4668966207187774e-05, + "loss": 0.3322, + "step": 7020 + }, + { + "epoch": 1.3840694006309149, + "grad_norm": 0.4729142635567184, + "learning_rate": 1.4667595444679212e-05, + "loss": 0.3265, + "step": 7021 + }, + { + "epoch": 1.3842665615141956, + "grad_norm": 0.5851446109658627, + "learning_rate": 1.466622457002791e-05, + "loss": 0.3491, + "step": 7022 + }, + { + "epoch": 1.3844637223974763, + "grad_norm": 0.5195972584107764, + "learning_rate": 1.4664853583266807e-05, + "loss": 0.3408, + "step": 7023 + }, + { + "epoch": 1.3846608832807572, + "grad_norm": 0.48331307744775936, + "learning_rate": 1.4663482484428839e-05, + "loss": 0.3438, + "step": 7024 + }, + { + "epoch": 1.384858044164038, + "grad_norm": 0.4872542154414702, + "learning_rate": 1.4662111273546949e-05, + "loss": 0.3116, + "step": 7025 + }, + { + "epoch": 1.3850552050473186, + "grad_norm": 0.5209465468531163, + "learning_rate": 1.4660739950654081e-05, + "loss": 0.3713, + "step": 7026 + }, + { + "epoch": 1.3852523659305993, + "grad_norm": 0.4946017796024822, + "learning_rate": 1.4659368515783183e-05, + "loss": 0.3238, + "step": 7027 + }, + { + "epoch": 1.38544952681388, + "grad_norm": 0.5092592005569798, + "learning_rate": 1.4657996968967202e-05, + "loss": 0.3456, + "step": 7028 + }, + { + "epoch": 1.385646687697161, + "grad_norm": 0.47093221898089077, + "learning_rate": 1.4656625310239095e-05, + "loss": 0.3068, + "step": 7029 + }, + { + "epoch": 1.3858438485804416, + "grad_norm": 0.5092749341911277, + "learning_rate": 1.4655253539631816e-05, + "loss": 0.3501, + "step": 7030 + }, + { + "epoch": 1.3860410094637223, + "grad_norm": 0.49688273138146666, + "learning_rate": 1.4653881657178317e-05, + "loss": 0.329, + "step": 7031 + }, + { + "epoch": 1.3862381703470033, + "grad_norm": 0.5210667444751383, + "learning_rate": 1.465250966291157e-05, + "loss": 0.3308, + "step": 7032 + }, + { + "epoch": 1.386435331230284, + "grad_norm": 0.5051826303648079, + "learning_rate": 1.4651137556864526e-05, + "loss": 0.3246, + "step": 7033 + }, + { + "epoch": 1.3866324921135647, + "grad_norm": 0.5131056295583875, + "learning_rate": 1.4649765339070161e-05, + "loss": 0.3472, + "step": 7034 + }, + { + "epoch": 1.3868296529968454, + "grad_norm": 0.5045828989240048, + "learning_rate": 1.4648393009561434e-05, + "loss": 0.35, + "step": 7035 + }, + { + "epoch": 1.387026813880126, + "grad_norm": 2.3607682081415566, + "learning_rate": 1.4647020568371329e-05, + "loss": 0.3819, + "step": 7036 + }, + { + "epoch": 1.387223974763407, + "grad_norm": 0.7063638677947639, + "learning_rate": 1.4645648015532806e-05, + "loss": 0.3035, + "step": 7037 + }, + { + "epoch": 1.3874211356466877, + "grad_norm": 0.5347651370289318, + "learning_rate": 1.464427535107885e-05, + "loss": 0.3415, + "step": 7038 + }, + { + "epoch": 1.3876182965299684, + "grad_norm": 0.5063799018311032, + "learning_rate": 1.4642902575042439e-05, + "loss": 0.3682, + "step": 7039 + }, + { + "epoch": 1.3878154574132493, + "grad_norm": 0.5418753232993281, + "learning_rate": 1.4641529687456558e-05, + "loss": 0.3801, + "step": 7040 + }, + { + "epoch": 1.38801261829653, + "grad_norm": 0.510285067684664, + "learning_rate": 1.4640156688354183e-05, + "loss": 0.3417, + "step": 7041 + }, + { + "epoch": 1.3882097791798107, + "grad_norm": 0.5074567362081557, + "learning_rate": 1.4638783577768312e-05, + "loss": 0.3426, + "step": 7042 + }, + { + "epoch": 1.3884069400630916, + "grad_norm": 0.4986711709526979, + "learning_rate": 1.4637410355731927e-05, + "loss": 0.3517, + "step": 7043 + }, + { + "epoch": 1.3886041009463723, + "grad_norm": 0.4834879827214969, + "learning_rate": 1.4636037022278022e-05, + "loss": 0.325, + "step": 7044 + }, + { + "epoch": 1.388801261829653, + "grad_norm": 0.4725810540995025, + "learning_rate": 1.4634663577439598e-05, + "loss": 0.3329, + "step": 7045 + }, + { + "epoch": 1.3889984227129337, + "grad_norm": 0.499687944712635, + "learning_rate": 1.4633290021249646e-05, + "loss": 0.3261, + "step": 7046 + }, + { + "epoch": 1.3891955835962144, + "grad_norm": 0.49600002260141485, + "learning_rate": 1.4631916353741174e-05, + "loss": 0.3625, + "step": 7047 + }, + { + "epoch": 1.3893927444794953, + "grad_norm": 0.5362771118840135, + "learning_rate": 1.4630542574947177e-05, + "loss": 0.3511, + "step": 7048 + }, + { + "epoch": 1.389589905362776, + "grad_norm": 0.49021230005905664, + "learning_rate": 1.462916868490067e-05, + "loss": 0.3459, + "step": 7049 + }, + { + "epoch": 1.3897870662460567, + "grad_norm": 0.5226203596978446, + "learning_rate": 1.4627794683634655e-05, + "loss": 0.3359, + "step": 7050 + }, + { + "epoch": 1.3899842271293377, + "grad_norm": 0.4892949036036693, + "learning_rate": 1.4626420571182146e-05, + "loss": 0.3354, + "step": 7051 + }, + { + "epoch": 1.3901813880126184, + "grad_norm": 0.49319072511760836, + "learning_rate": 1.4625046347576155e-05, + "loss": 0.3213, + "step": 7052 + }, + { + "epoch": 1.390378548895899, + "grad_norm": 0.48026160014473557, + "learning_rate": 1.4623672012849705e-05, + "loss": 0.3328, + "step": 7053 + }, + { + "epoch": 1.3905757097791798, + "grad_norm": 0.519599979648436, + "learning_rate": 1.462229756703581e-05, + "loss": 0.3768, + "step": 7054 + }, + { + "epoch": 1.3907728706624605, + "grad_norm": 0.5158052836985166, + "learning_rate": 1.4620923010167496e-05, + "loss": 0.3472, + "step": 7055 + }, + { + "epoch": 1.3909700315457414, + "grad_norm": 0.49003976941752064, + "learning_rate": 1.461954834227778e-05, + "loss": 0.3197, + "step": 7056 + }, + { + "epoch": 1.391167192429022, + "grad_norm": 0.47468028680465263, + "learning_rate": 1.46181735633997e-05, + "loss": 0.3163, + "step": 7057 + }, + { + "epoch": 1.3913643533123028, + "grad_norm": 0.4951911330562906, + "learning_rate": 1.4616798673566276e-05, + "loss": 0.3479, + "step": 7058 + }, + { + "epoch": 1.3915615141955837, + "grad_norm": 0.4785035909764805, + "learning_rate": 1.4615423672810549e-05, + "loss": 0.3244, + "step": 7059 + }, + { + "epoch": 1.3917586750788644, + "grad_norm": 0.46972733617436485, + "learning_rate": 1.4614048561165552e-05, + "loss": 0.3264, + "step": 7060 + }, + { + "epoch": 1.3919558359621451, + "grad_norm": 0.5045776545128406, + "learning_rate": 1.4612673338664322e-05, + "loss": 0.3555, + "step": 7061 + }, + { + "epoch": 1.3921529968454258, + "grad_norm": 0.4873523714181044, + "learning_rate": 1.46112980053399e-05, + "loss": 0.3357, + "step": 7062 + }, + { + "epoch": 1.3923501577287065, + "grad_norm": 0.5568338246839404, + "learning_rate": 1.460992256122533e-05, + "loss": 0.3606, + "step": 7063 + }, + { + "epoch": 1.3925473186119874, + "grad_norm": 0.5056504428860361, + "learning_rate": 1.4608547006353661e-05, + "loss": 0.3515, + "step": 7064 + }, + { + "epoch": 1.3927444794952681, + "grad_norm": 0.5637743705182227, + "learning_rate": 1.4607171340757935e-05, + "loss": 0.387, + "step": 7065 + }, + { + "epoch": 1.3929416403785488, + "grad_norm": 0.5012198726529813, + "learning_rate": 1.460579556447121e-05, + "loss": 0.3339, + "step": 7066 + }, + { + "epoch": 1.3931388012618298, + "grad_norm": 0.5255783486737358, + "learning_rate": 1.4604419677526536e-05, + "loss": 0.34, + "step": 7067 + }, + { + "epoch": 1.3933359621451105, + "grad_norm": 0.49793007788388866, + "learning_rate": 1.4603043679956972e-05, + "loss": 0.3364, + "step": 7068 + }, + { + "epoch": 1.3935331230283912, + "grad_norm": 0.5065921955646553, + "learning_rate": 1.4601667571795577e-05, + "loss": 0.3439, + "step": 7069 + }, + { + "epoch": 1.3937302839116719, + "grad_norm": 0.46887161071080763, + "learning_rate": 1.4600291353075413e-05, + "loss": 0.3174, + "step": 7070 + }, + { + "epoch": 1.3939274447949526, + "grad_norm": 0.5473282371408335, + "learning_rate": 1.4598915023829543e-05, + "loss": 0.3446, + "step": 7071 + }, + { + "epoch": 1.3941246056782335, + "grad_norm": 0.5483532718507378, + "learning_rate": 1.4597538584091038e-05, + "loss": 0.356, + "step": 7072 + }, + { + "epoch": 1.3943217665615142, + "grad_norm": 0.5316150064778802, + "learning_rate": 1.4596162033892962e-05, + "loss": 0.3371, + "step": 7073 + }, + { + "epoch": 1.3945189274447949, + "grad_norm": 0.5159110622307619, + "learning_rate": 1.4594785373268399e-05, + "loss": 0.3488, + "step": 7074 + }, + { + "epoch": 1.3947160883280758, + "grad_norm": 0.48598790594632707, + "learning_rate": 1.4593408602250412e-05, + "loss": 0.3235, + "step": 7075 + }, + { + "epoch": 1.3949132492113565, + "grad_norm": 0.4724557712538457, + "learning_rate": 1.4592031720872086e-05, + "loss": 0.3286, + "step": 7076 + }, + { + "epoch": 1.3951104100946372, + "grad_norm": 0.48240302693285425, + "learning_rate": 1.45906547291665e-05, + "loss": 0.3269, + "step": 7077 + }, + { + "epoch": 1.395307570977918, + "grad_norm": 0.5090930573384124, + "learning_rate": 1.4589277627166738e-05, + "loss": 0.3376, + "step": 7078 + }, + { + "epoch": 1.3955047318611986, + "grad_norm": 0.49297802997817497, + "learning_rate": 1.4587900414905884e-05, + "loss": 0.341, + "step": 7079 + }, + { + "epoch": 1.3957018927444795, + "grad_norm": 0.4997166046095957, + "learning_rate": 1.4586523092417023e-05, + "loss": 0.3496, + "step": 7080 + }, + { + "epoch": 1.3958990536277602, + "grad_norm": 0.5090637102230009, + "learning_rate": 1.4585145659733261e-05, + "loss": 0.3348, + "step": 7081 + }, + { + "epoch": 1.396096214511041, + "grad_norm": 0.5273133534088523, + "learning_rate": 1.4583768116887675e-05, + "loss": 0.3552, + "step": 7082 + }, + { + "epoch": 1.3962933753943219, + "grad_norm": 0.49036343465671867, + "learning_rate": 1.4582390463913374e-05, + "loss": 0.3291, + "step": 7083 + }, + { + "epoch": 1.3964905362776026, + "grad_norm": 0.5038344983388285, + "learning_rate": 1.4581012700843447e-05, + "loss": 0.333, + "step": 7084 + }, + { + "epoch": 1.3966876971608833, + "grad_norm": 0.4900487095066304, + "learning_rate": 1.4579634827711004e-05, + "loss": 0.3483, + "step": 7085 + }, + { + "epoch": 1.396884858044164, + "grad_norm": 0.5031078803878702, + "learning_rate": 1.4578256844549144e-05, + "loss": 0.3157, + "step": 7086 + }, + { + "epoch": 1.3970820189274447, + "grad_norm": 0.6475198113556349, + "learning_rate": 1.4576878751390977e-05, + "loss": 0.3807, + "step": 7087 + }, + { + "epoch": 1.3972791798107256, + "grad_norm": 0.5187308668664504, + "learning_rate": 1.4575500548269612e-05, + "loss": 0.3417, + "step": 7088 + }, + { + "epoch": 1.3974763406940063, + "grad_norm": 0.4702071372526483, + "learning_rate": 1.4574122235218165e-05, + "loss": 0.3175, + "step": 7089 + }, + { + "epoch": 1.397673501577287, + "grad_norm": 0.5261987476073774, + "learning_rate": 1.4572743812269742e-05, + "loss": 0.3641, + "step": 7090 + }, + { + "epoch": 1.397870662460568, + "grad_norm": 0.5232677422093927, + "learning_rate": 1.457136527945747e-05, + "loss": 0.3366, + "step": 7091 + }, + { + "epoch": 1.3980678233438486, + "grad_norm": 0.5413611690474402, + "learning_rate": 1.4569986636814467e-05, + "loss": 0.3387, + "step": 7092 + }, + { + "epoch": 1.3982649842271293, + "grad_norm": 0.504814762397322, + "learning_rate": 1.4568607884373853e-05, + "loss": 0.3559, + "step": 7093 + }, + { + "epoch": 1.3984621451104102, + "grad_norm": 0.5127986043906394, + "learning_rate": 1.4567229022168756e-05, + "loss": 0.3346, + "step": 7094 + }, + { + "epoch": 1.398659305993691, + "grad_norm": 0.5194678556274392, + "learning_rate": 1.4565850050232303e-05, + "loss": 0.3264, + "step": 7095 + }, + { + "epoch": 1.3988564668769716, + "grad_norm": 1.36315390724496, + "learning_rate": 1.4564470968597629e-05, + "loss": 0.3374, + "step": 7096 + }, + { + "epoch": 1.3990536277602523, + "grad_norm": 0.5377737743898511, + "learning_rate": 1.456309177729786e-05, + "loss": 0.3548, + "step": 7097 + }, + { + "epoch": 1.399250788643533, + "grad_norm": 0.5141750880497071, + "learning_rate": 1.4561712476366138e-05, + "loss": 0.3328, + "step": 7098 + }, + { + "epoch": 1.399447949526814, + "grad_norm": 0.48903622056427165, + "learning_rate": 1.4560333065835597e-05, + "loss": 0.3118, + "step": 7099 + }, + { + "epoch": 1.3996451104100947, + "grad_norm": 0.5462273751987472, + "learning_rate": 1.4558953545739386e-05, + "loss": 0.3234, + "step": 7100 + }, + { + "epoch": 1.3998422712933754, + "grad_norm": 0.4955309077143716, + "learning_rate": 1.4557573916110643e-05, + "loss": 0.3108, + "step": 7101 + }, + { + "epoch": 1.4000394321766563, + "grad_norm": 0.5069308732041122, + "learning_rate": 1.4556194176982521e-05, + "loss": 0.3231, + "step": 7102 + }, + { + "epoch": 1.400236593059937, + "grad_norm": 0.48081025706983266, + "learning_rate": 1.4554814328388158e-05, + "loss": 0.311, + "step": 7103 + }, + { + "epoch": 1.4004337539432177, + "grad_norm": 0.5215098010308659, + "learning_rate": 1.4553434370360718e-05, + "loss": 0.3516, + "step": 7104 + }, + { + "epoch": 1.4006309148264984, + "grad_norm": 0.4849700152957339, + "learning_rate": 1.4552054302933344e-05, + "loss": 0.3482, + "step": 7105 + }, + { + "epoch": 1.400828075709779, + "grad_norm": 0.5043874074673499, + "learning_rate": 1.4550674126139206e-05, + "loss": 0.3562, + "step": 7106 + }, + { + "epoch": 1.40102523659306, + "grad_norm": 0.4972388280378365, + "learning_rate": 1.4549293840011453e-05, + "loss": 0.3371, + "step": 7107 + }, + { + "epoch": 1.4012223974763407, + "grad_norm": 0.501745399798661, + "learning_rate": 1.4547913444583254e-05, + "loss": 0.3435, + "step": 7108 + }, + { + "epoch": 1.4014195583596214, + "grad_norm": 0.5121863899401545, + "learning_rate": 1.4546532939887775e-05, + "loss": 0.3126, + "step": 7109 + }, + { + "epoch": 1.4016167192429023, + "grad_norm": 0.5122597557427135, + "learning_rate": 1.4545152325958176e-05, + "loss": 0.3241, + "step": 7110 + }, + { + "epoch": 1.401813880126183, + "grad_norm": 0.5356092124198039, + "learning_rate": 1.4543771602827635e-05, + "loss": 0.3352, + "step": 7111 + }, + { + "epoch": 1.4020110410094637, + "grad_norm": 0.5046731418348064, + "learning_rate": 1.454239077052932e-05, + "loss": 0.3255, + "step": 7112 + }, + { + "epoch": 1.4022082018927444, + "grad_norm": 0.5329137134815505, + "learning_rate": 1.4541009829096411e-05, + "loss": 0.3668, + "step": 7113 + }, + { + "epoch": 1.4024053627760251, + "grad_norm": 0.5089935811054527, + "learning_rate": 1.4539628778562082e-05, + "loss": 0.3246, + "step": 7114 + }, + { + "epoch": 1.402602523659306, + "grad_norm": 0.5703923513720676, + "learning_rate": 1.4538247618959519e-05, + "loss": 0.3691, + "step": 7115 + }, + { + "epoch": 1.4027996845425867, + "grad_norm": 0.48847066057622457, + "learning_rate": 1.4536866350321899e-05, + "loss": 0.3268, + "step": 7116 + }, + { + "epoch": 1.4029968454258674, + "grad_norm": 0.5073745379809114, + "learning_rate": 1.4535484972682412e-05, + "loss": 0.3149, + "step": 7117 + }, + { + "epoch": 1.4031940063091484, + "grad_norm": 0.6462283290424533, + "learning_rate": 1.4534103486074246e-05, + "loss": 0.3336, + "step": 7118 + }, + { + "epoch": 1.403391167192429, + "grad_norm": 0.515565073309167, + "learning_rate": 1.4532721890530594e-05, + "loss": 0.3456, + "step": 7119 + }, + { + "epoch": 1.4035883280757098, + "grad_norm": 0.4990119046287184, + "learning_rate": 1.4531340186084647e-05, + "loss": 0.3122, + "step": 7120 + }, + { + "epoch": 1.4037854889589905, + "grad_norm": 0.510752738342485, + "learning_rate": 1.4529958372769603e-05, + "loss": 0.3478, + "step": 7121 + }, + { + "epoch": 1.4039826498422712, + "grad_norm": 0.533223751953392, + "learning_rate": 1.4528576450618661e-05, + "loss": 0.3166, + "step": 7122 + }, + { + "epoch": 1.404179810725552, + "grad_norm": 0.5250407115618655, + "learning_rate": 1.4527194419665027e-05, + "loss": 0.3405, + "step": 7123 + }, + { + "epoch": 1.4043769716088328, + "grad_norm": 0.5492621619572747, + "learning_rate": 1.4525812279941896e-05, + "loss": 0.3307, + "step": 7124 + }, + { + "epoch": 1.4045741324921135, + "grad_norm": 0.5808645619860187, + "learning_rate": 1.4524430031482483e-05, + "loss": 0.3236, + "step": 7125 + }, + { + "epoch": 1.4047712933753944, + "grad_norm": 0.516938354436358, + "learning_rate": 1.4523047674319992e-05, + "loss": 0.3397, + "step": 7126 + }, + { + "epoch": 1.4049684542586751, + "grad_norm": 0.5446571797503498, + "learning_rate": 1.452166520848764e-05, + "loss": 0.3739, + "step": 7127 + }, + { + "epoch": 1.4051656151419558, + "grad_norm": 0.5247398532256848, + "learning_rate": 1.4520282634018642e-05, + "loss": 0.33, + "step": 7128 + }, + { + "epoch": 1.4053627760252365, + "grad_norm": 0.5174783252477193, + "learning_rate": 1.451889995094621e-05, + "loss": 0.3291, + "step": 7129 + }, + { + "epoch": 1.4055599369085172, + "grad_norm": 0.45618206010019524, + "learning_rate": 1.4517517159303573e-05, + "loss": 0.3154, + "step": 7130 + }, + { + "epoch": 1.4057570977917981, + "grad_norm": 0.5302269877465452, + "learning_rate": 1.4516134259123944e-05, + "loss": 0.3305, + "step": 7131 + }, + { + "epoch": 1.4059542586750788, + "grad_norm": 0.49691950071685087, + "learning_rate": 1.4514751250440556e-05, + "loss": 0.3465, + "step": 7132 + }, + { + "epoch": 1.4061514195583595, + "grad_norm": 0.5204660479559051, + "learning_rate": 1.4513368133286628e-05, + "loss": 0.3484, + "step": 7133 + }, + { + "epoch": 1.4063485804416405, + "grad_norm": 0.5209911663466499, + "learning_rate": 1.45119849076954e-05, + "loss": 0.3324, + "step": 7134 + }, + { + "epoch": 1.4065457413249212, + "grad_norm": 0.52870920201794, + "learning_rate": 1.4510601573700098e-05, + "loss": 0.3464, + "step": 7135 + }, + { + "epoch": 1.4067429022082019, + "grad_norm": 0.5337821985731359, + "learning_rate": 1.4509218131333964e-05, + "loss": 0.3547, + "step": 7136 + }, + { + "epoch": 1.4069400630914828, + "grad_norm": 0.4726297001749275, + "learning_rate": 1.4507834580630231e-05, + "loss": 0.3287, + "step": 7137 + }, + { + "epoch": 1.4071372239747635, + "grad_norm": 0.5411170706465964, + "learning_rate": 1.4506450921622144e-05, + "loss": 0.3249, + "step": 7138 + }, + { + "epoch": 1.4073343848580442, + "grad_norm": 0.5280250037967184, + "learning_rate": 1.4505067154342944e-05, + "loss": 0.3481, + "step": 7139 + }, + { + "epoch": 1.4075315457413249, + "grad_norm": 0.5254200953659547, + "learning_rate": 1.4503683278825877e-05, + "loss": 0.3102, + "step": 7140 + }, + { + "epoch": 1.4077287066246056, + "grad_norm": 0.5167455740268537, + "learning_rate": 1.4502299295104194e-05, + "loss": 0.3626, + "step": 7141 + }, + { + "epoch": 1.4079258675078865, + "grad_norm": 0.5231802604371326, + "learning_rate": 1.4500915203211144e-05, + "loss": 0.3298, + "step": 7142 + }, + { + "epoch": 1.4081230283911672, + "grad_norm": 0.5273278352996336, + "learning_rate": 1.449953100317998e-05, + "loss": 0.3262, + "step": 7143 + }, + { + "epoch": 1.408320189274448, + "grad_norm": 0.467342150805326, + "learning_rate": 1.4498146695043963e-05, + "loss": 0.3136, + "step": 7144 + }, + { + "epoch": 1.4085173501577288, + "grad_norm": 0.6192865687389703, + "learning_rate": 1.4496762278836347e-05, + "loss": 0.3684, + "step": 7145 + }, + { + "epoch": 1.4087145110410095, + "grad_norm": 0.5257953374768632, + "learning_rate": 1.4495377754590396e-05, + "loss": 0.3528, + "step": 7146 + }, + { + "epoch": 1.4089116719242902, + "grad_norm": 0.5430432195240142, + "learning_rate": 1.4493993122339375e-05, + "loss": 0.3398, + "step": 7147 + }, + { + "epoch": 1.409108832807571, + "grad_norm": 0.5071243838122111, + "learning_rate": 1.4492608382116548e-05, + "loss": 0.3306, + "step": 7148 + }, + { + "epoch": 1.4093059936908516, + "grad_norm": 0.591861854929956, + "learning_rate": 1.4491223533955191e-05, + "loss": 0.3736, + "step": 7149 + }, + { + "epoch": 1.4095031545741326, + "grad_norm": 0.5388065248530421, + "learning_rate": 1.4489838577888569e-05, + "loss": 0.3513, + "step": 7150 + }, + { + "epoch": 1.4097003154574133, + "grad_norm": 0.517788510937729, + "learning_rate": 1.4488453513949963e-05, + "loss": 0.3479, + "step": 7151 + }, + { + "epoch": 1.409897476340694, + "grad_norm": 0.5610229696228063, + "learning_rate": 1.4487068342172642e-05, + "loss": 0.3469, + "step": 7152 + }, + { + "epoch": 1.4100946372239749, + "grad_norm": 0.4712793595470132, + "learning_rate": 1.4485683062589895e-05, + "loss": 0.3182, + "step": 7153 + }, + { + "epoch": 1.4102917981072556, + "grad_norm": 0.5668593758772468, + "learning_rate": 1.4484297675234995e-05, + "loss": 0.3809, + "step": 7154 + }, + { + "epoch": 1.4104889589905363, + "grad_norm": 0.4925454763252612, + "learning_rate": 1.4482912180141236e-05, + "loss": 0.3475, + "step": 7155 + }, + { + "epoch": 1.410686119873817, + "grad_norm": 0.5510156258206566, + "learning_rate": 1.44815265773419e-05, + "loss": 0.3579, + "step": 7156 + }, + { + "epoch": 1.4108832807570977, + "grad_norm": 0.5405436736712104, + "learning_rate": 1.4480140866870281e-05, + "loss": 0.3426, + "step": 7157 + }, + { + "epoch": 1.4110804416403786, + "grad_norm": 0.5237178960169243, + "learning_rate": 1.4478755048759668e-05, + "loss": 0.3632, + "step": 7158 + }, + { + "epoch": 1.4112776025236593, + "grad_norm": 0.5175312490148417, + "learning_rate": 1.4477369123043358e-05, + "loss": 0.3236, + "step": 7159 + }, + { + "epoch": 1.41147476340694, + "grad_norm": 0.4913933173819947, + "learning_rate": 1.447598308975465e-05, + "loss": 0.324, + "step": 7160 + }, + { + "epoch": 1.411671924290221, + "grad_norm": 0.4826321503807621, + "learning_rate": 1.4474596948926844e-05, + "loss": 0.326, + "step": 7161 + }, + { + "epoch": 1.4118690851735016, + "grad_norm": 0.4915727914760078, + "learning_rate": 1.4473210700593242e-05, + "loss": 0.33, + "step": 7162 + }, + { + "epoch": 1.4120662460567823, + "grad_norm": 0.5060788214654445, + "learning_rate": 1.4471824344787153e-05, + "loss": 0.314, + "step": 7163 + }, + { + "epoch": 1.412263406940063, + "grad_norm": 0.491241758641821, + "learning_rate": 1.4470437881541882e-05, + "loss": 0.3421, + "step": 7164 + }, + { + "epoch": 1.4124605678233437, + "grad_norm": 0.4942260983754986, + "learning_rate": 1.446905131089074e-05, + "loss": 0.3491, + "step": 7165 + }, + { + "epoch": 1.4126577287066246, + "grad_norm": 0.48529396730532887, + "learning_rate": 1.4467664632867042e-05, + "loss": 0.332, + "step": 7166 + }, + { + "epoch": 1.4128548895899053, + "grad_norm": 0.5474445100151097, + "learning_rate": 1.44662778475041e-05, + "loss": 0.3503, + "step": 7167 + }, + { + "epoch": 1.413052050473186, + "grad_norm": 0.49293254269828907, + "learning_rate": 1.4464890954835242e-05, + "loss": 0.326, + "step": 7168 + }, + { + "epoch": 1.413249211356467, + "grad_norm": 0.5017859566166725, + "learning_rate": 1.4463503954893778e-05, + "loss": 0.3244, + "step": 7169 + }, + { + "epoch": 1.4134463722397477, + "grad_norm": 0.5240485239593689, + "learning_rate": 1.446211684771304e-05, + "loss": 0.3708, + "step": 7170 + }, + { + "epoch": 1.4136435331230284, + "grad_norm": 0.4870947916252577, + "learning_rate": 1.4460729633326351e-05, + "loss": 0.3162, + "step": 7171 + }, + { + "epoch": 1.413840694006309, + "grad_norm": 0.4593378993420768, + "learning_rate": 1.4459342311767041e-05, + "loss": 0.3221, + "step": 7172 + }, + { + "epoch": 1.4140378548895898, + "grad_norm": 0.48340606797854385, + "learning_rate": 1.445795488306844e-05, + "loss": 0.3228, + "step": 7173 + }, + { + "epoch": 1.4142350157728707, + "grad_norm": 0.4713202093530444, + "learning_rate": 1.445656734726388e-05, + "loss": 0.3172, + "step": 7174 + }, + { + "epoch": 1.4144321766561514, + "grad_norm": 0.4780513871592405, + "learning_rate": 1.4455179704386706e-05, + "loss": 0.3551, + "step": 7175 + }, + { + "epoch": 1.414629337539432, + "grad_norm": 0.48058366057305824, + "learning_rate": 1.4453791954470248e-05, + "loss": 0.3257, + "step": 7176 + }, + { + "epoch": 1.414826498422713, + "grad_norm": 0.49793083984186, + "learning_rate": 1.4452404097547855e-05, + "loss": 0.3592, + "step": 7177 + }, + { + "epoch": 1.4150236593059937, + "grad_norm": 0.472865401917883, + "learning_rate": 1.4451016133652864e-05, + "loss": 0.3218, + "step": 7178 + }, + { + "epoch": 1.4152208201892744, + "grad_norm": 0.46575544454257445, + "learning_rate": 1.4449628062818628e-05, + "loss": 0.3077, + "step": 7179 + }, + { + "epoch": 1.4154179810725553, + "grad_norm": 0.48121486650008277, + "learning_rate": 1.4448239885078494e-05, + "loss": 0.3326, + "step": 7180 + }, + { + "epoch": 1.415615141955836, + "grad_norm": 0.473468751152917, + "learning_rate": 1.4446851600465817e-05, + "loss": 0.3376, + "step": 7181 + }, + { + "epoch": 1.4158123028391167, + "grad_norm": 0.5137837266489765, + "learning_rate": 1.4445463209013948e-05, + "loss": 0.3401, + "step": 7182 + }, + { + "epoch": 1.4160094637223974, + "grad_norm": 0.497315989831655, + "learning_rate": 1.4444074710756244e-05, + "loss": 0.3391, + "step": 7183 + }, + { + "epoch": 1.4162066246056781, + "grad_norm": 0.5170085322910686, + "learning_rate": 1.4442686105726066e-05, + "loss": 0.3612, + "step": 7184 + }, + { + "epoch": 1.416403785488959, + "grad_norm": 0.5055898381412588, + "learning_rate": 1.4441297393956779e-05, + "loss": 0.3368, + "step": 7185 + }, + { + "epoch": 1.4166009463722398, + "grad_norm": 0.4588445495464938, + "learning_rate": 1.4439908575481744e-05, + "loss": 0.3215, + "step": 7186 + }, + { + "epoch": 1.4167981072555205, + "grad_norm": 0.4940514211342652, + "learning_rate": 1.443851965033433e-05, + "loss": 0.3229, + "step": 7187 + }, + { + "epoch": 1.4169952681388014, + "grad_norm": 0.5237805317857336, + "learning_rate": 1.4437130618547905e-05, + "loss": 0.3746, + "step": 7188 + }, + { + "epoch": 1.417192429022082, + "grad_norm": 0.46802246011483617, + "learning_rate": 1.443574148015585e-05, + "loss": 0.3247, + "step": 7189 + }, + { + "epoch": 1.4173895899053628, + "grad_norm": 0.4760962059204285, + "learning_rate": 1.4434352235191526e-05, + "loss": 0.3145, + "step": 7190 + }, + { + "epoch": 1.4175867507886435, + "grad_norm": 0.46609718235500464, + "learning_rate": 1.4432962883688327e-05, + "loss": 0.3309, + "step": 7191 + }, + { + "epoch": 1.4177839116719242, + "grad_norm": 0.5063549571401547, + "learning_rate": 1.443157342567962e-05, + "loss": 0.3656, + "step": 7192 + }, + { + "epoch": 1.4179810725552051, + "grad_norm": 0.5161414428496777, + "learning_rate": 1.4430183861198792e-05, + "loss": 0.3811, + "step": 7193 + }, + { + "epoch": 1.4181782334384858, + "grad_norm": 0.46919572179206825, + "learning_rate": 1.4428794190279231e-05, + "loss": 0.3344, + "step": 7194 + }, + { + "epoch": 1.4183753943217665, + "grad_norm": 0.5020263012424411, + "learning_rate": 1.442740441295432e-05, + "loss": 0.3387, + "step": 7195 + }, + { + "epoch": 1.4185725552050474, + "grad_norm": 0.4593456839363708, + "learning_rate": 1.4426014529257457e-05, + "loss": 0.3161, + "step": 7196 + }, + { + "epoch": 1.4187697160883281, + "grad_norm": 0.50798569973899, + "learning_rate": 1.4424624539222028e-05, + "loss": 0.3444, + "step": 7197 + }, + { + "epoch": 1.4189668769716088, + "grad_norm": 0.4790042993417883, + "learning_rate": 1.4423234442881433e-05, + "loss": 0.3375, + "step": 7198 + }, + { + "epoch": 1.4191640378548895, + "grad_norm": 0.5051516627231682, + "learning_rate": 1.4421844240269064e-05, + "loss": 0.3502, + "step": 7199 + }, + { + "epoch": 1.4193611987381702, + "grad_norm": 0.49347178341054193, + "learning_rate": 1.4420453931418332e-05, + "loss": 0.3503, + "step": 7200 + }, + { + "epoch": 1.4195583596214512, + "grad_norm": 0.4764569323870533, + "learning_rate": 1.4419063516362633e-05, + "loss": 0.3142, + "step": 7201 + }, + { + "epoch": 1.4197555205047319, + "grad_norm": 0.48966644810736204, + "learning_rate": 1.4417672995135372e-05, + "loss": 0.3348, + "step": 7202 + }, + { + "epoch": 1.4199526813880126, + "grad_norm": 0.4897579050663058, + "learning_rate": 1.4416282367769961e-05, + "loss": 0.3284, + "step": 7203 + }, + { + "epoch": 1.4201498422712935, + "grad_norm": 0.4709690024262115, + "learning_rate": 1.441489163429981e-05, + "loss": 0.3158, + "step": 7204 + }, + { + "epoch": 1.4203470031545742, + "grad_norm": 0.5083654874998398, + "learning_rate": 1.4413500794758333e-05, + "loss": 0.3569, + "step": 7205 + }, + { + "epoch": 1.4205441640378549, + "grad_norm": 0.4977170017599856, + "learning_rate": 1.4412109849178944e-05, + "loss": 0.3207, + "step": 7206 + }, + { + "epoch": 1.4207413249211356, + "grad_norm": 0.49785411016808323, + "learning_rate": 1.4410718797595063e-05, + "loss": 0.3385, + "step": 7207 + }, + { + "epoch": 1.4209384858044163, + "grad_norm": 0.4985331494542473, + "learning_rate": 1.4409327640040111e-05, + "loss": 0.3256, + "step": 7208 + }, + { + "epoch": 1.4211356466876972, + "grad_norm": 12.637841434778313, + "learning_rate": 1.440793637654751e-05, + "loss": 0.4233, + "step": 7209 + }, + { + "epoch": 1.421332807570978, + "grad_norm": 0.5318624726951542, + "learning_rate": 1.4406545007150693e-05, + "loss": 0.3293, + "step": 7210 + }, + { + "epoch": 1.4215299684542586, + "grad_norm": 0.5031296805718974, + "learning_rate": 1.440515353188308e-05, + "loss": 0.3301, + "step": 7211 + }, + { + "epoch": 1.4217271293375395, + "grad_norm": 0.5445707070467908, + "learning_rate": 1.4403761950778106e-05, + "loss": 0.3248, + "step": 7212 + }, + { + "epoch": 1.4219242902208202, + "grad_norm": 0.4659455732252377, + "learning_rate": 1.4402370263869205e-05, + "loss": 0.3216, + "step": 7213 + }, + { + "epoch": 1.422121451104101, + "grad_norm": 0.5288243979500802, + "learning_rate": 1.4400978471189812e-05, + "loss": 0.3693, + "step": 7214 + }, + { + "epoch": 1.4223186119873816, + "grad_norm": 0.528449022273337, + "learning_rate": 1.439958657277337e-05, + "loss": 0.3402, + "step": 7215 + }, + { + "epoch": 1.4225157728706623, + "grad_norm": 0.5119944243955784, + "learning_rate": 1.4398194568653313e-05, + "loss": 0.3494, + "step": 7216 + }, + { + "epoch": 1.4227129337539433, + "grad_norm": 0.4916670845158821, + "learning_rate": 1.4396802458863095e-05, + "loss": 0.3217, + "step": 7217 + }, + { + "epoch": 1.422910094637224, + "grad_norm": 0.48113900294298484, + "learning_rate": 1.4395410243436153e-05, + "loss": 0.3314, + "step": 7218 + }, + { + "epoch": 1.4231072555205047, + "grad_norm": 0.47058896620118573, + "learning_rate": 1.4394017922405943e-05, + "loss": 0.3159, + "step": 7219 + }, + { + "epoch": 1.4233044164037856, + "grad_norm": 0.4849453680724847, + "learning_rate": 1.4392625495805913e-05, + "loss": 0.3395, + "step": 7220 + }, + { + "epoch": 1.4235015772870663, + "grad_norm": 0.5373072375771156, + "learning_rate": 1.439123296366952e-05, + "loss": 0.3288, + "step": 7221 + }, + { + "epoch": 1.423698738170347, + "grad_norm": 0.4693872279830995, + "learning_rate": 1.4389840326030213e-05, + "loss": 0.3302, + "step": 7222 + }, + { + "epoch": 1.4238958990536277, + "grad_norm": 0.5319368606240615, + "learning_rate": 1.438844758292146e-05, + "loss": 0.3312, + "step": 7223 + }, + { + "epoch": 1.4240930599369084, + "grad_norm": 0.5817496986395633, + "learning_rate": 1.4387054734376722e-05, + "loss": 0.3249, + "step": 7224 + }, + { + "epoch": 1.4242902208201893, + "grad_norm": 0.48934565333610924, + "learning_rate": 1.4385661780429461e-05, + "loss": 0.334, + "step": 7225 + }, + { + "epoch": 1.42448738170347, + "grad_norm": 0.49779432157118675, + "learning_rate": 1.438426872111314e-05, + "loss": 0.3428, + "step": 7226 + }, + { + "epoch": 1.4246845425867507, + "grad_norm": 0.4940338187493709, + "learning_rate": 1.4382875556461238e-05, + "loss": 0.3252, + "step": 7227 + }, + { + "epoch": 1.4248817034700316, + "grad_norm": 0.4815774455413699, + "learning_rate": 1.4381482286507216e-05, + "loss": 0.3158, + "step": 7228 + }, + { + "epoch": 1.4250788643533123, + "grad_norm": 0.4826791184071724, + "learning_rate": 1.4380088911284557e-05, + "loss": 0.3388, + "step": 7229 + }, + { + "epoch": 1.425276025236593, + "grad_norm": 0.5434358688212808, + "learning_rate": 1.4378695430826732e-05, + "loss": 0.3457, + "step": 7230 + }, + { + "epoch": 1.425473186119874, + "grad_norm": 0.496948379843039, + "learning_rate": 1.4377301845167227e-05, + "loss": 0.3431, + "step": 7231 + }, + { + "epoch": 1.4256703470031546, + "grad_norm": 0.4980057220909191, + "learning_rate": 1.4375908154339517e-05, + "loss": 0.3411, + "step": 7232 + }, + { + "epoch": 1.4258675078864353, + "grad_norm": 0.5381831341243695, + "learning_rate": 1.437451435837709e-05, + "loss": 0.3364, + "step": 7233 + }, + { + "epoch": 1.426064668769716, + "grad_norm": 0.5286642410721416, + "learning_rate": 1.4373120457313435e-05, + "loss": 0.3471, + "step": 7234 + }, + { + "epoch": 1.4262618296529967, + "grad_norm": 0.7724157082129258, + "learning_rate": 1.4371726451182038e-05, + "loss": 0.349, + "step": 7235 + }, + { + "epoch": 1.4264589905362777, + "grad_norm": 0.5265279436116093, + "learning_rate": 1.437033234001639e-05, + "loss": 0.3557, + "step": 7236 + }, + { + "epoch": 1.4266561514195584, + "grad_norm": 1.479067454447673, + "learning_rate": 1.436893812384999e-05, + "loss": 0.3668, + "step": 7237 + }, + { + "epoch": 1.426853312302839, + "grad_norm": 0.5212610233545351, + "learning_rate": 1.4367543802716334e-05, + "loss": 0.3513, + "step": 7238 + }, + { + "epoch": 1.42705047318612, + "grad_norm": 0.5107655933355794, + "learning_rate": 1.436614937664892e-05, + "loss": 0.3375, + "step": 7239 + }, + { + "epoch": 1.4272476340694007, + "grad_norm": 0.5255564965953446, + "learning_rate": 1.4364754845681253e-05, + "loss": 0.344, + "step": 7240 + }, + { + "epoch": 1.4274447949526814, + "grad_norm": 0.5736229739284229, + "learning_rate": 1.4363360209846833e-05, + "loss": 0.357, + "step": 7241 + }, + { + "epoch": 1.427641955835962, + "grad_norm": 0.500621320395277, + "learning_rate": 1.4361965469179173e-05, + "loss": 0.3204, + "step": 7242 + }, + { + "epoch": 1.4278391167192428, + "grad_norm": 0.49627772986305124, + "learning_rate": 1.4360570623711778e-05, + "loss": 0.3196, + "step": 7243 + }, + { + "epoch": 1.4280362776025237, + "grad_norm": 0.5137643214444078, + "learning_rate": 1.4359175673478163e-05, + "loss": 0.3455, + "step": 7244 + }, + { + "epoch": 1.4282334384858044, + "grad_norm": 0.5139351480323677, + "learning_rate": 1.435778061851184e-05, + "loss": 0.3494, + "step": 7245 + }, + { + "epoch": 1.4284305993690851, + "grad_norm": 0.5154850389726875, + "learning_rate": 1.435638545884633e-05, + "loss": 0.3538, + "step": 7246 + }, + { + "epoch": 1.428627760252366, + "grad_norm": 0.49555806148409276, + "learning_rate": 1.4354990194515155e-05, + "loss": 0.345, + "step": 7247 + }, + { + "epoch": 1.4288249211356467, + "grad_norm": 0.5168230858439605, + "learning_rate": 1.4353594825551827e-05, + "loss": 0.3459, + "step": 7248 + }, + { + "epoch": 1.4290220820189274, + "grad_norm": 0.5471309658606096, + "learning_rate": 1.4352199351989881e-05, + "loss": 0.3419, + "step": 7249 + }, + { + "epoch": 1.4292192429022081, + "grad_norm": 0.47810122148847417, + "learning_rate": 1.4350803773862841e-05, + "loss": 0.3093, + "step": 7250 + }, + { + "epoch": 1.4294164037854888, + "grad_norm": 0.4924167515842062, + "learning_rate": 1.4349408091204234e-05, + "loss": 0.3169, + "step": 7251 + }, + { + "epoch": 1.4296135646687698, + "grad_norm": 0.48173871551750974, + "learning_rate": 1.4348012304047596e-05, + "loss": 0.3336, + "step": 7252 + }, + { + "epoch": 1.4298107255520505, + "grad_norm": 0.49482226426234466, + "learning_rate": 1.4346616412426464e-05, + "loss": 0.3356, + "step": 7253 + }, + { + "epoch": 1.4300078864353312, + "grad_norm": 0.505868389409233, + "learning_rate": 1.434522041637437e-05, + "loss": 0.3555, + "step": 7254 + }, + { + "epoch": 1.430205047318612, + "grad_norm": 0.46830331430343086, + "learning_rate": 1.4343824315924855e-05, + "loss": 0.3163, + "step": 7255 + }, + { + "epoch": 1.4304022082018928, + "grad_norm": 0.4854035751744796, + "learning_rate": 1.4342428111111461e-05, + "loss": 0.3245, + "step": 7256 + }, + { + "epoch": 1.4305993690851735, + "grad_norm": 0.4948071078265386, + "learning_rate": 1.4341031801967742e-05, + "loss": 0.3319, + "step": 7257 + }, + { + "epoch": 1.4307965299684542, + "grad_norm": 0.5325267099042197, + "learning_rate": 1.4339635388527231e-05, + "loss": 0.3412, + "step": 7258 + }, + { + "epoch": 1.430993690851735, + "grad_norm": 0.6445122208735229, + "learning_rate": 1.433823887082349e-05, + "loss": 0.3409, + "step": 7259 + }, + { + "epoch": 1.4311908517350158, + "grad_norm": 0.5057312476219762, + "learning_rate": 1.4336842248890065e-05, + "loss": 0.3433, + "step": 7260 + }, + { + "epoch": 1.4313880126182965, + "grad_norm": 0.5013879369726504, + "learning_rate": 1.4335445522760512e-05, + "loss": 0.3373, + "step": 7261 + }, + { + "epoch": 1.4315851735015772, + "grad_norm": 0.5361669119631592, + "learning_rate": 1.433404869246839e-05, + "loss": 0.3599, + "step": 7262 + }, + { + "epoch": 1.4317823343848581, + "grad_norm": 0.5096029349380119, + "learning_rate": 1.4332651758047254e-05, + "loss": 0.3446, + "step": 7263 + }, + { + "epoch": 1.4319794952681388, + "grad_norm": 0.5265991854311413, + "learning_rate": 1.4331254719530676e-05, + "loss": 0.3717, + "step": 7264 + }, + { + "epoch": 1.4321766561514195, + "grad_norm": 0.4825505433718403, + "learning_rate": 1.4329857576952212e-05, + "loss": 0.3206, + "step": 7265 + }, + { + "epoch": 1.4323738170347002, + "grad_norm": 0.509655525792269, + "learning_rate": 1.4328460330345434e-05, + "loss": 0.3277, + "step": 7266 + }, + { + "epoch": 1.432570977917981, + "grad_norm": 0.4686753391448895, + "learning_rate": 1.432706297974391e-05, + "loss": 0.3028, + "step": 7267 + }, + { + "epoch": 1.4327681388012619, + "grad_norm": 0.5074945700987945, + "learning_rate": 1.4325665525181213e-05, + "loss": 0.3524, + "step": 7268 + }, + { + "epoch": 1.4329652996845426, + "grad_norm": 0.4994918673135275, + "learning_rate": 1.4324267966690919e-05, + "loss": 0.3373, + "step": 7269 + }, + { + "epoch": 1.4331624605678233, + "grad_norm": 0.4840679030796876, + "learning_rate": 1.4322870304306604e-05, + "loss": 0.3397, + "step": 7270 + }, + { + "epoch": 1.4333596214511042, + "grad_norm": 0.4620784380959485, + "learning_rate": 1.4321472538061852e-05, + "loss": 0.3314, + "step": 7271 + }, + { + "epoch": 1.4335567823343849, + "grad_norm": 0.4551298101484555, + "learning_rate": 1.4320074667990237e-05, + "loss": 0.3147, + "step": 7272 + }, + { + "epoch": 1.4337539432176656, + "grad_norm": 0.48354743821185125, + "learning_rate": 1.4318676694125353e-05, + "loss": 0.3441, + "step": 7273 + }, + { + "epoch": 1.4339511041009465, + "grad_norm": 0.44671416102776385, + "learning_rate": 1.4317278616500785e-05, + "loss": 0.3095, + "step": 7274 + }, + { + "epoch": 1.4341482649842272, + "grad_norm": 0.48747299202730604, + "learning_rate": 1.4315880435150119e-05, + "loss": 0.3416, + "step": 7275 + }, + { + "epoch": 1.434345425867508, + "grad_norm": 0.4931741101936371, + "learning_rate": 1.431448215010695e-05, + "loss": 0.3516, + "step": 7276 + }, + { + "epoch": 1.4345425867507886, + "grad_norm": 0.49240050671361324, + "learning_rate": 1.4313083761404874e-05, + "loss": 0.3476, + "step": 7277 + }, + { + "epoch": 1.4347397476340693, + "grad_norm": 0.4827511773398998, + "learning_rate": 1.4311685269077484e-05, + "loss": 0.3185, + "step": 7278 + }, + { + "epoch": 1.4349369085173502, + "grad_norm": 0.4971900913026981, + "learning_rate": 1.4310286673158387e-05, + "loss": 0.3614, + "step": 7279 + }, + { + "epoch": 1.435134069400631, + "grad_norm": 0.4857373079412297, + "learning_rate": 1.430888797368118e-05, + "loss": 0.3184, + "step": 7280 + }, + { + "epoch": 1.4353312302839116, + "grad_norm": 0.5759392585568717, + "learning_rate": 1.430748917067947e-05, + "loss": 0.3404, + "step": 7281 + }, + { + "epoch": 1.4355283911671926, + "grad_norm": 0.48435593637452573, + "learning_rate": 1.4306090264186863e-05, + "loss": 0.3346, + "step": 7282 + }, + { + "epoch": 1.4357255520504733, + "grad_norm": 0.5183943537528527, + "learning_rate": 1.430469125423697e-05, + "loss": 0.3635, + "step": 7283 + }, + { + "epoch": 1.435922712933754, + "grad_norm": 0.4874012749572438, + "learning_rate": 1.4303292140863402e-05, + "loss": 0.3417, + "step": 7284 + }, + { + "epoch": 1.4361198738170347, + "grad_norm": 0.47674893494615467, + "learning_rate": 1.4301892924099778e-05, + "loss": 0.326, + "step": 7285 + }, + { + "epoch": 1.4363170347003154, + "grad_norm": 0.4975791293304716, + "learning_rate": 1.430049360397971e-05, + "loss": 0.3426, + "step": 7286 + }, + { + "epoch": 1.4365141955835963, + "grad_norm": 0.47977676406998915, + "learning_rate": 1.4299094180536821e-05, + "loss": 0.3338, + "step": 7287 + }, + { + "epoch": 1.436711356466877, + "grad_norm": 0.4849615911471788, + "learning_rate": 1.429769465380473e-05, + "loss": 0.3365, + "step": 7288 + }, + { + "epoch": 1.4369085173501577, + "grad_norm": 0.50892477699892, + "learning_rate": 1.4296295023817068e-05, + "loss": 0.3194, + "step": 7289 + }, + { + "epoch": 1.4371056782334386, + "grad_norm": 0.5145007377570271, + "learning_rate": 1.4294895290607454e-05, + "loss": 0.3497, + "step": 7290 + }, + { + "epoch": 1.4373028391167193, + "grad_norm": 0.5513464913968162, + "learning_rate": 1.4293495454209525e-05, + "loss": 0.3693, + "step": 7291 + }, + { + "epoch": 1.4375, + "grad_norm": 0.5079800533958371, + "learning_rate": 1.4292095514656907e-05, + "loss": 0.3527, + "step": 7292 + }, + { + "epoch": 1.4376971608832807, + "grad_norm": 0.48977789204818567, + "learning_rate": 1.4290695471983243e-05, + "loss": 0.3207, + "step": 7293 + }, + { + "epoch": 1.4378943217665614, + "grad_norm": 0.6229406915573323, + "learning_rate": 1.4289295326222161e-05, + "loss": 0.3519, + "step": 7294 + }, + { + "epoch": 1.4380914826498423, + "grad_norm": 0.5705871966971622, + "learning_rate": 1.4287895077407306e-05, + "loss": 0.3602, + "step": 7295 + }, + { + "epoch": 1.438288643533123, + "grad_norm": 0.47501369073246813, + "learning_rate": 1.4286494725572317e-05, + "loss": 0.3289, + "step": 7296 + }, + { + "epoch": 1.4384858044164037, + "grad_norm": 0.4981376094438177, + "learning_rate": 1.4285094270750843e-05, + "loss": 0.3381, + "step": 7297 + }, + { + "epoch": 1.4386829652996846, + "grad_norm": 0.5229391942882657, + "learning_rate": 1.4283693712976527e-05, + "loss": 0.3332, + "step": 7298 + }, + { + "epoch": 1.4388801261829653, + "grad_norm": 0.5054756208664721, + "learning_rate": 1.4282293052283019e-05, + "loss": 0.352, + "step": 7299 + }, + { + "epoch": 1.439077287066246, + "grad_norm": 0.4951109600486945, + "learning_rate": 1.4280892288703974e-05, + "loss": 0.331, + "step": 7300 + }, + { + "epoch": 1.4392744479495267, + "grad_norm": 0.46364365237898975, + "learning_rate": 1.4279491422273043e-05, + "loss": 0.318, + "step": 7301 + }, + { + "epoch": 1.4394716088328074, + "grad_norm": 0.4947030225121084, + "learning_rate": 1.4278090453023885e-05, + "loss": 0.3423, + "step": 7302 + }, + { + "epoch": 1.4396687697160884, + "grad_norm": 0.4751068461401763, + "learning_rate": 1.4276689380990156e-05, + "loss": 0.3291, + "step": 7303 + }, + { + "epoch": 1.439865930599369, + "grad_norm": 0.5071969159877269, + "learning_rate": 1.4275288206205525e-05, + "loss": 0.3428, + "step": 7304 + }, + { + "epoch": 1.4400630914826498, + "grad_norm": 0.4964103377634554, + "learning_rate": 1.4273886928703648e-05, + "loss": 0.3302, + "step": 7305 + }, + { + "epoch": 1.4402602523659307, + "grad_norm": 0.4786434613936568, + "learning_rate": 1.4272485548518198e-05, + "loss": 0.3171, + "step": 7306 + }, + { + "epoch": 1.4404574132492114, + "grad_norm": 0.4821496603078477, + "learning_rate": 1.427108406568284e-05, + "loss": 0.3286, + "step": 7307 + }, + { + "epoch": 1.440654574132492, + "grad_norm": 0.4899274116908069, + "learning_rate": 1.4269682480231253e-05, + "loss": 0.3384, + "step": 7308 + }, + { + "epoch": 1.4408517350157728, + "grad_norm": 0.496373158989974, + "learning_rate": 1.42682807921971e-05, + "loss": 0.3355, + "step": 7309 + }, + { + "epoch": 1.4410488958990535, + "grad_norm": 0.509985991828358, + "learning_rate": 1.4266879001614067e-05, + "loss": 0.3311, + "step": 7310 + }, + { + "epoch": 1.4412460567823344, + "grad_norm": 0.4956371703782259, + "learning_rate": 1.4265477108515828e-05, + "loss": 0.3319, + "step": 7311 + }, + { + "epoch": 1.4414432176656151, + "grad_norm": 0.490193768475847, + "learning_rate": 1.426407511293607e-05, + "loss": 0.3395, + "step": 7312 + }, + { + "epoch": 1.4416403785488958, + "grad_norm": 0.4955718459701483, + "learning_rate": 1.4262673014908472e-05, + "loss": 0.3417, + "step": 7313 + }, + { + "epoch": 1.4418375394321767, + "grad_norm": 0.5089810792820509, + "learning_rate": 1.4261270814466719e-05, + "loss": 0.3719, + "step": 7314 + }, + { + "epoch": 1.4420347003154574, + "grad_norm": 0.5373269993400187, + "learning_rate": 1.4259868511644508e-05, + "loss": 0.3366, + "step": 7315 + }, + { + "epoch": 1.4422318611987381, + "grad_norm": 0.5191745083925651, + "learning_rate": 1.4258466106475522e-05, + "loss": 0.3412, + "step": 7316 + }, + { + "epoch": 1.442429022082019, + "grad_norm": 0.49462707282721535, + "learning_rate": 1.4257063598993458e-05, + "loss": 0.3463, + "step": 7317 + }, + { + "epoch": 1.4426261829652998, + "grad_norm": 0.4818656214076428, + "learning_rate": 1.4255660989232014e-05, + "loss": 0.321, + "step": 7318 + }, + { + "epoch": 1.4428233438485805, + "grad_norm": 0.5247053748007937, + "learning_rate": 1.4254258277224888e-05, + "loss": 0.3496, + "step": 7319 + }, + { + "epoch": 1.4430205047318612, + "grad_norm": 0.4934106851982142, + "learning_rate": 1.4252855463005782e-05, + "loss": 0.3463, + "step": 7320 + }, + { + "epoch": 1.4432176656151419, + "grad_norm": 0.4941318734425298, + "learning_rate": 1.4251452546608397e-05, + "loss": 0.3258, + "step": 7321 + }, + { + "epoch": 1.4434148264984228, + "grad_norm": 0.4740782133591454, + "learning_rate": 1.4250049528066441e-05, + "loss": 0.3002, + "step": 7322 + }, + { + "epoch": 1.4436119873817035, + "grad_norm": 0.528973383583317, + "learning_rate": 1.4248646407413622e-05, + "loss": 0.3539, + "step": 7323 + }, + { + "epoch": 1.4438091482649842, + "grad_norm": 0.49936440814980976, + "learning_rate": 1.424724318468365e-05, + "loss": 0.3563, + "step": 7324 + }, + { + "epoch": 1.444006309148265, + "grad_norm": 0.5037879267205015, + "learning_rate": 1.4245839859910247e-05, + "loss": 0.3528, + "step": 7325 + }, + { + "epoch": 1.4442034700315458, + "grad_norm": 0.5022670283442432, + "learning_rate": 1.4244436433127118e-05, + "loss": 0.3321, + "step": 7326 + }, + { + "epoch": 1.4444006309148265, + "grad_norm": 0.49067613703910734, + "learning_rate": 1.4243032904367984e-05, + "loss": 0.3502, + "step": 7327 + }, + { + "epoch": 1.4445977917981072, + "grad_norm": 0.49245160744368666, + "learning_rate": 1.424162927366657e-05, + "loss": 0.3459, + "step": 7328 + }, + { + "epoch": 1.444794952681388, + "grad_norm": 0.4882492569918211, + "learning_rate": 1.4240225541056596e-05, + "loss": 0.3348, + "step": 7329 + }, + { + "epoch": 1.4449921135646688, + "grad_norm": 0.5009716845668267, + "learning_rate": 1.423882170657179e-05, + "loss": 0.3276, + "step": 7330 + }, + { + "epoch": 1.4451892744479495, + "grad_norm": 0.46911913556435136, + "learning_rate": 1.4237417770245877e-05, + "loss": 0.3144, + "step": 7331 + }, + { + "epoch": 1.4453864353312302, + "grad_norm": 0.5206696861698887, + "learning_rate": 1.423601373211259e-05, + "loss": 0.3623, + "step": 7332 + }, + { + "epoch": 1.4455835962145112, + "grad_norm": 0.4906485716791266, + "learning_rate": 1.4234609592205662e-05, + "loss": 0.3442, + "step": 7333 + }, + { + "epoch": 1.4457807570977919, + "grad_norm": 0.5090573234966967, + "learning_rate": 1.423320535055883e-05, + "loss": 0.3448, + "step": 7334 + }, + { + "epoch": 1.4459779179810726, + "grad_norm": 0.4712940869922216, + "learning_rate": 1.4231801007205827e-05, + "loss": 0.3334, + "step": 7335 + }, + { + "epoch": 1.4461750788643533, + "grad_norm": 0.5092882547025841, + "learning_rate": 1.4230396562180401e-05, + "loss": 0.3217, + "step": 7336 + }, + { + "epoch": 1.446372239747634, + "grad_norm": 0.498188825420168, + "learning_rate": 1.4228992015516287e-05, + "loss": 0.3587, + "step": 7337 + }, + { + "epoch": 1.4465694006309149, + "grad_norm": 0.49312249691916427, + "learning_rate": 1.4227587367247238e-05, + "loss": 0.3465, + "step": 7338 + }, + { + "epoch": 1.4467665615141956, + "grad_norm": 0.5293802799153761, + "learning_rate": 1.4226182617406996e-05, + "loss": 0.3549, + "step": 7339 + }, + { + "epoch": 1.4469637223974763, + "grad_norm": 0.45293571133841104, + "learning_rate": 1.4224777766029311e-05, + "loss": 0.3161, + "step": 7340 + }, + { + "epoch": 1.4471608832807572, + "grad_norm": 0.46222311219300893, + "learning_rate": 1.4223372813147942e-05, + "loss": 0.3056, + "step": 7341 + }, + { + "epoch": 1.447358044164038, + "grad_norm": 0.5057284253651947, + "learning_rate": 1.422196775879664e-05, + "loss": 0.3432, + "step": 7342 + }, + { + "epoch": 1.4475552050473186, + "grad_norm": 0.48672729425679273, + "learning_rate": 1.422056260300916e-05, + "loss": 0.3328, + "step": 7343 + }, + { + "epoch": 1.4477523659305993, + "grad_norm": 0.4625625209082356, + "learning_rate": 1.4219157345819268e-05, + "loss": 0.3003, + "step": 7344 + }, + { + "epoch": 1.44794952681388, + "grad_norm": 0.5101487287795781, + "learning_rate": 1.421775198726072e-05, + "loss": 0.3384, + "step": 7345 + }, + { + "epoch": 1.448146687697161, + "grad_norm": 0.4940574773463017, + "learning_rate": 1.4216346527367284e-05, + "loss": 0.3219, + "step": 7346 + }, + { + "epoch": 1.4483438485804416, + "grad_norm": 0.4876148749981708, + "learning_rate": 1.421494096617273e-05, + "loss": 0.3428, + "step": 7347 + }, + { + "epoch": 1.4485410094637223, + "grad_norm": 0.47062050672221883, + "learning_rate": 1.4213535303710822e-05, + "loss": 0.3042, + "step": 7348 + }, + { + "epoch": 1.4487381703470033, + "grad_norm": 0.48615991071089154, + "learning_rate": 1.4212129540015339e-05, + "loss": 0.3421, + "step": 7349 + }, + { + "epoch": 1.448935331230284, + "grad_norm": 0.45576427389164237, + "learning_rate": 1.4210723675120049e-05, + "loss": 0.3115, + "step": 7350 + }, + { + "epoch": 1.4491324921135647, + "grad_norm": 0.5118194739469882, + "learning_rate": 1.420931770905873e-05, + "loss": 0.3674, + "step": 7351 + }, + { + "epoch": 1.4493296529968454, + "grad_norm": 0.5135213363178154, + "learning_rate": 1.4207911641865164e-05, + "loss": 0.349, + "step": 7352 + }, + { + "epoch": 1.449526813880126, + "grad_norm": 0.5010570752976321, + "learning_rate": 1.4206505473573135e-05, + "loss": 0.3526, + "step": 7353 + }, + { + "epoch": 1.449723974763407, + "grad_norm": 0.48809906237733963, + "learning_rate": 1.4205099204216421e-05, + "loss": 0.3339, + "step": 7354 + }, + { + "epoch": 1.4499211356466877, + "grad_norm": 0.48260002387765977, + "learning_rate": 1.4203692833828817e-05, + "loss": 0.315, + "step": 7355 + }, + { + "epoch": 1.4501182965299684, + "grad_norm": 0.4752614271985504, + "learning_rate": 1.4202286362444105e-05, + "loss": 0.3108, + "step": 7356 + }, + { + "epoch": 1.4503154574132493, + "grad_norm": 0.48050012591845676, + "learning_rate": 1.4200879790096078e-05, + "loss": 0.3118, + "step": 7357 + }, + { + "epoch": 1.45051261829653, + "grad_norm": 0.49612980028187825, + "learning_rate": 1.419947311681853e-05, + "loss": 0.3452, + "step": 7358 + }, + { + "epoch": 1.4507097791798107, + "grad_norm": 6.6692877091280165, + "learning_rate": 1.4198066342645262e-05, + "loss": 0.3363, + "step": 7359 + }, + { + "epoch": 1.4509069400630916, + "grad_norm": 0.5452461743710111, + "learning_rate": 1.4196659467610068e-05, + "loss": 0.3699, + "step": 7360 + }, + { + "epoch": 1.4511041009463723, + "grad_norm": 0.5175861598786076, + "learning_rate": 1.419525249174675e-05, + "loss": 0.3358, + "step": 7361 + }, + { + "epoch": 1.451301261829653, + "grad_norm": 0.5004638206871369, + "learning_rate": 1.4193845415089113e-05, + "loss": 0.3465, + "step": 7362 + }, + { + "epoch": 1.4514984227129337, + "grad_norm": 0.5018482035564551, + "learning_rate": 1.4192438237670962e-05, + "loss": 0.3553, + "step": 7363 + }, + { + "epoch": 1.4516955835962144, + "grad_norm": 0.6472551081778795, + "learning_rate": 1.4191030959526106e-05, + "loss": 0.3311, + "step": 7364 + }, + { + "epoch": 1.4518927444794953, + "grad_norm": 0.48615491438153513, + "learning_rate": 1.4189623580688358e-05, + "loss": 0.3209, + "step": 7365 + }, + { + "epoch": 1.452089905362776, + "grad_norm": 0.4996419475157942, + "learning_rate": 1.418821610119153e-05, + "loss": 0.3374, + "step": 7366 + }, + { + "epoch": 1.4522870662460567, + "grad_norm": 0.503614653074225, + "learning_rate": 1.4186808521069436e-05, + "loss": 0.3366, + "step": 7367 + }, + { + "epoch": 1.4524842271293377, + "grad_norm": 0.6183156108882925, + "learning_rate": 1.4185400840355895e-05, + "loss": 0.3291, + "step": 7368 + }, + { + "epoch": 1.4526813880126184, + "grad_norm": 0.4913587909312157, + "learning_rate": 1.4183993059084728e-05, + "loss": 0.3561, + "step": 7369 + }, + { + "epoch": 1.452878548895899, + "grad_norm": 0.5001590541017276, + "learning_rate": 1.418258517728976e-05, + "loss": 0.3561, + "step": 7370 + }, + { + "epoch": 1.4530757097791798, + "grad_norm": 0.4642898097782909, + "learning_rate": 1.4181177195004814e-05, + "loss": 0.3192, + "step": 7371 + }, + { + "epoch": 1.4532728706624605, + "grad_norm": 0.5378165455257576, + "learning_rate": 1.4179769112263719e-05, + "loss": 0.3453, + "step": 7372 + }, + { + "epoch": 1.4534700315457414, + "grad_norm": 0.4847498894723453, + "learning_rate": 1.4178360929100303e-05, + "loss": 0.3151, + "step": 7373 + }, + { + "epoch": 1.453667192429022, + "grad_norm": 0.5067406612634322, + "learning_rate": 1.4176952645548406e-05, + "loss": 0.3362, + "step": 7374 + }, + { + "epoch": 1.4538643533123028, + "grad_norm": 0.4918446350110439, + "learning_rate": 1.4175544261641854e-05, + "loss": 0.3319, + "step": 7375 + }, + { + "epoch": 1.4540615141955837, + "grad_norm": 0.49347879868509636, + "learning_rate": 1.417413577741449e-05, + "loss": 0.342, + "step": 7376 + }, + { + "epoch": 1.4542586750788644, + "grad_norm": 0.4783914406259114, + "learning_rate": 1.417272719290015e-05, + "loss": 0.3255, + "step": 7377 + }, + { + "epoch": 1.4544558359621451, + "grad_norm": 0.4941216512113406, + "learning_rate": 1.4171318508132683e-05, + "loss": 0.3196, + "step": 7378 + }, + { + "epoch": 1.4546529968454258, + "grad_norm": 0.49485434742875883, + "learning_rate": 1.416990972314593e-05, + "loss": 0.3293, + "step": 7379 + }, + { + "epoch": 1.4548501577287065, + "grad_norm": 0.4788373125318391, + "learning_rate": 1.4168500837973733e-05, + "loss": 0.3059, + "step": 7380 + }, + { + "epoch": 1.4550473186119874, + "grad_norm": 0.4834085395567045, + "learning_rate": 1.416709185264995e-05, + "loss": 0.3363, + "step": 7381 + }, + { + "epoch": 1.4552444794952681, + "grad_norm": 0.5134730730417215, + "learning_rate": 1.4165682767208426e-05, + "loss": 0.3416, + "step": 7382 + }, + { + "epoch": 1.4554416403785488, + "grad_norm": 0.5044382136084319, + "learning_rate": 1.4164273581683023e-05, + "loss": 0.3297, + "step": 7383 + }, + { + "epoch": 1.4556388012618298, + "grad_norm": 0.5714557060481008, + "learning_rate": 1.4162864296107593e-05, + "loss": 0.3529, + "step": 7384 + }, + { + "epoch": 1.4558359621451105, + "grad_norm": 0.5054705611777254, + "learning_rate": 1.4161454910515997e-05, + "loss": 0.3306, + "step": 7385 + }, + { + "epoch": 1.4560331230283912, + "grad_norm": 0.47824473710019705, + "learning_rate": 1.416004542494209e-05, + "loss": 0.3287, + "step": 7386 + }, + { + "epoch": 1.4562302839116719, + "grad_norm": 0.4835230550928211, + "learning_rate": 1.4158635839419745e-05, + "loss": 0.3193, + "step": 7387 + }, + { + "epoch": 1.4564274447949526, + "grad_norm": 0.5966821280918011, + "learning_rate": 1.4157226153982826e-05, + "loss": 0.3512, + "step": 7388 + }, + { + "epoch": 1.4566246056782335, + "grad_norm": 0.487708074097899, + "learning_rate": 1.4155816368665201e-05, + "loss": 0.3356, + "step": 7389 + }, + { + "epoch": 1.4568217665615142, + "grad_norm": 0.47577454166544947, + "learning_rate": 1.415440648350074e-05, + "loss": 0.3373, + "step": 7390 + }, + { + "epoch": 1.4570189274447949, + "grad_norm": 0.4872308941864148, + "learning_rate": 1.4152996498523317e-05, + "loss": 0.3309, + "step": 7391 + }, + { + "epoch": 1.4572160883280758, + "grad_norm": 0.5225407802789235, + "learning_rate": 1.4151586413766811e-05, + "loss": 0.3419, + "step": 7392 + }, + { + "epoch": 1.4574132492113565, + "grad_norm": 0.4998933279734873, + "learning_rate": 1.4150176229265096e-05, + "loss": 0.3668, + "step": 7393 + }, + { + "epoch": 1.4576104100946372, + "grad_norm": 0.5050373414529986, + "learning_rate": 1.4148765945052056e-05, + "loss": 0.3389, + "step": 7394 + }, + { + "epoch": 1.457807570977918, + "grad_norm": 0.5064131922922323, + "learning_rate": 1.4147355561161574e-05, + "loss": 0.3561, + "step": 7395 + }, + { + "epoch": 1.4580047318611986, + "grad_norm": 0.4948700452474314, + "learning_rate": 1.4145945077627531e-05, + "loss": 0.3395, + "step": 7396 + }, + { + "epoch": 1.4582018927444795, + "grad_norm": 0.5239164208524786, + "learning_rate": 1.4144534494483824e-05, + "loss": 0.3643, + "step": 7397 + }, + { + "epoch": 1.4583990536277602, + "grad_norm": 0.5345022125344085, + "learning_rate": 1.4143123811764335e-05, + "loss": 0.3509, + "step": 7398 + }, + { + "epoch": 1.458596214511041, + "grad_norm": 0.49581376523712395, + "learning_rate": 1.414171302950296e-05, + "loss": 0.3298, + "step": 7399 + }, + { + "epoch": 1.4587933753943219, + "grad_norm": 0.4921239829922169, + "learning_rate": 1.4140302147733596e-05, + "loss": 0.3322, + "step": 7400 + }, + { + "epoch": 1.4589905362776026, + "grad_norm": 0.5592026245468484, + "learning_rate": 1.4138891166490135e-05, + "loss": 0.3293, + "step": 7401 + }, + { + "epoch": 1.4591876971608833, + "grad_norm": 0.5442208251383976, + "learning_rate": 1.4137480085806486e-05, + "loss": 0.3361, + "step": 7402 + }, + { + "epoch": 1.459384858044164, + "grad_norm": 0.5170050108654181, + "learning_rate": 1.413606890571654e-05, + "loss": 0.3538, + "step": 7403 + }, + { + "epoch": 1.4595820189274447, + "grad_norm": 0.510647100212481, + "learning_rate": 1.4134657626254214e-05, + "loss": 0.337, + "step": 7404 + }, + { + "epoch": 1.4597791798107256, + "grad_norm": 0.5306321326123213, + "learning_rate": 1.4133246247453403e-05, + "loss": 0.3282, + "step": 7405 + }, + { + "epoch": 1.4599763406940063, + "grad_norm": 0.5133341234040529, + "learning_rate": 1.4131834769348026e-05, + "loss": 0.3366, + "step": 7406 + }, + { + "epoch": 1.460173501577287, + "grad_norm": 0.5395277590006632, + "learning_rate": 1.4130423191971992e-05, + "loss": 0.3333, + "step": 7407 + }, + { + "epoch": 1.460370662460568, + "grad_norm": 0.47995198018193463, + "learning_rate": 1.4129011515359212e-05, + "loss": 0.3362, + "step": 7408 + }, + { + "epoch": 1.4605678233438486, + "grad_norm": 0.4987772919977686, + "learning_rate": 1.4127599739543606e-05, + "loss": 0.3347, + "step": 7409 + }, + { + "epoch": 1.4607649842271293, + "grad_norm": 0.5453925939490976, + "learning_rate": 1.4126187864559094e-05, + "loss": 0.3689, + "step": 7410 + }, + { + "epoch": 1.4609621451104102, + "grad_norm": 0.4959830516006325, + "learning_rate": 1.4124775890439595e-05, + "loss": 0.3154, + "step": 7411 + }, + { + "epoch": 1.461159305993691, + "grad_norm": 0.5124401018607515, + "learning_rate": 1.4123363817219034e-05, + "loss": 0.321, + "step": 7412 + }, + { + "epoch": 1.4613564668769716, + "grad_norm": 1.2270349005748566, + "learning_rate": 1.4121951644931336e-05, + "loss": 0.3614, + "step": 7413 + }, + { + "epoch": 1.4615536277602523, + "grad_norm": 0.5138791690617409, + "learning_rate": 1.4120539373610429e-05, + "loss": 0.3237, + "step": 7414 + }, + { + "epoch": 1.461750788643533, + "grad_norm": 0.4521273911127816, + "learning_rate": 1.4119127003290248e-05, + "loss": 0.3186, + "step": 7415 + }, + { + "epoch": 1.461947949526814, + "grad_norm": 0.49874325861714064, + "learning_rate": 1.411771453400472e-05, + "loss": 0.3281, + "step": 7416 + }, + { + "epoch": 1.4621451104100947, + "grad_norm": 0.5021452602032279, + "learning_rate": 1.4116301965787786e-05, + "loss": 0.3404, + "step": 7417 + }, + { + "epoch": 1.4623422712933754, + "grad_norm": 0.4973741088361569, + "learning_rate": 1.4114889298673383e-05, + "loss": 0.3309, + "step": 7418 + }, + { + "epoch": 1.4625394321766563, + "grad_norm": 0.516601612052012, + "learning_rate": 1.4113476532695452e-05, + "loss": 0.3076, + "step": 7419 + }, + { + "epoch": 1.462736593059937, + "grad_norm": 0.5526911269279787, + "learning_rate": 1.4112063667887932e-05, + "loss": 0.3539, + "step": 7420 + }, + { + "epoch": 1.4629337539432177, + "grad_norm": 0.5562535752274906, + "learning_rate": 1.4110650704284773e-05, + "loss": 0.3569, + "step": 7421 + }, + { + "epoch": 1.4631309148264984, + "grad_norm": 0.5488213791263362, + "learning_rate": 1.410923764191992e-05, + "loss": 0.357, + "step": 7422 + }, + { + "epoch": 1.463328075709779, + "grad_norm": 0.5107788895192845, + "learning_rate": 1.4107824480827324e-05, + "loss": 0.3328, + "step": 7423 + }, + { + "epoch": 1.46352523659306, + "grad_norm": 0.5171838627063462, + "learning_rate": 1.4106411221040935e-05, + "loss": 0.3217, + "step": 7424 + }, + { + "epoch": 1.4637223974763407, + "grad_norm": 0.4922378526729013, + "learning_rate": 1.4104997862594711e-05, + "loss": 0.3343, + "step": 7425 + }, + { + "epoch": 1.4639195583596214, + "grad_norm": 0.5037026945262372, + "learning_rate": 1.4103584405522605e-05, + "loss": 0.3284, + "step": 7426 + }, + { + "epoch": 1.4641167192429023, + "grad_norm": 0.49326551249285355, + "learning_rate": 1.4102170849858583e-05, + "loss": 0.3443, + "step": 7427 + }, + { + "epoch": 1.464313880126183, + "grad_norm": 0.546820130602552, + "learning_rate": 1.41007571956366e-05, + "loss": 0.3602, + "step": 7428 + }, + { + "epoch": 1.4645110410094637, + "grad_norm": 0.4837797980660489, + "learning_rate": 1.4099343442890624e-05, + "loss": 0.3333, + "step": 7429 + }, + { + "epoch": 1.4647082018927444, + "grad_norm": 0.5126857949066812, + "learning_rate": 1.4097929591654621e-05, + "loss": 0.3383, + "step": 7430 + }, + { + "epoch": 1.4649053627760251, + "grad_norm": 0.46279281738729217, + "learning_rate": 1.4096515641962563e-05, + "loss": 0.3435, + "step": 7431 + }, + { + "epoch": 1.465102523659306, + "grad_norm": 2.09692903793275, + "learning_rate": 1.4095101593848415e-05, + "loss": 0.3672, + "step": 7432 + }, + { + "epoch": 1.4652996845425867, + "grad_norm": 0.5501540341610509, + "learning_rate": 1.4093687447346151e-05, + "loss": 0.3876, + "step": 7433 + }, + { + "epoch": 1.4654968454258674, + "grad_norm": 0.47609912782149055, + "learning_rate": 1.409227320248975e-05, + "loss": 0.3153, + "step": 7434 + }, + { + "epoch": 1.4656940063091484, + "grad_norm": 0.525537823437008, + "learning_rate": 1.4090858859313193e-05, + "loss": 0.3046, + "step": 7435 + }, + { + "epoch": 1.465891167192429, + "grad_norm": 0.5028179572182535, + "learning_rate": 1.4089444417850455e-05, + "loss": 0.3421, + "step": 7436 + }, + { + "epoch": 1.4660883280757098, + "grad_norm": 0.5163272751032175, + "learning_rate": 1.408802987813552e-05, + "loss": 0.3294, + "step": 7437 + }, + { + "epoch": 1.4662854889589905, + "grad_norm": 0.5132796172325643, + "learning_rate": 1.408661524020238e-05, + "loss": 0.3264, + "step": 7438 + }, + { + "epoch": 1.4664826498422712, + "grad_norm": 0.5108505262340117, + "learning_rate": 1.4085200504085013e-05, + "loss": 0.3491, + "step": 7439 + }, + { + "epoch": 1.466679810725552, + "grad_norm": 0.48158384519210207, + "learning_rate": 1.4083785669817417e-05, + "loss": 0.3258, + "step": 7440 + }, + { + "epoch": 1.4668769716088328, + "grad_norm": 0.6121936078703637, + "learning_rate": 1.408237073743358e-05, + "loss": 0.3299, + "step": 7441 + }, + { + "epoch": 1.4670741324921135, + "grad_norm": 0.5452188941885944, + "learning_rate": 1.4080955706967501e-05, + "loss": 0.3629, + "step": 7442 + }, + { + "epoch": 1.4672712933753944, + "grad_norm": 0.467809566032474, + "learning_rate": 1.407954057845317e-05, + "loss": 0.3111, + "step": 7443 + }, + { + "epoch": 1.4674684542586751, + "grad_norm": 0.5136940118705474, + "learning_rate": 1.4078125351924597e-05, + "loss": 0.3638, + "step": 7444 + }, + { + "epoch": 1.4676656151419558, + "grad_norm": 0.511820180671828, + "learning_rate": 1.4076710027415776e-05, + "loss": 0.357, + "step": 7445 + }, + { + "epoch": 1.4678627760252365, + "grad_norm": 0.490168596701873, + "learning_rate": 1.4075294604960715e-05, + "loss": 0.3198, + "step": 7446 + }, + { + "epoch": 1.4680599369085172, + "grad_norm": 0.5058740263507188, + "learning_rate": 1.4073879084593416e-05, + "loss": 0.3195, + "step": 7447 + }, + { + "epoch": 1.4682570977917981, + "grad_norm": 0.4674312373171209, + "learning_rate": 1.4072463466347892e-05, + "loss": 0.3201, + "step": 7448 + }, + { + "epoch": 1.4684542586750788, + "grad_norm": 2.1568178975708383, + "learning_rate": 1.4071047750258156e-05, + "loss": 0.3337, + "step": 7449 + }, + { + "epoch": 1.4686514195583595, + "grad_norm": 0.5546196457316546, + "learning_rate": 1.4069631936358214e-05, + "loss": 0.3781, + "step": 7450 + }, + { + "epoch": 1.4688485804416405, + "grad_norm": 0.5357099924800648, + "learning_rate": 1.4068216024682095e-05, + "loss": 0.3631, + "step": 7451 + }, + { + "epoch": 1.4690457413249212, + "grad_norm": 0.507066065309645, + "learning_rate": 1.4066800015263807e-05, + "loss": 0.3288, + "step": 7452 + }, + { + "epoch": 1.4692429022082019, + "grad_norm": 0.4874279119555861, + "learning_rate": 1.4065383908137373e-05, + "loss": 0.3218, + "step": 7453 + }, + { + "epoch": 1.4694400630914828, + "grad_norm": 0.4762728451473061, + "learning_rate": 1.4063967703336814e-05, + "loss": 0.3198, + "step": 7454 + }, + { + "epoch": 1.4696372239747635, + "grad_norm": 0.4854069150886455, + "learning_rate": 1.4062551400896163e-05, + "loss": 0.3287, + "step": 7455 + }, + { + "epoch": 1.4698343848580442, + "grad_norm": 0.4715608643695716, + "learning_rate": 1.406113500084944e-05, + "loss": 0.2892, + "step": 7456 + }, + { + "epoch": 1.4700315457413249, + "grad_norm": 0.5002411384785533, + "learning_rate": 1.405971850323068e-05, + "loss": 0.3071, + "step": 7457 + }, + { + "epoch": 1.4702287066246056, + "grad_norm": 0.5388064136656453, + "learning_rate": 1.4058301908073912e-05, + "loss": 0.3851, + "step": 7458 + }, + { + "epoch": 1.4704258675078865, + "grad_norm": 0.49296401636550136, + "learning_rate": 1.4056885215413174e-05, + "loss": 0.3298, + "step": 7459 + }, + { + "epoch": 1.4706230283911672, + "grad_norm": 0.5355211557498206, + "learning_rate": 1.4055468425282502e-05, + "loss": 0.338, + "step": 7460 + }, + { + "epoch": 1.470820189274448, + "grad_norm": 0.48977540245567913, + "learning_rate": 1.4054051537715933e-05, + "loss": 0.317, + "step": 7461 + }, + { + "epoch": 1.4710173501577288, + "grad_norm": 0.5326327177590915, + "learning_rate": 1.4052634552747512e-05, + "loss": 0.346, + "step": 7462 + }, + { + "epoch": 1.4712145110410095, + "grad_norm": 0.526993033584732, + "learning_rate": 1.4051217470411284e-05, + "loss": 0.3519, + "step": 7463 + }, + { + "epoch": 1.4714116719242902, + "grad_norm": 0.5258328026865123, + "learning_rate": 1.4049800290741293e-05, + "loss": 0.3475, + "step": 7464 + }, + { + "epoch": 1.471608832807571, + "grad_norm": 0.5203038982752369, + "learning_rate": 1.4048383013771588e-05, + "loss": 0.3321, + "step": 7465 + }, + { + "epoch": 1.4718059936908516, + "grad_norm": 0.5113074927167442, + "learning_rate": 1.404696563953622e-05, + "loss": 0.3354, + "step": 7466 + }, + { + "epoch": 1.4720031545741326, + "grad_norm": 0.49919882528097426, + "learning_rate": 1.4045548168069246e-05, + "loss": 0.3539, + "step": 7467 + }, + { + "epoch": 1.4722003154574133, + "grad_norm": 0.5016200068739187, + "learning_rate": 1.4044130599404717e-05, + "loss": 0.325, + "step": 7468 + }, + { + "epoch": 1.472397476340694, + "grad_norm": 0.5564339720636347, + "learning_rate": 1.4042712933576694e-05, + "loss": 0.3908, + "step": 7469 + }, + { + "epoch": 1.4725946372239749, + "grad_norm": 0.47101819777928333, + "learning_rate": 1.4041295170619241e-05, + "loss": 0.3266, + "step": 7470 + }, + { + "epoch": 1.4727917981072556, + "grad_norm": 0.4968101062786291, + "learning_rate": 1.403987731056641e-05, + "loss": 0.3045, + "step": 7471 + }, + { + "epoch": 1.4729889589905363, + "grad_norm": 0.5063663744372873, + "learning_rate": 1.403845935345228e-05, + "loss": 0.323, + "step": 7472 + }, + { + "epoch": 1.473186119873817, + "grad_norm": 0.4892904205195065, + "learning_rate": 1.4037041299310908e-05, + "loss": 0.3245, + "step": 7473 + }, + { + "epoch": 1.4733832807570977, + "grad_norm": 0.5159500037402428, + "learning_rate": 1.4035623148176369e-05, + "loss": 0.3428, + "step": 7474 + }, + { + "epoch": 1.4735804416403786, + "grad_norm": 0.48891170305663467, + "learning_rate": 1.4034204900082734e-05, + "loss": 0.3305, + "step": 7475 + }, + { + "epoch": 1.4737776025236593, + "grad_norm": 0.557799558291397, + "learning_rate": 1.4032786555064077e-05, + "loss": 0.3915, + "step": 7476 + }, + { + "epoch": 1.47397476340694, + "grad_norm": 0.49527625272519055, + "learning_rate": 1.4031368113154478e-05, + "loss": 0.323, + "step": 7477 + }, + { + "epoch": 1.474171924290221, + "grad_norm": 0.5316789203889967, + "learning_rate": 1.4029949574388009e-05, + "loss": 0.3531, + "step": 7478 + }, + { + "epoch": 1.4743690851735016, + "grad_norm": 0.47098548601018697, + "learning_rate": 1.4028530938798759e-05, + "loss": 0.3212, + "step": 7479 + }, + { + "epoch": 1.4745662460567823, + "grad_norm": 0.490959684927149, + "learning_rate": 1.402711220642081e-05, + "loss": 0.328, + "step": 7480 + }, + { + "epoch": 1.474763406940063, + "grad_norm": 0.5168859380738673, + "learning_rate": 1.4025693377288246e-05, + "loss": 0.3489, + "step": 7481 + }, + { + "epoch": 1.4749605678233437, + "grad_norm": 0.49571446645437645, + "learning_rate": 1.4024274451435157e-05, + "loss": 0.3375, + "step": 7482 + }, + { + "epoch": 1.4751577287066246, + "grad_norm": 0.5197521991945561, + "learning_rate": 1.4022855428895632e-05, + "loss": 0.3343, + "step": 7483 + }, + { + "epoch": 1.4753548895899053, + "grad_norm": 0.48589615662818103, + "learning_rate": 1.4021436309703766e-05, + "loss": 0.3201, + "step": 7484 + }, + { + "epoch": 1.475552050473186, + "grad_norm": 0.4995993722216193, + "learning_rate": 1.4020017093893656e-05, + "loss": 0.3371, + "step": 7485 + }, + { + "epoch": 1.475749211356467, + "grad_norm": 0.5036745093017024, + "learning_rate": 1.4018597781499399e-05, + "loss": 0.3505, + "step": 7486 + }, + { + "epoch": 1.4759463722397477, + "grad_norm": 0.4878243600702588, + "learning_rate": 1.4017178372555092e-05, + "loss": 0.3085, + "step": 7487 + }, + { + "epoch": 1.4761435331230284, + "grad_norm": 0.508433396912967, + "learning_rate": 1.4015758867094837e-05, + "loss": 0.3382, + "step": 7488 + }, + { + "epoch": 1.476340694006309, + "grad_norm": 0.5127101875396283, + "learning_rate": 1.4014339265152748e-05, + "loss": 0.3524, + "step": 7489 + }, + { + "epoch": 1.4765378548895898, + "grad_norm": 0.4724562924935433, + "learning_rate": 1.401291956676292e-05, + "loss": 0.3308, + "step": 7490 + }, + { + "epoch": 1.4767350157728707, + "grad_norm": 0.49909821615758254, + "learning_rate": 1.4011499771959469e-05, + "loss": 0.3278, + "step": 7491 + }, + { + "epoch": 1.4769321766561514, + "grad_norm": 0.500929277106934, + "learning_rate": 1.4010079880776505e-05, + "loss": 0.3428, + "step": 7492 + }, + { + "epoch": 1.477129337539432, + "grad_norm": 0.4578591298461616, + "learning_rate": 1.4008659893248147e-05, + "loss": 0.3263, + "step": 7493 + }, + { + "epoch": 1.477326498422713, + "grad_norm": 0.4829431282504663, + "learning_rate": 1.40072398094085e-05, + "loss": 0.3338, + "step": 7494 + }, + { + "epoch": 1.4775236593059937, + "grad_norm": 0.48641913532636866, + "learning_rate": 1.4005819629291692e-05, + "loss": 0.3367, + "step": 7495 + }, + { + "epoch": 1.4777208201892744, + "grad_norm": 0.5087064304556296, + "learning_rate": 1.4004399352931846e-05, + "loss": 0.3482, + "step": 7496 + }, + { + "epoch": 1.4779179810725553, + "grad_norm": 0.4930581875396616, + "learning_rate": 1.4002978980363075e-05, + "loss": 0.3192, + "step": 7497 + }, + { + "epoch": 1.478115141955836, + "grad_norm": 0.4957220120205278, + "learning_rate": 1.4001558511619515e-05, + "loss": 0.3341, + "step": 7498 + }, + { + "epoch": 1.4783123028391167, + "grad_norm": 0.514560592615134, + "learning_rate": 1.4000137946735284e-05, + "loss": 0.3032, + "step": 7499 + }, + { + "epoch": 1.4785094637223974, + "grad_norm": 0.5043148636879016, + "learning_rate": 1.3998717285744524e-05, + "loss": 0.3387, + "step": 7500 + }, + { + "epoch": 1.4787066246056781, + "grad_norm": 0.5094476950875343, + "learning_rate": 1.3997296528681355e-05, + "loss": 0.3536, + "step": 7501 + }, + { + "epoch": 1.478903785488959, + "grad_norm": 0.509271590468565, + "learning_rate": 1.3995875675579922e-05, + "loss": 0.3335, + "step": 7502 + }, + { + "epoch": 1.4791009463722398, + "grad_norm": 0.475098165612926, + "learning_rate": 1.3994454726474355e-05, + "loss": 0.3232, + "step": 7503 + }, + { + "epoch": 1.4792981072555205, + "grad_norm": 0.49409393273104263, + "learning_rate": 1.3993033681398797e-05, + "loss": 0.336, + "step": 7504 + }, + { + "epoch": 1.4794952681388014, + "grad_norm": 0.5025889163503731, + "learning_rate": 1.399161254038739e-05, + "loss": 0.3479, + "step": 7505 + }, + { + "epoch": 1.479692429022082, + "grad_norm": 0.5213687127491038, + "learning_rate": 1.399019130347428e-05, + "loss": 0.3591, + "step": 7506 + }, + { + "epoch": 1.4798895899053628, + "grad_norm": 0.49494600597377825, + "learning_rate": 1.3988769970693607e-05, + "loss": 0.3501, + "step": 7507 + }, + { + "epoch": 1.4800867507886435, + "grad_norm": 0.5047526802460451, + "learning_rate": 1.3987348542079526e-05, + "loss": 0.355, + "step": 7508 + }, + { + "epoch": 1.4802839116719242, + "grad_norm": 0.48534628796917784, + "learning_rate": 1.3985927017666183e-05, + "loss": 0.3306, + "step": 7509 + }, + { + "epoch": 1.4804810725552051, + "grad_norm": 0.5163215036754177, + "learning_rate": 1.3984505397487736e-05, + "loss": 0.3277, + "step": 7510 + }, + { + "epoch": 1.4806782334384858, + "grad_norm": 0.48771109504055543, + "learning_rate": 1.3983083681578336e-05, + "loss": 0.3142, + "step": 7511 + }, + { + "epoch": 1.4808753943217665, + "grad_norm": 0.49100954381314643, + "learning_rate": 1.3981661869972143e-05, + "loss": 0.3508, + "step": 7512 + }, + { + "epoch": 1.4810725552050474, + "grad_norm": 0.4964769898882689, + "learning_rate": 1.3980239962703316e-05, + "loss": 0.3195, + "step": 7513 + }, + { + "epoch": 1.4812697160883281, + "grad_norm": 0.4933646357348641, + "learning_rate": 1.3978817959806022e-05, + "loss": 0.3346, + "step": 7514 + }, + { + "epoch": 1.4814668769716088, + "grad_norm": 0.5237222580406612, + "learning_rate": 1.397739586131442e-05, + "loss": 0.3534, + "step": 7515 + }, + { + "epoch": 1.4816640378548895, + "grad_norm": 0.5499841344032339, + "learning_rate": 1.3975973667262678e-05, + "loss": 0.3752, + "step": 7516 + }, + { + "epoch": 1.4818611987381702, + "grad_norm": 0.5281225252481041, + "learning_rate": 1.397455137768497e-05, + "loss": 0.3612, + "step": 7517 + }, + { + "epoch": 1.4820583596214512, + "grad_norm": 0.4754396639001578, + "learning_rate": 1.3973128992615461e-05, + "loss": 0.3294, + "step": 7518 + }, + { + "epoch": 1.4822555205047319, + "grad_norm": 0.5106352743028668, + "learning_rate": 1.3971706512088334e-05, + "loss": 0.3489, + "step": 7519 + }, + { + "epoch": 1.4824526813880126, + "grad_norm": 0.4899386431725224, + "learning_rate": 1.3970283936137755e-05, + "loss": 0.326, + "step": 7520 + }, + { + "epoch": 1.4826498422712935, + "grad_norm": 0.49818208559529764, + "learning_rate": 1.3968861264797911e-05, + "loss": 0.3261, + "step": 7521 + }, + { + "epoch": 1.4828470031545742, + "grad_norm": 0.49436959103979455, + "learning_rate": 1.3967438498102971e-05, + "loss": 0.3511, + "step": 7522 + }, + { + "epoch": 1.4830441640378549, + "grad_norm": 0.5033269127969884, + "learning_rate": 1.3966015636087133e-05, + "loss": 0.3574, + "step": 7523 + }, + { + "epoch": 1.4832413249211356, + "grad_norm": 0.4906719889384929, + "learning_rate": 1.3964592678784574e-05, + "loss": 0.3317, + "step": 7524 + }, + { + "epoch": 1.4834384858044163, + "grad_norm": 0.516805390516583, + "learning_rate": 1.3963169626229485e-05, + "loss": 0.3533, + "step": 7525 + }, + { + "epoch": 1.4836356466876972, + "grad_norm": 0.4704857874190068, + "learning_rate": 1.396174647845605e-05, + "loss": 0.3244, + "step": 7526 + }, + { + "epoch": 1.483832807570978, + "grad_norm": 0.8886639274014726, + "learning_rate": 1.396032323549847e-05, + "loss": 0.36, + "step": 7527 + }, + { + "epoch": 1.4840299684542586, + "grad_norm": 0.5070409296139642, + "learning_rate": 1.3958899897390935e-05, + "loss": 0.3479, + "step": 7528 + }, + { + "epoch": 1.4842271293375395, + "grad_norm": 0.46156188122605435, + "learning_rate": 1.3957476464167639e-05, + "loss": 0.317, + "step": 7529 + }, + { + "epoch": 1.4844242902208202, + "grad_norm": 0.4798500007246751, + "learning_rate": 1.3956052935862782e-05, + "loss": 0.3307, + "step": 7530 + }, + { + "epoch": 1.484621451104101, + "grad_norm": 0.4699403853866003, + "learning_rate": 1.3954629312510573e-05, + "loss": 0.3395, + "step": 7531 + }, + { + "epoch": 1.4848186119873816, + "grad_norm": 0.5019467243741862, + "learning_rate": 1.3953205594145207e-05, + "loss": 0.3489, + "step": 7532 + }, + { + "epoch": 1.4850157728706623, + "grad_norm": 0.49998290248687316, + "learning_rate": 1.3951781780800892e-05, + "loss": 0.3493, + "step": 7533 + }, + { + "epoch": 1.4852129337539433, + "grad_norm": 0.4952129809254002, + "learning_rate": 1.3950357872511839e-05, + "loss": 0.3442, + "step": 7534 + }, + { + "epoch": 1.485410094637224, + "grad_norm": 0.5444354708755205, + "learning_rate": 1.3948933869312258e-05, + "loss": 0.3442, + "step": 7535 + }, + { + "epoch": 1.4856072555205047, + "grad_norm": 0.4968530516895542, + "learning_rate": 1.3947509771236361e-05, + "loss": 0.3205, + "step": 7536 + }, + { + "epoch": 1.4858044164037856, + "grad_norm": 0.6219558232573202, + "learning_rate": 1.3946085578318358e-05, + "loss": 0.3735, + "step": 7537 + }, + { + "epoch": 1.4860015772870663, + "grad_norm": 0.48319973377602, + "learning_rate": 1.3944661290592476e-05, + "loss": 0.3647, + "step": 7538 + }, + { + "epoch": 1.486198738170347, + "grad_norm": 0.4717457826676787, + "learning_rate": 1.3943236908092926e-05, + "loss": 0.3193, + "step": 7539 + }, + { + "epoch": 1.4863958990536277, + "grad_norm": 0.544058207471016, + "learning_rate": 1.3941812430853938e-05, + "loss": 0.3496, + "step": 7540 + }, + { + "epoch": 1.4865930599369084, + "grad_norm": 0.5298496126711097, + "learning_rate": 1.394038785890973e-05, + "loss": 0.3244, + "step": 7541 + }, + { + "epoch": 1.4867902208201893, + "grad_norm": 0.4946581847477271, + "learning_rate": 1.3938963192294533e-05, + "loss": 0.355, + "step": 7542 + }, + { + "epoch": 1.48698738170347, + "grad_norm": 0.46001185450442234, + "learning_rate": 1.3937538431042567e-05, + "loss": 0.3039, + "step": 7543 + }, + { + "epoch": 1.4871845425867507, + "grad_norm": 0.6003857261401706, + "learning_rate": 1.3936113575188074e-05, + "loss": 0.3356, + "step": 7544 + }, + { + "epoch": 1.4873817034700316, + "grad_norm": 0.48039873611738615, + "learning_rate": 1.3934688624765282e-05, + "loss": 0.3188, + "step": 7545 + }, + { + "epoch": 1.4875788643533123, + "grad_norm": 0.458905469054827, + "learning_rate": 1.3933263579808426e-05, + "loss": 0.3186, + "step": 7546 + }, + { + "epoch": 1.487776025236593, + "grad_norm": 0.5281694152795835, + "learning_rate": 1.3931838440351748e-05, + "loss": 0.3689, + "step": 7547 + }, + { + "epoch": 1.487973186119874, + "grad_norm": 0.48164249983072344, + "learning_rate": 1.3930413206429483e-05, + "loss": 0.3264, + "step": 7548 + }, + { + "epoch": 1.4881703470031546, + "grad_norm": 0.4698429076871976, + "learning_rate": 1.3928987878075874e-05, + "loss": 0.3312, + "step": 7549 + }, + { + "epoch": 1.4883675078864353, + "grad_norm": 0.4930590803919952, + "learning_rate": 1.392756245532517e-05, + "loss": 0.3312, + "step": 7550 + }, + { + "epoch": 1.488564668769716, + "grad_norm": 0.4898819634829872, + "learning_rate": 1.3926136938211615e-05, + "loss": 0.3485, + "step": 7551 + }, + { + "epoch": 1.4887618296529967, + "grad_norm": 0.48401145600289563, + "learning_rate": 1.3924711326769457e-05, + "loss": 0.3347, + "step": 7552 + }, + { + "epoch": 1.4889589905362777, + "grad_norm": 0.47592844367724896, + "learning_rate": 1.392328562103295e-05, + "loss": 0.3189, + "step": 7553 + }, + { + "epoch": 1.4891561514195584, + "grad_norm": 0.47313537818185825, + "learning_rate": 1.3921859821036345e-05, + "loss": 0.3195, + "step": 7554 + }, + { + "epoch": 1.489353312302839, + "grad_norm": 0.4828381063664307, + "learning_rate": 1.3920433926813901e-05, + "loss": 0.348, + "step": 7555 + }, + { + "epoch": 1.48955047318612, + "grad_norm": 0.5075800480469936, + "learning_rate": 1.3919007938399873e-05, + "loss": 0.353, + "step": 7556 + }, + { + "epoch": 1.4897476340694007, + "grad_norm": 0.4605077583110866, + "learning_rate": 1.3917581855828526e-05, + "loss": 0.3158, + "step": 7557 + }, + { + "epoch": 1.4899447949526814, + "grad_norm": 0.523185615476558, + "learning_rate": 1.3916155679134118e-05, + "loss": 0.3629, + "step": 7558 + }, + { + "epoch": 1.490141955835962, + "grad_norm": 4.614531591833265, + "learning_rate": 1.3914729408350918e-05, + "loss": 0.3198, + "step": 7559 + }, + { + "epoch": 1.4903391167192428, + "grad_norm": 0.6690308387896126, + "learning_rate": 1.3913303043513188e-05, + "loss": 0.3414, + "step": 7560 + }, + { + "epoch": 1.4905362776025237, + "grad_norm": 0.497169553741172, + "learning_rate": 1.3911876584655206e-05, + "loss": 0.3556, + "step": 7561 + }, + { + "epoch": 1.4907334384858044, + "grad_norm": 0.47652650301196414, + "learning_rate": 1.3910450031811235e-05, + "loss": 0.3258, + "step": 7562 + }, + { + "epoch": 1.4909305993690851, + "grad_norm": 0.500693022752151, + "learning_rate": 1.3909023385015551e-05, + "loss": 0.3462, + "step": 7563 + }, + { + "epoch": 1.491127760252366, + "grad_norm": 0.5068560385443583, + "learning_rate": 1.390759664430244e-05, + "loss": 0.3506, + "step": 7564 + }, + { + "epoch": 1.4913249211356467, + "grad_norm": 0.5227101584388434, + "learning_rate": 1.3906169809706165e-05, + "loss": 0.3384, + "step": 7565 + }, + { + "epoch": 1.4915220820189274, + "grad_norm": 0.49830512467585986, + "learning_rate": 1.390474288126102e-05, + "loss": 0.3357, + "step": 7566 + }, + { + "epoch": 1.4917192429022081, + "grad_norm": 0.5135981865878078, + "learning_rate": 1.3903315859001278e-05, + "loss": 0.3515, + "step": 7567 + }, + { + "epoch": 1.4919164037854888, + "grad_norm": 0.5072032920741137, + "learning_rate": 1.3901888742961233e-05, + "loss": 0.3636, + "step": 7568 + }, + { + "epoch": 1.4921135646687698, + "grad_norm": 0.46652743790054163, + "learning_rate": 1.3900461533175167e-05, + "loss": 0.3255, + "step": 7569 + }, + { + "epoch": 1.4923107255520505, + "grad_norm": 0.4559665210107085, + "learning_rate": 1.3899034229677373e-05, + "loss": 0.3187, + "step": 7570 + }, + { + "epoch": 1.4925078864353312, + "grad_norm": 0.4955869955690288, + "learning_rate": 1.389760683250214e-05, + "loss": 0.3473, + "step": 7571 + }, + { + "epoch": 1.492705047318612, + "grad_norm": 0.532343767493093, + "learning_rate": 1.3896179341683763e-05, + "loss": 0.3666, + "step": 7572 + }, + { + "epoch": 1.4929022082018928, + "grad_norm": 0.49706921835508877, + "learning_rate": 1.3894751757256544e-05, + "loss": 0.3403, + "step": 7573 + }, + { + "epoch": 1.4930993690851735, + "grad_norm": 0.5079181565121896, + "learning_rate": 1.3893324079254776e-05, + "loss": 0.3687, + "step": 7574 + }, + { + "epoch": 1.4932965299684542, + "grad_norm": 0.4908701711814409, + "learning_rate": 1.389189630771276e-05, + "loss": 0.3468, + "step": 7575 + }, + { + "epoch": 1.493493690851735, + "grad_norm": 0.4992855170423717, + "learning_rate": 1.3890468442664801e-05, + "loss": 0.3304, + "step": 7576 + }, + { + "epoch": 1.4936908517350158, + "grad_norm": 0.5074458565945166, + "learning_rate": 1.3889040484145206e-05, + "loss": 0.349, + "step": 7577 + }, + { + "epoch": 1.4938880126182965, + "grad_norm": 0.4776291728277284, + "learning_rate": 1.3887612432188282e-05, + "loss": 0.3391, + "step": 7578 + }, + { + "epoch": 1.4940851735015772, + "grad_norm": 0.47517474086457523, + "learning_rate": 1.388618428682834e-05, + "loss": 0.3163, + "step": 7579 + }, + { + "epoch": 1.4942823343848581, + "grad_norm": 0.4757824327904514, + "learning_rate": 1.3884756048099688e-05, + "loss": 0.3221, + "step": 7580 + }, + { + "epoch": 1.4944794952681388, + "grad_norm": 0.5268640892228607, + "learning_rate": 1.3883327716036643e-05, + "loss": 0.3628, + "step": 7581 + }, + { + "epoch": 1.4946766561514195, + "grad_norm": 0.4902773690445549, + "learning_rate": 1.3881899290673526e-05, + "loss": 0.3641, + "step": 7582 + }, + { + "epoch": 1.4948738170347002, + "grad_norm": 0.4598374139054406, + "learning_rate": 1.388047077204465e-05, + "loss": 0.3223, + "step": 7583 + }, + { + "epoch": 1.495070977917981, + "grad_norm": 0.4847424035881873, + "learning_rate": 1.3879042160184337e-05, + "loss": 0.3355, + "step": 7584 + }, + { + "epoch": 1.4952681388012619, + "grad_norm": 0.5130377501847655, + "learning_rate": 1.3877613455126918e-05, + "loss": 0.3306, + "step": 7585 + }, + { + "epoch": 1.4954652996845426, + "grad_norm": 0.4886195265619356, + "learning_rate": 1.3876184656906706e-05, + "loss": 0.3358, + "step": 7586 + }, + { + "epoch": 1.4956624605678233, + "grad_norm": 0.48902748634085175, + "learning_rate": 1.387475576555804e-05, + "loss": 0.339, + "step": 7587 + }, + { + "epoch": 1.4958596214511042, + "grad_norm": 0.47601101485961045, + "learning_rate": 1.3873326781115247e-05, + "loss": 0.3206, + "step": 7588 + }, + { + "epoch": 1.4960567823343849, + "grad_norm": 0.4942695337736247, + "learning_rate": 1.3871897703612658e-05, + "loss": 0.3439, + "step": 7589 + }, + { + "epoch": 1.4962539432176656, + "grad_norm": 0.4695829997055428, + "learning_rate": 1.3870468533084606e-05, + "loss": 0.3288, + "step": 7590 + }, + { + "epoch": 1.4964511041009465, + "grad_norm": 0.5104474370022767, + "learning_rate": 1.3869039269565434e-05, + "loss": 0.3388, + "step": 7591 + }, + { + "epoch": 1.4966482649842272, + "grad_norm": 0.508559710861152, + "learning_rate": 1.3867609913089476e-05, + "loss": 0.3353, + "step": 7592 + }, + { + "epoch": 1.496845425867508, + "grad_norm": 0.5039315719219875, + "learning_rate": 1.3866180463691077e-05, + "loss": 0.3208, + "step": 7593 + }, + { + "epoch": 1.4970425867507886, + "grad_norm": 0.5191040517201374, + "learning_rate": 1.386475092140458e-05, + "loss": 0.3468, + "step": 7594 + }, + { + "epoch": 1.4972397476340693, + "grad_norm": 0.4731795958829586, + "learning_rate": 1.3863321286264326e-05, + "loss": 0.2784, + "step": 7595 + }, + { + "epoch": 1.4974369085173502, + "grad_norm": 0.5740007403730863, + "learning_rate": 1.386189155830467e-05, + "loss": 0.3461, + "step": 7596 + }, + { + "epoch": 1.497634069400631, + "grad_norm": 0.5117098865886438, + "learning_rate": 1.3860461737559958e-05, + "loss": 0.3335, + "step": 7597 + }, + { + "epoch": 1.4978312302839116, + "grad_norm": 0.4963354358560534, + "learning_rate": 1.3859031824064543e-05, + "loss": 0.3213, + "step": 7598 + }, + { + "epoch": 1.4980283911671926, + "grad_norm": 0.5362349288534428, + "learning_rate": 1.3857601817852785e-05, + "loss": 0.3503, + "step": 7599 + }, + { + "epoch": 1.4982255520504733, + "grad_norm": 0.46854667444421344, + "learning_rate": 1.3856171718959033e-05, + "loss": 0.3113, + "step": 7600 + }, + { + "epoch": 1.498422712933754, + "grad_norm": 0.4976815417371681, + "learning_rate": 1.385474152741765e-05, + "loss": 0.3528, + "step": 7601 + }, + { + "epoch": 1.4986198738170347, + "grad_norm": 0.5129716538025957, + "learning_rate": 1.3853311243262999e-05, + "loss": 0.3332, + "step": 7602 + }, + { + "epoch": 1.4988170347003154, + "grad_norm": 0.4604531892193082, + "learning_rate": 1.3851880866529444e-05, + "loss": 0.3217, + "step": 7603 + }, + { + "epoch": 1.4990141955835963, + "grad_norm": 0.6867111001620063, + "learning_rate": 1.3850450397251344e-05, + "loss": 0.3352, + "step": 7604 + }, + { + "epoch": 1.499211356466877, + "grad_norm": 0.49556889703029594, + "learning_rate": 1.3849019835463076e-05, + "loss": 0.3441, + "step": 7605 + }, + { + "epoch": 1.4994085173501577, + "grad_norm": 0.6455148770924305, + "learning_rate": 1.3847589181199009e-05, + "loss": 0.3688, + "step": 7606 + }, + { + "epoch": 1.4996056782334386, + "grad_norm": 0.49503800910765083, + "learning_rate": 1.3846158434493507e-05, + "loss": 0.3615, + "step": 7607 + }, + { + "epoch": 1.4998028391167193, + "grad_norm": 0.5347889365402132, + "learning_rate": 1.3844727595380958e-05, + "loss": 0.3554, + "step": 7608 + }, + { + "epoch": 1.4998028391167193, + "eval_loss": 0.43272438645362854, + "eval_runtime": 344.4401, + "eval_samples_per_second": 23.604, + "eval_steps_per_second": 1.478, + "step": 7608 + }, + { + "epoch": 1.5, + "grad_norm": 11.409221739880106, + "learning_rate": 1.3843296663895726e-05, + "loss": 0.421, + "step": 7609 + }, + { + "epoch": 1.500197160883281, + "grad_norm": 0.5293429847721727, + "learning_rate": 1.3841865640072203e-05, + "loss": 0.3249, + "step": 7610 + }, + { + "epoch": 1.5003943217665614, + "grad_norm": 0.5879361449704711, + "learning_rate": 1.3840434523944759e-05, + "loss": 0.3574, + "step": 7611 + }, + { + "epoch": 1.5005914826498423, + "grad_norm": 0.507948237489469, + "learning_rate": 1.3839003315547785e-05, + "loss": 0.3477, + "step": 7612 + }, + { + "epoch": 1.500788643533123, + "grad_norm": 0.5174406330344764, + "learning_rate": 1.3837572014915669e-05, + "loss": 0.3464, + "step": 7613 + }, + { + "epoch": 1.5009858044164037, + "grad_norm": 0.507543096114278, + "learning_rate": 1.3836140622082788e-05, + "loss": 0.3351, + "step": 7614 + }, + { + "epoch": 1.5011829652996846, + "grad_norm": 0.5397406725670891, + "learning_rate": 1.3834709137083544e-05, + "loss": 0.3506, + "step": 7615 + }, + { + "epoch": 1.5013801261829653, + "grad_norm": 0.5232479139909186, + "learning_rate": 1.3833277559952323e-05, + "loss": 0.3521, + "step": 7616 + }, + { + "epoch": 1.501577287066246, + "grad_norm": 0.4913998494274104, + "learning_rate": 1.3831845890723523e-05, + "loss": 0.351, + "step": 7617 + }, + { + "epoch": 1.501774447949527, + "grad_norm": 0.5928180766843834, + "learning_rate": 1.3830414129431538e-05, + "loss": 0.3554, + "step": 7618 + }, + { + "epoch": 1.5019716088328074, + "grad_norm": 0.47768284134792766, + "learning_rate": 1.3828982276110767e-05, + "loss": 0.323, + "step": 7619 + }, + { + "epoch": 1.5021687697160884, + "grad_norm": 0.4784865791461987, + "learning_rate": 1.3827550330795618e-05, + "loss": 0.2985, + "step": 7620 + }, + { + "epoch": 1.502365930599369, + "grad_norm": 0.5109879323464163, + "learning_rate": 1.3826118293520488e-05, + "loss": 0.3387, + "step": 7621 + }, + { + "epoch": 1.5025630914826498, + "grad_norm": 0.4942414542354045, + "learning_rate": 1.3824686164319782e-05, + "loss": 0.3306, + "step": 7622 + }, + { + "epoch": 1.5027602523659307, + "grad_norm": 0.5035209979496461, + "learning_rate": 1.3823253943227916e-05, + "loss": 0.3313, + "step": 7623 + }, + { + "epoch": 1.5029574132492114, + "grad_norm": 0.5438482479157711, + "learning_rate": 1.382182163027929e-05, + "loss": 0.3251, + "step": 7624 + }, + { + "epoch": 1.503154574132492, + "grad_norm": 0.48477404769438337, + "learning_rate": 1.3820389225508327e-05, + "loss": 0.3358, + "step": 7625 + }, + { + "epoch": 1.503351735015773, + "grad_norm": 0.49598419627564255, + "learning_rate": 1.3818956728949432e-05, + "loss": 0.3176, + "step": 7626 + }, + { + "epoch": 1.5035488958990535, + "grad_norm": 0.6022310575721551, + "learning_rate": 1.3817524140637029e-05, + "loss": 0.3363, + "step": 7627 + }, + { + "epoch": 1.5037460567823344, + "grad_norm": 0.4892461050892043, + "learning_rate": 1.3816091460605534e-05, + "loss": 0.3255, + "step": 7628 + }, + { + "epoch": 1.5039432176656151, + "grad_norm": 0.552459885881935, + "learning_rate": 1.3814658688889369e-05, + "loss": 0.3272, + "step": 7629 + }, + { + "epoch": 1.5041403785488958, + "grad_norm": 0.5011577635235227, + "learning_rate": 1.3813225825522954e-05, + "loss": 0.3263, + "step": 7630 + }, + { + "epoch": 1.5043375394321767, + "grad_norm": 0.5071339854073511, + "learning_rate": 1.3811792870540717e-05, + "loss": 0.3306, + "step": 7631 + }, + { + "epoch": 1.5045347003154574, + "grad_norm": 0.4690112256108807, + "learning_rate": 1.3810359823977094e-05, + "loss": 0.3142, + "step": 7632 + }, + { + "epoch": 1.5047318611987381, + "grad_norm": 0.507389006042168, + "learning_rate": 1.38089266858665e-05, + "loss": 0.3595, + "step": 7633 + }, + { + "epoch": 1.504929022082019, + "grad_norm": 0.5136892463140303, + "learning_rate": 1.380749345624338e-05, + "loss": 0.3552, + "step": 7634 + }, + { + "epoch": 1.5051261829652995, + "grad_norm": 0.5230467446626003, + "learning_rate": 1.3806060135142159e-05, + "loss": 0.3361, + "step": 7635 + }, + { + "epoch": 1.5053233438485805, + "grad_norm": 0.47365615277402534, + "learning_rate": 1.3804626722597283e-05, + "loss": 0.3181, + "step": 7636 + }, + { + "epoch": 1.5055205047318612, + "grad_norm": 0.5263364289856198, + "learning_rate": 1.3803193218643181e-05, + "loss": 0.3324, + "step": 7637 + }, + { + "epoch": 1.5057176656151419, + "grad_norm": 0.5134844859050957, + "learning_rate": 1.3801759623314302e-05, + "loss": 0.3429, + "step": 7638 + }, + { + "epoch": 1.5059148264984228, + "grad_norm": 0.4801245661280436, + "learning_rate": 1.3800325936645087e-05, + "loss": 0.2968, + "step": 7639 + }, + { + "epoch": 1.5061119873817035, + "grad_norm": 0.4700963225645677, + "learning_rate": 1.379889215866998e-05, + "loss": 0.3203, + "step": 7640 + }, + { + "epoch": 1.5063091482649842, + "grad_norm": 0.4710374994497814, + "learning_rate": 1.3797458289423431e-05, + "loss": 0.3241, + "step": 7641 + }, + { + "epoch": 1.506506309148265, + "grad_norm": 0.4957756081340281, + "learning_rate": 1.3796024328939887e-05, + "loss": 0.3512, + "step": 7642 + }, + { + "epoch": 1.5067034700315456, + "grad_norm": 0.4761451630851939, + "learning_rate": 1.3794590277253803e-05, + "loss": 0.334, + "step": 7643 + }, + { + "epoch": 1.5069006309148265, + "grad_norm": 0.4814926896127318, + "learning_rate": 1.3793156134399633e-05, + "loss": 0.3246, + "step": 7644 + }, + { + "epoch": 1.5070977917981072, + "grad_norm": 0.4828137836425216, + "learning_rate": 1.379172190041183e-05, + "loss": 0.3414, + "step": 7645 + }, + { + "epoch": 1.507294952681388, + "grad_norm": 0.5194210689710018, + "learning_rate": 1.3790287575324854e-05, + "loss": 0.3533, + "step": 7646 + }, + { + "epoch": 1.5074921135646688, + "grad_norm": 0.4962183001455974, + "learning_rate": 1.3788853159173169e-05, + "loss": 0.3336, + "step": 7647 + }, + { + "epoch": 1.5076892744479495, + "grad_norm": 0.4950050985316144, + "learning_rate": 1.3787418651991233e-05, + "loss": 0.3565, + "step": 7648 + }, + { + "epoch": 1.5078864353312302, + "grad_norm": 0.49021770978943213, + "learning_rate": 1.3785984053813517e-05, + "loss": 0.3356, + "step": 7649 + }, + { + "epoch": 1.5080835962145112, + "grad_norm": 0.47294234808211555, + "learning_rate": 1.3784549364674485e-05, + "loss": 0.3198, + "step": 7650 + }, + { + "epoch": 1.5082807570977916, + "grad_norm": 0.5189806281206693, + "learning_rate": 1.3783114584608605e-05, + "loss": 0.3294, + "step": 7651 + }, + { + "epoch": 1.5084779179810726, + "grad_norm": 0.5185822245810467, + "learning_rate": 1.3781679713650349e-05, + "loss": 0.3446, + "step": 7652 + }, + { + "epoch": 1.5086750788643533, + "grad_norm": 0.4825839494350987, + "learning_rate": 1.3780244751834197e-05, + "loss": 0.3422, + "step": 7653 + }, + { + "epoch": 1.508872239747634, + "grad_norm": 0.4796252236159735, + "learning_rate": 1.3778809699194616e-05, + "loss": 0.3431, + "step": 7654 + }, + { + "epoch": 1.5090694006309149, + "grad_norm": 0.5021963528108637, + "learning_rate": 1.3777374555766093e-05, + "loss": 0.3401, + "step": 7655 + }, + { + "epoch": 1.5092665615141956, + "grad_norm": 0.5751823831914286, + "learning_rate": 1.37759393215831e-05, + "loss": 0.3532, + "step": 7656 + }, + { + "epoch": 1.5094637223974763, + "grad_norm": 0.4777026387116764, + "learning_rate": 1.3774503996680128e-05, + "loss": 0.3461, + "step": 7657 + }, + { + "epoch": 1.5096608832807572, + "grad_norm": 0.47046319672734455, + "learning_rate": 1.3773068581091655e-05, + "loss": 0.3252, + "step": 7658 + }, + { + "epoch": 1.509858044164038, + "grad_norm": 0.5047250594337946, + "learning_rate": 1.3771633074852173e-05, + "loss": 0.3539, + "step": 7659 + }, + { + "epoch": 1.5100552050473186, + "grad_norm": 0.5141111962164151, + "learning_rate": 1.3770197477996168e-05, + "loss": 0.3272, + "step": 7660 + }, + { + "epoch": 1.5102523659305995, + "grad_norm": 0.4811169395000833, + "learning_rate": 1.3768761790558134e-05, + "loss": 0.3245, + "step": 7661 + }, + { + "epoch": 1.51044952681388, + "grad_norm": 0.5244771681249779, + "learning_rate": 1.3767326012572561e-05, + "loss": 0.3566, + "step": 7662 + }, + { + "epoch": 1.510646687697161, + "grad_norm": 0.4704331253968268, + "learning_rate": 1.376589014407395e-05, + "loss": 0.3154, + "step": 7663 + }, + { + "epoch": 1.5108438485804416, + "grad_norm": 0.49390026411452725, + "learning_rate": 1.3764454185096792e-05, + "loss": 0.3377, + "step": 7664 + }, + { + "epoch": 1.5110410094637223, + "grad_norm": 0.45874452901953416, + "learning_rate": 1.3763018135675592e-05, + "loss": 0.3388, + "step": 7665 + }, + { + "epoch": 1.5112381703470033, + "grad_norm": 0.4768940389299864, + "learning_rate": 1.3761581995844852e-05, + "loss": 0.3355, + "step": 7666 + }, + { + "epoch": 1.511435331230284, + "grad_norm": 0.511936222268746, + "learning_rate": 1.3760145765639075e-05, + "loss": 0.3519, + "step": 7667 + }, + { + "epoch": 1.5116324921135647, + "grad_norm": 0.7261444629938607, + "learning_rate": 1.3758709445092767e-05, + "loss": 0.3503, + "step": 7668 + }, + { + "epoch": 1.5118296529968456, + "grad_norm": 0.47879946943440466, + "learning_rate": 1.3757273034240437e-05, + "loss": 0.3403, + "step": 7669 + }, + { + "epoch": 1.512026813880126, + "grad_norm": 0.5050574898314102, + "learning_rate": 1.3755836533116597e-05, + "loss": 0.3364, + "step": 7670 + }, + { + "epoch": 1.512223974763407, + "grad_norm": 0.49140249592667906, + "learning_rate": 1.3754399941755763e-05, + "loss": 0.3237, + "step": 7671 + }, + { + "epoch": 1.5124211356466877, + "grad_norm": 0.5364739642991352, + "learning_rate": 1.3752963260192442e-05, + "loss": 0.3594, + "step": 7672 + }, + { + "epoch": 1.5126182965299684, + "grad_norm": 0.5184206685342432, + "learning_rate": 1.3751526488461158e-05, + "loss": 0.3346, + "step": 7673 + }, + { + "epoch": 1.5128154574132493, + "grad_norm": 0.5054659832613505, + "learning_rate": 1.375008962659643e-05, + "loss": 0.3309, + "step": 7674 + }, + { + "epoch": 1.51301261829653, + "grad_norm": 0.5555312582768909, + "learning_rate": 1.3748652674632779e-05, + "loss": 0.359, + "step": 7675 + }, + { + "epoch": 1.5132097791798107, + "grad_norm": 0.5258696149826486, + "learning_rate": 1.374721563260473e-05, + "loss": 0.3485, + "step": 7676 + }, + { + "epoch": 1.5134069400630916, + "grad_norm": 7.007449812708166, + "learning_rate": 1.3745778500546805e-05, + "loss": 0.326, + "step": 7677 + }, + { + "epoch": 1.513604100946372, + "grad_norm": 0.5503065900084199, + "learning_rate": 1.3744341278493535e-05, + "loss": 0.333, + "step": 7678 + }, + { + "epoch": 1.513801261829653, + "grad_norm": 0.46540872566930663, + "learning_rate": 1.374290396647945e-05, + "loss": 0.308, + "step": 7679 + }, + { + "epoch": 1.5139984227129337, + "grad_norm": 0.48222335793406707, + "learning_rate": 1.3741466564539085e-05, + "loss": 0.3374, + "step": 7680 + }, + { + "epoch": 1.5141955835962144, + "grad_norm": 0.5340584301274428, + "learning_rate": 1.3740029072706975e-05, + "loss": 0.3339, + "step": 7681 + }, + { + "epoch": 1.5143927444794953, + "grad_norm": 0.4798870462171238, + "learning_rate": 1.373859149101765e-05, + "loss": 0.327, + "step": 7682 + }, + { + "epoch": 1.514589905362776, + "grad_norm": 0.5198175974996748, + "learning_rate": 1.3737153819505658e-05, + "loss": 0.3557, + "step": 7683 + }, + { + "epoch": 1.5147870662460567, + "grad_norm": 0.5400239741407015, + "learning_rate": 1.3735716058205533e-05, + "loss": 0.3415, + "step": 7684 + }, + { + "epoch": 1.5149842271293377, + "grad_norm": 0.740156822222281, + "learning_rate": 1.3734278207151824e-05, + "loss": 0.353, + "step": 7685 + }, + { + "epoch": 1.5151813880126181, + "grad_norm": 0.5022300864303025, + "learning_rate": 1.3732840266379071e-05, + "loss": 0.3161, + "step": 7686 + }, + { + "epoch": 1.515378548895899, + "grad_norm": 0.4932017068142421, + "learning_rate": 1.3731402235921824e-05, + "loss": 0.3324, + "step": 7687 + }, + { + "epoch": 1.5155757097791798, + "grad_norm": 0.5065348898290931, + "learning_rate": 1.3729964115814636e-05, + "loss": 0.3333, + "step": 7688 + }, + { + "epoch": 1.5157728706624605, + "grad_norm": 0.5417643838253485, + "learning_rate": 1.3728525906092056e-05, + "loss": 0.3521, + "step": 7689 + }, + { + "epoch": 1.5159700315457414, + "grad_norm": 0.4907116291133275, + "learning_rate": 1.3727087606788639e-05, + "loss": 0.3293, + "step": 7690 + }, + { + "epoch": 1.516167192429022, + "grad_norm": 0.47069234491091616, + "learning_rate": 1.3725649217938938e-05, + "loss": 0.328, + "step": 7691 + }, + { + "epoch": 1.5163643533123028, + "grad_norm": 0.6967452358004222, + "learning_rate": 1.3724210739577516e-05, + "loss": 0.3358, + "step": 7692 + }, + { + "epoch": 1.5165615141955837, + "grad_norm": 0.5071542817692918, + "learning_rate": 1.3722772171738932e-05, + "loss": 0.3593, + "step": 7693 + }, + { + "epoch": 1.5167586750788642, + "grad_norm": 0.48669388997244545, + "learning_rate": 1.3721333514457748e-05, + "loss": 0.3457, + "step": 7694 + }, + { + "epoch": 1.5169558359621451, + "grad_norm": 0.49829646210372097, + "learning_rate": 1.3719894767768532e-05, + "loss": 0.3418, + "step": 7695 + }, + { + "epoch": 1.5171529968454258, + "grad_norm": 0.5104164374213708, + "learning_rate": 1.3718455931705845e-05, + "loss": 0.3544, + "step": 7696 + }, + { + "epoch": 1.5173501577287065, + "grad_norm": 0.5017546986684652, + "learning_rate": 1.371701700630426e-05, + "loss": 0.355, + "step": 7697 + }, + { + "epoch": 1.5175473186119874, + "grad_norm": 0.5926037563656988, + "learning_rate": 1.3715577991598352e-05, + "loss": 0.3789, + "step": 7698 + }, + { + "epoch": 1.5177444794952681, + "grad_norm": 0.4963131869513678, + "learning_rate": 1.3714138887622685e-05, + "loss": 0.3448, + "step": 7699 + }, + { + "epoch": 1.5179416403785488, + "grad_norm": 0.5147685242406805, + "learning_rate": 1.3712699694411846e-05, + "loss": 0.3254, + "step": 7700 + }, + { + "epoch": 1.5181388012618298, + "grad_norm": 0.5075260322768886, + "learning_rate": 1.3711260412000403e-05, + "loss": 0.3396, + "step": 7701 + }, + { + "epoch": 1.5183359621451105, + "grad_norm": 0.4938057819218849, + "learning_rate": 1.3709821040422944e-05, + "loss": 0.3434, + "step": 7702 + }, + { + "epoch": 1.5185331230283912, + "grad_norm": 0.48016800863389886, + "learning_rate": 1.370838157971404e-05, + "loss": 0.3161, + "step": 7703 + }, + { + "epoch": 1.518730283911672, + "grad_norm": 0.4947174895321279, + "learning_rate": 1.370694202990829e-05, + "loss": 0.3636, + "step": 7704 + }, + { + "epoch": 1.5189274447949526, + "grad_norm": 0.4617872910169582, + "learning_rate": 1.3705502391040266e-05, + "loss": 0.3193, + "step": 7705 + }, + { + "epoch": 1.5191246056782335, + "grad_norm": 0.49049250891250884, + "learning_rate": 1.3704062663144569e-05, + "loss": 0.3336, + "step": 7706 + }, + { + "epoch": 1.5193217665615142, + "grad_norm": 0.4457461204930738, + "learning_rate": 1.370262284625578e-05, + "loss": 0.2909, + "step": 7707 + }, + { + "epoch": 1.5195189274447949, + "grad_norm": 0.5403694722432415, + "learning_rate": 1.3701182940408495e-05, + "loss": 0.3567, + "step": 7708 + }, + { + "epoch": 1.5197160883280758, + "grad_norm": 0.4816431559603944, + "learning_rate": 1.3699742945637312e-05, + "loss": 0.3163, + "step": 7709 + }, + { + "epoch": 1.5199132492113565, + "grad_norm": 0.5071851732106253, + "learning_rate": 1.3698302861976822e-05, + "loss": 0.3197, + "step": 7710 + }, + { + "epoch": 1.5201104100946372, + "grad_norm": 0.5036103402901476, + "learning_rate": 1.369686268946163e-05, + "loss": 0.3594, + "step": 7711 + }, + { + "epoch": 1.5203075709779181, + "grad_norm": 0.5082597299605005, + "learning_rate": 1.3695422428126335e-05, + "loss": 0.3452, + "step": 7712 + }, + { + "epoch": 1.5205047318611986, + "grad_norm": 0.5034774870521724, + "learning_rate": 1.3693982078005538e-05, + "loss": 0.3415, + "step": 7713 + }, + { + "epoch": 1.5207018927444795, + "grad_norm": 0.49017821736880307, + "learning_rate": 1.3692541639133849e-05, + "loss": 0.3503, + "step": 7714 + }, + { + "epoch": 1.5208990536277602, + "grad_norm": 0.5033328297669176, + "learning_rate": 1.3691101111545873e-05, + "loss": 0.3306, + "step": 7715 + }, + { + "epoch": 1.521096214511041, + "grad_norm": 0.5089744822677265, + "learning_rate": 1.368966049527622e-05, + "loss": 0.3579, + "step": 7716 + }, + { + "epoch": 1.5212933753943219, + "grad_norm": 0.5390487095974562, + "learning_rate": 1.3688219790359503e-05, + "loss": 0.369, + "step": 7717 + }, + { + "epoch": 1.5214905362776026, + "grad_norm": 0.4996281857585761, + "learning_rate": 1.3686778996830335e-05, + "loss": 0.317, + "step": 7718 + }, + { + "epoch": 1.5216876971608833, + "grad_norm": 0.5360343337780741, + "learning_rate": 1.3685338114723331e-05, + "loss": 0.3536, + "step": 7719 + }, + { + "epoch": 1.5218848580441642, + "grad_norm": 0.48695308863803854, + "learning_rate": 1.3683897144073111e-05, + "loss": 0.3319, + "step": 7720 + }, + { + "epoch": 1.5220820189274447, + "grad_norm": 0.5111947489161915, + "learning_rate": 1.36824560849143e-05, + "loss": 0.3506, + "step": 7721 + }, + { + "epoch": 1.5222791798107256, + "grad_norm": 0.515635778087092, + "learning_rate": 1.3681014937281509e-05, + "loss": 0.2918, + "step": 7722 + }, + { + "epoch": 1.5224763406940063, + "grad_norm": 0.7712050067029159, + "learning_rate": 1.3679573701209376e-05, + "loss": 0.3375, + "step": 7723 + }, + { + "epoch": 1.522673501577287, + "grad_norm": 0.5100802873571525, + "learning_rate": 1.3678132376732518e-05, + "loss": 0.3686, + "step": 7724 + }, + { + "epoch": 1.522870662460568, + "grad_norm": 0.4995731991498597, + "learning_rate": 1.367669096388557e-05, + "loss": 0.3459, + "step": 7725 + }, + { + "epoch": 1.5230678233438486, + "grad_norm": 0.49021205630646913, + "learning_rate": 1.367524946270316e-05, + "loss": 0.3568, + "step": 7726 + }, + { + "epoch": 1.5232649842271293, + "grad_norm": 0.5069952766234989, + "learning_rate": 1.3673807873219921e-05, + "loss": 0.3652, + "step": 7727 + }, + { + "epoch": 1.5234621451104102, + "grad_norm": 0.5044160352391255, + "learning_rate": 1.367236619547049e-05, + "loss": 0.3359, + "step": 7728 + }, + { + "epoch": 1.5236593059936907, + "grad_norm": 0.5003901713256061, + "learning_rate": 1.3670924429489505e-05, + "loss": 0.345, + "step": 7729 + }, + { + "epoch": 1.5238564668769716, + "grad_norm": 0.4700435713643861, + "learning_rate": 1.3669482575311604e-05, + "loss": 0.3138, + "step": 7730 + }, + { + "epoch": 1.5240536277602523, + "grad_norm": 0.489703666578115, + "learning_rate": 1.366804063297143e-05, + "loss": 0.3327, + "step": 7731 + }, + { + "epoch": 1.524250788643533, + "grad_norm": 0.5862569684725426, + "learning_rate": 1.3666598602503622e-05, + "loss": 0.3267, + "step": 7732 + }, + { + "epoch": 1.524447949526814, + "grad_norm": 0.4842290548690208, + "learning_rate": 1.3665156483942834e-05, + "loss": 0.3538, + "step": 7733 + }, + { + "epoch": 1.5246451104100947, + "grad_norm": 0.5540354619028578, + "learning_rate": 1.3663714277323707e-05, + "loss": 0.373, + "step": 7734 + }, + { + "epoch": 1.5248422712933754, + "grad_norm": 0.49143503772872, + "learning_rate": 1.3662271982680895e-05, + "loss": 0.3312, + "step": 7735 + }, + { + "epoch": 1.5250394321766563, + "grad_norm": 0.52098501675389, + "learning_rate": 1.366082960004905e-05, + "loss": 0.3509, + "step": 7736 + }, + { + "epoch": 1.5252365930599368, + "grad_norm": 0.4553592686652839, + "learning_rate": 1.3659387129462826e-05, + "loss": 0.3139, + "step": 7737 + }, + { + "epoch": 1.5254337539432177, + "grad_norm": 0.519004995842705, + "learning_rate": 1.365794457095688e-05, + "loss": 0.3671, + "step": 7738 + }, + { + "epoch": 1.5256309148264984, + "grad_norm": 0.5279828914728227, + "learning_rate": 1.3656501924565867e-05, + "loss": 0.3654, + "step": 7739 + }, + { + "epoch": 1.525828075709779, + "grad_norm": 0.47358808531757013, + "learning_rate": 1.3655059190324453e-05, + "loss": 0.2977, + "step": 7740 + }, + { + "epoch": 1.52602523659306, + "grad_norm": 0.5047664002702137, + "learning_rate": 1.3653616368267297e-05, + "loss": 0.3231, + "step": 7741 + }, + { + "epoch": 1.5262223974763407, + "grad_norm": 0.49780512154971746, + "learning_rate": 1.3652173458429068e-05, + "loss": 0.3339, + "step": 7742 + }, + { + "epoch": 1.5264195583596214, + "grad_norm": 0.49842257988288025, + "learning_rate": 1.3650730460844428e-05, + "loss": 0.3231, + "step": 7743 + }, + { + "epoch": 1.5266167192429023, + "grad_norm": 0.5112605303580525, + "learning_rate": 1.3649287375548052e-05, + "loss": 0.3416, + "step": 7744 + }, + { + "epoch": 1.526813880126183, + "grad_norm": 0.49859797442001647, + "learning_rate": 1.3647844202574603e-05, + "loss": 0.3594, + "step": 7745 + }, + { + "epoch": 1.5270110410094637, + "grad_norm": 0.5055274577950344, + "learning_rate": 1.3646400941958766e-05, + "loss": 0.3519, + "step": 7746 + }, + { + "epoch": 1.5272082018927446, + "grad_norm": 0.5512290570751095, + "learning_rate": 1.3644957593735206e-05, + "loss": 0.3529, + "step": 7747 + }, + { + "epoch": 1.5274053627760251, + "grad_norm": 0.47973030402608713, + "learning_rate": 1.3643514157938603e-05, + "loss": 0.3028, + "step": 7748 + }, + { + "epoch": 1.527602523659306, + "grad_norm": 0.5198514112156248, + "learning_rate": 1.364207063460364e-05, + "loss": 0.3197, + "step": 7749 + }, + { + "epoch": 1.5277996845425867, + "grad_norm": 0.720977186938447, + "learning_rate": 1.3640627023764998e-05, + "loss": 0.3224, + "step": 7750 + }, + { + "epoch": 1.5279968454258674, + "grad_norm": 0.4966494106627751, + "learning_rate": 1.363918332545736e-05, + "loss": 0.3673, + "step": 7751 + }, + { + "epoch": 1.5281940063091484, + "grad_norm": 0.4806489488052334, + "learning_rate": 1.363773953971541e-05, + "loss": 0.3246, + "step": 7752 + }, + { + "epoch": 1.528391167192429, + "grad_norm": 0.5090026005116808, + "learning_rate": 1.3636295666573841e-05, + "loss": 0.3489, + "step": 7753 + }, + { + "epoch": 1.5285883280757098, + "grad_norm": 0.5410349262919377, + "learning_rate": 1.3634851706067335e-05, + "loss": 0.3381, + "step": 7754 + }, + { + "epoch": 1.5287854889589907, + "grad_norm": 0.49058400059922475, + "learning_rate": 1.3633407658230596e-05, + "loss": 0.3363, + "step": 7755 + }, + { + "epoch": 1.5289826498422712, + "grad_norm": 0.45828128853910793, + "learning_rate": 1.3631963523098308e-05, + "loss": 0.334, + "step": 7756 + }, + { + "epoch": 1.529179810725552, + "grad_norm": 0.4813482445211936, + "learning_rate": 1.3630519300705171e-05, + "loss": 0.3347, + "step": 7757 + }, + { + "epoch": 1.5293769716088328, + "grad_norm": 0.484990899943496, + "learning_rate": 1.3629074991085886e-05, + "loss": 0.3298, + "step": 7758 + }, + { + "epoch": 1.5295741324921135, + "grad_norm": 0.49602894149932886, + "learning_rate": 1.3627630594275151e-05, + "loss": 0.3363, + "step": 7759 + }, + { + "epoch": 1.5297712933753944, + "grad_norm": 0.46335979722560355, + "learning_rate": 1.3626186110307673e-05, + "loss": 0.3197, + "step": 7760 + }, + { + "epoch": 1.5299684542586751, + "grad_norm": 0.4907689809200294, + "learning_rate": 1.3624741539218151e-05, + "loss": 0.3183, + "step": 7761 + }, + { + "epoch": 1.5301656151419558, + "grad_norm": 0.48534404933301745, + "learning_rate": 1.3623296881041294e-05, + "loss": 0.3298, + "step": 7762 + }, + { + "epoch": 1.5303627760252367, + "grad_norm": 0.44875294362215107, + "learning_rate": 1.3621852135811812e-05, + "loss": 0.2968, + "step": 7763 + }, + { + "epoch": 1.5305599369085172, + "grad_norm": 0.5797019681078132, + "learning_rate": 1.3620407303564416e-05, + "loss": 0.353, + "step": 7764 + }, + { + "epoch": 1.5307570977917981, + "grad_norm": 0.5028625687192622, + "learning_rate": 1.3618962384333818e-05, + "loss": 0.319, + "step": 7765 + }, + { + "epoch": 1.5309542586750788, + "grad_norm": 0.5099435059525019, + "learning_rate": 1.3617517378154737e-05, + "loss": 0.3495, + "step": 7766 + }, + { + "epoch": 1.5311514195583595, + "grad_norm": 0.502597011605929, + "learning_rate": 1.3616072285061886e-05, + "loss": 0.3425, + "step": 7767 + }, + { + "epoch": 1.5313485804416405, + "grad_norm": 0.48907353787315166, + "learning_rate": 1.3614627105089986e-05, + "loss": 0.3266, + "step": 7768 + }, + { + "epoch": 1.5315457413249212, + "grad_norm": 0.519646837650501, + "learning_rate": 1.3613181838273758e-05, + "loss": 0.3502, + "step": 7769 + }, + { + "epoch": 1.5317429022082019, + "grad_norm": 0.51805181382411, + "learning_rate": 1.3611736484647928e-05, + "loss": 0.3447, + "step": 7770 + }, + { + "epoch": 1.5319400630914828, + "grad_norm": 0.5093499251255805, + "learning_rate": 1.3610291044247218e-05, + "loss": 0.3474, + "step": 7771 + }, + { + "epoch": 1.5321372239747633, + "grad_norm": 0.5281367810739265, + "learning_rate": 1.3608845517106364e-05, + "loss": 0.3572, + "step": 7772 + }, + { + "epoch": 1.5323343848580442, + "grad_norm": 0.4742137497487593, + "learning_rate": 1.3607399903260085e-05, + "loss": 0.3142, + "step": 7773 + }, + { + "epoch": 1.5325315457413249, + "grad_norm": 0.506813178653113, + "learning_rate": 1.3605954202743118e-05, + "loss": 0.3509, + "step": 7774 + }, + { + "epoch": 1.5327287066246056, + "grad_norm": 0.4904880273353688, + "learning_rate": 1.36045084155902e-05, + "loss": 0.3103, + "step": 7775 + }, + { + "epoch": 1.5329258675078865, + "grad_norm": 0.49044062116786236, + "learning_rate": 1.3603062541836068e-05, + "loss": 0.3517, + "step": 7776 + }, + { + "epoch": 1.5331230283911672, + "grad_norm": 0.4634871291754897, + "learning_rate": 1.3601616581515451e-05, + "loss": 0.328, + "step": 7777 + }, + { + "epoch": 1.533320189274448, + "grad_norm": 0.47077009351413396, + "learning_rate": 1.3600170534663097e-05, + "loss": 0.3169, + "step": 7778 + }, + { + "epoch": 1.5335173501577288, + "grad_norm": 0.4899373883327769, + "learning_rate": 1.3598724401313748e-05, + "loss": 0.3151, + "step": 7779 + }, + { + "epoch": 1.5337145110410093, + "grad_norm": 0.519208902268232, + "learning_rate": 1.3597278181502146e-05, + "loss": 0.3416, + "step": 7780 + }, + { + "epoch": 1.5339116719242902, + "grad_norm": 0.504751981494511, + "learning_rate": 1.3595831875263038e-05, + "loss": 0.354, + "step": 7781 + }, + { + "epoch": 1.534108832807571, + "grad_norm": 0.5020543827284227, + "learning_rate": 1.3594385482631176e-05, + "loss": 0.3447, + "step": 7782 + }, + { + "epoch": 1.5343059936908516, + "grad_norm": 0.47429364811655356, + "learning_rate": 1.3592939003641308e-05, + "loss": 0.3392, + "step": 7783 + }, + { + "epoch": 1.5345031545741326, + "grad_norm": 6.77906927343203, + "learning_rate": 1.3591492438328185e-05, + "loss": 0.3508, + "step": 7784 + }, + { + "epoch": 1.5347003154574133, + "grad_norm": 0.5933469489722475, + "learning_rate": 1.3590045786726565e-05, + "loss": 0.3581, + "step": 7785 + }, + { + "epoch": 1.534897476340694, + "grad_norm": 0.4826236402550896, + "learning_rate": 1.3588599048871202e-05, + "loss": 0.3339, + "step": 7786 + }, + { + "epoch": 1.5350946372239749, + "grad_norm": 0.4830828768348468, + "learning_rate": 1.358715222479686e-05, + "loss": 0.3166, + "step": 7787 + }, + { + "epoch": 1.5352917981072554, + "grad_norm": 0.5336248468657963, + "learning_rate": 1.3585705314538293e-05, + "loss": 0.3423, + "step": 7788 + }, + { + "epoch": 1.5354889589905363, + "grad_norm": 0.5287306065268992, + "learning_rate": 1.3584258318130274e-05, + "loss": 0.3569, + "step": 7789 + }, + { + "epoch": 1.535686119873817, + "grad_norm": 0.5579283549576503, + "learning_rate": 1.3582811235607559e-05, + "loss": 0.3587, + "step": 7790 + }, + { + "epoch": 1.5358832807570977, + "grad_norm": 0.5054259962581725, + "learning_rate": 1.358136406700492e-05, + "loss": 0.3343, + "step": 7791 + }, + { + "epoch": 1.5360804416403786, + "grad_norm": 0.49852211546975295, + "learning_rate": 1.3579916812357123e-05, + "loss": 0.3143, + "step": 7792 + }, + { + "epoch": 1.5362776025236593, + "grad_norm": 0.4885451610000915, + "learning_rate": 1.3578469471698946e-05, + "loss": 0.3347, + "step": 7793 + }, + { + "epoch": 1.53647476340694, + "grad_norm": 1.097411098465949, + "learning_rate": 1.3577022045065154e-05, + "loss": 0.3622, + "step": 7794 + }, + { + "epoch": 1.536671924290221, + "grad_norm": 0.48941610871328045, + "learning_rate": 1.3575574532490528e-05, + "loss": 0.3244, + "step": 7795 + }, + { + "epoch": 1.5368690851735016, + "grad_norm": 0.5230416955206131, + "learning_rate": 1.3574126934009843e-05, + "loss": 0.3488, + "step": 7796 + }, + { + "epoch": 1.5370662460567823, + "grad_norm": 0.47356891013201374, + "learning_rate": 1.3572679249657883e-05, + "loss": 0.3262, + "step": 7797 + }, + { + "epoch": 1.5372634069400632, + "grad_norm": 0.5633856591758994, + "learning_rate": 1.3571231479469428e-05, + "loss": 0.3206, + "step": 7798 + }, + { + "epoch": 1.5374605678233437, + "grad_norm": 0.48005492038383385, + "learning_rate": 1.3569783623479259e-05, + "loss": 0.3251, + "step": 7799 + }, + { + "epoch": 1.5376577287066246, + "grad_norm": 0.44407837297296465, + "learning_rate": 1.3568335681722165e-05, + "loss": 0.3067, + "step": 7800 + }, + { + "epoch": 1.5378548895899053, + "grad_norm": 0.49462114423942827, + "learning_rate": 1.3566887654232927e-05, + "loss": 0.3368, + "step": 7801 + }, + { + "epoch": 1.538052050473186, + "grad_norm": 0.5323311830491543, + "learning_rate": 1.3565439541046346e-05, + "loss": 0.3671, + "step": 7802 + }, + { + "epoch": 1.538249211356467, + "grad_norm": 0.4877716449979154, + "learning_rate": 1.3563991342197207e-05, + "loss": 0.3255, + "step": 7803 + }, + { + "epoch": 1.5384463722397477, + "grad_norm": 0.5199206793938796, + "learning_rate": 1.3562543057720308e-05, + "loss": 0.3473, + "step": 7804 + }, + { + "epoch": 1.5386435331230284, + "grad_norm": 0.48858101161536593, + "learning_rate": 1.356109468765044e-05, + "loss": 0.3512, + "step": 7805 + }, + { + "epoch": 1.5388406940063093, + "grad_norm": 0.6302130832925654, + "learning_rate": 1.3559646232022408e-05, + "loss": 0.3233, + "step": 7806 + }, + { + "epoch": 1.5390378548895898, + "grad_norm": 0.5264654250829359, + "learning_rate": 1.3558197690871004e-05, + "loss": 0.3619, + "step": 7807 + }, + { + "epoch": 1.5392350157728707, + "grad_norm": 1.142333806645509, + "learning_rate": 1.3556749064231038e-05, + "loss": 0.3357, + "step": 7808 + }, + { + "epoch": 1.5394321766561514, + "grad_norm": 0.5172634182082058, + "learning_rate": 1.3555300352137311e-05, + "loss": 0.3492, + "step": 7809 + }, + { + "epoch": 1.539629337539432, + "grad_norm": 0.521433933106068, + "learning_rate": 1.3553851554624631e-05, + "loss": 0.374, + "step": 7810 + }, + { + "epoch": 1.539826498422713, + "grad_norm": 0.46871626887951895, + "learning_rate": 1.3552402671727805e-05, + "loss": 0.325, + "step": 7811 + }, + { + "epoch": 1.5400236593059937, + "grad_norm": 0.46267385384546883, + "learning_rate": 1.3550953703481645e-05, + "loss": 0.307, + "step": 7812 + }, + { + "epoch": 1.5402208201892744, + "grad_norm": 0.5082065142815442, + "learning_rate": 1.3549504649920961e-05, + "loss": 0.3179, + "step": 7813 + }, + { + "epoch": 1.5404179810725553, + "grad_norm": 0.4835751507338688, + "learning_rate": 1.3548055511080568e-05, + "loss": 0.3464, + "step": 7814 + }, + { + "epoch": 1.5406151419558358, + "grad_norm": 0.5054295099711004, + "learning_rate": 1.3546606286995288e-05, + "loss": 0.3411, + "step": 7815 + }, + { + "epoch": 1.5408123028391167, + "grad_norm": 0.5124576121953441, + "learning_rate": 1.3545156977699931e-05, + "loss": 0.3484, + "step": 7816 + }, + { + "epoch": 1.5410094637223974, + "grad_norm": 0.5937412421888819, + "learning_rate": 1.3543707583229328e-05, + "loss": 0.347, + "step": 7817 + }, + { + "epoch": 1.5412066246056781, + "grad_norm": 0.47414570200167167, + "learning_rate": 1.3542258103618293e-05, + "loss": 0.333, + "step": 7818 + }, + { + "epoch": 1.541403785488959, + "grad_norm": 0.4826001472020254, + "learning_rate": 1.3540808538901658e-05, + "loss": 0.3439, + "step": 7819 + }, + { + "epoch": 1.5416009463722398, + "grad_norm": 0.46449722652318304, + "learning_rate": 1.353935888911424e-05, + "loss": 0.3237, + "step": 7820 + }, + { + "epoch": 1.5417981072555205, + "grad_norm": 0.48229434160473766, + "learning_rate": 1.3537909154290883e-05, + "loss": 0.3343, + "step": 7821 + }, + { + "epoch": 1.5419952681388014, + "grad_norm": 0.48229421460382765, + "learning_rate": 1.3536459334466403e-05, + "loss": 0.3375, + "step": 7822 + }, + { + "epoch": 1.5421924290220819, + "grad_norm": 0.5116523420598664, + "learning_rate": 1.3535009429675641e-05, + "loss": 0.3473, + "step": 7823 + }, + { + "epoch": 1.5423895899053628, + "grad_norm": 0.5776036825506357, + "learning_rate": 1.3533559439953429e-05, + "loss": 0.3598, + "step": 7824 + }, + { + "epoch": 1.5425867507886435, + "grad_norm": 0.4835560536957904, + "learning_rate": 1.3532109365334609e-05, + "loss": 0.3359, + "step": 7825 + }, + { + "epoch": 1.5427839116719242, + "grad_norm": 0.4967123924164706, + "learning_rate": 1.3530659205854018e-05, + "loss": 0.3343, + "step": 7826 + }, + { + "epoch": 1.5429810725552051, + "grad_norm": 0.5275219762814375, + "learning_rate": 1.3529208961546494e-05, + "loss": 0.357, + "step": 7827 + }, + { + "epoch": 1.5431782334384858, + "grad_norm": 0.5090525455291536, + "learning_rate": 1.3527758632446884e-05, + "loss": 0.3525, + "step": 7828 + }, + { + "epoch": 1.5433753943217665, + "grad_norm": 0.4942938895394185, + "learning_rate": 1.3526308218590032e-05, + "loss": 0.3466, + "step": 7829 + }, + { + "epoch": 1.5435725552050474, + "grad_norm": 0.485652733290798, + "learning_rate": 1.3524857720010784e-05, + "loss": 0.3549, + "step": 7830 + }, + { + "epoch": 1.543769716088328, + "grad_norm": 0.49030437167938445, + "learning_rate": 1.3523407136743992e-05, + "loss": 0.3342, + "step": 7831 + }, + { + "epoch": 1.5439668769716088, + "grad_norm": 0.48804412538773867, + "learning_rate": 1.3521956468824505e-05, + "loss": 0.3553, + "step": 7832 + }, + { + "epoch": 1.5441640378548895, + "grad_norm": 0.48837222737277775, + "learning_rate": 1.3520505716287178e-05, + "loss": 0.3262, + "step": 7833 + }, + { + "epoch": 1.5443611987381702, + "grad_norm": 0.47270681354383004, + "learning_rate": 1.3519054879166867e-05, + "loss": 0.3444, + "step": 7834 + }, + { + "epoch": 1.5445583596214512, + "grad_norm": 0.4926982400357956, + "learning_rate": 1.3517603957498426e-05, + "loss": 0.3336, + "step": 7835 + }, + { + "epoch": 1.5447555205047319, + "grad_norm": 0.49014739600256885, + "learning_rate": 1.351615295131672e-05, + "loss": 0.3392, + "step": 7836 + }, + { + "epoch": 1.5449526813880126, + "grad_norm": 0.4966773713684002, + "learning_rate": 1.3514701860656605e-05, + "loss": 0.3234, + "step": 7837 + }, + { + "epoch": 1.5451498422712935, + "grad_norm": 0.49585084596816736, + "learning_rate": 1.351325068555295e-05, + "loss": 0.3287, + "step": 7838 + }, + { + "epoch": 1.5453470031545742, + "grad_norm": 0.4772798404368484, + "learning_rate": 1.3511799426040617e-05, + "loss": 0.3295, + "step": 7839 + }, + { + "epoch": 1.5455441640378549, + "grad_norm": 0.49778402900715435, + "learning_rate": 1.3510348082154476e-05, + "loss": 0.3277, + "step": 7840 + }, + { + "epoch": 1.5457413249211358, + "grad_norm": 0.4984788297997575, + "learning_rate": 1.3508896653929392e-05, + "loss": 0.3276, + "step": 7841 + }, + { + "epoch": 1.5459384858044163, + "grad_norm": 1.3775125906338332, + "learning_rate": 1.3507445141400247e-05, + "loss": 0.3507, + "step": 7842 + }, + { + "epoch": 1.5461356466876972, + "grad_norm": 0.4962221209215611, + "learning_rate": 1.35059935446019e-05, + "loss": 0.363, + "step": 7843 + }, + { + "epoch": 1.546332807570978, + "grad_norm": 0.5016263606305518, + "learning_rate": 1.3504541863569237e-05, + "loss": 0.3119, + "step": 7844 + }, + { + "epoch": 1.5465299684542586, + "grad_norm": 0.5307167375631622, + "learning_rate": 1.3503090098337138e-05, + "loss": 0.332, + "step": 7845 + }, + { + "epoch": 1.5467271293375395, + "grad_norm": 0.46696657884783277, + "learning_rate": 1.3501638248940475e-05, + "loss": 0.3343, + "step": 7846 + }, + { + "epoch": 1.5469242902208202, + "grad_norm": 0.5225512410564163, + "learning_rate": 1.3500186315414133e-05, + "loss": 0.3373, + "step": 7847 + }, + { + "epoch": 1.547121451104101, + "grad_norm": 0.487864098453998, + "learning_rate": 1.3498734297792994e-05, + "loss": 0.3357, + "step": 7848 + }, + { + "epoch": 1.5473186119873819, + "grad_norm": 0.4768617570887559, + "learning_rate": 1.3497282196111949e-05, + "loss": 0.352, + "step": 7849 + }, + { + "epoch": 1.5475157728706623, + "grad_norm": 0.5331400342710803, + "learning_rate": 1.3495830010405884e-05, + "loss": 0.3285, + "step": 7850 + }, + { + "epoch": 1.5477129337539433, + "grad_norm": 0.48686256381769405, + "learning_rate": 1.3494377740709685e-05, + "loss": 0.3352, + "step": 7851 + }, + { + "epoch": 1.547910094637224, + "grad_norm": 0.4919653626792826, + "learning_rate": 1.3492925387058249e-05, + "loss": 0.349, + "step": 7852 + }, + { + "epoch": 1.5481072555205047, + "grad_norm": 0.5069611129563922, + "learning_rate": 1.3491472949486466e-05, + "loss": 0.3184, + "step": 7853 + }, + { + "epoch": 1.5483044164037856, + "grad_norm": 0.4979053390062899, + "learning_rate": 1.3490020428029236e-05, + "loss": 0.324, + "step": 7854 + }, + { + "epoch": 1.5485015772870663, + "grad_norm": 0.5968503457298566, + "learning_rate": 1.3488567822721453e-05, + "loss": 0.3675, + "step": 7855 + }, + { + "epoch": 1.548698738170347, + "grad_norm": 0.4798040440294239, + "learning_rate": 1.3487115133598017e-05, + "loss": 0.332, + "step": 7856 + }, + { + "epoch": 1.548895899053628, + "grad_norm": 0.9676067949318208, + "learning_rate": 1.3485662360693834e-05, + "loss": 0.3271, + "step": 7857 + }, + { + "epoch": 1.5490930599369084, + "grad_norm": 0.5290026297802422, + "learning_rate": 1.3484209504043804e-05, + "loss": 0.3601, + "step": 7858 + }, + { + "epoch": 1.5492902208201893, + "grad_norm": 0.5572104887123693, + "learning_rate": 1.3482756563682837e-05, + "loss": 0.3474, + "step": 7859 + }, + { + "epoch": 1.54948738170347, + "grad_norm": 0.48952981915700133, + "learning_rate": 1.3481303539645838e-05, + "loss": 0.349, + "step": 7860 + }, + { + "epoch": 1.5496845425867507, + "grad_norm": 0.49539223483029654, + "learning_rate": 1.347985043196772e-05, + "loss": 0.3374, + "step": 7861 + }, + { + "epoch": 1.5498817034700316, + "grad_norm": 0.4935516593268971, + "learning_rate": 1.3478397240683387e-05, + "loss": 0.3269, + "step": 7862 + }, + { + "epoch": 1.5500788643533123, + "grad_norm": 0.5036803822601013, + "learning_rate": 1.3476943965827765e-05, + "loss": 0.3285, + "step": 7863 + }, + { + "epoch": 1.550276025236593, + "grad_norm": 0.5028733026075429, + "learning_rate": 1.3475490607435764e-05, + "loss": 0.3494, + "step": 7864 + }, + { + "epoch": 1.550473186119874, + "grad_norm": 0.4764330154599795, + "learning_rate": 1.34740371655423e-05, + "loss": 0.3291, + "step": 7865 + }, + { + "epoch": 1.5506703470031544, + "grad_norm": 2.4174378790452895, + "learning_rate": 1.3472583640182298e-05, + "loss": 0.3636, + "step": 7866 + }, + { + "epoch": 1.5508675078864353, + "grad_norm": 0.5015765985791284, + "learning_rate": 1.3471130031390673e-05, + "loss": 0.332, + "step": 7867 + }, + { + "epoch": 1.551064668769716, + "grad_norm": 0.5309675714334715, + "learning_rate": 1.346967633920236e-05, + "loss": 0.3314, + "step": 7868 + }, + { + "epoch": 1.5512618296529967, + "grad_norm": 0.4868835503536096, + "learning_rate": 1.3468222563652274e-05, + "loss": 0.3377, + "step": 7869 + }, + { + "epoch": 1.5514589905362777, + "grad_norm": 0.5127927157453637, + "learning_rate": 1.3466768704775348e-05, + "loss": 0.3672, + "step": 7870 + }, + { + "epoch": 1.5516561514195584, + "grad_norm": 0.4827580717241643, + "learning_rate": 1.3465314762606513e-05, + "loss": 0.3218, + "step": 7871 + }, + { + "epoch": 1.551853312302839, + "grad_norm": 0.5736851092892326, + "learning_rate": 1.3463860737180703e-05, + "loss": 0.366, + "step": 7872 + }, + { + "epoch": 1.55205047318612, + "grad_norm": 0.5318567397637377, + "learning_rate": 1.3462406628532846e-05, + "loss": 0.3378, + "step": 7873 + }, + { + "epoch": 1.5522476340694005, + "grad_norm": 0.5155275111603366, + "learning_rate": 1.3460952436697883e-05, + "loss": 0.3438, + "step": 7874 + }, + { + "epoch": 1.5524447949526814, + "grad_norm": 0.4775879907549808, + "learning_rate": 1.345949816171075e-05, + "loss": 0.3427, + "step": 7875 + }, + { + "epoch": 1.552641955835962, + "grad_norm": 0.5168763642660567, + "learning_rate": 1.3458043803606386e-05, + "loss": 0.3597, + "step": 7876 + }, + { + "epoch": 1.5528391167192428, + "grad_norm": 0.5349163438973762, + "learning_rate": 1.3456589362419739e-05, + "loss": 0.3805, + "step": 7877 + }, + { + "epoch": 1.5530362776025237, + "grad_norm": 0.5844522900838138, + "learning_rate": 1.3455134838185746e-05, + "loss": 0.3263, + "step": 7878 + }, + { + "epoch": 1.5532334384858044, + "grad_norm": 0.4829639630596844, + "learning_rate": 1.3453680230939357e-05, + "loss": 0.3256, + "step": 7879 + }, + { + "epoch": 1.5534305993690851, + "grad_norm": 0.5218800494399819, + "learning_rate": 1.345222554071552e-05, + "loss": 0.3303, + "step": 7880 + }, + { + "epoch": 1.553627760252366, + "grad_norm": 0.521156173705382, + "learning_rate": 1.3450770767549181e-05, + "loss": 0.3395, + "step": 7881 + }, + { + "epoch": 1.5538249211356467, + "grad_norm": 0.4981289635139336, + "learning_rate": 1.34493159114753e-05, + "loss": 0.3327, + "step": 7882 + }, + { + "epoch": 1.5540220820189274, + "grad_norm": 0.5448350587172559, + "learning_rate": 1.3447860972528823e-05, + "loss": 0.3581, + "step": 7883 + }, + { + "epoch": 1.5542192429022084, + "grad_norm": 0.5056691581563505, + "learning_rate": 1.3446405950744709e-05, + "loss": 0.3432, + "step": 7884 + }, + { + "epoch": 1.5544164037854888, + "grad_norm": 0.4953282778846118, + "learning_rate": 1.344495084615792e-05, + "loss": 0.3137, + "step": 7885 + }, + { + "epoch": 1.5546135646687698, + "grad_norm": 0.4751287087730977, + "learning_rate": 1.344349565880341e-05, + "loss": 0.3207, + "step": 7886 + }, + { + "epoch": 1.5548107255520505, + "grad_norm": 0.5085147826900555, + "learning_rate": 1.3442040388716146e-05, + "loss": 0.3499, + "step": 7887 + }, + { + "epoch": 1.5550078864353312, + "grad_norm": 0.47206999350998413, + "learning_rate": 1.3440585035931089e-05, + "loss": 0.327, + "step": 7888 + }, + { + "epoch": 1.555205047318612, + "grad_norm": 0.4488347810163907, + "learning_rate": 1.3439129600483207e-05, + "loss": 0.318, + "step": 7889 + }, + { + "epoch": 1.5554022082018928, + "grad_norm": 0.5051353448066457, + "learning_rate": 1.3437674082407463e-05, + "loss": 0.3564, + "step": 7890 + }, + { + "epoch": 1.5555993690851735, + "grad_norm": 0.49268581152861246, + "learning_rate": 1.3436218481738834e-05, + "loss": 0.3415, + "step": 7891 + }, + { + "epoch": 1.5557965299684544, + "grad_norm": 0.45508654653177255, + "learning_rate": 1.343476279851229e-05, + "loss": 0.3255, + "step": 7892 + }, + { + "epoch": 1.555993690851735, + "grad_norm": 0.5114403490145619, + "learning_rate": 1.3433307032762799e-05, + "loss": 0.3453, + "step": 7893 + }, + { + "epoch": 1.5561908517350158, + "grad_norm": 0.5594515489170677, + "learning_rate": 1.3431851184525343e-05, + "loss": 0.3382, + "step": 7894 + }, + { + "epoch": 1.5563880126182965, + "grad_norm": 0.5360994056754633, + "learning_rate": 1.3430395253834902e-05, + "loss": 0.359, + "step": 7895 + }, + { + "epoch": 1.5565851735015772, + "grad_norm": 0.5154345101589657, + "learning_rate": 1.3428939240726451e-05, + "loss": 0.3317, + "step": 7896 + }, + { + "epoch": 1.5567823343848581, + "grad_norm": 0.5196905638444523, + "learning_rate": 1.3427483145234974e-05, + "loss": 0.3562, + "step": 7897 + }, + { + "epoch": 1.5569794952681388, + "grad_norm": 0.6914979302075561, + "learning_rate": 1.342602696739545e-05, + "loss": 0.3364, + "step": 7898 + }, + { + "epoch": 1.5571766561514195, + "grad_norm": 0.4739027137576714, + "learning_rate": 1.3424570707242875e-05, + "loss": 0.3069, + "step": 7899 + }, + { + "epoch": 1.5573738170347005, + "grad_norm": 0.4986733883254391, + "learning_rate": 1.3423114364812229e-05, + "loss": 0.3468, + "step": 7900 + }, + { + "epoch": 1.557570977917981, + "grad_norm": 0.5096555609944065, + "learning_rate": 1.3421657940138504e-05, + "loss": 0.3436, + "step": 7901 + }, + { + "epoch": 1.5577681388012619, + "grad_norm": 0.5170708802611125, + "learning_rate": 1.342020143325669e-05, + "loss": 0.3461, + "step": 7902 + }, + { + "epoch": 1.5579652996845426, + "grad_norm": 0.5134132452327762, + "learning_rate": 1.3418744844201783e-05, + "loss": 0.3173, + "step": 7903 + }, + { + "epoch": 1.5581624605678233, + "grad_norm": 0.48373651440601273, + "learning_rate": 1.3417288173008778e-05, + "loss": 0.3306, + "step": 7904 + }, + { + "epoch": 1.5583596214511042, + "grad_norm": 0.5305843575820182, + "learning_rate": 1.341583141971267e-05, + "loss": 0.359, + "step": 7905 + }, + { + "epoch": 1.5585567823343849, + "grad_norm": 0.4992440462111545, + "learning_rate": 1.3414374584348466e-05, + "loss": 0.3436, + "step": 7906 + }, + { + "epoch": 1.5587539432176656, + "grad_norm": 0.4815491827718251, + "learning_rate": 1.3412917666951159e-05, + "loss": 0.3462, + "step": 7907 + }, + { + "epoch": 1.5589511041009465, + "grad_norm": 0.4686319393529876, + "learning_rate": 1.3411460667555762e-05, + "loss": 0.3345, + "step": 7908 + }, + { + "epoch": 1.559148264984227, + "grad_norm": 0.4699483011052357, + "learning_rate": 1.341000358619727e-05, + "loss": 0.3093, + "step": 7909 + }, + { + "epoch": 1.559345425867508, + "grad_norm": 0.464632109167414, + "learning_rate": 1.34085464229107e-05, + "loss": 0.3376, + "step": 7910 + }, + { + "epoch": 1.5595425867507886, + "grad_norm": 0.4823143160131754, + "learning_rate": 1.3407089177731052e-05, + "loss": 0.3385, + "step": 7911 + }, + { + "epoch": 1.5597397476340693, + "grad_norm": 0.5218932690257738, + "learning_rate": 1.3405631850693347e-05, + "loss": 0.3565, + "step": 7912 + }, + { + "epoch": 1.5599369085173502, + "grad_norm": 0.45189898716468596, + "learning_rate": 1.3404174441832592e-05, + "loss": 0.3183, + "step": 7913 + }, + { + "epoch": 1.560134069400631, + "grad_norm": 0.5167028440182481, + "learning_rate": 1.3402716951183807e-05, + "loss": 0.3643, + "step": 7914 + }, + { + "epoch": 1.5603312302839116, + "grad_norm": 0.48376625167026677, + "learning_rate": 1.3401259378782005e-05, + "loss": 0.3401, + "step": 7915 + }, + { + "epoch": 1.5605283911671926, + "grad_norm": 0.4773795454869428, + "learning_rate": 1.3399801724662209e-05, + "loss": 0.3224, + "step": 7916 + }, + { + "epoch": 1.560725552050473, + "grad_norm": 0.4945502868783542, + "learning_rate": 1.3398343988859439e-05, + "loss": 0.3307, + "step": 7917 + }, + { + "epoch": 1.560922712933754, + "grad_norm": 0.5019400523084708, + "learning_rate": 1.3396886171408717e-05, + "loss": 0.3409, + "step": 7918 + }, + { + "epoch": 1.5611198738170347, + "grad_norm": 0.4691324729142486, + "learning_rate": 1.3395428272345067e-05, + "loss": 0.325, + "step": 7919 + }, + { + "epoch": 1.5613170347003154, + "grad_norm": 0.4672956168760278, + "learning_rate": 1.3393970291703523e-05, + "loss": 0.3032, + "step": 7920 + }, + { + "epoch": 1.5615141955835963, + "grad_norm": 0.6166235750122606, + "learning_rate": 1.3392512229519105e-05, + "loss": 0.3498, + "step": 7921 + }, + { + "epoch": 1.561711356466877, + "grad_norm": 0.4806514171065023, + "learning_rate": 1.339105408582685e-05, + "loss": 0.3212, + "step": 7922 + }, + { + "epoch": 1.5619085173501577, + "grad_norm": 0.48674489059156, + "learning_rate": 1.3389595860661793e-05, + "loss": 0.3374, + "step": 7923 + }, + { + "epoch": 1.5621056782334386, + "grad_norm": 0.502066062621961, + "learning_rate": 1.3388137554058961e-05, + "loss": 0.3328, + "step": 7924 + }, + { + "epoch": 1.562302839116719, + "grad_norm": 0.479052777119119, + "learning_rate": 1.33866791660534e-05, + "loss": 0.3363, + "step": 7925 + }, + { + "epoch": 1.5625, + "grad_norm": 0.512932795649453, + "learning_rate": 1.3385220696680142e-05, + "loss": 0.3632, + "step": 7926 + }, + { + "epoch": 1.562697160883281, + "grad_norm": 0.5291035392129729, + "learning_rate": 1.3383762145974233e-05, + "loss": 0.3619, + "step": 7927 + }, + { + "epoch": 1.5628943217665614, + "grad_norm": 0.49465140055838475, + "learning_rate": 1.338230351397071e-05, + "loss": 0.3417, + "step": 7928 + }, + { + "epoch": 1.5630914826498423, + "grad_norm": 0.4967349452813738, + "learning_rate": 1.3380844800704624e-05, + "loss": 0.3243, + "step": 7929 + }, + { + "epoch": 1.563288643533123, + "grad_norm": 0.47414238203469705, + "learning_rate": 1.3379386006211021e-05, + "loss": 0.3145, + "step": 7930 + }, + { + "epoch": 1.5634858044164037, + "grad_norm": 0.5112298596770326, + "learning_rate": 1.3377927130524943e-05, + "loss": 0.3532, + "step": 7931 + }, + { + "epoch": 1.5636829652996846, + "grad_norm": 0.49210192456304214, + "learning_rate": 1.337646817368145e-05, + "loss": 0.3367, + "step": 7932 + }, + { + "epoch": 1.5638801261829653, + "grad_norm": 0.5028321558489999, + "learning_rate": 1.3375009135715584e-05, + "loss": 0.3532, + "step": 7933 + }, + { + "epoch": 1.564077287066246, + "grad_norm": 0.45880232523040504, + "learning_rate": 1.3373550016662414e-05, + "loss": 0.3176, + "step": 7934 + }, + { + "epoch": 1.564274447949527, + "grad_norm": 0.5003629680329704, + "learning_rate": 1.337209081655698e-05, + "loss": 0.3133, + "step": 7935 + }, + { + "epoch": 1.5644716088328074, + "grad_norm": 0.5046115931642703, + "learning_rate": 1.3370631535434356e-05, + "loss": 0.3378, + "step": 7936 + }, + { + "epoch": 1.5646687697160884, + "grad_norm": 0.47177267681957447, + "learning_rate": 1.3369172173329588e-05, + "loss": 0.322, + "step": 7937 + }, + { + "epoch": 1.564865930599369, + "grad_norm": 0.5082813052750841, + "learning_rate": 1.3367712730277748e-05, + "loss": 0.3357, + "step": 7938 + }, + { + "epoch": 1.5650630914826498, + "grad_norm": 0.509547876262465, + "learning_rate": 1.33662532063139e-05, + "loss": 0.3505, + "step": 7939 + }, + { + "epoch": 1.5652602523659307, + "grad_norm": 0.4783658190779783, + "learning_rate": 1.3364793601473105e-05, + "loss": 0.3284, + "step": 7940 + }, + { + "epoch": 1.5654574132492114, + "grad_norm": 0.5056684800626197, + "learning_rate": 1.3363333915790435e-05, + "loss": 0.3331, + "step": 7941 + }, + { + "epoch": 1.565654574132492, + "grad_norm": 0.5243617531078861, + "learning_rate": 1.336187414930096e-05, + "loss": 0.3303, + "step": 7942 + }, + { + "epoch": 1.565851735015773, + "grad_norm": 0.49836190761234994, + "learning_rate": 1.336041430203975e-05, + "loss": 0.3452, + "step": 7943 + }, + { + "epoch": 1.5660488958990535, + "grad_norm": 0.4968929524695866, + "learning_rate": 1.3358954374041882e-05, + "loss": 0.3172, + "step": 7944 + }, + { + "epoch": 1.5662460567823344, + "grad_norm": 0.4356287567333736, + "learning_rate": 1.335749436534243e-05, + "loss": 0.2964, + "step": 7945 + }, + { + "epoch": 1.5664432176656151, + "grad_norm": 0.4984101635486525, + "learning_rate": 1.335603427597647e-05, + "loss": 0.355, + "step": 7946 + }, + { + "epoch": 1.5666403785488958, + "grad_norm": 0.47682247093029434, + "learning_rate": 1.3354574105979085e-05, + "loss": 0.3365, + "step": 7947 + }, + { + "epoch": 1.5668375394321767, + "grad_norm": 0.4845515350210312, + "learning_rate": 1.3353113855385356e-05, + "loss": 0.3208, + "step": 7948 + }, + { + "epoch": 1.5670347003154574, + "grad_norm": 0.5275550760728646, + "learning_rate": 1.3351653524230366e-05, + "loss": 0.3523, + "step": 7949 + }, + { + "epoch": 1.5672318611987381, + "grad_norm": 0.4735343037228871, + "learning_rate": 1.3350193112549202e-05, + "loss": 0.3279, + "step": 7950 + }, + { + "epoch": 1.567429022082019, + "grad_norm": 0.46954231120457435, + "learning_rate": 1.334873262037695e-05, + "loss": 0.3417, + "step": 7951 + }, + { + "epoch": 1.5676261829652995, + "grad_norm": 0.9705396445603504, + "learning_rate": 1.3347272047748696e-05, + "loss": 0.3745, + "step": 7952 + }, + { + "epoch": 1.5678233438485805, + "grad_norm": 0.513132943966501, + "learning_rate": 1.3345811394699542e-05, + "loss": 0.3437, + "step": 7953 + }, + { + "epoch": 1.5680205047318612, + "grad_norm": 0.4859610432771261, + "learning_rate": 1.3344350661264568e-05, + "loss": 0.3525, + "step": 7954 + }, + { + "epoch": 1.5682176656151419, + "grad_norm": 0.5006180525289092, + "learning_rate": 1.3342889847478884e-05, + "loss": 0.3054, + "step": 7955 + }, + { + "epoch": 1.5684148264984228, + "grad_norm": 0.49462467277505867, + "learning_rate": 1.3341428953377574e-05, + "loss": 0.3457, + "step": 7956 + }, + { + "epoch": 1.5686119873817035, + "grad_norm": 0.4960059003422695, + "learning_rate": 1.3339967978995746e-05, + "loss": 0.3262, + "step": 7957 + }, + { + "epoch": 1.5688091482649842, + "grad_norm": 0.5065110926656363, + "learning_rate": 1.3338506924368494e-05, + "loss": 0.3368, + "step": 7958 + }, + { + "epoch": 1.569006309148265, + "grad_norm": 0.49924087045412563, + "learning_rate": 1.3337045789530927e-05, + "loss": 0.3544, + "step": 7959 + }, + { + "epoch": 1.5692034700315456, + "grad_norm": 0.5450061812449806, + "learning_rate": 1.3335584574518148e-05, + "loss": 0.3455, + "step": 7960 + }, + { + "epoch": 1.5694006309148265, + "grad_norm": 0.5370940243152774, + "learning_rate": 1.333412327936526e-05, + "loss": 0.3212, + "step": 7961 + }, + { + "epoch": 1.5695977917981072, + "grad_norm": 0.48962467971522494, + "learning_rate": 1.333266190410738e-05, + "loss": 0.327, + "step": 7962 + }, + { + "epoch": 1.569794952681388, + "grad_norm": 0.4967444956543661, + "learning_rate": 1.3331200448779611e-05, + "loss": 0.3412, + "step": 7963 + }, + { + "epoch": 1.5699921135646688, + "grad_norm": 1.5137118599387835, + "learning_rate": 1.332973891341707e-05, + "loss": 0.3315, + "step": 7964 + }, + { + "epoch": 1.5701892744479495, + "grad_norm": 0.5024737388313264, + "learning_rate": 1.332827729805487e-05, + "loss": 0.3528, + "step": 7965 + }, + { + "epoch": 1.5703864353312302, + "grad_norm": 0.5259290240369583, + "learning_rate": 1.3326815602728127e-05, + "loss": 0.3713, + "step": 7966 + }, + { + "epoch": 1.5705835962145112, + "grad_norm": 0.5279028181427355, + "learning_rate": 1.332535382747196e-05, + "loss": 0.3557, + "step": 7967 + }, + { + "epoch": 1.5707807570977916, + "grad_norm": 0.7224209367198574, + "learning_rate": 1.332389197232149e-05, + "loss": 0.3688, + "step": 7968 + }, + { + "epoch": 1.5709779179810726, + "grad_norm": 0.48099183427409264, + "learning_rate": 1.3322430037311837e-05, + "loss": 0.3116, + "step": 7969 + }, + { + "epoch": 1.5711750788643533, + "grad_norm": 0.508998063080973, + "learning_rate": 1.332096802247813e-05, + "loss": 0.3365, + "step": 7970 + }, + { + "epoch": 1.571372239747634, + "grad_norm": 0.49220793045424055, + "learning_rate": 1.331950592785549e-05, + "loss": 0.3522, + "step": 7971 + }, + { + "epoch": 1.5715694006309149, + "grad_norm": 0.5332391139433758, + "learning_rate": 1.3318043753479047e-05, + "loss": 0.3892, + "step": 7972 + }, + { + "epoch": 1.5717665615141956, + "grad_norm": 0.4951350369532224, + "learning_rate": 1.3316581499383929e-05, + "loss": 0.3295, + "step": 7973 + }, + { + "epoch": 1.5719637223974763, + "grad_norm": 0.5140523975794388, + "learning_rate": 1.3315119165605273e-05, + "loss": 0.3443, + "step": 7974 + }, + { + "epoch": 1.5721608832807572, + "grad_norm": 0.5191099345240461, + "learning_rate": 1.3313656752178205e-05, + "loss": 0.3456, + "step": 7975 + }, + { + "epoch": 1.572358044164038, + "grad_norm": 0.5150428182108653, + "learning_rate": 1.331219425913787e-05, + "loss": 0.3262, + "step": 7976 + }, + { + "epoch": 1.5725552050473186, + "grad_norm": 0.4909428641563583, + "learning_rate": 1.3310731686519397e-05, + "loss": 0.3172, + "step": 7977 + }, + { + "epoch": 1.5727523659305995, + "grad_norm": 0.4961393599175189, + "learning_rate": 1.3309269034357931e-05, + "loss": 0.3097, + "step": 7978 + }, + { + "epoch": 1.57294952681388, + "grad_norm": 0.5318028453318919, + "learning_rate": 1.330780630268861e-05, + "loss": 0.3635, + "step": 7979 + }, + { + "epoch": 1.573146687697161, + "grad_norm": 0.5996323717685671, + "learning_rate": 1.3306343491546581e-05, + "loss": 0.3703, + "step": 7980 + }, + { + "epoch": 1.5733438485804416, + "grad_norm": 1.5419005107960566, + "learning_rate": 1.3304880600966985e-05, + "loss": 0.3559, + "step": 7981 + }, + { + "epoch": 1.5735410094637223, + "grad_norm": 0.4648861254663534, + "learning_rate": 1.3303417630984972e-05, + "loss": 0.31, + "step": 7982 + }, + { + "epoch": 1.5737381703470033, + "grad_norm": 0.5014405931273911, + "learning_rate": 1.3301954581635692e-05, + "loss": 0.3542, + "step": 7983 + }, + { + "epoch": 1.573935331230284, + "grad_norm": 0.47324297334032567, + "learning_rate": 1.3300491452954292e-05, + "loss": 0.3439, + "step": 7984 + }, + { + "epoch": 1.5741324921135647, + "grad_norm": 0.5098324583372497, + "learning_rate": 1.3299028244975929e-05, + "loss": 0.3528, + "step": 7985 + }, + { + "epoch": 1.5743296529968456, + "grad_norm": 0.48462830498298404, + "learning_rate": 1.3297564957735752e-05, + "loss": 0.3318, + "step": 7986 + }, + { + "epoch": 1.574526813880126, + "grad_norm": 0.5391632212698709, + "learning_rate": 1.3296101591268924e-05, + "loss": 0.3632, + "step": 7987 + }, + { + "epoch": 1.574723974763407, + "grad_norm": 0.48926713072275946, + "learning_rate": 1.3294638145610598e-05, + "loss": 0.314, + "step": 7988 + }, + { + "epoch": 1.5749211356466877, + "grad_norm": 0.5220045491815719, + "learning_rate": 1.3293174620795942e-05, + "loss": 0.3469, + "step": 7989 + }, + { + "epoch": 1.5751182965299684, + "grad_norm": 0.5103365539849146, + "learning_rate": 1.329171101686011e-05, + "loss": 0.3431, + "step": 7990 + }, + { + "epoch": 1.5753154574132493, + "grad_norm": 0.49515948764836015, + "learning_rate": 1.329024733383827e-05, + "loss": 0.312, + "step": 7991 + }, + { + "epoch": 1.57551261829653, + "grad_norm": 0.47764308725452986, + "learning_rate": 1.328878357176559e-05, + "loss": 0.3156, + "step": 7992 + }, + { + "epoch": 1.5757097791798107, + "grad_norm": 0.4913518129002428, + "learning_rate": 1.3287319730677237e-05, + "loss": 0.3449, + "step": 7993 + }, + { + "epoch": 1.5759069400630916, + "grad_norm": 0.5199989452514197, + "learning_rate": 1.3285855810608377e-05, + "loss": 0.3572, + "step": 7994 + }, + { + "epoch": 1.576104100946372, + "grad_norm": 0.46619302997703116, + "learning_rate": 1.3284391811594191e-05, + "loss": 0.3115, + "step": 7995 + }, + { + "epoch": 1.576301261829653, + "grad_norm": 0.532011015620942, + "learning_rate": 1.3282927733669842e-05, + "loss": 0.3377, + "step": 7996 + }, + { + "epoch": 1.5764984227129337, + "grad_norm": 1.440253937845809, + "learning_rate": 1.328146357687051e-05, + "loss": 0.3688, + "step": 7997 + }, + { + "epoch": 1.5766955835962144, + "grad_norm": 1.0850935873776184, + "learning_rate": 1.3279999341231375e-05, + "loss": 0.338, + "step": 7998 + }, + { + "epoch": 1.5768927444794953, + "grad_norm": 0.5168104366063799, + "learning_rate": 1.3278535026787614e-05, + "loss": 0.3501, + "step": 7999 + }, + { + "epoch": 1.577089905362776, + "grad_norm": 0.5139066874038835, + "learning_rate": 1.3277070633574409e-05, + "loss": 0.3414, + "step": 8000 + }, + { + "epoch": 1.5772870662460567, + "grad_norm": 0.48443540557771675, + "learning_rate": 1.3275606161626941e-05, + "loss": 0.3271, + "step": 8001 + }, + { + "epoch": 1.5774842271293377, + "grad_norm": 0.5239644881242183, + "learning_rate": 1.32741416109804e-05, + "loss": 0.3445, + "step": 8002 + }, + { + "epoch": 1.5776813880126181, + "grad_norm": 0.46561515482433635, + "learning_rate": 1.3272676981669968e-05, + "loss": 0.3232, + "step": 8003 + }, + { + "epoch": 1.577878548895899, + "grad_norm": 0.5355952021642951, + "learning_rate": 1.327121227373084e-05, + "loss": 0.3552, + "step": 8004 + }, + { + "epoch": 1.5780757097791798, + "grad_norm": 0.511194087803704, + "learning_rate": 1.3269747487198197e-05, + "loss": 0.3651, + "step": 8005 + }, + { + "epoch": 1.5782728706624605, + "grad_norm": 0.463393152472201, + "learning_rate": 1.326828262210724e-05, + "loss": 0.3282, + "step": 8006 + }, + { + "epoch": 1.5784700315457414, + "grad_norm": 0.49704658525209167, + "learning_rate": 1.326681767849316e-05, + "loss": 0.3174, + "step": 8007 + }, + { + "epoch": 1.578667192429022, + "grad_norm": 0.5412028232341971, + "learning_rate": 1.3265352656391158e-05, + "loss": 0.349, + "step": 8008 + }, + { + "epoch": 1.5788643533123028, + "grad_norm": 0.4698860322570366, + "learning_rate": 1.3263887555836425e-05, + "loss": 0.3106, + "step": 8009 + }, + { + "epoch": 1.5790615141955837, + "grad_norm": 0.4982618521435573, + "learning_rate": 1.3262422376864168e-05, + "loss": 0.3262, + "step": 8010 + }, + { + "epoch": 1.5792586750788642, + "grad_norm": 0.539325905160115, + "learning_rate": 1.3260957119509586e-05, + "loss": 0.3502, + "step": 8011 + }, + { + "epoch": 1.5794558359621451, + "grad_norm": 0.5288545896046527, + "learning_rate": 1.325949178380788e-05, + "loss": 0.3692, + "step": 8012 + }, + { + "epoch": 1.5796529968454258, + "grad_norm": 0.4758545656821426, + "learning_rate": 1.3258026369794261e-05, + "loss": 0.3283, + "step": 8013 + }, + { + "epoch": 1.5798501577287065, + "grad_norm": 0.5218405213561202, + "learning_rate": 1.3256560877503936e-05, + "loss": 0.3415, + "step": 8014 + }, + { + "epoch": 1.5800473186119874, + "grad_norm": 0.4894784063780892, + "learning_rate": 1.3255095306972112e-05, + "loss": 0.3269, + "step": 8015 + }, + { + "epoch": 1.5802444794952681, + "grad_norm": 0.5110496055324127, + "learning_rate": 1.3253629658234002e-05, + "loss": 0.3542, + "step": 8016 + }, + { + "epoch": 1.5804416403785488, + "grad_norm": 0.49174003178088704, + "learning_rate": 1.325216393132482e-05, + "loss": 0.3394, + "step": 8017 + }, + { + "epoch": 1.5806388012618298, + "grad_norm": 0.47299800973515416, + "learning_rate": 1.3250698126279781e-05, + "loss": 0.3419, + "step": 8018 + }, + { + "epoch": 1.5808359621451105, + "grad_norm": 0.4822158106803239, + "learning_rate": 1.32492322431341e-05, + "loss": 0.3539, + "step": 8019 + }, + { + "epoch": 1.5810331230283912, + "grad_norm": 0.47952679544937005, + "learning_rate": 1.3247766281922998e-05, + "loss": 0.3468, + "step": 8020 + }, + { + "epoch": 1.581230283911672, + "grad_norm": 0.46714001850575293, + "learning_rate": 1.3246300242681698e-05, + "loss": 0.3468, + "step": 8021 + }, + { + "epoch": 1.5814274447949526, + "grad_norm": 0.5770765745101369, + "learning_rate": 1.3244834125445415e-05, + "loss": 0.3658, + "step": 8022 + }, + { + "epoch": 1.5816246056782335, + "grad_norm": 0.466026202641457, + "learning_rate": 1.3243367930249386e-05, + "loss": 0.3342, + "step": 8023 + }, + { + "epoch": 1.5818217665615142, + "grad_norm": 0.5082981481974369, + "learning_rate": 1.3241901657128827e-05, + "loss": 0.3635, + "step": 8024 + }, + { + "epoch": 1.5820189274447949, + "grad_norm": 0.48542475906619664, + "learning_rate": 1.3240435306118973e-05, + "loss": 0.3553, + "step": 8025 + }, + { + "epoch": 1.5822160883280758, + "grad_norm": 0.4787474049672265, + "learning_rate": 1.3238968877255044e-05, + "loss": 0.3357, + "step": 8026 + }, + { + "epoch": 1.5824132492113565, + "grad_norm": 0.5075996681809781, + "learning_rate": 1.3237502370572287e-05, + "loss": 0.3479, + "step": 8027 + }, + { + "epoch": 1.5826104100946372, + "grad_norm": 0.5129452355663832, + "learning_rate": 1.3236035786105922e-05, + "loss": 0.3598, + "step": 8028 + }, + { + "epoch": 1.5828075709779181, + "grad_norm": 0.45167882333931725, + "learning_rate": 1.3234569123891197e-05, + "loss": 0.3199, + "step": 8029 + }, + { + "epoch": 1.5830047318611986, + "grad_norm": 0.4891940893562189, + "learning_rate": 1.3233102383963341e-05, + "loss": 0.3588, + "step": 8030 + }, + { + "epoch": 1.5832018927444795, + "grad_norm": 0.4924263041994256, + "learning_rate": 1.3231635566357599e-05, + "loss": 0.3558, + "step": 8031 + }, + { + "epoch": 1.5833990536277602, + "grad_norm": 0.5691351727091879, + "learning_rate": 1.3230168671109207e-05, + "loss": 0.3215, + "step": 8032 + }, + { + "epoch": 1.583596214511041, + "grad_norm": 0.5454873000853145, + "learning_rate": 1.3228701698253415e-05, + "loss": 0.3561, + "step": 8033 + }, + { + "epoch": 1.5837933753943219, + "grad_norm": 4.541252989056811, + "learning_rate": 1.3227234647825463e-05, + "loss": 0.329, + "step": 8034 + }, + { + "epoch": 1.5839905362776026, + "grad_norm": 0.5120743849675218, + "learning_rate": 1.3225767519860597e-05, + "loss": 0.342, + "step": 8035 + }, + { + "epoch": 1.5841876971608833, + "grad_norm": 0.5241749900315733, + "learning_rate": 1.3224300314394073e-05, + "loss": 0.3847, + "step": 8036 + }, + { + "epoch": 1.5843848580441642, + "grad_norm": 0.5165462915069103, + "learning_rate": 1.3222833031461133e-05, + "loss": 0.3414, + "step": 8037 + }, + { + "epoch": 1.5845820189274447, + "grad_norm": 0.5189206607460046, + "learning_rate": 1.3221365671097038e-05, + "loss": 0.3291, + "step": 8038 + }, + { + "epoch": 1.5847791798107256, + "grad_norm": 0.5059501509473748, + "learning_rate": 1.3219898233337036e-05, + "loss": 0.3249, + "step": 8039 + }, + { + "epoch": 1.5849763406940063, + "grad_norm": 0.4904431912501636, + "learning_rate": 1.321843071821639e-05, + "loss": 0.3435, + "step": 8040 + }, + { + "epoch": 1.585173501577287, + "grad_norm": 0.45852925927612104, + "learning_rate": 1.3216963125770345e-05, + "loss": 0.3055, + "step": 8041 + }, + { + "epoch": 1.585370662460568, + "grad_norm": 0.5589366409498242, + "learning_rate": 1.3215495456034179e-05, + "loss": 0.3253, + "step": 8042 + }, + { + "epoch": 1.5855678233438486, + "grad_norm": 0.49532146559960627, + "learning_rate": 1.3214027709043142e-05, + "loss": 0.3442, + "step": 8043 + }, + { + "epoch": 1.5857649842271293, + "grad_norm": 0.49902148390151746, + "learning_rate": 1.3212559884832503e-05, + "loss": 0.3499, + "step": 8044 + }, + { + "epoch": 1.5859621451104102, + "grad_norm": 0.5117292560455523, + "learning_rate": 1.3211091983437524e-05, + "loss": 0.3502, + "step": 8045 + }, + { + "epoch": 1.5861593059936907, + "grad_norm": 0.4829964496583425, + "learning_rate": 1.3209624004893476e-05, + "loss": 0.321, + "step": 8046 + }, + { + "epoch": 1.5863564668769716, + "grad_norm": 0.5776878181785877, + "learning_rate": 1.3208155949235621e-05, + "loss": 0.3194, + "step": 8047 + }, + { + "epoch": 1.5865536277602523, + "grad_norm": 0.44848690865415186, + "learning_rate": 1.3206687816499242e-05, + "loss": 0.3077, + "step": 8048 + }, + { + "epoch": 1.586750788643533, + "grad_norm": 0.4881456482953011, + "learning_rate": 1.3205219606719606e-05, + "loss": 0.3246, + "step": 8049 + }, + { + "epoch": 1.586947949526814, + "grad_norm": 0.5011767838202971, + "learning_rate": 1.3203751319931983e-05, + "loss": 0.355, + "step": 8050 + }, + { + "epoch": 1.5871451104100947, + "grad_norm": 0.5664107192411584, + "learning_rate": 1.320228295617166e-05, + "loss": 0.3579, + "step": 8051 + }, + { + "epoch": 1.5873422712933754, + "grad_norm": 3.4551884902309777, + "learning_rate": 1.3200814515473905e-05, + "loss": 0.3417, + "step": 8052 + }, + { + "epoch": 1.5875394321766563, + "grad_norm": 0.5286194871009565, + "learning_rate": 1.3199345997874007e-05, + "loss": 0.3351, + "step": 8053 + }, + { + "epoch": 1.5877365930599368, + "grad_norm": 0.4625651878543429, + "learning_rate": 1.3197877403407242e-05, + "loss": 0.3198, + "step": 8054 + }, + { + "epoch": 1.5879337539432177, + "grad_norm": 0.6352734118743283, + "learning_rate": 1.31964087321089e-05, + "loss": 0.3448, + "step": 8055 + }, + { + "epoch": 1.5881309148264984, + "grad_norm": 0.503782340045354, + "learning_rate": 1.3194939984014263e-05, + "loss": 0.354, + "step": 8056 + }, + { + "epoch": 1.588328075709779, + "grad_norm": 0.5794061421651925, + "learning_rate": 1.3193471159158621e-05, + "loss": 0.3621, + "step": 8057 + }, + { + "epoch": 1.58852523659306, + "grad_norm": 0.49275969055339675, + "learning_rate": 1.3192002257577263e-05, + "loss": 0.3333, + "step": 8058 + }, + { + "epoch": 1.5887223974763407, + "grad_norm": 0.4815640944601471, + "learning_rate": 1.319053327930548e-05, + "loss": 0.341, + "step": 8059 + }, + { + "epoch": 1.5889195583596214, + "grad_norm": 0.4613014999066501, + "learning_rate": 1.3189064224378562e-05, + "loss": 0.3101, + "step": 8060 + }, + { + "epoch": 1.5891167192429023, + "grad_norm": 0.48626447035569825, + "learning_rate": 1.3187595092831813e-05, + "loss": 0.3146, + "step": 8061 + }, + { + "epoch": 1.589313880126183, + "grad_norm": 0.4577848208645168, + "learning_rate": 1.3186125884700522e-05, + "loss": 0.3175, + "step": 8062 + }, + { + "epoch": 1.5895110410094637, + "grad_norm": 0.49457076312235915, + "learning_rate": 1.3184656600019992e-05, + "loss": 0.353, + "step": 8063 + }, + { + "epoch": 1.5897082018927446, + "grad_norm": 0.47999535138775207, + "learning_rate": 1.318318723882552e-05, + "loss": 0.3392, + "step": 8064 + }, + { + "epoch": 1.5899053627760251, + "grad_norm": 0.5418783986590079, + "learning_rate": 1.3181717801152414e-05, + "loss": 0.3415, + "step": 8065 + }, + { + "epoch": 1.590102523659306, + "grad_norm": 0.5096260948506632, + "learning_rate": 1.3180248287035977e-05, + "loss": 0.3262, + "step": 8066 + }, + { + "epoch": 1.5902996845425867, + "grad_norm": 0.4906247366527042, + "learning_rate": 1.3178778696511511e-05, + "loss": 0.3384, + "step": 8067 + }, + { + "epoch": 1.5904968454258674, + "grad_norm": 0.470211162945986, + "learning_rate": 1.317730902961433e-05, + "loss": 0.3244, + "step": 8068 + }, + { + "epoch": 1.5906940063091484, + "grad_norm": 0.49115378811432053, + "learning_rate": 1.3175839286379734e-05, + "loss": 0.3422, + "step": 8069 + }, + { + "epoch": 1.590891167192429, + "grad_norm": 0.5013805949559059, + "learning_rate": 1.3174369466843048e-05, + "loss": 0.3441, + "step": 8070 + }, + { + "epoch": 1.5910883280757098, + "grad_norm": 0.4876708210398381, + "learning_rate": 1.3172899571039577e-05, + "loss": 0.3444, + "step": 8071 + }, + { + "epoch": 1.5912854889589907, + "grad_norm": 0.45242714845948756, + "learning_rate": 1.3171429599004641e-05, + "loss": 0.3162, + "step": 8072 + }, + { + "epoch": 1.5914826498422712, + "grad_norm": 0.47227360673852276, + "learning_rate": 1.316995955077355e-05, + "loss": 0.3263, + "step": 8073 + }, + { + "epoch": 1.591679810725552, + "grad_norm": 0.4875936546980924, + "learning_rate": 1.3168489426381635e-05, + "loss": 0.3398, + "step": 8074 + }, + { + "epoch": 1.5918769716088328, + "grad_norm": 0.483433090727928, + "learning_rate": 1.3167019225864203e-05, + "loss": 0.3262, + "step": 8075 + }, + { + "epoch": 1.5920741324921135, + "grad_norm": 0.4983273920225542, + "learning_rate": 1.3165548949256586e-05, + "loss": 0.3528, + "step": 8076 + }, + { + "epoch": 1.5922712933753944, + "grad_norm": 0.48450953137363295, + "learning_rate": 1.3164078596594107e-05, + "loss": 0.3452, + "step": 8077 + }, + { + "epoch": 1.5924684542586751, + "grad_norm": 0.4623023529103843, + "learning_rate": 1.3162608167912091e-05, + "loss": 0.3421, + "step": 8078 + }, + { + "epoch": 1.5926656151419558, + "grad_norm": 1.1891945757617244, + "learning_rate": 1.3161137663245869e-05, + "loss": 0.335, + "step": 8079 + }, + { + "epoch": 1.5928627760252367, + "grad_norm": 0.46625342878642473, + "learning_rate": 1.3159667082630768e-05, + "loss": 0.3017, + "step": 8080 + }, + { + "epoch": 1.5930599369085172, + "grad_norm": 0.4885959593564677, + "learning_rate": 1.3158196426102121e-05, + "loss": 0.3207, + "step": 8081 + }, + { + "epoch": 1.5932570977917981, + "grad_norm": 0.5374079371161415, + "learning_rate": 1.315672569369526e-05, + "loss": 0.3292, + "step": 8082 + }, + { + "epoch": 1.5934542586750788, + "grad_norm": 0.47767332144300273, + "learning_rate": 1.3155254885445526e-05, + "loss": 0.3134, + "step": 8083 + }, + { + "epoch": 1.5936514195583595, + "grad_norm": 0.4563667095812108, + "learning_rate": 1.3153784001388249e-05, + "loss": 0.3181, + "step": 8084 + }, + { + "epoch": 1.5938485804416405, + "grad_norm": 0.4891329550595928, + "learning_rate": 1.315231304155877e-05, + "loss": 0.352, + "step": 8085 + }, + { + "epoch": 1.5940457413249212, + "grad_norm": 0.4746370559092902, + "learning_rate": 1.3150842005992434e-05, + "loss": 0.3107, + "step": 8086 + }, + { + "epoch": 1.5942429022082019, + "grad_norm": 0.5821961651457958, + "learning_rate": 1.3149370894724583e-05, + "loss": 0.3215, + "step": 8087 + }, + { + "epoch": 1.5944400630914828, + "grad_norm": 0.4692821907783642, + "learning_rate": 1.3147899707790557e-05, + "loss": 0.3383, + "step": 8088 + }, + { + "epoch": 1.5946372239747633, + "grad_norm": 0.6369717589355536, + "learning_rate": 1.3146428445225708e-05, + "loss": 0.3516, + "step": 8089 + }, + { + "epoch": 1.5948343848580442, + "grad_norm": 0.5091082227275152, + "learning_rate": 1.3144957107065379e-05, + "loss": 0.3598, + "step": 8090 + }, + { + "epoch": 1.5950315457413249, + "grad_norm": 0.4761782875190844, + "learning_rate": 1.3143485693344925e-05, + "loss": 0.3409, + "step": 8091 + }, + { + "epoch": 1.5952287066246056, + "grad_norm": 0.46602014777166734, + "learning_rate": 1.3142014204099696e-05, + "loss": 0.3135, + "step": 8092 + }, + { + "epoch": 1.5954258675078865, + "grad_norm": 0.4514528986309173, + "learning_rate": 1.3140542639365047e-05, + "loss": 0.3134, + "step": 8093 + }, + { + "epoch": 1.5956230283911672, + "grad_norm": 0.5297263361088499, + "learning_rate": 1.3139070999176326e-05, + "loss": 0.3419, + "step": 8094 + }, + { + "epoch": 1.595820189274448, + "grad_norm": 0.5469443697151118, + "learning_rate": 1.3137599283568902e-05, + "loss": 0.3563, + "step": 8095 + }, + { + "epoch": 1.5960173501577288, + "grad_norm": 0.4957278314999418, + "learning_rate": 1.3136127492578126e-05, + "loss": 0.355, + "step": 8096 + }, + { + "epoch": 1.5962145110410093, + "grad_norm": 0.5166791104046938, + "learning_rate": 1.3134655626239363e-05, + "loss": 0.3296, + "step": 8097 + }, + { + "epoch": 1.5964116719242902, + "grad_norm": 0.5016830354531797, + "learning_rate": 1.3133183684587974e-05, + "loss": 0.3412, + "step": 8098 + }, + { + "epoch": 1.596608832807571, + "grad_norm": 0.4893337636428677, + "learning_rate": 1.3131711667659323e-05, + "loss": 0.339, + "step": 8099 + }, + { + "epoch": 1.5968059936908516, + "grad_norm": 0.467330301371642, + "learning_rate": 1.3130239575488777e-05, + "loss": 0.2984, + "step": 8100 + }, + { + "epoch": 1.5970031545741326, + "grad_norm": 3.5270122225585343, + "learning_rate": 1.3128767408111704e-05, + "loss": 0.3816, + "step": 8101 + }, + { + "epoch": 1.5972003154574133, + "grad_norm": 0.46859234609245476, + "learning_rate": 1.3127295165563476e-05, + "loss": 0.3137, + "step": 8102 + }, + { + "epoch": 1.597397476340694, + "grad_norm": 0.5086912187536752, + "learning_rate": 1.3125822847879464e-05, + "loss": 0.364, + "step": 8103 + }, + { + "epoch": 1.5975946372239749, + "grad_norm": 0.46400065943464436, + "learning_rate": 1.312435045509504e-05, + "loss": 0.3094, + "step": 8104 + }, + { + "epoch": 1.5977917981072554, + "grad_norm": 0.5054767816862595, + "learning_rate": 1.3122877987245579e-05, + "loss": 0.3287, + "step": 8105 + }, + { + "epoch": 1.5979889589905363, + "grad_norm": 0.49607387723822316, + "learning_rate": 1.3121405444366459e-05, + "loss": 0.3377, + "step": 8106 + }, + { + "epoch": 1.598186119873817, + "grad_norm": 0.4921049056516874, + "learning_rate": 1.3119932826493063e-05, + "loss": 0.3322, + "step": 8107 + }, + { + "epoch": 1.5983832807570977, + "grad_norm": 0.4850703791480183, + "learning_rate": 1.3118460133660766e-05, + "loss": 0.317, + "step": 8108 + }, + { + "epoch": 1.5985804416403786, + "grad_norm": 0.4917675892451221, + "learning_rate": 1.3116987365904951e-05, + "loss": 0.3443, + "step": 8109 + }, + { + "epoch": 1.5987776025236593, + "grad_norm": 0.5527143499695473, + "learning_rate": 1.3115514523261008e-05, + "loss": 0.317, + "step": 8110 + }, + { + "epoch": 1.59897476340694, + "grad_norm": 0.4889167905883699, + "learning_rate": 1.3114041605764319e-05, + "loss": 0.3288, + "step": 8111 + }, + { + "epoch": 1.599171924290221, + "grad_norm": 0.4604739649545089, + "learning_rate": 1.3112568613450271e-05, + "loss": 0.3237, + "step": 8112 + }, + { + "epoch": 1.5993690851735016, + "grad_norm": 0.4826891223593189, + "learning_rate": 1.3111095546354257e-05, + "loss": 0.3372, + "step": 8113 + }, + { + "epoch": 1.5995662460567823, + "grad_norm": 0.4627352710214227, + "learning_rate": 1.3109622404511669e-05, + "loss": 0.3274, + "step": 8114 + }, + { + "epoch": 1.5997634069400632, + "grad_norm": 0.5183424926340384, + "learning_rate": 1.3108149187957895e-05, + "loss": 0.3668, + "step": 8115 + }, + { + "epoch": 1.5999605678233437, + "grad_norm": 0.4890735359860389, + "learning_rate": 1.3106675896728334e-05, + "loss": 0.3281, + "step": 8116 + }, + { + "epoch": 1.6001577287066246, + "grad_norm": 0.4663459951593309, + "learning_rate": 1.3105202530858386e-05, + "loss": 0.3157, + "step": 8117 + }, + { + "epoch": 1.6003548895899053, + "grad_norm": 0.4642219805501338, + "learning_rate": 1.310372909038344e-05, + "loss": 0.322, + "step": 8118 + }, + { + "epoch": 1.600552050473186, + "grad_norm": 0.4883535083604666, + "learning_rate": 1.3102255575338912e-05, + "loss": 0.3544, + "step": 8119 + }, + { + "epoch": 1.600749211356467, + "grad_norm": 0.4938965248583283, + "learning_rate": 1.3100781985760188e-05, + "loss": 0.3415, + "step": 8120 + }, + { + "epoch": 1.6009463722397477, + "grad_norm": 0.508009918172584, + "learning_rate": 1.3099308321682685e-05, + "loss": 0.3645, + "step": 8121 + }, + { + "epoch": 1.6011435331230284, + "grad_norm": 0.4977394454270282, + "learning_rate": 1.30978345831418e-05, + "loss": 0.37, + "step": 8122 + }, + { + "epoch": 1.6013406940063093, + "grad_norm": 0.4827810334398335, + "learning_rate": 1.3096360770172947e-05, + "loss": 0.3373, + "step": 8123 + }, + { + "epoch": 1.6015378548895898, + "grad_norm": 0.5238029508010671, + "learning_rate": 1.309488688281153e-05, + "loss": 0.3907, + "step": 8124 + }, + { + "epoch": 1.6017350157728707, + "grad_norm": 0.4897188747786206, + "learning_rate": 1.3093412921092967e-05, + "loss": 0.3295, + "step": 8125 + }, + { + "epoch": 1.6019321766561514, + "grad_norm": 0.4957516780778494, + "learning_rate": 1.3091938885052665e-05, + "loss": 0.35, + "step": 8126 + }, + { + "epoch": 1.602129337539432, + "grad_norm": 0.5216427832225966, + "learning_rate": 1.3090464774726042e-05, + "loss": 0.3486, + "step": 8127 + }, + { + "epoch": 1.602326498422713, + "grad_norm": 0.5159704540850552, + "learning_rate": 1.3088990590148516e-05, + "loss": 0.355, + "step": 8128 + }, + { + "epoch": 1.6025236593059937, + "grad_norm": 0.49842376144042355, + "learning_rate": 1.3087516331355501e-05, + "loss": 0.3439, + "step": 8129 + }, + { + "epoch": 1.6027208201892744, + "grad_norm": 0.4666686177899867, + "learning_rate": 1.3086041998382419e-05, + "loss": 0.3322, + "step": 8130 + }, + { + "epoch": 1.6029179810725553, + "grad_norm": 0.6474512146454534, + "learning_rate": 1.3084567591264694e-05, + "loss": 0.3618, + "step": 8131 + }, + { + "epoch": 1.6031151419558358, + "grad_norm": 0.5026757411329433, + "learning_rate": 1.308309311003775e-05, + "loss": 0.3374, + "step": 8132 + }, + { + "epoch": 1.6033123028391167, + "grad_norm": 0.4966462917789033, + "learning_rate": 1.308161855473701e-05, + "loss": 0.326, + "step": 8133 + }, + { + "epoch": 1.6035094637223974, + "grad_norm": 0.48096431260184735, + "learning_rate": 1.3080143925397904e-05, + "loss": 0.3368, + "step": 8134 + }, + { + "epoch": 1.6037066246056781, + "grad_norm": 0.48619845262827444, + "learning_rate": 1.3078669222055858e-05, + "loss": 0.3617, + "step": 8135 + }, + { + "epoch": 1.603903785488959, + "grad_norm": 1.8367614071058735, + "learning_rate": 1.3077194444746307e-05, + "loss": 0.3497, + "step": 8136 + }, + { + "epoch": 1.6041009463722398, + "grad_norm": 0.512344448724785, + "learning_rate": 1.3075719593504674e-05, + "loss": 0.3413, + "step": 8137 + }, + { + "epoch": 1.6042981072555205, + "grad_norm": 0.5211149611982421, + "learning_rate": 1.3074244668366412e-05, + "loss": 0.3447, + "step": 8138 + }, + { + "epoch": 1.6044952681388014, + "grad_norm": 0.48739207546058483, + "learning_rate": 1.3072769669366938e-05, + "loss": 0.3083, + "step": 8139 + }, + { + "epoch": 1.6046924290220819, + "grad_norm": 0.49212067031603235, + "learning_rate": 1.3071294596541701e-05, + "loss": 0.3196, + "step": 8140 + }, + { + "epoch": 1.6048895899053628, + "grad_norm": 3.7626457727782956, + "learning_rate": 1.3069819449926136e-05, + "loss": 0.4303, + "step": 8141 + }, + { + "epoch": 1.6050867507886435, + "grad_norm": 0.5830689224709581, + "learning_rate": 1.3068344229555692e-05, + "loss": 0.3308, + "step": 8142 + }, + { + "epoch": 1.6052839116719242, + "grad_norm": 0.49521045756115595, + "learning_rate": 1.30668689354658e-05, + "loss": 0.3391, + "step": 8143 + }, + { + "epoch": 1.6054810725552051, + "grad_norm": 0.5145784616836976, + "learning_rate": 1.3065393567691914e-05, + "loss": 0.3544, + "step": 8144 + }, + { + "epoch": 1.6056782334384858, + "grad_norm": 0.5028725718766839, + "learning_rate": 1.3063918126269483e-05, + "loss": 0.3444, + "step": 8145 + }, + { + "epoch": 1.6058753943217665, + "grad_norm": 0.5769436395830195, + "learning_rate": 1.3062442611233949e-05, + "loss": 0.3543, + "step": 8146 + }, + { + "epoch": 1.6060725552050474, + "grad_norm": 0.5022646683039049, + "learning_rate": 1.3060967022620766e-05, + "loss": 0.3165, + "step": 8147 + }, + { + "epoch": 1.606269716088328, + "grad_norm": 0.4827891396259431, + "learning_rate": 1.3059491360465384e-05, + "loss": 0.3357, + "step": 8148 + }, + { + "epoch": 1.6064668769716088, + "grad_norm": 0.49635418716653323, + "learning_rate": 1.305801562480326e-05, + "loss": 0.3121, + "step": 8149 + }, + { + "epoch": 1.6066640378548895, + "grad_norm": 0.4857542068874944, + "learning_rate": 1.3056539815669846e-05, + "loss": 0.3357, + "step": 8150 + }, + { + "epoch": 1.6068611987381702, + "grad_norm": 0.45147908050559366, + "learning_rate": 1.3055063933100602e-05, + "loss": 0.2883, + "step": 8151 + }, + { + "epoch": 1.6070583596214512, + "grad_norm": 0.46546726459791526, + "learning_rate": 1.3053587977130988e-05, + "loss": 0.3118, + "step": 8152 + }, + { + "epoch": 1.6072555205047319, + "grad_norm": 0.4993862309452187, + "learning_rate": 1.3052111947796463e-05, + "loss": 0.3594, + "step": 8153 + }, + { + "epoch": 1.6074526813880126, + "grad_norm": 0.4879439935041709, + "learning_rate": 1.305063584513249e-05, + "loss": 0.3151, + "step": 8154 + }, + { + "epoch": 1.6076498422712935, + "grad_norm": 0.48554820642690266, + "learning_rate": 1.3049159669174534e-05, + "loss": 0.3031, + "step": 8155 + }, + { + "epoch": 1.6078470031545742, + "grad_norm": 0.500184176222356, + "learning_rate": 1.3047683419958062e-05, + "loss": 0.3406, + "step": 8156 + }, + { + "epoch": 1.6080441640378549, + "grad_norm": 0.5063923589890583, + "learning_rate": 1.3046207097518542e-05, + "loss": 0.3499, + "step": 8157 + }, + { + "epoch": 1.6082413249211358, + "grad_norm": 0.49816615378885987, + "learning_rate": 1.3044730701891442e-05, + "loss": 0.3237, + "step": 8158 + }, + { + "epoch": 1.6084384858044163, + "grad_norm": 0.4925753536085062, + "learning_rate": 1.3043254233112237e-05, + "loss": 0.3368, + "step": 8159 + }, + { + "epoch": 1.6086356466876972, + "grad_norm": 0.5310015500392122, + "learning_rate": 1.3041777691216395e-05, + "loss": 0.3725, + "step": 8160 + }, + { + "epoch": 1.608832807570978, + "grad_norm": 0.49537678955838665, + "learning_rate": 1.3040301076239398e-05, + "loss": 0.3287, + "step": 8161 + }, + { + "epoch": 1.6090299684542586, + "grad_norm": 0.5527761110782208, + "learning_rate": 1.3038824388216718e-05, + "loss": 0.3726, + "step": 8162 + }, + { + "epoch": 1.6092271293375395, + "grad_norm": 0.447484220574138, + "learning_rate": 1.3037347627183835e-05, + "loss": 0.3021, + "step": 8163 + }, + { + "epoch": 1.6094242902208202, + "grad_norm": 0.5179693625577074, + "learning_rate": 1.3035870793176229e-05, + "loss": 0.3551, + "step": 8164 + }, + { + "epoch": 1.609621451104101, + "grad_norm": 0.46927906569820993, + "learning_rate": 1.3034393886229381e-05, + "loss": 0.3183, + "step": 8165 + }, + { + "epoch": 1.6098186119873819, + "grad_norm": 0.498692120573005, + "learning_rate": 1.3032916906378782e-05, + "loss": 0.3039, + "step": 8166 + }, + { + "epoch": 1.6100157728706623, + "grad_norm": 0.501698354263306, + "learning_rate": 1.3031439853659906e-05, + "loss": 0.3314, + "step": 8167 + }, + { + "epoch": 1.6102129337539433, + "grad_norm": 0.8075569329846617, + "learning_rate": 1.302996272810825e-05, + "loss": 0.3544, + "step": 8168 + }, + { + "epoch": 1.610410094637224, + "grad_norm": 0.4790951853547528, + "learning_rate": 1.3028485529759296e-05, + "loss": 0.3349, + "step": 8169 + }, + { + "epoch": 1.6106072555205047, + "grad_norm": 0.48026262439420764, + "learning_rate": 1.3027008258648538e-05, + "loss": 0.3649, + "step": 8170 + }, + { + "epoch": 1.6108044164037856, + "grad_norm": 0.4922180547792553, + "learning_rate": 1.3025530914811473e-05, + "loss": 0.3143, + "step": 8171 + }, + { + "epoch": 1.6110015772870663, + "grad_norm": 0.5198306988127153, + "learning_rate": 1.3024053498283588e-05, + "loss": 0.3275, + "step": 8172 + }, + { + "epoch": 1.611198738170347, + "grad_norm": 0.4742201615022806, + "learning_rate": 1.3022576009100382e-05, + "loss": 0.3558, + "step": 8173 + }, + { + "epoch": 1.611395899053628, + "grad_norm": 0.5014496532783586, + "learning_rate": 1.3021098447297358e-05, + "loss": 0.3412, + "step": 8174 + }, + { + "epoch": 1.6115930599369084, + "grad_norm": 0.49858321941500644, + "learning_rate": 1.3019620812910008e-05, + "loss": 0.3349, + "step": 8175 + }, + { + "epoch": 1.6117902208201893, + "grad_norm": 0.4876528140518932, + "learning_rate": 1.3018143105973835e-05, + "loss": 0.3295, + "step": 8176 + }, + { + "epoch": 1.61198738170347, + "grad_norm": 0.5014129411656993, + "learning_rate": 1.3016665326524343e-05, + "loss": 0.3169, + "step": 8177 + }, + { + "epoch": 1.6121845425867507, + "grad_norm": 0.4709155858213729, + "learning_rate": 1.301518747459704e-05, + "loss": 0.3033, + "step": 8178 + }, + { + "epoch": 1.6123817034700316, + "grad_norm": 0.569369280596523, + "learning_rate": 1.3013709550227429e-05, + "loss": 0.3567, + "step": 8179 + }, + { + "epoch": 1.6125788643533123, + "grad_norm": 0.49624380854953204, + "learning_rate": 1.3012231553451018e-05, + "loss": 0.3198, + "step": 8180 + }, + { + "epoch": 1.612776025236593, + "grad_norm": 0.4572480950811159, + "learning_rate": 1.301075348430332e-05, + "loss": 0.3102, + "step": 8181 + }, + { + "epoch": 1.612973186119874, + "grad_norm": 0.5168349496318023, + "learning_rate": 1.3009275342819842e-05, + "loss": 0.3265, + "step": 8182 + }, + { + "epoch": 1.6131703470031544, + "grad_norm": 0.6022754654599433, + "learning_rate": 1.3007797129036104e-05, + "loss": 0.3463, + "step": 8183 + }, + { + "epoch": 1.6133675078864353, + "grad_norm": 0.49921945706957105, + "learning_rate": 1.3006318842987615e-05, + "loss": 0.3402, + "step": 8184 + }, + { + "epoch": 1.613564668769716, + "grad_norm": 0.4786378011895483, + "learning_rate": 1.3004840484709897e-05, + "loss": 0.3151, + "step": 8185 + }, + { + "epoch": 1.6137618296529967, + "grad_norm": 0.5091965495386273, + "learning_rate": 1.3003362054238465e-05, + "loss": 0.3439, + "step": 8186 + }, + { + "epoch": 1.6139589905362777, + "grad_norm": 0.5075959256110124, + "learning_rate": 1.3001883551608843e-05, + "loss": 0.3539, + "step": 8187 + }, + { + "epoch": 1.6141561514195584, + "grad_norm": 0.51624022913439, + "learning_rate": 1.3000404976856546e-05, + "loss": 0.362, + "step": 8188 + }, + { + "epoch": 1.614353312302839, + "grad_norm": 0.5353531514337778, + "learning_rate": 1.2998926330017109e-05, + "loss": 0.3716, + "step": 8189 + }, + { + "epoch": 1.61455047318612, + "grad_norm": 0.48775573515644133, + "learning_rate": 1.2997447611126049e-05, + "loss": 0.3223, + "step": 8190 + }, + { + "epoch": 1.6147476340694005, + "grad_norm": 0.5058365439971066, + "learning_rate": 1.2995968820218896e-05, + "loss": 0.3351, + "step": 8191 + }, + { + "epoch": 1.6149447949526814, + "grad_norm": 0.4873527967441035, + "learning_rate": 1.2994489957331183e-05, + "loss": 0.3134, + "step": 8192 + }, + { + "epoch": 1.615141955835962, + "grad_norm": 0.5142383938117662, + "learning_rate": 1.2993011022498434e-05, + "loss": 0.365, + "step": 8193 + }, + { + "epoch": 1.6153391167192428, + "grad_norm": 0.5405740913312057, + "learning_rate": 1.2991532015756185e-05, + "loss": 0.3483, + "step": 8194 + }, + { + "epoch": 1.6155362776025237, + "grad_norm": 0.5168750664395806, + "learning_rate": 1.2990052937139972e-05, + "loss": 0.343, + "step": 8195 + }, + { + "epoch": 1.6157334384858044, + "grad_norm": 0.49456698674100874, + "learning_rate": 1.298857378668533e-05, + "loss": 0.3316, + "step": 8196 + }, + { + "epoch": 1.6159305993690851, + "grad_norm": 0.5197649456360487, + "learning_rate": 1.2987094564427794e-05, + "loss": 0.3375, + "step": 8197 + }, + { + "epoch": 1.616127760252366, + "grad_norm": 0.49653557262255904, + "learning_rate": 1.2985615270402904e-05, + "loss": 0.3166, + "step": 8198 + }, + { + "epoch": 1.6163249211356467, + "grad_norm": 0.5287246948423061, + "learning_rate": 1.2984135904646206e-05, + "loss": 0.3684, + "step": 8199 + }, + { + "epoch": 1.6165220820189274, + "grad_norm": 0.5612379255016995, + "learning_rate": 1.298265646719324e-05, + "loss": 0.3378, + "step": 8200 + }, + { + "epoch": 1.6167192429022084, + "grad_norm": 0.5045525899494582, + "learning_rate": 1.2981176958079549e-05, + "loss": 0.3534, + "step": 8201 + }, + { + "epoch": 1.6169164037854888, + "grad_norm": 0.5021046772968188, + "learning_rate": 1.2979697377340681e-05, + "loss": 0.3276, + "step": 8202 + }, + { + "epoch": 1.6171135646687698, + "grad_norm": 0.5041966851488991, + "learning_rate": 1.2978217725012183e-05, + "loss": 0.3378, + "step": 8203 + }, + { + "epoch": 1.6173107255520505, + "grad_norm": 0.4830420145017819, + "learning_rate": 1.2976738001129608e-05, + "loss": 0.3352, + "step": 8204 + }, + { + "epoch": 1.6175078864353312, + "grad_norm": 0.511796475168007, + "learning_rate": 1.2975258205728503e-05, + "loss": 0.3401, + "step": 8205 + }, + { + "epoch": 1.617705047318612, + "grad_norm": 0.4940584230053775, + "learning_rate": 1.2973778338844425e-05, + "loss": 0.3433, + "step": 8206 + }, + { + "epoch": 1.6179022082018928, + "grad_norm": 0.5293789678998656, + "learning_rate": 1.2972298400512926e-05, + "loss": 0.3809, + "step": 8207 + }, + { + "epoch": 1.6180993690851735, + "grad_norm": 0.49843756683541707, + "learning_rate": 1.2970818390769569e-05, + "loss": 0.3397, + "step": 8208 + }, + { + "epoch": 1.6182965299684544, + "grad_norm": 0.45831650251490347, + "learning_rate": 1.2969338309649901e-05, + "loss": 0.302, + "step": 8209 + }, + { + "epoch": 1.618493690851735, + "grad_norm": 0.4793435986752458, + "learning_rate": 1.2967858157189495e-05, + "loss": 0.3479, + "step": 8210 + }, + { + "epoch": 1.6186908517350158, + "grad_norm": 0.48039522018416736, + "learning_rate": 1.2966377933423901e-05, + "loss": 0.3395, + "step": 8211 + }, + { + "epoch": 1.6188880126182965, + "grad_norm": 0.49848149873721165, + "learning_rate": 1.2964897638388694e-05, + "loss": 0.3391, + "step": 8212 + }, + { + "epoch": 1.6190851735015772, + "grad_norm": 0.4735212283074649, + "learning_rate": 1.296341727211943e-05, + "loss": 0.3421, + "step": 8213 + }, + { + "epoch": 1.6192823343848581, + "grad_norm": 0.5087062921882801, + "learning_rate": 1.296193683465168e-05, + "loss": 0.342, + "step": 8214 + }, + { + "epoch": 1.6194794952681388, + "grad_norm": 0.5258996248721838, + "learning_rate": 1.2960456326021013e-05, + "loss": 0.354, + "step": 8215 + }, + { + "epoch": 1.6196766561514195, + "grad_norm": 0.47955351066783924, + "learning_rate": 1.2958975746263e-05, + "loss": 0.3241, + "step": 8216 + }, + { + "epoch": 1.6198738170347005, + "grad_norm": 0.49147931279397045, + "learning_rate": 1.295749509541321e-05, + "loss": 0.3606, + "step": 8217 + }, + { + "epoch": 1.620070977917981, + "grad_norm": 0.5002125077620726, + "learning_rate": 1.2956014373507219e-05, + "loss": 0.3514, + "step": 8218 + }, + { + "epoch": 1.6202681388012619, + "grad_norm": 0.5136920075059789, + "learning_rate": 1.2954533580580603e-05, + "loss": 0.3622, + "step": 8219 + }, + { + "epoch": 1.6204652996845426, + "grad_norm": 0.5015775386249709, + "learning_rate": 1.2953052716668939e-05, + "loss": 0.3395, + "step": 8220 + }, + { + "epoch": 1.6206624605678233, + "grad_norm": 0.48326937347111654, + "learning_rate": 1.2951571781807804e-05, + "loss": 0.3227, + "step": 8221 + }, + { + "epoch": 1.6208596214511042, + "grad_norm": 0.5751545472906557, + "learning_rate": 1.295009077603278e-05, + "loss": 0.3488, + "step": 8222 + }, + { + "epoch": 1.6210567823343849, + "grad_norm": 0.5320211920407559, + "learning_rate": 1.2948609699379451e-05, + "loss": 0.3413, + "step": 8223 + }, + { + "epoch": 1.6212539432176656, + "grad_norm": 0.4687064635801877, + "learning_rate": 1.2947128551883399e-05, + "loss": 0.3454, + "step": 8224 + }, + { + "epoch": 1.6214511041009465, + "grad_norm": 0.49744433210961403, + "learning_rate": 1.294564733358021e-05, + "loss": 0.3515, + "step": 8225 + }, + { + "epoch": 1.621648264984227, + "grad_norm": 0.5069074005833163, + "learning_rate": 1.2944166044505467e-05, + "loss": 0.3457, + "step": 8226 + }, + { + "epoch": 1.621845425867508, + "grad_norm": 0.47976757490607047, + "learning_rate": 1.294268468469477e-05, + "loss": 0.3453, + "step": 8227 + }, + { + "epoch": 1.6220425867507886, + "grad_norm": 0.4952595442891473, + "learning_rate": 1.29412032541837e-05, + "loss": 0.32, + "step": 8228 + }, + { + "epoch": 1.6222397476340693, + "grad_norm": 0.46297510160068384, + "learning_rate": 1.2939721753007857e-05, + "loss": 0.3111, + "step": 8229 + }, + { + "epoch": 1.6224369085173502, + "grad_norm": 0.5043310082743702, + "learning_rate": 1.2938240181202828e-05, + "loss": 0.3748, + "step": 8230 + }, + { + "epoch": 1.622634069400631, + "grad_norm": 0.5095253493670123, + "learning_rate": 1.2936758538804215e-05, + "loss": 0.3488, + "step": 8231 + }, + { + "epoch": 1.6228312302839116, + "grad_norm": 0.4503216039107211, + "learning_rate": 1.2935276825847614e-05, + "loss": 0.3134, + "step": 8232 + }, + { + "epoch": 1.6230283911671926, + "grad_norm": 0.5050115110615685, + "learning_rate": 1.293379504236862e-05, + "loss": 0.3343, + "step": 8233 + }, + { + "epoch": 1.623225552050473, + "grad_norm": 0.5141044664399913, + "learning_rate": 1.293231318840284e-05, + "loss": 0.3302, + "step": 8234 + }, + { + "epoch": 1.623422712933754, + "grad_norm": 0.4852074392315996, + "learning_rate": 1.2930831263985873e-05, + "loss": 0.3247, + "step": 8235 + }, + { + "epoch": 1.6236198738170347, + "grad_norm": 0.5009665060672993, + "learning_rate": 1.2929349269153326e-05, + "loss": 0.3452, + "step": 8236 + }, + { + "epoch": 1.6238170347003154, + "grad_norm": 0.48558116806406154, + "learning_rate": 1.29278672039408e-05, + "loss": 0.3345, + "step": 8237 + }, + { + "epoch": 1.6240141955835963, + "grad_norm": 0.45949344463355146, + "learning_rate": 1.292638506838391e-05, + "loss": 0.3232, + "step": 8238 + }, + { + "epoch": 1.624211356466877, + "grad_norm": 0.4606410064917042, + "learning_rate": 1.2924902862518262e-05, + "loss": 0.3166, + "step": 8239 + }, + { + "epoch": 1.6244085173501577, + "grad_norm": 0.47453261506907685, + "learning_rate": 1.2923420586379466e-05, + "loss": 0.3172, + "step": 8240 + }, + { + "epoch": 1.6246056782334386, + "grad_norm": 0.49860886417423067, + "learning_rate": 1.2921938240003138e-05, + "loss": 0.3468, + "step": 8241 + }, + { + "epoch": 1.624802839116719, + "grad_norm": 0.4842849858710501, + "learning_rate": 1.2920455823424892e-05, + "loss": 0.3301, + "step": 8242 + }, + { + "epoch": 1.625, + "grad_norm": 0.4715791270451386, + "learning_rate": 1.2918973336680339e-05, + "loss": 0.3352, + "step": 8243 + }, + { + "epoch": 1.625197160883281, + "grad_norm": 0.4857708781749364, + "learning_rate": 1.2917490779805105e-05, + "loss": 0.3422, + "step": 8244 + }, + { + "epoch": 1.6253943217665614, + "grad_norm": 1.2910001595707934, + "learning_rate": 1.2916008152834803e-05, + "loss": 0.3522, + "step": 8245 + }, + { + "epoch": 1.6255914826498423, + "grad_norm": 0.46922968126191145, + "learning_rate": 1.2914525455805056e-05, + "loss": 0.3209, + "step": 8246 + }, + { + "epoch": 1.625788643533123, + "grad_norm": 0.5124348833505142, + "learning_rate": 1.291304268875149e-05, + "loss": 0.3444, + "step": 8247 + }, + { + "epoch": 1.6259858044164037, + "grad_norm": 0.4617262156966585, + "learning_rate": 1.2911559851709728e-05, + "loss": 0.3357, + "step": 8248 + }, + { + "epoch": 1.6261829652996846, + "grad_norm": 0.4682239861424123, + "learning_rate": 1.2910076944715394e-05, + "loss": 0.3258, + "step": 8249 + }, + { + "epoch": 1.6263801261829653, + "grad_norm": 0.4926977054765841, + "learning_rate": 1.2908593967804117e-05, + "loss": 0.3357, + "step": 8250 + }, + { + "epoch": 1.626577287066246, + "grad_norm": 0.47277698075135877, + "learning_rate": 1.290711092101153e-05, + "loss": 0.3236, + "step": 8251 + }, + { + "epoch": 1.626774447949527, + "grad_norm": 0.46612901619189784, + "learning_rate": 1.2905627804373259e-05, + "loss": 0.3218, + "step": 8252 + }, + { + "epoch": 1.6269716088328074, + "grad_norm": 0.46710838609859795, + "learning_rate": 1.2904144617924946e-05, + "loss": 0.32, + "step": 8253 + }, + { + "epoch": 1.6271687697160884, + "grad_norm": 0.46480402221238837, + "learning_rate": 1.2902661361702214e-05, + "loss": 0.322, + "step": 8254 + }, + { + "epoch": 1.627365930599369, + "grad_norm": 0.4992203321218696, + "learning_rate": 1.2901178035740709e-05, + "loss": 0.317, + "step": 8255 + }, + { + "epoch": 1.6275630914826498, + "grad_norm": 0.5101087513586251, + "learning_rate": 1.2899694640076062e-05, + "loss": 0.3448, + "step": 8256 + }, + { + "epoch": 1.6277602523659307, + "grad_norm": 0.48008287555221985, + "learning_rate": 1.289821117474392e-05, + "loss": 0.3251, + "step": 8257 + }, + { + "epoch": 1.6279574132492114, + "grad_norm": 0.4797668818388636, + "learning_rate": 1.2896727639779916e-05, + "loss": 0.3361, + "step": 8258 + }, + { + "epoch": 1.628154574132492, + "grad_norm": 0.4713917003117169, + "learning_rate": 1.2895244035219701e-05, + "loss": 0.3121, + "step": 8259 + }, + { + "epoch": 1.628351735015773, + "grad_norm": 0.8057901594512715, + "learning_rate": 1.2893760361098915e-05, + "loss": 0.3256, + "step": 8260 + }, + { + "epoch": 1.6285488958990535, + "grad_norm": 0.4876468535149187, + "learning_rate": 1.2892276617453208e-05, + "loss": 0.3393, + "step": 8261 + }, + { + "epoch": 1.6287460567823344, + "grad_norm": 0.5166514990575025, + "learning_rate": 1.2890792804318224e-05, + "loss": 0.3577, + "step": 8262 + }, + { + "epoch": 1.6289432176656151, + "grad_norm": 0.4791140344047714, + "learning_rate": 1.2889308921729616e-05, + "loss": 0.3388, + "step": 8263 + }, + { + "epoch": 1.6291403785488958, + "grad_norm": 0.4812157889038476, + "learning_rate": 1.2887824969723035e-05, + "loss": 0.3236, + "step": 8264 + }, + { + "epoch": 1.6293375394321767, + "grad_norm": 0.45798962884288835, + "learning_rate": 1.2886340948334132e-05, + "loss": 0.328, + "step": 8265 + }, + { + "epoch": 1.6295347003154574, + "grad_norm": 0.8222202615392995, + "learning_rate": 1.2884856857598564e-05, + "loss": 0.3577, + "step": 8266 + }, + { + "epoch": 1.6297318611987381, + "grad_norm": 0.5528989353996063, + "learning_rate": 1.2883372697551987e-05, + "loss": 0.3358, + "step": 8267 + }, + { + "epoch": 1.629929022082019, + "grad_norm": 0.49179851604347374, + "learning_rate": 1.2881888468230059e-05, + "loss": 0.3182, + "step": 8268 + }, + { + "epoch": 1.6301261829652995, + "grad_norm": 0.6056266880805475, + "learning_rate": 1.2880404169668438e-05, + "loss": 0.3249, + "step": 8269 + }, + { + "epoch": 1.6303233438485805, + "grad_norm": 0.7467253138778867, + "learning_rate": 1.2878919801902791e-05, + "loss": 0.3381, + "step": 8270 + }, + { + "epoch": 1.6305205047318612, + "grad_norm": 0.48977301540090146, + "learning_rate": 1.2877435364968776e-05, + "loss": 0.3175, + "step": 8271 + }, + { + "epoch": 1.6307176656151419, + "grad_norm": 0.507009351089707, + "learning_rate": 1.2875950858902057e-05, + "loss": 0.3186, + "step": 8272 + }, + { + "epoch": 1.6309148264984228, + "grad_norm": 0.48004983350279057, + "learning_rate": 1.2874466283738303e-05, + "loss": 0.3183, + "step": 8273 + }, + { + "epoch": 1.6311119873817035, + "grad_norm": 0.6427227001497235, + "learning_rate": 1.2872981639513187e-05, + "loss": 0.36, + "step": 8274 + }, + { + "epoch": 1.6313091482649842, + "grad_norm": 0.5043727739166707, + "learning_rate": 1.2871496926262365e-05, + "loss": 0.3388, + "step": 8275 + }, + { + "epoch": 1.631506309148265, + "grad_norm": 0.537296723687631, + "learning_rate": 1.2870012144021524e-05, + "loss": 0.3812, + "step": 8276 + }, + { + "epoch": 1.6317034700315456, + "grad_norm": 0.4600951770649924, + "learning_rate": 1.2868527292826325e-05, + "loss": 0.3042, + "step": 8277 + }, + { + "epoch": 1.6319006309148265, + "grad_norm": 0.4961373527723629, + "learning_rate": 1.2867042372712453e-05, + "loss": 0.3288, + "step": 8278 + }, + { + "epoch": 1.6320977917981072, + "grad_norm": 0.49399953361553006, + "learning_rate": 1.2865557383715574e-05, + "loss": 0.3309, + "step": 8279 + }, + { + "epoch": 1.632294952681388, + "grad_norm": 0.4951800992829749, + "learning_rate": 1.2864072325871372e-05, + "loss": 0.316, + "step": 8280 + }, + { + "epoch": 1.6324921135646688, + "grad_norm": 0.5172023636303266, + "learning_rate": 1.2862587199215528e-05, + "loss": 0.3361, + "step": 8281 + }, + { + "epoch": 1.6326892744479495, + "grad_norm": 0.44561120909851326, + "learning_rate": 1.2861102003783722e-05, + "loss": 0.3092, + "step": 8282 + }, + { + "epoch": 1.6328864353312302, + "grad_norm": 0.4804875200058334, + "learning_rate": 1.2859616739611636e-05, + "loss": 0.3166, + "step": 8283 + }, + { + "epoch": 1.6330835962145112, + "grad_norm": 1.2364757919941514, + "learning_rate": 1.2858131406734953e-05, + "loss": 0.3109, + "step": 8284 + }, + { + "epoch": 1.6332807570977916, + "grad_norm": 0.4655120000897447, + "learning_rate": 1.2856646005189367e-05, + "loss": 0.3197, + "step": 8285 + }, + { + "epoch": 1.6334779179810726, + "grad_norm": 0.49106292754923186, + "learning_rate": 1.285516053501055e-05, + "loss": 0.3198, + "step": 8286 + }, + { + "epoch": 1.6336750788643533, + "grad_norm": 0.48718403958723305, + "learning_rate": 1.2853674996234209e-05, + "loss": 0.3309, + "step": 8287 + }, + { + "epoch": 1.633872239747634, + "grad_norm": 0.5300102271215745, + "learning_rate": 1.2852189388896027e-05, + "loss": 0.3611, + "step": 8288 + }, + { + "epoch": 1.6340694006309149, + "grad_norm": 0.5758537724813633, + "learning_rate": 1.2850703713031698e-05, + "loss": 0.321, + "step": 8289 + }, + { + "epoch": 1.6342665615141956, + "grad_norm": 0.4930932386464715, + "learning_rate": 1.2849217968676916e-05, + "loss": 0.3411, + "step": 8290 + }, + { + "epoch": 1.6344637223974763, + "grad_norm": 0.47643339025266535, + "learning_rate": 1.284773215586738e-05, + "loss": 0.3325, + "step": 8291 + }, + { + "epoch": 1.6346608832807572, + "grad_norm": 0.5102517200602528, + "learning_rate": 1.2846246274638783e-05, + "loss": 0.3085, + "step": 8292 + }, + { + "epoch": 1.634858044164038, + "grad_norm": 0.4964470884863604, + "learning_rate": 1.2844760325026827e-05, + "loss": 0.3336, + "step": 8293 + }, + { + "epoch": 1.6350552050473186, + "grad_norm": 0.4830571944820674, + "learning_rate": 1.2843274307067212e-05, + "loss": 0.2993, + "step": 8294 + }, + { + "epoch": 1.6352523659305995, + "grad_norm": 0.604904792281781, + "learning_rate": 1.2841788220795648e-05, + "loss": 0.382, + "step": 8295 + }, + { + "epoch": 1.63544952681388, + "grad_norm": 0.5476960221303971, + "learning_rate": 1.2840302066247828e-05, + "loss": 0.3219, + "step": 8296 + }, + { + "epoch": 1.635646687697161, + "grad_norm": 0.5578041142423232, + "learning_rate": 1.2838815843459467e-05, + "loss": 0.3686, + "step": 8297 + }, + { + "epoch": 1.6358438485804416, + "grad_norm": 0.4786603253362986, + "learning_rate": 1.2837329552466268e-05, + "loss": 0.3073, + "step": 8298 + }, + { + "epoch": 1.6360410094637223, + "grad_norm": 0.5103008039139185, + "learning_rate": 1.2835843193303941e-05, + "loss": 0.3315, + "step": 8299 + }, + { + "epoch": 1.6362381703470033, + "grad_norm": 0.5547846433262801, + "learning_rate": 1.2834356766008198e-05, + "loss": 0.3745, + "step": 8300 + }, + { + "epoch": 1.636435331230284, + "grad_norm": 0.47914878602175826, + "learning_rate": 1.283287027061475e-05, + "loss": 0.3504, + "step": 8301 + }, + { + "epoch": 1.6366324921135647, + "grad_norm": 0.48804434383979656, + "learning_rate": 1.2831383707159316e-05, + "loss": 0.2993, + "step": 8302 + }, + { + "epoch": 1.6368296529968456, + "grad_norm": 0.4802703000186615, + "learning_rate": 1.2829897075677602e-05, + "loss": 0.3225, + "step": 8303 + }, + { + "epoch": 1.637026813880126, + "grad_norm": 0.4776966853109427, + "learning_rate": 1.2828410376205338e-05, + "loss": 0.3372, + "step": 8304 + }, + { + "epoch": 1.637223974763407, + "grad_norm": 0.4575440033807344, + "learning_rate": 1.2826923608778234e-05, + "loss": 0.3184, + "step": 8305 + }, + { + "epoch": 1.6374211356466877, + "grad_norm": 0.5436332250882948, + "learning_rate": 1.2825436773432014e-05, + "loss": 0.3861, + "step": 8306 + }, + { + "epoch": 1.6376182965299684, + "grad_norm": 0.48668387601687046, + "learning_rate": 1.2823949870202402e-05, + "loss": 0.3331, + "step": 8307 + }, + { + "epoch": 1.6378154574132493, + "grad_norm": 0.5285033833556001, + "learning_rate": 1.2822462899125118e-05, + "loss": 0.3535, + "step": 8308 + }, + { + "epoch": 1.63801261829653, + "grad_norm": 0.47806217212282204, + "learning_rate": 1.2820975860235892e-05, + "loss": 0.3247, + "step": 8309 + }, + { + "epoch": 1.6382097791798107, + "grad_norm": 0.47295818635975434, + "learning_rate": 1.2819488753570448e-05, + "loss": 0.3155, + "step": 8310 + }, + { + "epoch": 1.6384069400630916, + "grad_norm": 0.49044071641561593, + "learning_rate": 1.2818001579164516e-05, + "loss": 0.3129, + "step": 8311 + }, + { + "epoch": 1.638604100946372, + "grad_norm": 0.49641282989980273, + "learning_rate": 1.2816514337053829e-05, + "loss": 0.339, + "step": 8312 + }, + { + "epoch": 1.638801261829653, + "grad_norm": 0.49283456075636484, + "learning_rate": 1.2815027027274114e-05, + "loss": 0.3328, + "step": 8313 + }, + { + "epoch": 1.6389984227129337, + "grad_norm": 0.509363681934031, + "learning_rate": 1.281353964986111e-05, + "loss": 0.338, + "step": 8314 + }, + { + "epoch": 1.6391955835962144, + "grad_norm": 0.4973258630677114, + "learning_rate": 1.2812052204850547e-05, + "loss": 0.3491, + "step": 8315 + }, + { + "epoch": 1.6393927444794953, + "grad_norm": 0.5065183080300542, + "learning_rate": 1.2810564692278167e-05, + "loss": 0.3394, + "step": 8316 + }, + { + "epoch": 1.639589905362776, + "grad_norm": 0.4506338288617181, + "learning_rate": 1.2809077112179708e-05, + "loss": 0.3088, + "step": 8317 + }, + { + "epoch": 1.6397870662460567, + "grad_norm": 0.5475036275756043, + "learning_rate": 1.2807589464590908e-05, + "loss": 0.3665, + "step": 8318 + }, + { + "epoch": 1.6399842271293377, + "grad_norm": 0.5165355996297737, + "learning_rate": 1.280610174954751e-05, + "loss": 0.3362, + "step": 8319 + }, + { + "epoch": 1.6401813880126181, + "grad_norm": 0.4791278974491389, + "learning_rate": 1.2804613967085258e-05, + "loss": 0.3059, + "step": 8320 + }, + { + "epoch": 1.640378548895899, + "grad_norm": 0.4683684775788071, + "learning_rate": 1.28031261172399e-05, + "loss": 0.3264, + "step": 8321 + }, + { + "epoch": 1.6405757097791798, + "grad_norm": 0.4783064134174778, + "learning_rate": 1.2801638200047173e-05, + "loss": 0.3351, + "step": 8322 + }, + { + "epoch": 1.6407728706624605, + "grad_norm": 0.4818861779472011, + "learning_rate": 1.2800150215542839e-05, + "loss": 0.3351, + "step": 8323 + }, + { + "epoch": 1.6409700315457414, + "grad_norm": 0.46380844410707833, + "learning_rate": 1.2798662163762635e-05, + "loss": 0.3361, + "step": 8324 + }, + { + "epoch": 1.641167192429022, + "grad_norm": 1.7966899299353296, + "learning_rate": 1.2797174044742324e-05, + "loss": 0.3874, + "step": 8325 + }, + { + "epoch": 1.6413643533123028, + "grad_norm": 0.4735261266189423, + "learning_rate": 1.2795685858517651e-05, + "loss": 0.3191, + "step": 8326 + }, + { + "epoch": 1.6415615141955837, + "grad_norm": 0.4845463952330695, + "learning_rate": 1.2794197605124375e-05, + "loss": 0.356, + "step": 8327 + }, + { + "epoch": 1.6417586750788642, + "grad_norm": 0.503438921518367, + "learning_rate": 1.279270928459825e-05, + "loss": 0.3515, + "step": 8328 + }, + { + "epoch": 1.6419558359621451, + "grad_norm": 0.4899624719397338, + "learning_rate": 1.2791220896975037e-05, + "loss": 0.3158, + "step": 8329 + }, + { + "epoch": 1.6421529968454258, + "grad_norm": 0.4669534661764057, + "learning_rate": 1.2789732442290493e-05, + "loss": 0.3215, + "step": 8330 + }, + { + "epoch": 1.6423501577287065, + "grad_norm": 0.5173134441620917, + "learning_rate": 1.2788243920580381e-05, + "loss": 0.3774, + "step": 8331 + }, + { + "epoch": 1.6425473186119874, + "grad_norm": 0.4980287720201502, + "learning_rate": 1.2786755331880464e-05, + "loss": 0.3449, + "step": 8332 + }, + { + "epoch": 1.6427444794952681, + "grad_norm": 0.5047789983811025, + "learning_rate": 1.2785266676226507e-05, + "loss": 0.3501, + "step": 8333 + }, + { + "epoch": 1.6429416403785488, + "grad_norm": 0.468218461046473, + "learning_rate": 1.2783777953654273e-05, + "loss": 0.3191, + "step": 8334 + }, + { + "epoch": 1.6431388012618298, + "grad_norm": 0.48844626736338487, + "learning_rate": 1.2782289164199534e-05, + "loss": 0.3438, + "step": 8335 + }, + { + "epoch": 1.6433359621451105, + "grad_norm": 0.5111075206278436, + "learning_rate": 1.2780800307898057e-05, + "loss": 0.3595, + "step": 8336 + }, + { + "epoch": 1.6435331230283912, + "grad_norm": 0.5097957258324088, + "learning_rate": 1.2779311384785609e-05, + "loss": 0.3501, + "step": 8337 + }, + { + "epoch": 1.643730283911672, + "grad_norm": 0.4715524753760921, + "learning_rate": 1.2777822394897971e-05, + "loss": 0.3133, + "step": 8338 + }, + { + "epoch": 1.6439274447949526, + "grad_norm": 0.5158213364397016, + "learning_rate": 1.2776333338270912e-05, + "loss": 0.3584, + "step": 8339 + }, + { + "epoch": 1.6441246056782335, + "grad_norm": 0.5119422614957586, + "learning_rate": 1.277484421494021e-05, + "loss": 0.3578, + "step": 8340 + }, + { + "epoch": 1.6443217665615142, + "grad_norm": 0.4651826749966539, + "learning_rate": 1.2773355024941636e-05, + "loss": 0.3213, + "step": 8341 + }, + { + "epoch": 1.6445189274447949, + "grad_norm": 0.5550710370506761, + "learning_rate": 1.277186576831098e-05, + "loss": 0.3653, + "step": 8342 + }, + { + "epoch": 1.6447160883280758, + "grad_norm": 0.4788442285113236, + "learning_rate": 1.2770376445084014e-05, + "loss": 0.3207, + "step": 8343 + }, + { + "epoch": 1.6449132492113565, + "grad_norm": 0.5097909767918353, + "learning_rate": 1.2768887055296527e-05, + "loss": 0.3457, + "step": 8344 + }, + { + "epoch": 1.6451104100946372, + "grad_norm": 0.5052842170050793, + "learning_rate": 1.2767397598984293e-05, + "loss": 0.3592, + "step": 8345 + }, + { + "epoch": 1.6453075709779181, + "grad_norm": 0.525965947087887, + "learning_rate": 1.2765908076183107e-05, + "loss": 0.3717, + "step": 8346 + }, + { + "epoch": 1.6455047318611986, + "grad_norm": 0.4520388814224274, + "learning_rate": 1.2764418486928748e-05, + "loss": 0.3079, + "step": 8347 + }, + { + "epoch": 1.6457018927444795, + "grad_norm": 0.4948960396210387, + "learning_rate": 1.276292883125701e-05, + "loss": 0.3625, + "step": 8348 + }, + { + "epoch": 1.6458990536277602, + "grad_norm": 0.48398295575215156, + "learning_rate": 1.2761439109203683e-05, + "loss": 0.3362, + "step": 8349 + }, + { + "epoch": 1.646096214511041, + "grad_norm": 0.45308149007619875, + "learning_rate": 1.2759949320804559e-05, + "loss": 0.3072, + "step": 8350 + }, + { + "epoch": 1.6462933753943219, + "grad_norm": 0.47227300867152255, + "learning_rate": 1.2758459466095432e-05, + "loss": 0.3099, + "step": 8351 + }, + { + "epoch": 1.6464905362776026, + "grad_norm": 0.5261569725940197, + "learning_rate": 1.275696954511209e-05, + "loss": 0.3195, + "step": 8352 + }, + { + "epoch": 1.6466876971608833, + "grad_norm": 0.5172694259251647, + "learning_rate": 1.2755479557890337e-05, + "loss": 0.3382, + "step": 8353 + }, + { + "epoch": 1.6468848580441642, + "grad_norm": 0.4596032582706732, + "learning_rate": 1.2753989504465967e-05, + "loss": 0.3264, + "step": 8354 + }, + { + "epoch": 1.6470820189274447, + "grad_norm": 0.4700775556886168, + "learning_rate": 1.275249938487478e-05, + "loss": 0.3145, + "step": 8355 + }, + { + "epoch": 1.6472791798107256, + "grad_norm": 0.5182439165674259, + "learning_rate": 1.2751009199152584e-05, + "loss": 0.3204, + "step": 8356 + }, + { + "epoch": 1.6474763406940063, + "grad_norm": 0.4892848082259555, + "learning_rate": 1.2749518947335173e-05, + "loss": 0.3589, + "step": 8357 + }, + { + "epoch": 1.647673501577287, + "grad_norm": 0.5181167588333921, + "learning_rate": 1.2748028629458356e-05, + "loss": 0.3517, + "step": 8358 + }, + { + "epoch": 1.647870662460568, + "grad_norm": 0.5273555196791303, + "learning_rate": 1.2746538245557938e-05, + "loss": 0.3611, + "step": 8359 + }, + { + "epoch": 1.6480678233438486, + "grad_norm": 0.4769301710367526, + "learning_rate": 1.2745047795669728e-05, + "loss": 0.3385, + "step": 8360 + }, + { + "epoch": 1.6482649842271293, + "grad_norm": 0.5168568047514019, + "learning_rate": 1.274355727982953e-05, + "loss": 0.3357, + "step": 8361 + }, + { + "epoch": 1.6484621451104102, + "grad_norm": 0.47721045303944054, + "learning_rate": 1.2742066698073164e-05, + "loss": 0.3318, + "step": 8362 + }, + { + "epoch": 1.6486593059936907, + "grad_norm": 0.5244966742590467, + "learning_rate": 1.2740576050436433e-05, + "loss": 0.3629, + "step": 8363 + }, + { + "epoch": 1.6488564668769716, + "grad_norm": 0.4763494332427964, + "learning_rate": 1.273908533695516e-05, + "loss": 0.355, + "step": 8364 + }, + { + "epoch": 1.6490536277602523, + "grad_norm": 0.4791256346523753, + "learning_rate": 1.2737594557665152e-05, + "loss": 0.3291, + "step": 8365 + }, + { + "epoch": 1.649250788643533, + "grad_norm": 0.49789637045484475, + "learning_rate": 1.2736103712602232e-05, + "loss": 0.3177, + "step": 8366 + }, + { + "epoch": 1.649447949526814, + "grad_norm": 0.4568801611649524, + "learning_rate": 1.2734612801802217e-05, + "loss": 0.3141, + "step": 8367 + }, + { + "epoch": 1.6496451104100947, + "grad_norm": 0.49441685311553696, + "learning_rate": 1.2733121825300927e-05, + "loss": 0.3359, + "step": 8368 + }, + { + "epoch": 1.6498422712933754, + "grad_norm": 0.5050247460468098, + "learning_rate": 1.2731630783134182e-05, + "loss": 0.3715, + "step": 8369 + }, + { + "epoch": 1.6500394321766563, + "grad_norm": 0.5015943491120733, + "learning_rate": 1.273013967533781e-05, + "loss": 0.3383, + "step": 8370 + }, + { + "epoch": 1.6502365930599368, + "grad_norm": 0.506827984233287, + "learning_rate": 1.2728648501947633e-05, + "loss": 0.3357, + "step": 8371 + }, + { + "epoch": 1.6504337539432177, + "grad_norm": 0.44237770072174254, + "learning_rate": 1.2727157262999481e-05, + "loss": 0.2997, + "step": 8372 + }, + { + "epoch": 1.6506309148264984, + "grad_norm": 0.48053734591353375, + "learning_rate": 1.2725665958529177e-05, + "loss": 0.3193, + "step": 8373 + }, + { + "epoch": 1.650828075709779, + "grad_norm": 0.5146248090989906, + "learning_rate": 1.2724174588572556e-05, + "loss": 0.3481, + "step": 8374 + }, + { + "epoch": 1.65102523659306, + "grad_norm": 0.47551521631075744, + "learning_rate": 1.272268315316544e-05, + "loss": 0.327, + "step": 8375 + }, + { + "epoch": 1.6512223974763407, + "grad_norm": 0.4877727357397369, + "learning_rate": 1.2721191652343674e-05, + "loss": 0.3283, + "step": 8376 + }, + { + "epoch": 1.6514195583596214, + "grad_norm": 1.4180824108129741, + "learning_rate": 1.2719700086143088e-05, + "loss": 0.3418, + "step": 8377 + }, + { + "epoch": 1.6516167192429023, + "grad_norm": 0.5155193108625191, + "learning_rate": 1.2718208454599515e-05, + "loss": 0.3532, + "step": 8378 + }, + { + "epoch": 1.651813880126183, + "grad_norm": 0.47241976046112605, + "learning_rate": 1.2716716757748795e-05, + "loss": 0.3526, + "step": 8379 + }, + { + "epoch": 1.6520110410094637, + "grad_norm": 0.4883813868684643, + "learning_rate": 1.2715224995626769e-05, + "loss": 0.321, + "step": 8380 + }, + { + "epoch": 1.6522082018927446, + "grad_norm": 0.468988201610082, + "learning_rate": 1.2713733168269275e-05, + "loss": 0.3357, + "step": 8381 + }, + { + "epoch": 1.6524053627760251, + "grad_norm": 0.4474868601000789, + "learning_rate": 1.2712241275712156e-05, + "loss": 0.3386, + "step": 8382 + }, + { + "epoch": 1.652602523659306, + "grad_norm": 1.0602180659250304, + "learning_rate": 1.2710749317991255e-05, + "loss": 0.3677, + "step": 8383 + }, + { + "epoch": 1.6527996845425867, + "grad_norm": 0.4626736577510362, + "learning_rate": 1.2709257295142421e-05, + "loss": 0.3171, + "step": 8384 + }, + { + "epoch": 1.6529968454258674, + "grad_norm": 0.7287485310553602, + "learning_rate": 1.2707765207201497e-05, + "loss": 0.3376, + "step": 8385 + }, + { + "epoch": 1.6531940063091484, + "grad_norm": 0.49508196335788296, + "learning_rate": 1.2706273054204334e-05, + "loss": 0.3154, + "step": 8386 + }, + { + "epoch": 1.653391167192429, + "grad_norm": 0.5335563258052243, + "learning_rate": 1.2704780836186781e-05, + "loss": 0.335, + "step": 8387 + }, + { + "epoch": 1.6535883280757098, + "grad_norm": 0.49204084068864695, + "learning_rate": 1.270328855318469e-05, + "loss": 0.3457, + "step": 8388 + }, + { + "epoch": 1.6537854889589907, + "grad_norm": 0.7179327613194991, + "learning_rate": 1.2701796205233916e-05, + "loss": 0.3262, + "step": 8389 + }, + { + "epoch": 1.6539826498422712, + "grad_norm": 0.5875255445299163, + "learning_rate": 1.270030379237031e-05, + "loss": 0.3445, + "step": 8390 + }, + { + "epoch": 1.654179810725552, + "grad_norm": 0.48571857002344315, + "learning_rate": 1.2698811314629734e-05, + "loss": 0.3384, + "step": 8391 + }, + { + "epoch": 1.6543769716088328, + "grad_norm": 0.5420200873446145, + "learning_rate": 1.269731877204804e-05, + "loss": 0.3471, + "step": 8392 + }, + { + "epoch": 1.6545741324921135, + "grad_norm": 0.4978969822403536, + "learning_rate": 1.2695826164661093e-05, + "loss": 0.3361, + "step": 8393 + }, + { + "epoch": 1.6547712933753944, + "grad_norm": 0.5619686050490151, + "learning_rate": 1.269433349250475e-05, + "loss": 0.3448, + "step": 8394 + }, + { + "epoch": 1.6549684542586751, + "grad_norm": 0.5254844107289572, + "learning_rate": 1.2692840755614873e-05, + "loss": 0.3367, + "step": 8395 + }, + { + "epoch": 1.6551656151419558, + "grad_norm": 0.5011006680947409, + "learning_rate": 1.269134795402733e-05, + "loss": 0.3418, + "step": 8396 + }, + { + "epoch": 1.6553627760252367, + "grad_norm": 0.569081232736909, + "learning_rate": 1.2689855087777988e-05, + "loss": 0.3265, + "step": 8397 + }, + { + "epoch": 1.6555599369085172, + "grad_norm": 0.49741199855508267, + "learning_rate": 1.2688362156902707e-05, + "loss": 0.3367, + "step": 8398 + }, + { + "epoch": 1.6557570977917981, + "grad_norm": 0.5473920091128143, + "learning_rate": 1.2686869161437364e-05, + "loss": 0.3747, + "step": 8399 + }, + { + "epoch": 1.6559542586750788, + "grad_norm": 0.5076794965437509, + "learning_rate": 1.2685376101417823e-05, + "loss": 0.3506, + "step": 8400 + }, + { + "epoch": 1.6561514195583595, + "grad_norm": 0.5162556295971144, + "learning_rate": 1.268388297687996e-05, + "loss": 0.3288, + "step": 8401 + }, + { + "epoch": 1.6563485804416405, + "grad_norm": 0.5519110540607234, + "learning_rate": 1.2682389787859646e-05, + "loss": 0.3312, + "step": 8402 + }, + { + "epoch": 1.6565457413249212, + "grad_norm": 0.5118402740090534, + "learning_rate": 1.268089653439276e-05, + "loss": 0.2988, + "step": 8403 + }, + { + "epoch": 1.6567429022082019, + "grad_norm": 0.5055886550149764, + "learning_rate": 1.2679403216515171e-05, + "loss": 0.3293, + "step": 8404 + }, + { + "epoch": 1.6569400630914828, + "grad_norm": 0.5299784103251353, + "learning_rate": 1.2677909834262764e-05, + "loss": 0.3818, + "step": 8405 + }, + { + "epoch": 1.6571372239747633, + "grad_norm": 0.4721883334342438, + "learning_rate": 1.267641638767142e-05, + "loss": 0.3164, + "step": 8406 + }, + { + "epoch": 1.6573343848580442, + "grad_norm": 0.46598843868067574, + "learning_rate": 1.2674922876777014e-05, + "loss": 0.3242, + "step": 8407 + }, + { + "epoch": 1.6575315457413249, + "grad_norm": 0.4908947433651475, + "learning_rate": 1.2673429301615431e-05, + "loss": 0.361, + "step": 8408 + }, + { + "epoch": 1.6577287066246056, + "grad_norm": 0.5044190970850814, + "learning_rate": 1.2671935662222556e-05, + "loss": 0.3492, + "step": 8409 + }, + { + "epoch": 1.6579258675078865, + "grad_norm": 0.4885883648458966, + "learning_rate": 1.2670441958634278e-05, + "loss": 0.3405, + "step": 8410 + }, + { + "epoch": 1.6581230283911672, + "grad_norm": 0.4992792901348933, + "learning_rate": 1.2668948190886479e-05, + "loss": 0.3495, + "step": 8411 + }, + { + "epoch": 1.658320189274448, + "grad_norm": 0.4766119795887696, + "learning_rate": 1.2667454359015053e-05, + "loss": 0.324, + "step": 8412 + }, + { + "epoch": 1.6585173501577288, + "grad_norm": 0.49210628144064317, + "learning_rate": 1.2665960463055884e-05, + "loss": 0.3262, + "step": 8413 + }, + { + "epoch": 1.6587145110410093, + "grad_norm": 0.5123201240998976, + "learning_rate": 1.2664466503044872e-05, + "loss": 0.3375, + "step": 8414 + }, + { + "epoch": 1.6589116719242902, + "grad_norm": 0.48206936397556427, + "learning_rate": 1.2662972479017905e-05, + "loss": 0.3495, + "step": 8415 + }, + { + "epoch": 1.659108832807571, + "grad_norm": 0.46491900044693, + "learning_rate": 1.2661478391010877e-05, + "loss": 0.3371, + "step": 8416 + }, + { + "epoch": 1.6593059936908516, + "grad_norm": 0.4754604595322926, + "learning_rate": 1.2659984239059693e-05, + "loss": 0.345, + "step": 8417 + }, + { + "epoch": 1.6595031545741326, + "grad_norm": 0.485110553505231, + "learning_rate": 1.2658490023200238e-05, + "loss": 0.3457, + "step": 8418 + }, + { + "epoch": 1.6597003154574133, + "grad_norm": 0.5020242501635195, + "learning_rate": 1.2656995743468428e-05, + "loss": 0.3449, + "step": 8419 + }, + { + "epoch": 1.659897476340694, + "grad_norm": 0.47335087548835225, + "learning_rate": 1.2655501399900147e-05, + "loss": 0.3344, + "step": 8420 + }, + { + "epoch": 1.6600946372239749, + "grad_norm": 0.7292549190195249, + "learning_rate": 1.2654006992531314e-05, + "loss": 0.3618, + "step": 8421 + }, + { + "epoch": 1.6602917981072554, + "grad_norm": 0.46704737896112347, + "learning_rate": 1.265251252139782e-05, + "loss": 0.3379, + "step": 8422 + }, + { + "epoch": 1.6604889589905363, + "grad_norm": 0.47828307265868714, + "learning_rate": 1.2651017986535578e-05, + "loss": 0.3546, + "step": 8423 + }, + { + "epoch": 1.660686119873817, + "grad_norm": 0.5046362588351052, + "learning_rate": 1.2649523387980497e-05, + "loss": 0.3535, + "step": 8424 + }, + { + "epoch": 1.6608832807570977, + "grad_norm": 0.529973462052246, + "learning_rate": 1.264802872576848e-05, + "loss": 0.3614, + "step": 8425 + }, + { + "epoch": 1.6610804416403786, + "grad_norm": 0.47162369718261343, + "learning_rate": 1.2646533999935442e-05, + "loss": 0.3486, + "step": 8426 + }, + { + "epoch": 1.6612776025236593, + "grad_norm": 0.5125003435688313, + "learning_rate": 1.2645039210517291e-05, + "loss": 0.35, + "step": 8427 + }, + { + "epoch": 1.66147476340694, + "grad_norm": 0.49775247171182835, + "learning_rate": 1.2643544357549946e-05, + "loss": 0.3468, + "step": 8428 + }, + { + "epoch": 1.661671924290221, + "grad_norm": 0.4979574735742323, + "learning_rate": 1.2642049441069318e-05, + "loss": 0.3479, + "step": 8429 + }, + { + "epoch": 1.6618690851735016, + "grad_norm": 0.49155198733720284, + "learning_rate": 1.2640554461111324e-05, + "loss": 0.3534, + "step": 8430 + }, + { + "epoch": 1.6620662460567823, + "grad_norm": 0.49261265160510403, + "learning_rate": 1.2639059417711882e-05, + "loss": 0.3377, + "step": 8431 + }, + { + "epoch": 1.6622634069400632, + "grad_norm": 0.47430655424794055, + "learning_rate": 1.2637564310906913e-05, + "loss": 0.3409, + "step": 8432 + }, + { + "epoch": 1.6624605678233437, + "grad_norm": 0.4847102680872918, + "learning_rate": 1.2636069140732338e-05, + "loss": 0.3385, + "step": 8433 + }, + { + "epoch": 1.6626577287066246, + "grad_norm": 0.47723013743075726, + "learning_rate": 1.263457390722408e-05, + "loss": 0.3318, + "step": 8434 + }, + { + "epoch": 1.6628548895899053, + "grad_norm": 0.4724871482409827, + "learning_rate": 1.2633078610418062e-05, + "loss": 0.3122, + "step": 8435 + }, + { + "epoch": 1.663052050473186, + "grad_norm": 0.4828083436400746, + "learning_rate": 1.2631583250350208e-05, + "loss": 0.344, + "step": 8436 + }, + { + "epoch": 1.663249211356467, + "grad_norm": 0.4461653149295655, + "learning_rate": 1.2630087827056445e-05, + "loss": 0.3247, + "step": 8437 + }, + { + "epoch": 1.6634463722397477, + "grad_norm": 0.5069753338240224, + "learning_rate": 1.262859234057271e-05, + "loss": 0.3152, + "step": 8438 + }, + { + "epoch": 1.6636435331230284, + "grad_norm": 0.460521307249503, + "learning_rate": 1.2627096790934921e-05, + "loss": 0.3223, + "step": 8439 + }, + { + "epoch": 1.6638406940063093, + "grad_norm": 0.4604716994762646, + "learning_rate": 1.2625601178179021e-05, + "loss": 0.3283, + "step": 8440 + }, + { + "epoch": 1.6640378548895898, + "grad_norm": 0.5234481706217016, + "learning_rate": 1.2624105502340935e-05, + "loss": 0.3534, + "step": 8441 + }, + { + "epoch": 1.6642350157728707, + "grad_norm": 0.5252701025916288, + "learning_rate": 1.2622609763456604e-05, + "loss": 0.3829, + "step": 8442 + }, + { + "epoch": 1.6644321766561514, + "grad_norm": 0.48070551451863064, + "learning_rate": 1.262111396156196e-05, + "loss": 0.3495, + "step": 8443 + }, + { + "epoch": 1.664629337539432, + "grad_norm": 0.48384766288562747, + "learning_rate": 1.2619618096692942e-05, + "loss": 0.3316, + "step": 8444 + }, + { + "epoch": 1.664826498422713, + "grad_norm": 0.5062594443871286, + "learning_rate": 1.2618122168885489e-05, + "loss": 0.3348, + "step": 8445 + }, + { + "epoch": 1.6650236593059937, + "grad_norm": 0.4907130059730268, + "learning_rate": 1.2616626178175544e-05, + "loss": 0.3559, + "step": 8446 + }, + { + "epoch": 1.6652208201892744, + "grad_norm": 0.5082080738735469, + "learning_rate": 1.2615130124599047e-05, + "loss": 0.3292, + "step": 8447 + }, + { + "epoch": 1.6654179810725553, + "grad_norm": 0.5032607745506708, + "learning_rate": 1.2613634008191944e-05, + "loss": 0.3291, + "step": 8448 + }, + { + "epoch": 1.6656151419558358, + "grad_norm": 0.47422112873571903, + "learning_rate": 1.2612137828990178e-05, + "loss": 0.341, + "step": 8449 + }, + { + "epoch": 1.6658123028391167, + "grad_norm": 0.48408723981425067, + "learning_rate": 1.2610641587029697e-05, + "loss": 0.342, + "step": 8450 + }, + { + "epoch": 1.6660094637223974, + "grad_norm": 0.47815519859652317, + "learning_rate": 1.2609145282346452e-05, + "loss": 0.3042, + "step": 8451 + }, + { + "epoch": 1.6662066246056781, + "grad_norm": 0.45905147453962514, + "learning_rate": 1.2607648914976386e-05, + "loss": 0.3413, + "step": 8452 + }, + { + "epoch": 1.666403785488959, + "grad_norm": 0.4686501349889585, + "learning_rate": 1.2606152484955458e-05, + "loss": 0.32, + "step": 8453 + }, + { + "epoch": 1.6666009463722398, + "grad_norm": 0.5027813358909118, + "learning_rate": 1.2604655992319618e-05, + "loss": 0.3577, + "step": 8454 + }, + { + "epoch": 1.6667981072555205, + "grad_norm": 0.48965015962544906, + "learning_rate": 1.260315943710482e-05, + "loss": 0.3383, + "step": 8455 + }, + { + "epoch": 1.6669952681388014, + "grad_norm": 0.48574589911473337, + "learning_rate": 1.2601662819347017e-05, + "loss": 0.3396, + "step": 8456 + }, + { + "epoch": 1.6671924290220819, + "grad_norm": 0.44679877307708993, + "learning_rate": 1.2600166139082175e-05, + "loss": 0.3054, + "step": 8457 + }, + { + "epoch": 1.6673895899053628, + "grad_norm": 0.5131395778523016, + "learning_rate": 1.2598669396346244e-05, + "loss": 0.3606, + "step": 8458 + }, + { + "epoch": 1.6675867507886435, + "grad_norm": 0.48971170022752913, + "learning_rate": 1.259717259117519e-05, + "loss": 0.3588, + "step": 8459 + }, + { + "epoch": 1.6677839116719242, + "grad_norm": 0.49066716607132943, + "learning_rate": 1.259567572360497e-05, + "loss": 0.3364, + "step": 8460 + }, + { + "epoch": 1.6679810725552051, + "grad_norm": 0.4509623703464088, + "learning_rate": 1.2594178793671554e-05, + "loss": 0.3219, + "step": 8461 + }, + { + "epoch": 1.6681782334384858, + "grad_norm": 0.46371182929662663, + "learning_rate": 1.25926818014109e-05, + "loss": 0.3491, + "step": 8462 + }, + { + "epoch": 1.6683753943217665, + "grad_norm": 0.49080219225409744, + "learning_rate": 1.259118474685898e-05, + "loss": 0.3511, + "step": 8463 + }, + { + "epoch": 1.6685725552050474, + "grad_norm": 0.4985969802063767, + "learning_rate": 1.258968763005176e-05, + "loss": 0.3487, + "step": 8464 + }, + { + "epoch": 1.668769716088328, + "grad_norm": 0.49622170543145033, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.333, + "step": 8465 + }, + { + "epoch": 1.6689668769716088, + "grad_norm": 0.4511106753090801, + "learning_rate": 1.2586693209815298e-05, + "loss": 0.3192, + "step": 8466 + }, + { + "epoch": 1.6691640378548895, + "grad_norm": 0.479506900351286, + "learning_rate": 1.2585195906457998e-05, + "loss": 0.3485, + "step": 8467 + }, + { + "epoch": 1.6693611987381702, + "grad_norm": 0.4715197277159651, + "learning_rate": 1.2583698540989288e-05, + "loss": 0.3276, + "step": 8468 + }, + { + "epoch": 1.6695583596214512, + "grad_norm": 0.4798793765470226, + "learning_rate": 1.2582201113445136e-05, + "loss": 0.3414, + "step": 8469 + }, + { + "epoch": 1.6697555205047319, + "grad_norm": 0.4505832073301154, + "learning_rate": 1.2580703623861525e-05, + "loss": 0.3292, + "step": 8470 + }, + { + "epoch": 1.6699526813880126, + "grad_norm": 0.4695096328518373, + "learning_rate": 1.257920607227443e-05, + "loss": 0.3511, + "step": 8471 + }, + { + "epoch": 1.6701498422712935, + "grad_norm": 0.475920897049246, + "learning_rate": 1.2577708458719836e-05, + "loss": 0.3333, + "step": 8472 + }, + { + "epoch": 1.6703470031545742, + "grad_norm": 0.4629881113952917, + "learning_rate": 1.2576210783233715e-05, + "loss": 0.3427, + "step": 8473 + }, + { + "epoch": 1.6705441640378549, + "grad_norm": 0.4579101805999366, + "learning_rate": 1.2574713045852059e-05, + "loss": 0.3249, + "step": 8474 + }, + { + "epoch": 1.6707413249211358, + "grad_norm": 0.5062337208811096, + "learning_rate": 1.2573215246610845e-05, + "loss": 0.3161, + "step": 8475 + }, + { + "epoch": 1.6709384858044163, + "grad_norm": 0.5716309498702405, + "learning_rate": 1.2571717385546067e-05, + "loss": 0.3362, + "step": 8476 + }, + { + "epoch": 1.6711356466876972, + "grad_norm": 0.45248380776560626, + "learning_rate": 1.2570219462693703e-05, + "loss": 0.3345, + "step": 8477 + }, + { + "epoch": 1.671332807570978, + "grad_norm": 0.4930136541420434, + "learning_rate": 1.2568721478089752e-05, + "loss": 0.3379, + "step": 8478 + }, + { + "epoch": 1.6715299684542586, + "grad_norm": 0.5282094905423682, + "learning_rate": 1.2567223431770193e-05, + "loss": 0.3808, + "step": 8479 + }, + { + "epoch": 1.6717271293375395, + "grad_norm": 0.49446422793627104, + "learning_rate": 1.256572532377103e-05, + "loss": 0.3307, + "step": 8480 + }, + { + "epoch": 1.6719242902208202, + "grad_norm": 0.4665091200670554, + "learning_rate": 1.2564227154128248e-05, + "loss": 0.3129, + "step": 8481 + }, + { + "epoch": 1.672121451104101, + "grad_norm": 0.5106508095556581, + "learning_rate": 1.256272892287784e-05, + "loss": 0.3647, + "step": 8482 + }, + { + "epoch": 1.6723186119873819, + "grad_norm": 0.48462370149465533, + "learning_rate": 1.256123063005581e-05, + "loss": 0.345, + "step": 8483 + }, + { + "epoch": 1.6725157728706623, + "grad_norm": 0.46959052479288327, + "learning_rate": 1.2559732275698147e-05, + "loss": 0.3305, + "step": 8484 + }, + { + "epoch": 1.6727129337539433, + "grad_norm": 0.4863277709967929, + "learning_rate": 1.2558233859840861e-05, + "loss": 0.3402, + "step": 8485 + }, + { + "epoch": 1.672910094637224, + "grad_norm": 0.48685079228972755, + "learning_rate": 1.255673538251994e-05, + "loss": 0.3153, + "step": 8486 + }, + { + "epoch": 1.6731072555205047, + "grad_norm": 0.4893988869401554, + "learning_rate": 1.2555236843771398e-05, + "loss": 0.3546, + "step": 8487 + }, + { + "epoch": 1.6733044164037856, + "grad_norm": 0.4806560567289344, + "learning_rate": 1.2553738243631228e-05, + "loss": 0.3762, + "step": 8488 + }, + { + "epoch": 1.6735015772870663, + "grad_norm": 0.4744744228607181, + "learning_rate": 1.2552239582135446e-05, + "loss": 0.3557, + "step": 8489 + }, + { + "epoch": 1.673698738170347, + "grad_norm": 0.5065072339151813, + "learning_rate": 1.2550740859320047e-05, + "loss": 0.3514, + "step": 8490 + }, + { + "epoch": 1.673895899053628, + "grad_norm": 0.49720037597529226, + "learning_rate": 1.2549242075221047e-05, + "loss": 0.3621, + "step": 8491 + }, + { + "epoch": 1.6740930599369084, + "grad_norm": 0.4862962373562821, + "learning_rate": 1.2547743229874452e-05, + "loss": 0.3278, + "step": 8492 + }, + { + "epoch": 1.6742902208201893, + "grad_norm": 0.4862623198322913, + "learning_rate": 1.2546244323316276e-05, + "loss": 0.3649, + "step": 8493 + }, + { + "epoch": 1.67448738170347, + "grad_norm": 0.46675920503742757, + "learning_rate": 1.254474535558253e-05, + "loss": 0.3142, + "step": 8494 + }, + { + "epoch": 1.6746845425867507, + "grad_norm": 6.314767157106249, + "learning_rate": 1.2543246326709227e-05, + "loss": 0.3485, + "step": 8495 + }, + { + "epoch": 1.6748817034700316, + "grad_norm": 0.5032826105490751, + "learning_rate": 1.2541747236732382e-05, + "loss": 0.334, + "step": 8496 + }, + { + "epoch": 1.6750788643533123, + "grad_norm": 0.4565883254189662, + "learning_rate": 1.2540248085688013e-05, + "loss": 0.3298, + "step": 8497 + }, + { + "epoch": 1.675276025236593, + "grad_norm": 0.47701955741607316, + "learning_rate": 1.253874887361214e-05, + "loss": 0.3419, + "step": 8498 + }, + { + "epoch": 1.675473186119874, + "grad_norm": 0.5192006825879065, + "learning_rate": 1.253724960054078e-05, + "loss": 0.349, + "step": 8499 + }, + { + "epoch": 1.6756703470031544, + "grad_norm": 0.498046580157121, + "learning_rate": 1.2535750266509955e-05, + "loss": 0.359, + "step": 8500 + }, + { + "epoch": 1.6758675078864353, + "grad_norm": 0.45577657594308446, + "learning_rate": 1.2534250871555687e-05, + "loss": 0.3262, + "step": 8501 + }, + { + "epoch": 1.676064668769716, + "grad_norm": 0.4735721377691079, + "learning_rate": 1.2532751415714001e-05, + "loss": 0.2878, + "step": 8502 + }, + { + "epoch": 1.6762618296529967, + "grad_norm": 0.4771932029118233, + "learning_rate": 1.2531251899020925e-05, + "loss": 0.3162, + "step": 8503 + }, + { + "epoch": 1.6764589905362777, + "grad_norm": 0.4757910741943044, + "learning_rate": 1.252975232151248e-05, + "loss": 0.3357, + "step": 8504 + }, + { + "epoch": 1.6766561514195584, + "grad_norm": 0.5121567601038779, + "learning_rate": 1.2528252683224697e-05, + "loss": 0.3436, + "step": 8505 + }, + { + "epoch": 1.676853312302839, + "grad_norm": 0.5520693211608968, + "learning_rate": 1.2526752984193613e-05, + "loss": 0.3188, + "step": 8506 + }, + { + "epoch": 1.67705047318612, + "grad_norm": 0.47244929893727605, + "learning_rate": 1.2525253224455249e-05, + "loss": 0.3074, + "step": 8507 + }, + { + "epoch": 1.6772476340694005, + "grad_norm": 0.4911902577750642, + "learning_rate": 1.252375340404565e-05, + "loss": 0.3437, + "step": 8508 + }, + { + "epoch": 1.6774447949526814, + "grad_norm": 0.4702219725532421, + "learning_rate": 1.2522253523000834e-05, + "loss": 0.2994, + "step": 8509 + }, + { + "epoch": 1.677641955835962, + "grad_norm": 0.4719943895332728, + "learning_rate": 1.2520753581356852e-05, + "loss": 0.3319, + "step": 8510 + }, + { + "epoch": 1.6778391167192428, + "grad_norm": 0.46273321896963937, + "learning_rate": 1.251925357914973e-05, + "loss": 0.3156, + "step": 8511 + }, + { + "epoch": 1.6780362776025237, + "grad_norm": 0.45943840807810565, + "learning_rate": 1.2517753516415516e-05, + "loss": 0.3108, + "step": 8512 + }, + { + "epoch": 1.6782334384858044, + "grad_norm": 0.4958574879886677, + "learning_rate": 1.2516253393190245e-05, + "loss": 0.3415, + "step": 8513 + }, + { + "epoch": 1.6784305993690851, + "grad_norm": 0.471413234518349, + "learning_rate": 1.251475320950996e-05, + "loss": 0.3336, + "step": 8514 + }, + { + "epoch": 1.678627760252366, + "grad_norm": 3.3034750186313615, + "learning_rate": 1.2513252965410706e-05, + "loss": 0.3529, + "step": 8515 + }, + { + "epoch": 1.6788249211356467, + "grad_norm": 0.5058575196517642, + "learning_rate": 1.2511752660928523e-05, + "loss": 0.3583, + "step": 8516 + }, + { + "epoch": 1.6790220820189274, + "grad_norm": 0.5210370780798075, + "learning_rate": 1.251025229609946e-05, + "loss": 0.3627, + "step": 8517 + }, + { + "epoch": 1.6792192429022084, + "grad_norm": 0.9679223222367713, + "learning_rate": 1.2508751870959563e-05, + "loss": 0.3305, + "step": 8518 + }, + { + "epoch": 1.6794164037854888, + "grad_norm": 0.955205787142468, + "learning_rate": 1.2507251385544885e-05, + "loss": 0.3384, + "step": 8519 + }, + { + "epoch": 1.6796135646687698, + "grad_norm": 0.5760713418529518, + "learning_rate": 1.2505750839891473e-05, + "loss": 0.3412, + "step": 8520 + }, + { + "epoch": 1.6798107255520505, + "grad_norm": 0.4672872118729301, + "learning_rate": 1.2504250234035378e-05, + "loss": 0.3238, + "step": 8521 + }, + { + "epoch": 1.6800078864353312, + "grad_norm": 0.46960680724931947, + "learning_rate": 1.2502749568012655e-05, + "loss": 0.3368, + "step": 8522 + }, + { + "epoch": 1.680205047318612, + "grad_norm": 0.4892990248131517, + "learning_rate": 1.2501248841859358e-05, + "loss": 0.3222, + "step": 8523 + }, + { + "epoch": 1.6804022082018928, + "grad_norm": 0.9521603633960574, + "learning_rate": 1.2499748055611543e-05, + "loss": 0.3473, + "step": 8524 + }, + { + "epoch": 1.6805993690851735, + "grad_norm": 0.4961229036962787, + "learning_rate": 1.2498247209305267e-05, + "loss": 0.34, + "step": 8525 + }, + { + "epoch": 1.6807965299684544, + "grad_norm": 0.4641387089623987, + "learning_rate": 1.2496746302976588e-05, + "loss": 0.3262, + "step": 8526 + }, + { + "epoch": 1.680993690851735, + "grad_norm": 0.5208046380242989, + "learning_rate": 1.2495245336661575e-05, + "loss": 0.357, + "step": 8527 + }, + { + "epoch": 1.6811908517350158, + "grad_norm": 0.6377815976714571, + "learning_rate": 1.2493744310396276e-05, + "loss": 0.3446, + "step": 8528 + }, + { + "epoch": 1.6813880126182965, + "grad_norm": 0.48508566333163605, + "learning_rate": 1.249224322421677e-05, + "loss": 0.3265, + "step": 8529 + }, + { + "epoch": 1.6815851735015772, + "grad_norm": 0.4640062435177377, + "learning_rate": 1.2490742078159107e-05, + "loss": 0.3453, + "step": 8530 + }, + { + "epoch": 1.6817823343848581, + "grad_norm": 0.48082524719773534, + "learning_rate": 1.248924087225936e-05, + "loss": 0.3428, + "step": 8531 + }, + { + "epoch": 1.6819794952681388, + "grad_norm": 0.46881859443066326, + "learning_rate": 1.24877396065536e-05, + "loss": 0.3073, + "step": 8532 + }, + { + "epoch": 1.6821766561514195, + "grad_norm": 0.4883737738622232, + "learning_rate": 1.248623828107789e-05, + "loss": 0.3472, + "step": 8533 + }, + { + "epoch": 1.6823738170347005, + "grad_norm": 0.45689282889724375, + "learning_rate": 1.2484736895868306e-05, + "loss": 0.3097, + "step": 8534 + }, + { + "epoch": 1.682570977917981, + "grad_norm": 0.47112167138042593, + "learning_rate": 1.2483235450960914e-05, + "loss": 0.3444, + "step": 8535 + }, + { + "epoch": 1.6827681388012619, + "grad_norm": 0.46214308971573514, + "learning_rate": 1.2481733946391792e-05, + "loss": 0.3189, + "step": 8536 + }, + { + "epoch": 1.6829652996845426, + "grad_norm": 0.46005114791319535, + "learning_rate": 1.2480232382197013e-05, + "loss": 0.3178, + "step": 8537 + }, + { + "epoch": 1.6831624605678233, + "grad_norm": 0.4767717118901013, + "learning_rate": 1.2478730758412652e-05, + "loss": 0.3362, + "step": 8538 + }, + { + "epoch": 1.6833596214511042, + "grad_norm": 0.4599960998922992, + "learning_rate": 1.247722907507479e-05, + "loss": 0.3195, + "step": 8539 + }, + { + "epoch": 1.6835567823343849, + "grad_norm": 0.4951705881711061, + "learning_rate": 1.2475727332219505e-05, + "loss": 0.346, + "step": 8540 + }, + { + "epoch": 1.6837539432176656, + "grad_norm": 0.5046721506851792, + "learning_rate": 1.2474225529882878e-05, + "loss": 0.3255, + "step": 8541 + }, + { + "epoch": 1.6839511041009465, + "grad_norm": 0.4829302359712838, + "learning_rate": 1.247272366810099e-05, + "loss": 0.3429, + "step": 8542 + }, + { + "epoch": 1.684148264984227, + "grad_norm": 0.46391312466042783, + "learning_rate": 1.2471221746909923e-05, + "loss": 0.3325, + "step": 8543 + }, + { + "epoch": 1.684345425867508, + "grad_norm": 0.47200457284310665, + "learning_rate": 1.2469719766345763e-05, + "loss": 0.3388, + "step": 8544 + }, + { + "epoch": 1.6845425867507886, + "grad_norm": 0.4746525342051994, + "learning_rate": 1.2468217726444595e-05, + "loss": 0.3293, + "step": 8545 + }, + { + "epoch": 1.6847397476340693, + "grad_norm": 0.523726717599289, + "learning_rate": 1.2466715627242514e-05, + "loss": 0.3564, + "step": 8546 + }, + { + "epoch": 1.6849369085173502, + "grad_norm": 0.47113498305611295, + "learning_rate": 1.2465213468775602e-05, + "loss": 0.3459, + "step": 8547 + }, + { + "epoch": 1.685134069400631, + "grad_norm": 0.4768683631487932, + "learning_rate": 1.2463711251079951e-05, + "loss": 0.3419, + "step": 8548 + }, + { + "epoch": 1.6853312302839116, + "grad_norm": 0.4796081831360892, + "learning_rate": 1.2462208974191652e-05, + "loss": 0.337, + "step": 8549 + }, + { + "epoch": 1.6855283911671926, + "grad_norm": 0.47022166990324243, + "learning_rate": 1.24607066381468e-05, + "loss": 0.3343, + "step": 8550 + }, + { + "epoch": 1.685725552050473, + "grad_norm": 0.47554608946434046, + "learning_rate": 1.245920424298149e-05, + "loss": 0.3205, + "step": 8551 + }, + { + "epoch": 1.685922712933754, + "grad_norm": 0.616505992571845, + "learning_rate": 1.2457701788731812e-05, + "loss": 0.3296, + "step": 8552 + }, + { + "epoch": 1.6861198738170347, + "grad_norm": 0.4878539182332263, + "learning_rate": 1.2456199275433878e-05, + "loss": 0.3525, + "step": 8553 + }, + { + "epoch": 1.6863170347003154, + "grad_norm": 0.4414899321505671, + "learning_rate": 1.2454696703123773e-05, + "loss": 0.3206, + "step": 8554 + }, + { + "epoch": 1.6865141955835963, + "grad_norm": 0.48286881495321865, + "learning_rate": 1.2453194071837606e-05, + "loss": 0.3238, + "step": 8555 + }, + { + "epoch": 1.686711356466877, + "grad_norm": 0.4742179590973439, + "learning_rate": 1.2451691381611472e-05, + "loss": 0.3197, + "step": 8556 + }, + { + "epoch": 1.6869085173501577, + "grad_norm": 0.47834655573536045, + "learning_rate": 1.2450188632481484e-05, + "loss": 0.3265, + "step": 8557 + }, + { + "epoch": 1.6871056782334386, + "grad_norm": 0.4343626858698186, + "learning_rate": 1.2448685824483735e-05, + "loss": 0.3035, + "step": 8558 + }, + { + "epoch": 1.687302839116719, + "grad_norm": 0.48021663657417696, + "learning_rate": 1.244718295765434e-05, + "loss": 0.3275, + "step": 8559 + }, + { + "epoch": 1.6875, + "grad_norm": 0.4680145546040287, + "learning_rate": 1.2445680032029403e-05, + "loss": 0.3164, + "step": 8560 + }, + { + "epoch": 1.687697160883281, + "grad_norm": 0.46324619631494696, + "learning_rate": 1.2444177047645036e-05, + "loss": 0.3164, + "step": 8561 + }, + { + "epoch": 1.6878943217665614, + "grad_norm": 0.4788496971926555, + "learning_rate": 1.2442674004537345e-05, + "loss": 0.3412, + "step": 8562 + }, + { + "epoch": 1.6880914826498423, + "grad_norm": 0.47439352264543966, + "learning_rate": 1.2441170902742445e-05, + "loss": 0.3272, + "step": 8563 + }, + { + "epoch": 1.688288643533123, + "grad_norm": 0.49561553090817284, + "learning_rate": 1.2439667742296448e-05, + "loss": 0.3302, + "step": 8564 + }, + { + "epoch": 1.6884858044164037, + "grad_norm": 0.4800560302995904, + "learning_rate": 1.2438164523235467e-05, + "loss": 0.3406, + "step": 8565 + }, + { + "epoch": 1.6886829652996846, + "grad_norm": 0.49067895256143157, + "learning_rate": 1.2436661245595623e-05, + "loss": 0.325, + "step": 8566 + }, + { + "epoch": 1.6888801261829653, + "grad_norm": 0.47817903605159684, + "learning_rate": 1.2435157909413029e-05, + "loss": 0.3247, + "step": 8567 + }, + { + "epoch": 1.689077287066246, + "grad_norm": 0.48136123183849766, + "learning_rate": 1.2433654514723806e-05, + "loss": 0.3454, + "step": 8568 + }, + { + "epoch": 1.689274447949527, + "grad_norm": 0.5051580981062126, + "learning_rate": 1.2432151061564071e-05, + "loss": 0.3601, + "step": 8569 + }, + { + "epoch": 1.6894716088328074, + "grad_norm": 0.461511250819562, + "learning_rate": 1.2430647549969949e-05, + "loss": 0.3233, + "step": 8570 + }, + { + "epoch": 1.6896687697160884, + "grad_norm": 0.47678716555909567, + "learning_rate": 1.2429143979977562e-05, + "loss": 0.3427, + "step": 8571 + }, + { + "epoch": 1.689865930599369, + "grad_norm": 0.4486388389800133, + "learning_rate": 1.2427640351623037e-05, + "loss": 0.3183, + "step": 8572 + }, + { + "epoch": 1.6900630914826498, + "grad_norm": 0.5299380107151339, + "learning_rate": 1.2426136664942495e-05, + "loss": 0.3656, + "step": 8573 + }, + { + "epoch": 1.6902602523659307, + "grad_norm": 0.46937533374559753, + "learning_rate": 1.2424632919972068e-05, + "loss": 0.33, + "step": 8574 + }, + { + "epoch": 1.6904574132492114, + "grad_norm": 0.4979192584567516, + "learning_rate": 1.2423129116747878e-05, + "loss": 0.3224, + "step": 8575 + }, + { + "epoch": 1.690654574132492, + "grad_norm": 1.0231877748006113, + "learning_rate": 1.2421625255306067e-05, + "loss": 0.3506, + "step": 8576 + }, + { + "epoch": 1.690851735015773, + "grad_norm": 0.47898806414813694, + "learning_rate": 1.242012133568275e-05, + "loss": 0.3408, + "step": 8577 + }, + { + "epoch": 1.6910488958990535, + "grad_norm": 0.47674507248699505, + "learning_rate": 1.2418617357914078e-05, + "loss": 0.3402, + "step": 8578 + }, + { + "epoch": 1.6912460567823344, + "grad_norm": 0.45179630277653227, + "learning_rate": 1.2417113322036172e-05, + "loss": 0.3154, + "step": 8579 + }, + { + "epoch": 1.6914432176656151, + "grad_norm": 0.46052841582017556, + "learning_rate": 1.2415609228085171e-05, + "loss": 0.3126, + "step": 8580 + }, + { + "epoch": 1.6916403785488958, + "grad_norm": 0.44611469096894074, + "learning_rate": 1.2414105076097214e-05, + "loss": 0.3166, + "step": 8581 + }, + { + "epoch": 1.6918375394321767, + "grad_norm": 0.4576223803492312, + "learning_rate": 1.241260086610844e-05, + "loss": 0.2939, + "step": 8582 + }, + { + "epoch": 1.6920347003154574, + "grad_norm": 0.4557877387492244, + "learning_rate": 1.2411096598154985e-05, + "loss": 0.3268, + "step": 8583 + }, + { + "epoch": 1.6922318611987381, + "grad_norm": 0.46572552067045964, + "learning_rate": 1.2409592272272995e-05, + "loss": 0.351, + "step": 8584 + }, + { + "epoch": 1.692429022082019, + "grad_norm": 0.5122610064442623, + "learning_rate": 1.2408087888498608e-05, + "loss": 0.3605, + "step": 8585 + }, + { + "epoch": 1.6926261829652995, + "grad_norm": 0.49403230477515936, + "learning_rate": 1.2406583446867972e-05, + "loss": 0.3342, + "step": 8586 + }, + { + "epoch": 1.6928233438485805, + "grad_norm": 0.5017202216659983, + "learning_rate": 1.240507894741723e-05, + "loss": 0.3491, + "step": 8587 + }, + { + "epoch": 1.6930205047318612, + "grad_norm": 0.4899331141459395, + "learning_rate": 1.2403574390182529e-05, + "loss": 0.3607, + "step": 8588 + }, + { + "epoch": 1.6932176656151419, + "grad_norm": 0.4930643507100292, + "learning_rate": 1.2402069775200018e-05, + "loss": 0.3346, + "step": 8589 + }, + { + "epoch": 1.6934148264984228, + "grad_norm": 0.47713702195641194, + "learning_rate": 1.2400565102505846e-05, + "loss": 0.3293, + "step": 8590 + }, + { + "epoch": 1.6936119873817035, + "grad_norm": 0.4783126509399956, + "learning_rate": 1.2399060372136165e-05, + "loss": 0.343, + "step": 8591 + }, + { + "epoch": 1.6938091482649842, + "grad_norm": 0.48348496039775457, + "learning_rate": 1.2397555584127127e-05, + "loss": 0.3281, + "step": 8592 + }, + { + "epoch": 1.694006309148265, + "grad_norm": 0.5160059605204272, + "learning_rate": 1.2396050738514884e-05, + "loss": 0.3423, + "step": 8593 + }, + { + "epoch": 1.6942034700315456, + "grad_norm": 0.45441215261046347, + "learning_rate": 1.2394545835335591e-05, + "loss": 0.314, + "step": 8594 + }, + { + "epoch": 1.6944006309148265, + "grad_norm": 0.4836399537284616, + "learning_rate": 1.239304087462541e-05, + "loss": 0.327, + "step": 8595 + }, + { + "epoch": 1.6945977917981072, + "grad_norm": 0.46510484102779165, + "learning_rate": 1.2391535856420492e-05, + "loss": 0.3528, + "step": 8596 + }, + { + "epoch": 1.694794952681388, + "grad_norm": 0.4430475484441221, + "learning_rate": 1.2390030780757e-05, + "loss": 0.31, + "step": 8597 + }, + { + "epoch": 1.6949921135646688, + "grad_norm": 0.4674652671591372, + "learning_rate": 1.2388525647671092e-05, + "loss": 0.3148, + "step": 8598 + }, + { + "epoch": 1.6951892744479495, + "grad_norm": 0.5054421188784337, + "learning_rate": 1.2387020457198937e-05, + "loss": 0.3795, + "step": 8599 + }, + { + "epoch": 1.6953864353312302, + "grad_norm": 0.4930491695671332, + "learning_rate": 1.238551520937669e-05, + "loss": 0.3515, + "step": 8600 + }, + { + "epoch": 1.6955835962145112, + "grad_norm": 0.464472471942234, + "learning_rate": 1.2384009904240517e-05, + "loss": 0.3178, + "step": 8601 + }, + { + "epoch": 1.6957807570977916, + "grad_norm": 0.46922888632843546, + "learning_rate": 1.238250454182659e-05, + "loss": 0.3421, + "step": 8602 + }, + { + "epoch": 1.6959779179810726, + "grad_norm": 0.5117715606317115, + "learning_rate": 1.238099912217107e-05, + "loss": 0.3657, + "step": 8603 + }, + { + "epoch": 1.6961750788643533, + "grad_norm": 0.45140287026735587, + "learning_rate": 1.237949364531013e-05, + "loss": 0.3224, + "step": 8604 + }, + { + "epoch": 1.696372239747634, + "grad_norm": 0.49574769053532836, + "learning_rate": 1.2377988111279937e-05, + "loss": 0.3462, + "step": 8605 + }, + { + "epoch": 1.6965694006309149, + "grad_norm": 0.46230478380221307, + "learning_rate": 1.2376482520116666e-05, + "loss": 0.3373, + "step": 8606 + }, + { + "epoch": 1.6967665615141956, + "grad_norm": 0.5059151436193712, + "learning_rate": 1.237497687185649e-05, + "loss": 0.3359, + "step": 8607 + }, + { + "epoch": 1.6969637223974763, + "grad_norm": 0.48234255694436523, + "learning_rate": 1.237347116653558e-05, + "loss": 0.3479, + "step": 8608 + }, + { + "epoch": 1.6971608832807572, + "grad_norm": 0.4396592143423356, + "learning_rate": 1.2371965404190116e-05, + "loss": 0.3067, + "step": 8609 + }, + { + "epoch": 1.697358044164038, + "grad_norm": 0.49065585389386196, + "learning_rate": 1.2370459584856271e-05, + "loss": 0.3445, + "step": 8610 + }, + { + "epoch": 1.6975552050473186, + "grad_norm": 0.49051628604620695, + "learning_rate": 1.2368953708570226e-05, + "loss": 0.3402, + "step": 8611 + }, + { + "epoch": 1.6977523659305995, + "grad_norm": 0.5254870988260015, + "learning_rate": 1.2367447775368163e-05, + "loss": 0.3454, + "step": 8612 + }, + { + "epoch": 1.69794952681388, + "grad_norm": 0.4562980109810731, + "learning_rate": 1.2365941785286258e-05, + "loss": 0.3074, + "step": 8613 + }, + { + "epoch": 1.698146687697161, + "grad_norm": 0.5678148667222066, + "learning_rate": 1.2364435738360696e-05, + "loss": 0.3396, + "step": 8614 + }, + { + "epoch": 1.6983438485804416, + "grad_norm": 0.4911585049022961, + "learning_rate": 1.2362929634627663e-05, + "loss": 0.3346, + "step": 8615 + }, + { + "epoch": 1.6985410094637223, + "grad_norm": 0.4681297072028885, + "learning_rate": 1.2361423474123343e-05, + "loss": 0.3082, + "step": 8616 + }, + { + "epoch": 1.6987381703470033, + "grad_norm": 0.5174736460144449, + "learning_rate": 1.235991725688392e-05, + "loss": 0.3814, + "step": 8617 + }, + { + "epoch": 1.698935331230284, + "grad_norm": 0.46929685353298345, + "learning_rate": 1.2358410982945586e-05, + "loss": 0.3355, + "step": 8618 + }, + { + "epoch": 1.6991324921135647, + "grad_norm": 0.4922370451892625, + "learning_rate": 1.2356904652344528e-05, + "loss": 0.3268, + "step": 8619 + }, + { + "epoch": 1.6993296529968456, + "grad_norm": 0.4596598424424867, + "learning_rate": 1.2355398265116937e-05, + "loss": 0.3293, + "step": 8620 + }, + { + "epoch": 1.699526813880126, + "grad_norm": 0.5001218713531277, + "learning_rate": 1.235389182129901e-05, + "loss": 0.3669, + "step": 8621 + }, + { + "epoch": 1.699723974763407, + "grad_norm": 0.4896954521807538, + "learning_rate": 1.2352385320926929e-05, + "loss": 0.3477, + "step": 8622 + }, + { + "epoch": 1.6999211356466877, + "grad_norm": 0.49063376422553284, + "learning_rate": 1.2350878764036904e-05, + "loss": 0.372, + "step": 8623 + }, + { + "epoch": 1.7001182965299684, + "grad_norm": 0.46066188918467005, + "learning_rate": 1.2349372150665117e-05, + "loss": 0.3251, + "step": 8624 + }, + { + "epoch": 1.7003154574132493, + "grad_norm": 0.48615185578096526, + "learning_rate": 1.2347865480847778e-05, + "loss": 0.3356, + "step": 8625 + }, + { + "epoch": 1.70051261829653, + "grad_norm": 0.46350483669286957, + "learning_rate": 1.2346358754621078e-05, + "loss": 0.3367, + "step": 8626 + }, + { + "epoch": 1.7007097791798107, + "grad_norm": 0.4534163321107968, + "learning_rate": 1.2344851972021219e-05, + "loss": 0.3297, + "step": 8627 + }, + { + "epoch": 1.7009069400630916, + "grad_norm": 0.46024896781400154, + "learning_rate": 1.2343345133084403e-05, + "loss": 0.3161, + "step": 8628 + }, + { + "epoch": 1.701104100946372, + "grad_norm": 0.4597911141388873, + "learning_rate": 1.2341838237846833e-05, + "loss": 0.3251, + "step": 8629 + }, + { + "epoch": 1.701301261829653, + "grad_norm": 0.47755657347184777, + "learning_rate": 1.2340331286344713e-05, + "loss": 0.3439, + "step": 8630 + }, + { + "epoch": 1.7014984227129337, + "grad_norm": 0.4595385546420116, + "learning_rate": 1.233882427861425e-05, + "loss": 0.3236, + "step": 8631 + }, + { + "epoch": 1.7016955835962144, + "grad_norm": 0.47878136809716293, + "learning_rate": 1.233731721469165e-05, + "loss": 0.3182, + "step": 8632 + }, + { + "epoch": 1.7018927444794953, + "grad_norm": 0.46749188234424455, + "learning_rate": 1.2335810094613123e-05, + "loss": 0.3346, + "step": 8633 + }, + { + "epoch": 1.702089905362776, + "grad_norm": 0.4620951279690006, + "learning_rate": 1.2334302918414875e-05, + "loss": 0.324, + "step": 8634 + }, + { + "epoch": 1.7022870662460567, + "grad_norm": 0.489807733516085, + "learning_rate": 1.2332795686133121e-05, + "loss": 0.3252, + "step": 8635 + }, + { + "epoch": 1.7024842271293377, + "grad_norm": 0.4804183436816162, + "learning_rate": 1.2331288397804072e-05, + "loss": 0.3598, + "step": 8636 + }, + { + "epoch": 1.7026813880126181, + "grad_norm": 0.5026867518086552, + "learning_rate": 1.2329781053463944e-05, + "loss": 0.36, + "step": 8637 + }, + { + "epoch": 1.702878548895899, + "grad_norm": 0.4759113828318395, + "learning_rate": 1.2328273653148945e-05, + "loss": 0.3411, + "step": 8638 + }, + { + "epoch": 1.7030757097791798, + "grad_norm": 0.48031817244407715, + "learning_rate": 1.2326766196895301e-05, + "loss": 0.3496, + "step": 8639 + }, + { + "epoch": 1.7032728706624605, + "grad_norm": 0.47282002859928735, + "learning_rate": 1.2325258684739223e-05, + "loss": 0.3215, + "step": 8640 + }, + { + "epoch": 1.7034700315457414, + "grad_norm": 3.191557994037556, + "learning_rate": 1.2323751116716932e-05, + "loss": 0.3359, + "step": 8641 + }, + { + "epoch": 1.703667192429022, + "grad_norm": 0.5967525800282695, + "learning_rate": 1.2322243492864651e-05, + "loss": 0.3469, + "step": 8642 + }, + { + "epoch": 1.7038643533123028, + "grad_norm": 0.4551107597567062, + "learning_rate": 1.2320735813218599e-05, + "loss": 0.3291, + "step": 8643 + }, + { + "epoch": 1.7040615141955837, + "grad_norm": 0.4620889896670438, + "learning_rate": 1.2319228077815001e-05, + "loss": 0.33, + "step": 8644 + }, + { + "epoch": 1.7042586750788642, + "grad_norm": 0.4853258209863878, + "learning_rate": 1.231772028669008e-05, + "loss": 0.3261, + "step": 8645 + }, + { + "epoch": 1.7044558359621451, + "grad_norm": 0.48368889919551544, + "learning_rate": 1.2316212439880065e-05, + "loss": 0.354, + "step": 8646 + }, + { + "epoch": 1.7046529968454258, + "grad_norm": 0.5542920344424767, + "learning_rate": 1.2314704537421177e-05, + "loss": 0.3408, + "step": 8647 + }, + { + "epoch": 1.7048501577287065, + "grad_norm": 0.47647351780117475, + "learning_rate": 1.2313196579349648e-05, + "loss": 0.3274, + "step": 8648 + }, + { + "epoch": 1.7050473186119874, + "grad_norm": 0.46934633413559235, + "learning_rate": 1.2311688565701711e-05, + "loss": 0.3418, + "step": 8649 + }, + { + "epoch": 1.7052444794952681, + "grad_norm": 0.48432919465079566, + "learning_rate": 1.2310180496513595e-05, + "loss": 0.3193, + "step": 8650 + }, + { + "epoch": 1.7054416403785488, + "grad_norm": 0.4661153258537868, + "learning_rate": 1.2308672371821532e-05, + "loss": 0.3321, + "step": 8651 + }, + { + "epoch": 1.7056388012618298, + "grad_norm": 0.4826086437714458, + "learning_rate": 1.2307164191661756e-05, + "loss": 0.338, + "step": 8652 + }, + { + "epoch": 1.7058359621451105, + "grad_norm": 0.4637877696238651, + "learning_rate": 1.2305655956070504e-05, + "loss": 0.3245, + "step": 8653 + }, + { + "epoch": 1.7060331230283912, + "grad_norm": 0.4640340301452536, + "learning_rate": 1.2304147665084007e-05, + "loss": 0.3176, + "step": 8654 + }, + { + "epoch": 1.706230283911672, + "grad_norm": 0.49786805036849485, + "learning_rate": 1.230263931873851e-05, + "loss": 0.3619, + "step": 8655 + }, + { + "epoch": 1.7064274447949526, + "grad_norm": 0.5747013253276394, + "learning_rate": 1.2301130917070245e-05, + "loss": 0.3151, + "step": 8656 + }, + { + "epoch": 1.7066246056782335, + "grad_norm": 0.5511325174270215, + "learning_rate": 1.2299622460115461e-05, + "loss": 0.36, + "step": 8657 + }, + { + "epoch": 1.7068217665615142, + "grad_norm": 0.5072499961226788, + "learning_rate": 1.2298113947910393e-05, + "loss": 0.3349, + "step": 8658 + }, + { + "epoch": 1.7070189274447949, + "grad_norm": 0.48091952766514195, + "learning_rate": 1.2296605380491288e-05, + "loss": 0.3491, + "step": 8659 + }, + { + "epoch": 1.7072160883280758, + "grad_norm": 0.4798754544496718, + "learning_rate": 1.2295096757894389e-05, + "loss": 0.3471, + "step": 8660 + }, + { + "epoch": 1.7074132492113565, + "grad_norm": 0.6307603113857772, + "learning_rate": 1.2293588080155943e-05, + "loss": 0.3413, + "step": 8661 + }, + { + "epoch": 1.7076104100946372, + "grad_norm": 0.4779048637501002, + "learning_rate": 1.2292079347312194e-05, + "loss": 0.3337, + "step": 8662 + }, + { + "epoch": 1.7078075709779181, + "grad_norm": 0.5247659710582877, + "learning_rate": 1.2290570559399395e-05, + "loss": 0.37, + "step": 8663 + }, + { + "epoch": 1.7080047318611986, + "grad_norm": 0.4617016132383226, + "learning_rate": 1.2289061716453795e-05, + "loss": 0.3223, + "step": 8664 + }, + { + "epoch": 1.7082018927444795, + "grad_norm": 0.47698713563356776, + "learning_rate": 1.2287552818511641e-05, + "loss": 0.3247, + "step": 8665 + }, + { + "epoch": 1.7083990536277602, + "grad_norm": 0.46484751275332564, + "learning_rate": 1.2286043865609188e-05, + "loss": 0.3319, + "step": 8666 + }, + { + "epoch": 1.708596214511041, + "grad_norm": 0.473092049126572, + "learning_rate": 1.2284534857782694e-05, + "loss": 0.3515, + "step": 8667 + }, + { + "epoch": 1.7087933753943219, + "grad_norm": 0.49275200785968454, + "learning_rate": 1.2283025795068407e-05, + "loss": 0.3342, + "step": 8668 + }, + { + "epoch": 1.7089905362776026, + "grad_norm": 0.5026513536426332, + "learning_rate": 1.2281516677502586e-05, + "loss": 0.3411, + "step": 8669 + }, + { + "epoch": 1.7091876971608833, + "grad_norm": 0.46869592654591175, + "learning_rate": 1.2280007505121491e-05, + "loss": 0.332, + "step": 8670 + }, + { + "epoch": 1.7093848580441642, + "grad_norm": 0.5099270759799205, + "learning_rate": 1.2278498277961377e-05, + "loss": 0.3446, + "step": 8671 + }, + { + "epoch": 1.7095820189274447, + "grad_norm": 2.050432417214253, + "learning_rate": 1.2276988996058511e-05, + "loss": 0.4219, + "step": 8672 + }, + { + "epoch": 1.7097791798107256, + "grad_norm": 0.4725441540525176, + "learning_rate": 1.227547965944915e-05, + "loss": 0.3432, + "step": 8673 + }, + { + "epoch": 1.7099763406940063, + "grad_norm": 0.4993708255622457, + "learning_rate": 1.227397026816956e-05, + "loss": 0.3513, + "step": 8674 + }, + { + "epoch": 1.710173501577287, + "grad_norm": 0.4887912737158361, + "learning_rate": 1.2272460822255996e-05, + "loss": 0.322, + "step": 8675 + }, + { + "epoch": 1.710370662460568, + "grad_norm": 0.46993623199235296, + "learning_rate": 1.2270951321744736e-05, + "loss": 0.3599, + "step": 8676 + }, + { + "epoch": 1.7105678233438486, + "grad_norm": 0.4643011227098681, + "learning_rate": 1.2269441766672042e-05, + "loss": 0.3012, + "step": 8677 + }, + { + "epoch": 1.7107649842271293, + "grad_norm": 0.5506263319435446, + "learning_rate": 1.2267932157074178e-05, + "loss": 0.3621, + "step": 8678 + }, + { + "epoch": 1.7109621451104102, + "grad_norm": 0.4919225236599655, + "learning_rate": 1.2266422492987423e-05, + "loss": 0.338, + "step": 8679 + }, + { + "epoch": 1.7111593059936907, + "grad_norm": 0.4896517148433233, + "learning_rate": 1.2264912774448037e-05, + "loss": 0.3527, + "step": 8680 + }, + { + "epoch": 1.7113564668769716, + "grad_norm": 0.48839509275100296, + "learning_rate": 1.22634030014923e-05, + "loss": 0.3464, + "step": 8681 + }, + { + "epoch": 1.7115536277602523, + "grad_norm": 0.4721548932185601, + "learning_rate": 1.2261893174156485e-05, + "loss": 0.3297, + "step": 8682 + }, + { + "epoch": 1.711750788643533, + "grad_norm": 0.4911637333121339, + "learning_rate": 1.2260383292476862e-05, + "loss": 0.3285, + "step": 8683 + }, + { + "epoch": 1.711947949526814, + "grad_norm": 0.459378104968867, + "learning_rate": 1.2258873356489713e-05, + "loss": 0.3292, + "step": 8684 + }, + { + "epoch": 1.7121451104100947, + "grad_norm": 0.4643594445198744, + "learning_rate": 1.2257363366231311e-05, + "loss": 0.3354, + "step": 8685 + }, + { + "epoch": 1.7123422712933754, + "grad_norm": 0.4875758149219096, + "learning_rate": 1.2255853321737935e-05, + "loss": 0.3461, + "step": 8686 + }, + { + "epoch": 1.7125394321766563, + "grad_norm": 0.48234734526715584, + "learning_rate": 1.225434322304587e-05, + "loss": 0.3432, + "step": 8687 + }, + { + "epoch": 1.7127365930599368, + "grad_norm": 0.4765321853764992, + "learning_rate": 1.2252833070191388e-05, + "loss": 0.335, + "step": 8688 + }, + { + "epoch": 1.7129337539432177, + "grad_norm": 0.4922172475680278, + "learning_rate": 1.2251322863210785e-05, + "loss": 0.3532, + "step": 8689 + }, + { + "epoch": 1.7131309148264984, + "grad_norm": 0.49132588894060586, + "learning_rate": 1.224981260214033e-05, + "loss": 0.3471, + "step": 8690 + }, + { + "epoch": 1.713328075709779, + "grad_norm": 0.4726091865577661, + "learning_rate": 1.2248302287016321e-05, + "loss": 0.3148, + "step": 8691 + }, + { + "epoch": 1.71352523659306, + "grad_norm": 0.44365990430314184, + "learning_rate": 1.2246791917875034e-05, + "loss": 0.2972, + "step": 8692 + }, + { + "epoch": 1.7137223974763407, + "grad_norm": 0.5487336291279901, + "learning_rate": 1.2245281494752765e-05, + "loss": 0.3856, + "step": 8693 + }, + { + "epoch": 1.7139195583596214, + "grad_norm": 0.4920272311780737, + "learning_rate": 1.2243771017685797e-05, + "loss": 0.3366, + "step": 8694 + }, + { + "epoch": 1.7141167192429023, + "grad_norm": 0.5015165415124818, + "learning_rate": 1.2242260486710427e-05, + "loss": 0.3512, + "step": 8695 + }, + { + "epoch": 1.714313880126183, + "grad_norm": 0.5239711415916627, + "learning_rate": 1.224074990186294e-05, + "loss": 0.381, + "step": 8696 + }, + { + "epoch": 1.7145110410094637, + "grad_norm": 0.4568955886255038, + "learning_rate": 1.2239239263179635e-05, + "loss": 0.3209, + "step": 8697 + }, + { + "epoch": 1.7147082018927446, + "grad_norm": 0.49060093715398756, + "learning_rate": 1.2237728570696801e-05, + "loss": 0.3439, + "step": 8698 + }, + { + "epoch": 1.7149053627760251, + "grad_norm": 0.47975117071356893, + "learning_rate": 1.2236217824450739e-05, + "loss": 0.344, + "step": 8699 + }, + { + "epoch": 1.715102523659306, + "grad_norm": 0.5258786604080068, + "learning_rate": 1.2234707024477742e-05, + "loss": 0.3348, + "step": 8700 + }, + { + "epoch": 1.7152996845425867, + "grad_norm": 0.48137432039450917, + "learning_rate": 1.2233196170814105e-05, + "loss": 0.3525, + "step": 8701 + }, + { + "epoch": 1.7154968454258674, + "grad_norm": 0.5014397647686436, + "learning_rate": 1.2231685263496137e-05, + "loss": 0.3438, + "step": 8702 + }, + { + "epoch": 1.7156940063091484, + "grad_norm": 0.4708182769624843, + "learning_rate": 1.2230174302560132e-05, + "loss": 0.3342, + "step": 8703 + }, + { + "epoch": 1.715891167192429, + "grad_norm": 0.512195415565872, + "learning_rate": 1.2228663288042392e-05, + "loss": 0.3553, + "step": 8704 + }, + { + "epoch": 1.7160883280757098, + "grad_norm": 0.4788767330139541, + "learning_rate": 1.2227152219979224e-05, + "loss": 0.3536, + "step": 8705 + }, + { + "epoch": 1.7162854889589907, + "grad_norm": 0.4625920435496106, + "learning_rate": 1.2225641098406928e-05, + "loss": 0.2878, + "step": 8706 + }, + { + "epoch": 1.7164826498422712, + "grad_norm": 0.5070101532510752, + "learning_rate": 1.2224129923361813e-05, + "loss": 0.3453, + "step": 8707 + }, + { + "epoch": 1.716679810725552, + "grad_norm": 0.4988378522423907, + "learning_rate": 1.2222618694880187e-05, + "loss": 0.3697, + "step": 8708 + }, + { + "epoch": 1.7168769716088328, + "grad_norm": 0.5033574544758712, + "learning_rate": 1.2221107412998352e-05, + "loss": 0.3118, + "step": 8709 + }, + { + "epoch": 1.7170741324921135, + "grad_norm": 0.4890282116096399, + "learning_rate": 1.2219596077752629e-05, + "loss": 0.325, + "step": 8710 + }, + { + "epoch": 1.7172712933753944, + "grad_norm": 0.4760094568747338, + "learning_rate": 1.221808468917932e-05, + "loss": 0.3281, + "step": 8711 + }, + { + "epoch": 1.7174684542586751, + "grad_norm": 0.48764166466975545, + "learning_rate": 1.221657324731474e-05, + "loss": 0.3331, + "step": 8712 + }, + { + "epoch": 1.7176656151419558, + "grad_norm": 0.4517243985708021, + "learning_rate": 1.22150617521952e-05, + "loss": 0.34, + "step": 8713 + }, + { + "epoch": 1.7178627760252367, + "grad_norm": 0.5139727951740557, + "learning_rate": 1.2213550203857025e-05, + "loss": 0.3511, + "step": 8714 + }, + { + "epoch": 1.7180599369085172, + "grad_norm": 0.45072979492101384, + "learning_rate": 1.2212038602336518e-05, + "loss": 0.3228, + "step": 8715 + }, + { + "epoch": 1.7182570977917981, + "grad_norm": 0.4844004632125416, + "learning_rate": 1.2210526947670003e-05, + "loss": 0.3361, + "step": 8716 + }, + { + "epoch": 1.7184542586750788, + "grad_norm": 0.48156386430034853, + "learning_rate": 1.22090152398938e-05, + "loss": 0.3579, + "step": 8717 + }, + { + "epoch": 1.7186514195583595, + "grad_norm": 0.4801222110814163, + "learning_rate": 1.2207503479044224e-05, + "loss": 0.3327, + "step": 8718 + }, + { + "epoch": 1.7188485804416405, + "grad_norm": 0.4633028361510336, + "learning_rate": 1.2205991665157604e-05, + "loss": 0.3361, + "step": 8719 + }, + { + "epoch": 1.7190457413249212, + "grad_norm": 0.4954639943287471, + "learning_rate": 1.2204479798270252e-05, + "loss": 0.3555, + "step": 8720 + }, + { + "epoch": 1.7192429022082019, + "grad_norm": 0.5071392340252393, + "learning_rate": 1.2202967878418504e-05, + "loss": 0.3311, + "step": 8721 + }, + { + "epoch": 1.7194400630914828, + "grad_norm": 0.4924112369717431, + "learning_rate": 1.2201455905638673e-05, + "loss": 0.3343, + "step": 8722 + }, + { + "epoch": 1.7196372239747633, + "grad_norm": 0.5281013648224745, + "learning_rate": 1.2199943879967092e-05, + "loss": 0.3661, + "step": 8723 + }, + { + "epoch": 1.7198343848580442, + "grad_norm": 0.4188521660470496, + "learning_rate": 1.2198431801440087e-05, + "loss": 0.2792, + "step": 8724 + }, + { + "epoch": 1.7200315457413249, + "grad_norm": 0.514554223490542, + "learning_rate": 1.2196919670093989e-05, + "loss": 0.351, + "step": 8725 + }, + { + "epoch": 1.7202287066246056, + "grad_norm": 0.4773669903370585, + "learning_rate": 1.2195407485965129e-05, + "loss": 0.3371, + "step": 8726 + }, + { + "epoch": 1.7204258675078865, + "grad_norm": 0.4486097522414558, + "learning_rate": 1.2193895249089833e-05, + "loss": 0.3221, + "step": 8727 + }, + { + "epoch": 1.7206230283911672, + "grad_norm": 0.4514574538208784, + "learning_rate": 1.2192382959504438e-05, + "loss": 0.3146, + "step": 8728 + }, + { + "epoch": 1.720820189274448, + "grad_norm": 0.4906075691840977, + "learning_rate": 1.2190870617245279e-05, + "loss": 0.3452, + "step": 8729 + }, + { + "epoch": 1.7210173501577288, + "grad_norm": 0.4728367195594932, + "learning_rate": 1.2189358222348685e-05, + "loss": 0.34, + "step": 8730 + }, + { + "epoch": 1.7212145110410093, + "grad_norm": 0.47426809412562415, + "learning_rate": 1.2187845774850999e-05, + "loss": 0.2974, + "step": 8731 + }, + { + "epoch": 1.7214116719242902, + "grad_norm": 0.4676329793425546, + "learning_rate": 1.2186333274788558e-05, + "loss": 0.3375, + "step": 8732 + }, + { + "epoch": 1.721608832807571, + "grad_norm": 0.467140693302685, + "learning_rate": 1.2184820722197696e-05, + "loss": 0.3224, + "step": 8733 + }, + { + "epoch": 1.7218059936908516, + "grad_norm": 0.492299563007178, + "learning_rate": 1.2183308117114759e-05, + "loss": 0.3401, + "step": 8734 + }, + { + "epoch": 1.7220031545741326, + "grad_norm": 0.47469376128492763, + "learning_rate": 1.2181795459576085e-05, + "loss": 0.3676, + "step": 8735 + }, + { + "epoch": 1.7222003154574133, + "grad_norm": 0.482130929800137, + "learning_rate": 1.2180282749618017e-05, + "loss": 0.332, + "step": 8736 + }, + { + "epoch": 1.722397476340694, + "grad_norm": 0.5031498208900129, + "learning_rate": 1.2178769987276902e-05, + "loss": 0.3525, + "step": 8737 + }, + { + "epoch": 1.7225946372239749, + "grad_norm": 0.48826854933895736, + "learning_rate": 1.2177257172589086e-05, + "loss": 0.3464, + "step": 8738 + }, + { + "epoch": 1.7227917981072554, + "grad_norm": 0.475972420949733, + "learning_rate": 1.2175744305590907e-05, + "loss": 0.3193, + "step": 8739 + }, + { + "epoch": 1.7229889589905363, + "grad_norm": 0.49949539660408465, + "learning_rate": 1.2174231386318724e-05, + "loss": 0.3567, + "step": 8740 + }, + { + "epoch": 1.723186119873817, + "grad_norm": 0.49325281815052363, + "learning_rate": 1.2172718414808877e-05, + "loss": 0.3581, + "step": 8741 + }, + { + "epoch": 1.7233832807570977, + "grad_norm": 0.4777530691912601, + "learning_rate": 1.2171205391097724e-05, + "loss": 0.3277, + "step": 8742 + }, + { + "epoch": 1.7235804416403786, + "grad_norm": 0.47301218354355706, + "learning_rate": 1.216969231522161e-05, + "loss": 0.3346, + "step": 8743 + }, + { + "epoch": 1.7237776025236593, + "grad_norm": 0.6201310559734784, + "learning_rate": 1.2168179187216893e-05, + "loss": 0.3247, + "step": 8744 + }, + { + "epoch": 1.72397476340694, + "grad_norm": 0.4669955469618345, + "learning_rate": 1.2166666007119925e-05, + "loss": 0.3382, + "step": 8745 + }, + { + "epoch": 1.724171924290221, + "grad_norm": 0.47156930793491, + "learning_rate": 1.2165152774967061e-05, + "loss": 0.348, + "step": 8746 + }, + { + "epoch": 1.7243690851735016, + "grad_norm": 0.5146898131222495, + "learning_rate": 1.2163639490794659e-05, + "loss": 0.3423, + "step": 8747 + }, + { + "epoch": 1.7245662460567823, + "grad_norm": 0.4748624471052609, + "learning_rate": 1.2162126154639073e-05, + "loss": 0.3329, + "step": 8748 + }, + { + "epoch": 1.7247634069400632, + "grad_norm": 0.5041545803945943, + "learning_rate": 1.2160612766536668e-05, + "loss": 0.3429, + "step": 8749 + }, + { + "epoch": 1.7249605678233437, + "grad_norm": 0.4560606048941914, + "learning_rate": 1.21590993265238e-05, + "loss": 0.3329, + "step": 8750 + }, + { + "epoch": 1.7251577287066246, + "grad_norm": 0.4920455686195489, + "learning_rate": 1.2157585834636834e-05, + "loss": 0.3369, + "step": 8751 + }, + { + "epoch": 1.7253548895899053, + "grad_norm": 0.4817930597858489, + "learning_rate": 1.2156072290912126e-05, + "loss": 0.3123, + "step": 8752 + }, + { + "epoch": 1.725552050473186, + "grad_norm": 0.4933196910433103, + "learning_rate": 1.2154558695386049e-05, + "loss": 0.3625, + "step": 8753 + }, + { + "epoch": 1.725749211356467, + "grad_norm": 0.43787576840008613, + "learning_rate": 1.2153045048094963e-05, + "loss": 0.3305, + "step": 8754 + }, + { + "epoch": 1.7259463722397477, + "grad_norm": 0.4646297207054528, + "learning_rate": 1.2151531349075236e-05, + "loss": 0.3276, + "step": 8755 + }, + { + "epoch": 1.7261435331230284, + "grad_norm": 0.459618597411763, + "learning_rate": 1.2150017598363236e-05, + "loss": 0.3039, + "step": 8756 + }, + { + "epoch": 1.7263406940063093, + "grad_norm": 0.48044932107882354, + "learning_rate": 1.2148503795995332e-05, + "loss": 0.3209, + "step": 8757 + }, + { + "epoch": 1.7265378548895898, + "grad_norm": 0.48647578029445776, + "learning_rate": 1.2146989942007891e-05, + "loss": 0.3308, + "step": 8758 + }, + { + "epoch": 1.7267350157728707, + "grad_norm": 0.4518527350354567, + "learning_rate": 1.2145476036437294e-05, + "loss": 0.3233, + "step": 8759 + }, + { + "epoch": 1.7269321766561514, + "grad_norm": 0.48074502143691444, + "learning_rate": 1.21439620793199e-05, + "loss": 0.3538, + "step": 8760 + }, + { + "epoch": 1.727129337539432, + "grad_norm": 0.4662467836506099, + "learning_rate": 1.2142448070692096e-05, + "loss": 0.3411, + "step": 8761 + }, + { + "epoch": 1.727326498422713, + "grad_norm": 0.5184102450636691, + "learning_rate": 1.2140934010590249e-05, + "loss": 0.3508, + "step": 8762 + }, + { + "epoch": 1.7275236593059937, + "grad_norm": 0.54097531724488, + "learning_rate": 1.213941989905074e-05, + "loss": 0.3291, + "step": 8763 + }, + { + "epoch": 1.7277208201892744, + "grad_norm": 0.4815838736915963, + "learning_rate": 1.2137905736109946e-05, + "loss": 0.3279, + "step": 8764 + }, + { + "epoch": 1.7279179810725553, + "grad_norm": 0.47938770142115866, + "learning_rate": 1.213639152180424e-05, + "loss": 0.3222, + "step": 8765 + }, + { + "epoch": 1.7281151419558358, + "grad_norm": 0.44462625843755743, + "learning_rate": 1.2134877256170012e-05, + "loss": 0.3056, + "step": 8766 + }, + { + "epoch": 1.7283123028391167, + "grad_norm": 0.48782541427748793, + "learning_rate": 1.2133362939243638e-05, + "loss": 0.3382, + "step": 8767 + }, + { + "epoch": 1.7285094637223974, + "grad_norm": 0.47686312944629256, + "learning_rate": 1.2131848571061501e-05, + "loss": 0.3336, + "step": 8768 + }, + { + "epoch": 1.7287066246056781, + "grad_norm": 0.4972392941654052, + "learning_rate": 1.2130334151659987e-05, + "loss": 0.3457, + "step": 8769 + }, + { + "epoch": 1.728903785488959, + "grad_norm": 0.44658596263964356, + "learning_rate": 1.2128819681075476e-05, + "loss": 0.3113, + "step": 8770 + }, + { + "epoch": 1.7291009463722398, + "grad_norm": 0.6132762769001334, + "learning_rate": 1.2127305159344358e-05, + "loss": 0.2762, + "step": 8771 + }, + { + "epoch": 1.7292981072555205, + "grad_norm": 0.5184653807356759, + "learning_rate": 1.2125790586503024e-05, + "loss": 0.3589, + "step": 8772 + }, + { + "epoch": 1.7294952681388014, + "grad_norm": 0.48887264222267973, + "learning_rate": 1.2124275962587857e-05, + "loss": 0.3458, + "step": 8773 + }, + { + "epoch": 1.7296924290220819, + "grad_norm": 0.48006157666774124, + "learning_rate": 1.212276128763525e-05, + "loss": 0.3261, + "step": 8774 + }, + { + "epoch": 1.7298895899053628, + "grad_norm": 0.4705936894192923, + "learning_rate": 1.2121246561681592e-05, + "loss": 0.3318, + "step": 8775 + }, + { + "epoch": 1.7300867507886435, + "grad_norm": 0.4680969494433612, + "learning_rate": 1.2119731784763278e-05, + "loss": 0.3278, + "step": 8776 + }, + { + "epoch": 1.7302839116719242, + "grad_norm": 0.5204916184909986, + "learning_rate": 1.21182169569167e-05, + "loss": 0.3461, + "step": 8777 + }, + { + "epoch": 1.7304810725552051, + "grad_norm": 0.4682144834045916, + "learning_rate": 1.2116702078178255e-05, + "loss": 0.3115, + "step": 8778 + }, + { + "epoch": 1.7306782334384858, + "grad_norm": 0.4684797179301652, + "learning_rate": 1.2115187148584338e-05, + "loss": 0.3297, + "step": 8779 + }, + { + "epoch": 1.7308753943217665, + "grad_norm": 0.5149570001266986, + "learning_rate": 1.2113672168171347e-05, + "loss": 0.3411, + "step": 8780 + }, + { + "epoch": 1.7310725552050474, + "grad_norm": 0.4918293028719042, + "learning_rate": 1.2112157136975678e-05, + "loss": 0.3302, + "step": 8781 + }, + { + "epoch": 1.731269716088328, + "grad_norm": 0.4680392185456968, + "learning_rate": 1.2110642055033737e-05, + "loss": 0.313, + "step": 8782 + }, + { + "epoch": 1.7314668769716088, + "grad_norm": 0.4675694840904993, + "learning_rate": 1.2109126922381917e-05, + "loss": 0.3383, + "step": 8783 + }, + { + "epoch": 1.7316640378548895, + "grad_norm": 0.5051346681197235, + "learning_rate": 1.2107611739056624e-05, + "loss": 0.3679, + "step": 8784 + }, + { + "epoch": 1.7318611987381702, + "grad_norm": 0.48965432777518686, + "learning_rate": 1.2106096505094264e-05, + "loss": 0.3412, + "step": 8785 + }, + { + "epoch": 1.7320583596214512, + "grad_norm": 0.5179854164790161, + "learning_rate": 1.2104581220531237e-05, + "loss": 0.3659, + "step": 8786 + }, + { + "epoch": 1.7322555205047319, + "grad_norm": 0.4814569942652689, + "learning_rate": 1.2103065885403955e-05, + "loss": 0.3369, + "step": 8787 + }, + { + "epoch": 1.7324526813880126, + "grad_norm": 0.4885561961779997, + "learning_rate": 1.2101550499748818e-05, + "loss": 0.3446, + "step": 8788 + }, + { + "epoch": 1.7326498422712935, + "grad_norm": 0.49161198978038273, + "learning_rate": 1.210003506360224e-05, + "loss": 0.3177, + "step": 8789 + }, + { + "epoch": 1.7328470031545742, + "grad_norm": 0.48634200842172337, + "learning_rate": 1.2098519577000627e-05, + "loss": 0.3553, + "step": 8790 + }, + { + "epoch": 1.7330441640378549, + "grad_norm": 0.48325106568491305, + "learning_rate": 1.2097004039980391e-05, + "loss": 0.3519, + "step": 8791 + }, + { + "epoch": 1.7332413249211358, + "grad_norm": 0.4641044710550502, + "learning_rate": 1.2095488452577946e-05, + "loss": 0.3123, + "step": 8792 + }, + { + "epoch": 1.7334384858044163, + "grad_norm": 0.47358093899817655, + "learning_rate": 1.2093972814829701e-05, + "loss": 0.3162, + "step": 8793 + }, + { + "epoch": 1.7336356466876972, + "grad_norm": 0.4913256828106219, + "learning_rate": 1.2092457126772074e-05, + "loss": 0.3531, + "step": 8794 + }, + { + "epoch": 1.733832807570978, + "grad_norm": 0.4912311679309798, + "learning_rate": 1.2090941388441482e-05, + "loss": 0.3359, + "step": 8795 + }, + { + "epoch": 1.7340299684542586, + "grad_norm": 0.47304417855315983, + "learning_rate": 1.2089425599874335e-05, + "loss": 0.3022, + "step": 8796 + }, + { + "epoch": 1.7342271293375395, + "grad_norm": 0.4672250909616135, + "learning_rate": 1.208790976110706e-05, + "loss": 0.3027, + "step": 8797 + }, + { + "epoch": 1.7344242902208202, + "grad_norm": 0.4647836250132962, + "learning_rate": 1.2086393872176067e-05, + "loss": 0.3144, + "step": 8798 + }, + { + "epoch": 1.734621451104101, + "grad_norm": 0.5010956167290279, + "learning_rate": 1.2084877933117784e-05, + "loss": 0.3641, + "step": 8799 + }, + { + "epoch": 1.7348186119873819, + "grad_norm": 0.46610623517280136, + "learning_rate": 1.2083361943968628e-05, + "loss": 0.3291, + "step": 8800 + }, + { + "epoch": 1.7350157728706623, + "grad_norm": 0.5069304515479853, + "learning_rate": 1.2081845904765026e-05, + "loss": 0.3766, + "step": 8801 + }, + { + "epoch": 1.7352129337539433, + "grad_norm": 0.4418811498806394, + "learning_rate": 1.2080329815543398e-05, + "loss": 0.3065, + "step": 8802 + }, + { + "epoch": 1.735410094637224, + "grad_norm": 0.5417087924566732, + "learning_rate": 1.2078813676340171e-05, + "loss": 0.322, + "step": 8803 + }, + { + "epoch": 1.7356072555205047, + "grad_norm": 0.4885878208354627, + "learning_rate": 1.2077297487191771e-05, + "loss": 0.3166, + "step": 8804 + }, + { + "epoch": 1.7358044164037856, + "grad_norm": 0.46819488040628704, + "learning_rate": 1.2075781248134624e-05, + "loss": 0.3308, + "step": 8805 + }, + { + "epoch": 1.7360015772870663, + "grad_norm": 0.4725168433278604, + "learning_rate": 1.2074264959205167e-05, + "loss": 0.3248, + "step": 8806 + }, + { + "epoch": 1.736198738170347, + "grad_norm": 0.4685757443962958, + "learning_rate": 1.2072748620439816e-05, + "loss": 0.3162, + "step": 8807 + }, + { + "epoch": 1.736395899053628, + "grad_norm": 0.4944655388806196, + "learning_rate": 1.2071232231875017e-05, + "loss": 0.3465, + "step": 8808 + }, + { + "epoch": 1.7365930599369084, + "grad_norm": 0.48192011814447067, + "learning_rate": 1.2069715793547192e-05, + "loss": 0.3312, + "step": 8809 + }, + { + "epoch": 1.7367902208201893, + "grad_norm": 0.5000553460824155, + "learning_rate": 1.2068199305492781e-05, + "loss": 0.3439, + "step": 8810 + }, + { + "epoch": 1.73698738170347, + "grad_norm": 0.47422303429265467, + "learning_rate": 1.2066682767748212e-05, + "loss": 0.3176, + "step": 8811 + }, + { + "epoch": 1.7371845425867507, + "grad_norm": 0.449422917568121, + "learning_rate": 1.2065166180349928e-05, + "loss": 0.3241, + "step": 8812 + }, + { + "epoch": 1.7373817034700316, + "grad_norm": 0.49595270985884926, + "learning_rate": 1.2063649543334364e-05, + "loss": 0.3461, + "step": 8813 + }, + { + "epoch": 1.7375788643533123, + "grad_norm": 0.44630470527387806, + "learning_rate": 1.2062132856737958e-05, + "loss": 0.3099, + "step": 8814 + }, + { + "epoch": 1.737776025236593, + "grad_norm": 2.0412021740875836, + "learning_rate": 1.2060616120597149e-05, + "loss": 0.3194, + "step": 8815 + }, + { + "epoch": 1.737973186119874, + "grad_norm": 0.47472434744895176, + "learning_rate": 1.2059099334948376e-05, + "loss": 0.3325, + "step": 8816 + }, + { + "epoch": 1.7381703470031544, + "grad_norm": 0.481225491809552, + "learning_rate": 1.2057582499828086e-05, + "loss": 0.336, + "step": 8817 + }, + { + "epoch": 1.7383675078864353, + "grad_norm": 0.44821446660142955, + "learning_rate": 1.205606561527272e-05, + "loss": 0.3134, + "step": 8818 + }, + { + "epoch": 1.738564668769716, + "grad_norm": 0.4813063073960643, + "learning_rate": 1.205454868131872e-05, + "loss": 0.3246, + "step": 8819 + }, + { + "epoch": 1.7387618296529967, + "grad_norm": 0.4753107258555097, + "learning_rate": 1.2053031698002533e-05, + "loss": 0.333, + "step": 8820 + }, + { + "epoch": 1.7389589905362777, + "grad_norm": 0.5613865154278634, + "learning_rate": 1.2051514665360606e-05, + "loss": 0.3334, + "step": 8821 + }, + { + "epoch": 1.7391561514195584, + "grad_norm": 0.5166325707314708, + "learning_rate": 1.2049997583429389e-05, + "loss": 0.3388, + "step": 8822 + }, + { + "epoch": 1.739353312302839, + "grad_norm": 0.4689529203647351, + "learning_rate": 1.2048480452245328e-05, + "loss": 0.3409, + "step": 8823 + }, + { + "epoch": 1.73955047318612, + "grad_norm": 0.5124805778270197, + "learning_rate": 1.2046963271844876e-05, + "loss": 0.3589, + "step": 8824 + }, + { + "epoch": 1.7397476340694005, + "grad_norm": 0.4555862428808988, + "learning_rate": 1.2045446042264482e-05, + "loss": 0.3218, + "step": 8825 + }, + { + "epoch": 1.7399447949526814, + "grad_norm": 0.46880230449353444, + "learning_rate": 1.2043928763540598e-05, + "loss": 0.3204, + "step": 8826 + }, + { + "epoch": 1.740141955835962, + "grad_norm": 0.4661584032901464, + "learning_rate": 1.2042411435709683e-05, + "loss": 0.3008, + "step": 8827 + }, + { + "epoch": 1.7403391167192428, + "grad_norm": 0.49560394022565896, + "learning_rate": 1.2040894058808183e-05, + "loss": 0.3304, + "step": 8828 + }, + { + "epoch": 1.7405362776025237, + "grad_norm": 0.477044636888031, + "learning_rate": 1.2039376632872565e-05, + "loss": 0.3348, + "step": 8829 + }, + { + "epoch": 1.7407334384858044, + "grad_norm": 0.4768628146082518, + "learning_rate": 1.2037859157939278e-05, + "loss": 0.3517, + "step": 8830 + }, + { + "epoch": 1.7409305993690851, + "grad_norm": 0.46710532512756137, + "learning_rate": 1.2036341634044785e-05, + "loss": 0.3378, + "step": 8831 + }, + { + "epoch": 1.741127760252366, + "grad_norm": 0.4838108367315246, + "learning_rate": 1.2034824061225545e-05, + "loss": 0.3403, + "step": 8832 + }, + { + "epoch": 1.7413249211356467, + "grad_norm": 0.4814792464848545, + "learning_rate": 1.2033306439518017e-05, + "loss": 0.328, + "step": 8833 + }, + { + "epoch": 1.7415220820189274, + "grad_norm": 0.4927086846869189, + "learning_rate": 1.2031788768958666e-05, + "loss": 0.3458, + "step": 8834 + }, + { + "epoch": 1.7417192429022084, + "grad_norm": 0.4617442660897952, + "learning_rate": 1.203027104958395e-05, + "loss": 0.3377, + "step": 8835 + }, + { + "epoch": 1.7419164037854888, + "grad_norm": 0.48638088204825836, + "learning_rate": 1.2028753281430343e-05, + "loss": 0.3269, + "step": 8836 + }, + { + "epoch": 1.7421135646687698, + "grad_norm": 0.5049199749667796, + "learning_rate": 1.20272354645343e-05, + "loss": 0.3766, + "step": 8837 + }, + { + "epoch": 1.7423107255520505, + "grad_norm": 0.48188659695790537, + "learning_rate": 1.2025717598932293e-05, + "loss": 0.3414, + "step": 8838 + }, + { + "epoch": 1.7425078864353312, + "grad_norm": 0.4738832139863155, + "learning_rate": 1.2024199684660792e-05, + "loss": 0.3253, + "step": 8839 + }, + { + "epoch": 1.742705047318612, + "grad_norm": 0.477503317970411, + "learning_rate": 1.202268172175626e-05, + "loss": 0.3344, + "step": 8840 + }, + { + "epoch": 1.7429022082018928, + "grad_norm": 0.5304814544412956, + "learning_rate": 1.2021163710255173e-05, + "loss": 0.334, + "step": 8841 + }, + { + "epoch": 1.7430993690851735, + "grad_norm": 0.508157775015286, + "learning_rate": 1.2019645650193999e-05, + "loss": 0.3512, + "step": 8842 + }, + { + "epoch": 1.7432965299684544, + "grad_norm": 0.5017879226850557, + "learning_rate": 1.2018127541609212e-05, + "loss": 0.333, + "step": 8843 + }, + { + "epoch": 1.743493690851735, + "grad_norm": 0.4867632085277509, + "learning_rate": 1.2016609384537287e-05, + "loss": 0.3366, + "step": 8844 + }, + { + "epoch": 1.7436908517350158, + "grad_norm": 0.5105248897104497, + "learning_rate": 1.2015091179014696e-05, + "loss": 0.3556, + "step": 8845 + }, + { + "epoch": 1.7438880126182965, + "grad_norm": 0.4608424473916468, + "learning_rate": 1.2013572925077919e-05, + "loss": 0.3371, + "step": 8846 + }, + { + "epoch": 1.7440851735015772, + "grad_norm": 0.45038670620879856, + "learning_rate": 1.2012054622763425e-05, + "loss": 0.3052, + "step": 8847 + }, + { + "epoch": 1.7442823343848581, + "grad_norm": 0.5081471358396383, + "learning_rate": 1.2010536272107706e-05, + "loss": 0.3455, + "step": 8848 + }, + { + "epoch": 1.7444794952681388, + "grad_norm": 0.4644321538998626, + "learning_rate": 1.200901787314723e-05, + "loss": 0.3091, + "step": 8849 + }, + { + "epoch": 1.7446766561514195, + "grad_norm": 0.48115698751245345, + "learning_rate": 1.2007499425918483e-05, + "loss": 0.3482, + "step": 8850 + }, + { + "epoch": 1.7448738170347005, + "grad_norm": 0.5228016067341045, + "learning_rate": 1.2005980930457946e-05, + "loss": 0.3275, + "step": 8851 + }, + { + "epoch": 1.745070977917981, + "grad_norm": 0.583985733994483, + "learning_rate": 1.2004462386802098e-05, + "loss": 0.3648, + "step": 8852 + }, + { + "epoch": 1.7452681388012619, + "grad_norm": 0.45939749735947777, + "learning_rate": 1.2002943794987432e-05, + "loss": 0.3389, + "step": 8853 + }, + { + "epoch": 1.7454652996845426, + "grad_norm": 0.4648960544103313, + "learning_rate": 1.2001425155050423e-05, + "loss": 0.3412, + "step": 8854 + }, + { + "epoch": 1.7456624605678233, + "grad_norm": 0.4793461007710896, + "learning_rate": 1.1999906467027568e-05, + "loss": 0.3385, + "step": 8855 + }, + { + "epoch": 1.7458596214511042, + "grad_norm": 0.4611557554227105, + "learning_rate": 1.1998387730955345e-05, + "loss": 0.2932, + "step": 8856 + }, + { + "epoch": 1.7460567823343849, + "grad_norm": 0.4679923259603527, + "learning_rate": 1.1996868946870252e-05, + "loss": 0.3347, + "step": 8857 + }, + { + "epoch": 1.7462539432176656, + "grad_norm": 0.47215734774680346, + "learning_rate": 1.1995350114808772e-05, + "loss": 0.3437, + "step": 8858 + }, + { + "epoch": 1.7464511041009465, + "grad_norm": 0.4648472195023899, + "learning_rate": 1.1993831234807401e-05, + "loss": 0.3623, + "step": 8859 + }, + { + "epoch": 1.746648264984227, + "grad_norm": 0.473482682420203, + "learning_rate": 1.1992312306902625e-05, + "loss": 0.3372, + "step": 8860 + }, + { + "epoch": 1.746845425867508, + "grad_norm": 0.46094418222077305, + "learning_rate": 1.1990793331130944e-05, + "loss": 0.3221, + "step": 8861 + }, + { + "epoch": 1.7470425867507886, + "grad_norm": 0.5016161341201105, + "learning_rate": 1.1989274307528848e-05, + "loss": 0.3539, + "step": 8862 + }, + { + "epoch": 1.7472397476340693, + "grad_norm": 0.49099642673007926, + "learning_rate": 1.1987755236132839e-05, + "loss": 0.3462, + "step": 8863 + }, + { + "epoch": 1.7474369085173502, + "grad_norm": 0.48487743130133104, + "learning_rate": 1.1986236116979406e-05, + "loss": 0.3585, + "step": 8864 + }, + { + "epoch": 1.747634069400631, + "grad_norm": 0.46908185404595404, + "learning_rate": 1.1984716950105054e-05, + "loss": 0.3306, + "step": 8865 + }, + { + "epoch": 1.7478312302839116, + "grad_norm": 0.4832688212778634, + "learning_rate": 1.1983197735546275e-05, + "loss": 0.3631, + "step": 8866 + }, + { + "epoch": 1.7480283911671926, + "grad_norm": 0.4602457165492578, + "learning_rate": 1.1981678473339576e-05, + "loss": 0.3071, + "step": 8867 + }, + { + "epoch": 1.748225552050473, + "grad_norm": 0.43528249927314083, + "learning_rate": 1.1980159163521454e-05, + "loss": 0.3135, + "step": 8868 + }, + { + "epoch": 1.748422712933754, + "grad_norm": 0.4695790802517424, + "learning_rate": 1.1978639806128416e-05, + "loss": 0.331, + "step": 8869 + }, + { + "epoch": 1.7486198738170347, + "grad_norm": 0.5280131284138565, + "learning_rate": 1.1977120401196963e-05, + "loss": 0.3353, + "step": 8870 + }, + { + "epoch": 1.7488170347003154, + "grad_norm": 0.4681473144779894, + "learning_rate": 1.1975600948763597e-05, + "loss": 0.3286, + "step": 8871 + }, + { + "epoch": 1.7490141955835963, + "grad_norm": 0.4604810167753171, + "learning_rate": 1.197408144886483e-05, + "loss": 0.3244, + "step": 8872 + }, + { + "epoch": 1.749211356466877, + "grad_norm": 0.46975125282307023, + "learning_rate": 1.1972561901537164e-05, + "loss": 0.3175, + "step": 8873 + }, + { + "epoch": 1.7494085173501577, + "grad_norm": 0.4932348572031116, + "learning_rate": 1.1971042306817113e-05, + "loss": 0.3538, + "step": 8874 + }, + { + "epoch": 1.7496056782334386, + "grad_norm": 0.46969073205555756, + "learning_rate": 1.196952266474118e-05, + "loss": 0.3453, + "step": 8875 + }, + { + "epoch": 1.749802839116719, + "grad_norm": 0.4808934210094641, + "learning_rate": 1.1968002975345882e-05, + "loss": 0.3415, + "step": 8876 + }, + { + "epoch": 1.749802839116719, + "eval_loss": 0.42366865277290344, + "eval_runtime": 344.3756, + "eval_samples_per_second": 23.608, + "eval_steps_per_second": 1.478, + "step": 8876 + }, + { + "epoch": 1.75, + "grad_norm": 0.48060930083956127, + "learning_rate": 1.1966483238667725e-05, + "loss": 0.3601, + "step": 8877 + }, + { + "epoch": 1.750197160883281, + "grad_norm": 0.4821573663894791, + "learning_rate": 1.1964963454743228e-05, + "loss": 0.3615, + "step": 8878 + }, + { + "epoch": 1.7503943217665614, + "grad_norm": 0.47251836827858207, + "learning_rate": 1.1963443623608897e-05, + "loss": 0.3372, + "step": 8879 + }, + { + "epoch": 1.7505914826498423, + "grad_norm": 0.47137428118391156, + "learning_rate": 1.1961923745301256e-05, + "loss": 0.3314, + "step": 8880 + }, + { + "epoch": 1.750788643533123, + "grad_norm": 0.4957152153915973, + "learning_rate": 1.1960403819856815e-05, + "loss": 0.3537, + "step": 8881 + }, + { + "epoch": 1.7509858044164037, + "grad_norm": 0.4470537419100556, + "learning_rate": 1.1958883847312092e-05, + "loss": 0.3231, + "step": 8882 + }, + { + "epoch": 1.7511829652996846, + "grad_norm": 0.4608113402256386, + "learning_rate": 1.1957363827703612e-05, + "loss": 0.3211, + "step": 8883 + }, + { + "epoch": 1.7513801261829653, + "grad_norm": 0.49969096886092734, + "learning_rate": 1.1955843761067886e-05, + "loss": 0.337, + "step": 8884 + }, + { + "epoch": 1.751577287066246, + "grad_norm": 0.480041688896454, + "learning_rate": 1.1954323647441439e-05, + "loss": 0.3579, + "step": 8885 + }, + { + "epoch": 1.751774447949527, + "grad_norm": 0.45142752291944904, + "learning_rate": 1.1952803486860794e-05, + "loss": 0.314, + "step": 8886 + }, + { + "epoch": 1.7519716088328074, + "grad_norm": 0.48553207420875816, + "learning_rate": 1.1951283279362471e-05, + "loss": 0.3485, + "step": 8887 + }, + { + "epoch": 1.7521687697160884, + "grad_norm": 0.4724742952583676, + "learning_rate": 1.1949763024982997e-05, + "loss": 0.3496, + "step": 8888 + }, + { + "epoch": 1.752365930599369, + "grad_norm": 0.4854477143262649, + "learning_rate": 1.1948242723758896e-05, + "loss": 0.3421, + "step": 8889 + }, + { + "epoch": 1.7525630914826498, + "grad_norm": 0.46735796360977727, + "learning_rate": 1.1946722375726694e-05, + "loss": 0.3375, + "step": 8890 + }, + { + "epoch": 1.7527602523659307, + "grad_norm": 0.4637010683614967, + "learning_rate": 1.194520198092292e-05, + "loss": 0.3038, + "step": 8891 + }, + { + "epoch": 1.7529574132492114, + "grad_norm": 0.4793650345380348, + "learning_rate": 1.1943681539384103e-05, + "loss": 0.3252, + "step": 8892 + }, + { + "epoch": 1.753154574132492, + "grad_norm": 0.4636919118333324, + "learning_rate": 1.194216105114677e-05, + "loss": 0.3256, + "step": 8893 + }, + { + "epoch": 1.753351735015773, + "grad_norm": 0.47895800206523675, + "learning_rate": 1.194064051624745e-05, + "loss": 0.325, + "step": 8894 + }, + { + "epoch": 1.7535488958990535, + "grad_norm": 0.46796297495549594, + "learning_rate": 1.1939119934722685e-05, + "loss": 0.322, + "step": 8895 + }, + { + "epoch": 1.7537460567823344, + "grad_norm": 0.46995032074065873, + "learning_rate": 1.1937599306609e-05, + "loss": 0.35, + "step": 8896 + }, + { + "epoch": 1.7539432176656151, + "grad_norm": 55.908718330694654, + "learning_rate": 1.193607863194293e-05, + "loss": 0.5092, + "step": 8897 + }, + { + "epoch": 1.7541403785488958, + "grad_norm": 0.5121025623411513, + "learning_rate": 1.1934557910761013e-05, + "loss": 0.3565, + "step": 8898 + }, + { + "epoch": 1.7543375394321767, + "grad_norm": 0.45082554071248027, + "learning_rate": 1.1933037143099786e-05, + "loss": 0.3044, + "step": 8899 + }, + { + "epoch": 1.7545347003154574, + "grad_norm": 0.47611001136680525, + "learning_rate": 1.1931516328995782e-05, + "loss": 0.3135, + "step": 8900 + }, + { + "epoch": 1.7547318611987381, + "grad_norm": 0.4768767652243685, + "learning_rate": 1.1929995468485545e-05, + "loss": 0.3442, + "step": 8901 + }, + { + "epoch": 1.754929022082019, + "grad_norm": 0.49669976262352766, + "learning_rate": 1.1928474561605612e-05, + "loss": 0.3118, + "step": 8902 + }, + { + "epoch": 1.7551261829652995, + "grad_norm": 0.4900205596977599, + "learning_rate": 1.1926953608392522e-05, + "loss": 0.3165, + "step": 8903 + }, + { + "epoch": 1.7553233438485805, + "grad_norm": 0.48731950619097536, + "learning_rate": 1.1925432608882826e-05, + "loss": 0.3524, + "step": 8904 + }, + { + "epoch": 1.7555205047318612, + "grad_norm": 0.5192299995573778, + "learning_rate": 1.1923911563113053e-05, + "loss": 0.3761, + "step": 8905 + }, + { + "epoch": 1.7557176656151419, + "grad_norm": 0.4698410536685136, + "learning_rate": 1.1922390471119763e-05, + "loss": 0.3271, + "step": 8906 + }, + { + "epoch": 1.7559148264984228, + "grad_norm": 0.4891636860035787, + "learning_rate": 1.1920869332939488e-05, + "loss": 0.3466, + "step": 8907 + }, + { + "epoch": 1.7561119873817035, + "grad_norm": 0.49905335282315777, + "learning_rate": 1.1919348148608782e-05, + "loss": 0.3497, + "step": 8908 + }, + { + "epoch": 1.7563091482649842, + "grad_norm": 0.5085891428975882, + "learning_rate": 1.1917826918164193e-05, + "loss": 0.3384, + "step": 8909 + }, + { + "epoch": 1.756506309148265, + "grad_norm": 0.49833646762619443, + "learning_rate": 1.1916305641642265e-05, + "loss": 0.3471, + "step": 8910 + }, + { + "epoch": 1.7567034700315456, + "grad_norm": 0.49886395851113163, + "learning_rate": 1.1914784319079554e-05, + "loss": 0.3834, + "step": 8911 + }, + { + "epoch": 1.7569006309148265, + "grad_norm": 0.5085736701155745, + "learning_rate": 1.1913262950512605e-05, + "loss": 0.348, + "step": 8912 + }, + { + "epoch": 1.7570977917981072, + "grad_norm": 0.5032507524362276, + "learning_rate": 1.1911741535977972e-05, + "loss": 0.3574, + "step": 8913 + }, + { + "epoch": 1.757294952681388, + "grad_norm": 0.48298477571501336, + "learning_rate": 1.1910220075512213e-05, + "loss": 0.318, + "step": 8914 + }, + { + "epoch": 1.7574921135646688, + "grad_norm": 0.48667269740164176, + "learning_rate": 1.1908698569151877e-05, + "loss": 0.3329, + "step": 8915 + }, + { + "epoch": 1.7576892744479495, + "grad_norm": 0.47116779180865714, + "learning_rate": 1.190717701693352e-05, + "loss": 0.3365, + "step": 8916 + }, + { + "epoch": 1.7578864353312302, + "grad_norm": 0.4997514621039943, + "learning_rate": 1.19056554188937e-05, + "loss": 0.3235, + "step": 8917 + }, + { + "epoch": 1.7580835962145112, + "grad_norm": 0.48309653739189073, + "learning_rate": 1.1904133775068974e-05, + "loss": 0.3161, + "step": 8918 + }, + { + "epoch": 1.7582807570977916, + "grad_norm": 0.5210440531213502, + "learning_rate": 1.1902612085495902e-05, + "loss": 0.3631, + "step": 8919 + }, + { + "epoch": 1.7584779179810726, + "grad_norm": 0.4921227572826234, + "learning_rate": 1.1901090350211037e-05, + "loss": 0.3294, + "step": 8920 + }, + { + "epoch": 1.7586750788643533, + "grad_norm": 0.49727990636910413, + "learning_rate": 1.1899568569250951e-05, + "loss": 0.3416, + "step": 8921 + }, + { + "epoch": 1.758872239747634, + "grad_norm": 0.5053613903983315, + "learning_rate": 1.1898046742652196e-05, + "loss": 0.3138, + "step": 8922 + }, + { + "epoch": 1.7590694006309149, + "grad_norm": 0.45734876191997076, + "learning_rate": 1.1896524870451344e-05, + "loss": 0.3403, + "step": 8923 + }, + { + "epoch": 1.7592665615141956, + "grad_norm": 0.43524997755300077, + "learning_rate": 1.1895002952684952e-05, + "loss": 0.3275, + "step": 8924 + }, + { + "epoch": 1.7594637223974763, + "grad_norm": 0.46379750213806303, + "learning_rate": 1.189348098938959e-05, + "loss": 0.3241, + "step": 8925 + }, + { + "epoch": 1.7596608832807572, + "grad_norm": 0.5193732674368089, + "learning_rate": 1.1891958980601819e-05, + "loss": 0.3229, + "step": 8926 + }, + { + "epoch": 1.759858044164038, + "grad_norm": 0.5374560365743167, + "learning_rate": 1.1890436926358214e-05, + "loss": 0.3549, + "step": 8927 + }, + { + "epoch": 1.7600552050473186, + "grad_norm": 0.5510070896034697, + "learning_rate": 1.1888914826695336e-05, + "loss": 0.3427, + "step": 8928 + }, + { + "epoch": 1.7602523659305995, + "grad_norm": 0.500101497587192, + "learning_rate": 1.1887392681649761e-05, + "loss": 0.3519, + "step": 8929 + }, + { + "epoch": 1.76044952681388, + "grad_norm": 0.5025929287914618, + "learning_rate": 1.1885870491258054e-05, + "loss": 0.3415, + "step": 8930 + }, + { + "epoch": 1.760646687697161, + "grad_norm": 0.4493713242328926, + "learning_rate": 1.1884348255556793e-05, + "loss": 0.327, + "step": 8931 + }, + { + "epoch": 1.7608438485804416, + "grad_norm": 0.49432274508122315, + "learning_rate": 1.1882825974582546e-05, + "loss": 0.3382, + "step": 8932 + }, + { + "epoch": 1.7610410094637223, + "grad_norm": 0.4970567230452097, + "learning_rate": 1.1881303648371889e-05, + "loss": 0.3509, + "step": 8933 + }, + { + "epoch": 1.7612381703470033, + "grad_norm": 0.47318188032190406, + "learning_rate": 1.1879781276961396e-05, + "loss": 0.3271, + "step": 8934 + }, + { + "epoch": 1.761435331230284, + "grad_norm": 0.5242959226678022, + "learning_rate": 1.1878258860387644e-05, + "loss": 0.3749, + "step": 8935 + }, + { + "epoch": 1.7616324921135647, + "grad_norm": 0.46123437786870813, + "learning_rate": 1.1876736398687212e-05, + "loss": 0.3346, + "step": 8936 + }, + { + "epoch": 1.7618296529968456, + "grad_norm": 0.4597024041990773, + "learning_rate": 1.1875213891896676e-05, + "loss": 0.3226, + "step": 8937 + }, + { + "epoch": 1.762026813880126, + "grad_norm": 0.4708652177872067, + "learning_rate": 1.1873691340052615e-05, + "loss": 0.3292, + "step": 8938 + }, + { + "epoch": 1.762223974763407, + "grad_norm": 0.4520141786068151, + "learning_rate": 1.1872168743191613e-05, + "loss": 0.3026, + "step": 8939 + }, + { + "epoch": 1.7624211356466877, + "grad_norm": 0.45200547022438, + "learning_rate": 1.1870646101350247e-05, + "loss": 0.3254, + "step": 8940 + }, + { + "epoch": 1.7626182965299684, + "grad_norm": 0.4656284736142514, + "learning_rate": 1.18691234145651e-05, + "loss": 0.3146, + "step": 8941 + }, + { + "epoch": 1.7628154574132493, + "grad_norm": 0.45349874844695887, + "learning_rate": 1.1867600682872764e-05, + "loss": 0.3145, + "step": 8942 + }, + { + "epoch": 1.76301261829653, + "grad_norm": 0.4977307339476207, + "learning_rate": 1.1866077906309812e-05, + "loss": 0.3329, + "step": 8943 + }, + { + "epoch": 1.7632097791798107, + "grad_norm": 0.47172564633086056, + "learning_rate": 1.1864555084912839e-05, + "loss": 0.32, + "step": 8944 + }, + { + "epoch": 1.7634069400630916, + "grad_norm": 0.6274170785811407, + "learning_rate": 1.1863032218718424e-05, + "loss": 0.3686, + "step": 8945 + }, + { + "epoch": 1.763604100946372, + "grad_norm": 0.4976279806243257, + "learning_rate": 1.1861509307763166e-05, + "loss": 0.3433, + "step": 8946 + }, + { + "epoch": 1.763801261829653, + "grad_norm": 0.5010044169487304, + "learning_rate": 1.1859986352083644e-05, + "loss": 0.3308, + "step": 8947 + }, + { + "epoch": 1.7639984227129337, + "grad_norm": 0.4709732593585705, + "learning_rate": 1.185846335171645e-05, + "loss": 0.3243, + "step": 8948 + }, + { + "epoch": 1.7641955835962144, + "grad_norm": 0.46672840740361604, + "learning_rate": 1.1856940306698182e-05, + "loss": 0.3288, + "step": 8949 + }, + { + "epoch": 1.7643927444794953, + "grad_norm": 0.4876404296213418, + "learning_rate": 1.1855417217065427e-05, + "loss": 0.3454, + "step": 8950 + }, + { + "epoch": 1.764589905362776, + "grad_norm": 0.7344474694941918, + "learning_rate": 1.1853894082854778e-05, + "loss": 0.3343, + "step": 8951 + }, + { + "epoch": 1.7647870662460567, + "grad_norm": 0.5281718261013864, + "learning_rate": 1.185237090410283e-05, + "loss": 0.3559, + "step": 8952 + }, + { + "epoch": 1.7649842271293377, + "grad_norm": 0.45336944968705656, + "learning_rate": 1.1850847680846181e-05, + "loss": 0.3367, + "step": 8953 + }, + { + "epoch": 1.7651813880126181, + "grad_norm": 0.48515961309525846, + "learning_rate": 1.1849324413121424e-05, + "loss": 0.35, + "step": 8954 + }, + { + "epoch": 1.765378548895899, + "grad_norm": 0.4683396505847219, + "learning_rate": 1.184780110096516e-05, + "loss": 0.3291, + "step": 8955 + }, + { + "epoch": 1.7655757097791798, + "grad_norm": 0.47251430009899725, + "learning_rate": 1.1846277744413988e-05, + "loss": 0.3143, + "step": 8956 + }, + { + "epoch": 1.7657728706624605, + "grad_norm": 0.47184209011286815, + "learning_rate": 1.1844754343504503e-05, + "loss": 0.3256, + "step": 8957 + }, + { + "epoch": 1.7659700315457414, + "grad_norm": 0.4783002948832379, + "learning_rate": 1.1843230898273312e-05, + "loss": 0.3487, + "step": 8958 + }, + { + "epoch": 1.766167192429022, + "grad_norm": 0.5175752592893191, + "learning_rate": 1.1841707408757012e-05, + "loss": 0.348, + "step": 8959 + }, + { + "epoch": 1.7663643533123028, + "grad_norm": 0.48774710368554364, + "learning_rate": 1.184018387499221e-05, + "loss": 0.3489, + "step": 8960 + }, + { + "epoch": 1.7665615141955837, + "grad_norm": 0.5091512140833712, + "learning_rate": 1.183866029701551e-05, + "loss": 0.3437, + "step": 8961 + }, + { + "epoch": 1.7667586750788642, + "grad_norm": 0.4579012245288681, + "learning_rate": 1.1837136674863512e-05, + "loss": 0.3116, + "step": 8962 + }, + { + "epoch": 1.7669558359621451, + "grad_norm": 0.4607106881483517, + "learning_rate": 1.1835613008572828e-05, + "loss": 0.3182, + "step": 8963 + }, + { + "epoch": 1.7671529968454258, + "grad_norm": 0.48460789200010285, + "learning_rate": 1.1834089298180062e-05, + "loss": 0.3515, + "step": 8964 + }, + { + "epoch": 1.7673501577287065, + "grad_norm": 0.5066322063807643, + "learning_rate": 1.1832565543721828e-05, + "loss": 0.3611, + "step": 8965 + }, + { + "epoch": 1.7675473186119874, + "grad_norm": 0.4619333716892967, + "learning_rate": 1.1831041745234728e-05, + "loss": 0.3183, + "step": 8966 + }, + { + "epoch": 1.7677444794952681, + "grad_norm": 0.46313697004860765, + "learning_rate": 1.1829517902755375e-05, + "loss": 0.322, + "step": 8967 + }, + { + "epoch": 1.7679416403785488, + "grad_norm": 0.6763284941264115, + "learning_rate": 1.1827994016320381e-05, + "loss": 0.3173, + "step": 8968 + }, + { + "epoch": 1.7681388012618298, + "grad_norm": 0.47210950027001125, + "learning_rate": 1.1826470085966357e-05, + "loss": 0.3312, + "step": 8969 + }, + { + "epoch": 1.7683359621451105, + "grad_norm": 0.47290245061079667, + "learning_rate": 1.1824946111729922e-05, + "loss": 0.3043, + "step": 8970 + }, + { + "epoch": 1.7685331230283912, + "grad_norm": 0.479790151905134, + "learning_rate": 1.1823422093647684e-05, + "loss": 0.3469, + "step": 8971 + }, + { + "epoch": 1.768730283911672, + "grad_norm": 0.4581566972384319, + "learning_rate": 1.1821898031756265e-05, + "loss": 0.33, + "step": 8972 + }, + { + "epoch": 1.7689274447949526, + "grad_norm": 0.4721423210829706, + "learning_rate": 1.1820373926092274e-05, + "loss": 0.3159, + "step": 8973 + }, + { + "epoch": 1.7691246056782335, + "grad_norm": 0.49894798175464766, + "learning_rate": 1.181884977669234e-05, + "loss": 0.3572, + "step": 8974 + }, + { + "epoch": 1.7693217665615142, + "grad_norm": 0.49659191839512695, + "learning_rate": 1.181732558359307e-05, + "loss": 0.334, + "step": 8975 + }, + { + "epoch": 1.7695189274447949, + "grad_norm": 0.5171496834529237, + "learning_rate": 1.181580134683109e-05, + "loss": 0.345, + "step": 8976 + }, + { + "epoch": 1.7697160883280758, + "grad_norm": 0.448236524997749, + "learning_rate": 1.1814277066443023e-05, + "loss": 0.2887, + "step": 8977 + }, + { + "epoch": 1.7699132492113565, + "grad_norm": 5.333849505830885, + "learning_rate": 1.1812752742465488e-05, + "loss": 0.3734, + "step": 8978 + }, + { + "epoch": 1.7701104100946372, + "grad_norm": 0.49076446223605413, + "learning_rate": 1.1811228374935107e-05, + "loss": 0.3396, + "step": 8979 + }, + { + "epoch": 1.7703075709779181, + "grad_norm": 0.5459828067722964, + "learning_rate": 1.1809703963888506e-05, + "loss": 0.3123, + "step": 8980 + }, + { + "epoch": 1.7705047318611986, + "grad_norm": 0.48374652525635975, + "learning_rate": 1.180817950936231e-05, + "loss": 0.3285, + "step": 8981 + }, + { + "epoch": 1.7707018927444795, + "grad_norm": 0.4755660557838733, + "learning_rate": 1.1806655011393144e-05, + "loss": 0.3507, + "step": 8982 + }, + { + "epoch": 1.7708990536277602, + "grad_norm": 0.6006920450403271, + "learning_rate": 1.1805130470017639e-05, + "loss": 0.3379, + "step": 8983 + }, + { + "epoch": 1.771096214511041, + "grad_norm": 0.5009897132212682, + "learning_rate": 1.180360588527242e-05, + "loss": 0.3573, + "step": 8984 + }, + { + "epoch": 1.7712933753943219, + "grad_norm": 0.4620563729527647, + "learning_rate": 1.1802081257194116e-05, + "loss": 0.3262, + "step": 8985 + }, + { + "epoch": 1.7714905362776026, + "grad_norm": 0.4769119308524991, + "learning_rate": 1.180055658581936e-05, + "loss": 0.3184, + "step": 8986 + }, + { + "epoch": 1.7716876971608833, + "grad_norm": 0.4487850697936526, + "learning_rate": 1.179903187118478e-05, + "loss": 0.3179, + "step": 8987 + }, + { + "epoch": 1.7718848580441642, + "grad_norm": 0.4677363529785754, + "learning_rate": 1.179750711332701e-05, + "loss": 0.3009, + "step": 8988 + }, + { + "epoch": 1.7720820189274447, + "grad_norm": 0.48154564362378643, + "learning_rate": 1.179598231228269e-05, + "loss": 0.3517, + "step": 8989 + }, + { + "epoch": 1.7722791798107256, + "grad_norm": 0.4608411338811139, + "learning_rate": 1.1794457468088443e-05, + "loss": 0.3284, + "step": 8990 + }, + { + "epoch": 1.7724763406940063, + "grad_norm": 0.5027074279023396, + "learning_rate": 1.1792932580780913e-05, + "loss": 0.3442, + "step": 8991 + }, + { + "epoch": 1.772673501577287, + "grad_norm": 0.4399957421863353, + "learning_rate": 1.1791407650396731e-05, + "loss": 0.3108, + "step": 8992 + }, + { + "epoch": 1.772870662460568, + "grad_norm": 0.45380590331375986, + "learning_rate": 1.1789882676972541e-05, + "loss": 0.2994, + "step": 8993 + }, + { + "epoch": 1.7730678233438486, + "grad_norm": 0.5204613664594859, + "learning_rate": 1.1788357660544976e-05, + "loss": 0.3543, + "step": 8994 + }, + { + "epoch": 1.7732649842271293, + "grad_norm": 0.48732752300245125, + "learning_rate": 1.1786832601150677e-05, + "loss": 0.3421, + "step": 8995 + }, + { + "epoch": 1.7734621451104102, + "grad_norm": 0.473467187951441, + "learning_rate": 1.1785307498826288e-05, + "loss": 0.3338, + "step": 8996 + }, + { + "epoch": 1.7736593059936907, + "grad_norm": 0.48137614570470716, + "learning_rate": 1.1783782353608449e-05, + "loss": 0.3533, + "step": 8997 + }, + { + "epoch": 1.7738564668769716, + "grad_norm": 0.4665674415229966, + "learning_rate": 1.1782257165533802e-05, + "loss": 0.3309, + "step": 8998 + }, + { + "epoch": 1.7740536277602523, + "grad_norm": 0.47639470639484743, + "learning_rate": 1.1780731934638992e-05, + "loss": 0.3181, + "step": 8999 + }, + { + "epoch": 1.774250788643533, + "grad_norm": 0.4888316448783417, + "learning_rate": 1.177920666096066e-05, + "loss": 0.3469, + "step": 9000 + }, + { + "epoch": 1.774447949526814, + "grad_norm": 0.488778406351827, + "learning_rate": 1.177768134453546e-05, + "loss": 0.3744, + "step": 9001 + }, + { + "epoch": 1.7746451104100947, + "grad_norm": 0.44883233582815296, + "learning_rate": 1.177615598540003e-05, + "loss": 0.3301, + "step": 9002 + }, + { + "epoch": 1.7748422712933754, + "grad_norm": 0.4506843011240513, + "learning_rate": 1.1774630583591024e-05, + "loss": 0.3038, + "step": 9003 + }, + { + "epoch": 1.7750394321766563, + "grad_norm": 0.49508062610720943, + "learning_rate": 1.1773105139145088e-05, + "loss": 0.3583, + "step": 9004 + }, + { + "epoch": 1.7752365930599368, + "grad_norm": 0.5164716499986945, + "learning_rate": 1.1771579652098874e-05, + "loss": 0.382, + "step": 9005 + }, + { + "epoch": 1.7754337539432177, + "grad_norm": 0.4882485785032844, + "learning_rate": 1.1770054122489031e-05, + "loss": 0.337, + "step": 9006 + }, + { + "epoch": 1.7756309148264984, + "grad_norm": 0.4663980919945463, + "learning_rate": 1.1768528550352216e-05, + "loss": 0.3192, + "step": 9007 + }, + { + "epoch": 1.775828075709779, + "grad_norm": 0.43682922880005404, + "learning_rate": 1.1767002935725076e-05, + "loss": 0.3017, + "step": 9008 + }, + { + "epoch": 1.77602523659306, + "grad_norm": 0.4867060152007287, + "learning_rate": 1.1765477278644264e-05, + "loss": 0.3299, + "step": 9009 + }, + { + "epoch": 1.7762223974763407, + "grad_norm": 0.5124563775251622, + "learning_rate": 1.1763951579146444e-05, + "loss": 0.3266, + "step": 9010 + }, + { + "epoch": 1.7764195583596214, + "grad_norm": 0.47078691041664955, + "learning_rate": 1.1762425837268263e-05, + "loss": 0.3356, + "step": 9011 + }, + { + "epoch": 1.7766167192429023, + "grad_norm": 0.4916978260239702, + "learning_rate": 1.1760900053046386e-05, + "loss": 0.3288, + "step": 9012 + }, + { + "epoch": 1.776813880126183, + "grad_norm": 0.4956955563685568, + "learning_rate": 1.1759374226517464e-05, + "loss": 0.3384, + "step": 9013 + }, + { + "epoch": 1.7770110410094637, + "grad_norm": 0.4728851012371811, + "learning_rate": 1.1757848357718162e-05, + "loss": 0.3307, + "step": 9014 + }, + { + "epoch": 1.7772082018927446, + "grad_norm": 0.503512410418657, + "learning_rate": 1.1756322446685134e-05, + "loss": 0.3366, + "step": 9015 + }, + { + "epoch": 1.7774053627760251, + "grad_norm": 0.5196421133302855, + "learning_rate": 1.1754796493455048e-05, + "loss": 0.3633, + "step": 9016 + }, + { + "epoch": 1.777602523659306, + "grad_norm": 0.49369062439862366, + "learning_rate": 1.1753270498064561e-05, + "loss": 0.3453, + "step": 9017 + }, + { + "epoch": 1.7777996845425867, + "grad_norm": 0.4653735508627901, + "learning_rate": 1.1751744460550338e-05, + "loss": 0.3296, + "step": 9018 + }, + { + "epoch": 1.7779968454258674, + "grad_norm": 0.5068627718887688, + "learning_rate": 1.1750218380949047e-05, + "loss": 0.3514, + "step": 9019 + }, + { + "epoch": 1.7781940063091484, + "grad_norm": 0.47122281155877815, + "learning_rate": 1.1748692259297347e-05, + "loss": 0.3461, + "step": 9020 + }, + { + "epoch": 1.778391167192429, + "grad_norm": 0.4566301776497934, + "learning_rate": 1.174716609563191e-05, + "loss": 0.3396, + "step": 9021 + }, + { + "epoch": 1.7785883280757098, + "grad_norm": 0.4861860829634978, + "learning_rate": 1.1745639889989398e-05, + "loss": 0.3213, + "step": 9022 + }, + { + "epoch": 1.7787854889589907, + "grad_norm": 0.4783918537273916, + "learning_rate": 1.1744113642406483e-05, + "loss": 0.3139, + "step": 9023 + }, + { + "epoch": 1.7789826498422712, + "grad_norm": 0.4813446133963429, + "learning_rate": 1.1742587352919833e-05, + "loss": 0.3439, + "step": 9024 + }, + { + "epoch": 1.779179810725552, + "grad_norm": 0.5247316124213983, + "learning_rate": 1.1741061021566118e-05, + "loss": 0.3264, + "step": 9025 + }, + { + "epoch": 1.7793769716088328, + "grad_norm": 0.4666782968847508, + "learning_rate": 1.173953464838201e-05, + "loss": 0.3286, + "step": 9026 + }, + { + "epoch": 1.7795741324921135, + "grad_norm": 0.4530221240481721, + "learning_rate": 1.1738008233404181e-05, + "loss": 0.3428, + "step": 9027 + }, + { + "epoch": 1.7797712933753944, + "grad_norm": 0.4647953682965125, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.3247, + "step": 9028 + }, + { + "epoch": 1.7799684542586751, + "grad_norm": 0.49727784750492343, + "learning_rate": 1.1734955278214057e-05, + "loss": 0.3619, + "step": 9029 + }, + { + "epoch": 1.7801656151419558, + "grad_norm": 0.49005891060030987, + "learning_rate": 1.1733428738075108e-05, + "loss": 0.3562, + "step": 9030 + }, + { + "epoch": 1.7803627760252367, + "grad_norm": 0.734206410674185, + "learning_rate": 1.1731902156289142e-05, + "loss": 0.3285, + "step": 9031 + }, + { + "epoch": 1.7805599369085172, + "grad_norm": 0.4681761876719222, + "learning_rate": 1.173037553289283e-05, + "loss": 0.3409, + "step": 9032 + }, + { + "epoch": 1.7807570977917981, + "grad_norm": 0.46203491841893324, + "learning_rate": 1.1728848867922853e-05, + "loss": 0.3186, + "step": 9033 + }, + { + "epoch": 1.7809542586750788, + "grad_norm": 0.4835537269864709, + "learning_rate": 1.1727322161415888e-05, + "loss": 0.3312, + "step": 9034 + }, + { + "epoch": 1.7811514195583595, + "grad_norm": 0.44861177894612153, + "learning_rate": 1.1725795413408618e-05, + "loss": 0.3016, + "step": 9035 + }, + { + "epoch": 1.7813485804416405, + "grad_norm": 0.4566247359785835, + "learning_rate": 1.1724268623937725e-05, + "loss": 0.3094, + "step": 9036 + }, + { + "epoch": 1.7815457413249212, + "grad_norm": 0.5013017232922788, + "learning_rate": 1.1722741793039885e-05, + "loss": 0.3556, + "step": 9037 + }, + { + "epoch": 1.7817429022082019, + "grad_norm": 0.4828140840198569, + "learning_rate": 1.172121492075179e-05, + "loss": 0.3398, + "step": 9038 + }, + { + "epoch": 1.7819400630914828, + "grad_norm": 0.4836084111183976, + "learning_rate": 1.171968800711012e-05, + "loss": 0.3516, + "step": 9039 + }, + { + "epoch": 1.7821372239747633, + "grad_norm": 0.45421673308016836, + "learning_rate": 1.1718161052151562e-05, + "loss": 0.2943, + "step": 9040 + }, + { + "epoch": 1.7823343848580442, + "grad_norm": 0.46771437738564686, + "learning_rate": 1.1716634055912796e-05, + "loss": 0.358, + "step": 9041 + }, + { + "epoch": 1.7825315457413249, + "grad_norm": 0.47666003792216416, + "learning_rate": 1.1715107018430522e-05, + "loss": 0.3418, + "step": 9042 + }, + { + "epoch": 1.7827287066246056, + "grad_norm": 0.4840693365790069, + "learning_rate": 1.1713579939741415e-05, + "loss": 0.3198, + "step": 9043 + }, + { + "epoch": 1.7829258675078865, + "grad_norm": 0.46910784593485333, + "learning_rate": 1.1712052819882171e-05, + "loss": 0.3262, + "step": 9044 + }, + { + "epoch": 1.7831230283911672, + "grad_norm": 0.4896822327451005, + "learning_rate": 1.171052565888948e-05, + "loss": 0.3433, + "step": 9045 + }, + { + "epoch": 1.783320189274448, + "grad_norm": 0.476154410913374, + "learning_rate": 1.1708998456800034e-05, + "loss": 0.3148, + "step": 9046 + }, + { + "epoch": 1.7835173501577288, + "grad_norm": 0.5217716610338343, + "learning_rate": 1.170747121365052e-05, + "loss": 0.3312, + "step": 9047 + }, + { + "epoch": 1.7837145110410093, + "grad_norm": 0.49551436556952305, + "learning_rate": 1.1705943929477639e-05, + "loss": 0.348, + "step": 9048 + }, + { + "epoch": 1.7839116719242902, + "grad_norm": 0.4920600225347391, + "learning_rate": 1.170441660431808e-05, + "loss": 0.349, + "step": 9049 + }, + { + "epoch": 1.784108832807571, + "grad_norm": 0.4378125833985353, + "learning_rate": 1.1702889238208539e-05, + "loss": 0.312, + "step": 9050 + }, + { + "epoch": 1.7843059936908516, + "grad_norm": 0.43674490442544767, + "learning_rate": 1.1701361831185714e-05, + "loss": 0.2936, + "step": 9051 + }, + { + "epoch": 1.7845031545741326, + "grad_norm": 0.4842053898639523, + "learning_rate": 1.1699834383286299e-05, + "loss": 0.3474, + "step": 9052 + }, + { + "epoch": 1.7847003154574133, + "grad_norm": 0.4529779374085598, + "learning_rate": 1.1698306894546995e-05, + "loss": 0.3128, + "step": 9053 + }, + { + "epoch": 1.784897476340694, + "grad_norm": 0.4801304915326945, + "learning_rate": 1.16967793650045e-05, + "loss": 0.3324, + "step": 9054 + }, + { + "epoch": 1.7850946372239749, + "grad_norm": 0.5173598334277061, + "learning_rate": 1.1695251794695514e-05, + "loss": 0.3415, + "step": 9055 + }, + { + "epoch": 1.7852917981072554, + "grad_norm": 0.4654828527782663, + "learning_rate": 1.169372418365674e-05, + "loss": 0.3172, + "step": 9056 + }, + { + "epoch": 1.7854889589905363, + "grad_norm": 0.4563252768971276, + "learning_rate": 1.1692196531924877e-05, + "loss": 0.3165, + "step": 9057 + }, + { + "epoch": 1.785686119873817, + "grad_norm": 0.48839701961063525, + "learning_rate": 1.169066883953663e-05, + "loss": 0.3321, + "step": 9058 + }, + { + "epoch": 1.7858832807570977, + "grad_norm": 0.4668548629020815, + "learning_rate": 1.1689141106528703e-05, + "loss": 0.3229, + "step": 9059 + }, + { + "epoch": 1.7860804416403786, + "grad_norm": 0.48668760035869535, + "learning_rate": 1.16876133329378e-05, + "loss": 0.3411, + "step": 9060 + }, + { + "epoch": 1.7862776025236593, + "grad_norm": 0.4648244886523526, + "learning_rate": 1.168608551880063e-05, + "loss": 0.337, + "step": 9061 + }, + { + "epoch": 1.78647476340694, + "grad_norm": 0.48604760080798454, + "learning_rate": 1.1684557664153893e-05, + "loss": 0.3144, + "step": 9062 + }, + { + "epoch": 1.786671924290221, + "grad_norm": 0.4783043678898303, + "learning_rate": 1.1683029769034304e-05, + "loss": 0.3225, + "step": 9063 + }, + { + "epoch": 1.7868690851735016, + "grad_norm": 0.47691055878103017, + "learning_rate": 1.168150183347857e-05, + "loss": 0.3317, + "step": 9064 + }, + { + "epoch": 1.7870662460567823, + "grad_norm": 0.4868775598571958, + "learning_rate": 1.16799738575234e-05, + "loss": 0.3326, + "step": 9065 + }, + { + "epoch": 1.7872634069400632, + "grad_norm": 0.45774853271321925, + "learning_rate": 1.1678445841205506e-05, + "loss": 0.3183, + "step": 9066 + }, + { + "epoch": 1.7874605678233437, + "grad_norm": 0.9497850183764717, + "learning_rate": 1.1676917784561599e-05, + "loss": 0.3714, + "step": 9067 + }, + { + "epoch": 1.7876577287066246, + "grad_norm": 0.4768647293354841, + "learning_rate": 1.1675389687628389e-05, + "loss": 0.3462, + "step": 9068 + }, + { + "epoch": 1.7878548895899053, + "grad_norm": 0.4783944603727662, + "learning_rate": 1.1673861550442596e-05, + "loss": 0.3372, + "step": 9069 + }, + { + "epoch": 1.788052050473186, + "grad_norm": 0.46436887776659636, + "learning_rate": 1.167233337304093e-05, + "loss": 0.3318, + "step": 9070 + }, + { + "epoch": 1.788249211356467, + "grad_norm": 4.345180913181195, + "learning_rate": 1.1670805155460108e-05, + "loss": 0.3453, + "step": 9071 + }, + { + "epoch": 1.7884463722397477, + "grad_norm": 0.5159025017330271, + "learning_rate": 1.1669276897736847e-05, + "loss": 0.3449, + "step": 9072 + }, + { + "epoch": 1.7886435331230284, + "grad_norm": 0.5595335747614689, + "learning_rate": 1.1667748599907864e-05, + "loss": 0.3788, + "step": 9073 + }, + { + "epoch": 1.7888406940063093, + "grad_norm": 0.5325186373611229, + "learning_rate": 1.1666220262009877e-05, + "loss": 0.353, + "step": 9074 + }, + { + "epoch": 1.7890378548895898, + "grad_norm": 1.2209348375435845, + "learning_rate": 1.1664691884079606e-05, + "loss": 0.3454, + "step": 9075 + }, + { + "epoch": 1.7892350157728707, + "grad_norm": 0.4529621721978549, + "learning_rate": 1.1663163466153775e-05, + "loss": 0.3219, + "step": 9076 + }, + { + "epoch": 1.7894321766561514, + "grad_norm": 0.5786960304033676, + "learning_rate": 1.16616350082691e-05, + "loss": 0.3458, + "step": 9077 + }, + { + "epoch": 1.789629337539432, + "grad_norm": 0.46513659246814454, + "learning_rate": 1.1660106510462305e-05, + "loss": 0.3205, + "step": 9078 + }, + { + "epoch": 1.789826498422713, + "grad_norm": 0.49129392716967757, + "learning_rate": 1.1658577972770115e-05, + "loss": 0.3643, + "step": 9079 + }, + { + "epoch": 1.7900236593059937, + "grad_norm": 0.4946634044459266, + "learning_rate": 1.1657049395229255e-05, + "loss": 0.3585, + "step": 9080 + }, + { + "epoch": 1.7902208201892744, + "grad_norm": 0.4625534608044079, + "learning_rate": 1.1655520777876446e-05, + "loss": 0.3295, + "step": 9081 + }, + { + "epoch": 1.7904179810725553, + "grad_norm": 0.4770599185465994, + "learning_rate": 1.1653992120748421e-05, + "loss": 0.3232, + "step": 9082 + }, + { + "epoch": 1.7906151419558358, + "grad_norm": 0.5732472260639709, + "learning_rate": 1.1652463423881898e-05, + "loss": 0.3939, + "step": 9083 + }, + { + "epoch": 1.7908123028391167, + "grad_norm": 0.5032610006719888, + "learning_rate": 1.1650934687313615e-05, + "loss": 0.3256, + "step": 9084 + }, + { + "epoch": 1.7910094637223974, + "grad_norm": 0.5315497537438166, + "learning_rate": 1.1649405911080298e-05, + "loss": 0.3533, + "step": 9085 + }, + { + "epoch": 1.7912066246056781, + "grad_norm": 0.47972476193122204, + "learning_rate": 1.1647877095218671e-05, + "loss": 0.3087, + "step": 9086 + }, + { + "epoch": 1.791403785488959, + "grad_norm": 0.49904975210777663, + "learning_rate": 1.1646348239765475e-05, + "loss": 0.333, + "step": 9087 + }, + { + "epoch": 1.7916009463722398, + "grad_norm": 0.4982533128788867, + "learning_rate": 1.164481934475743e-05, + "loss": 0.3569, + "step": 9088 + }, + { + "epoch": 1.7917981072555205, + "grad_norm": 0.9122269960563105, + "learning_rate": 1.1643290410231282e-05, + "loss": 0.3395, + "step": 9089 + }, + { + "epoch": 1.7919952681388014, + "grad_norm": 0.4557932362062137, + "learning_rate": 1.1641761436223753e-05, + "loss": 0.3024, + "step": 9090 + }, + { + "epoch": 1.7921924290220819, + "grad_norm": 0.4941263997632442, + "learning_rate": 1.1640232422771586e-05, + "loss": 0.3476, + "step": 9091 + }, + { + "epoch": 1.7923895899053628, + "grad_norm": 0.5155585824775918, + "learning_rate": 1.1638703369911517e-05, + "loss": 0.3596, + "step": 9092 + }, + { + "epoch": 1.7925867507886435, + "grad_norm": 1.4023286561635016, + "learning_rate": 1.1637174277680277e-05, + "loss": 0.3335, + "step": 9093 + }, + { + "epoch": 1.7927839116719242, + "grad_norm": 0.5065005447760449, + "learning_rate": 1.1635645146114607e-05, + "loss": 0.355, + "step": 9094 + }, + { + "epoch": 1.7929810725552051, + "grad_norm": 0.526370677958319, + "learning_rate": 1.1634115975251245e-05, + "loss": 0.3056, + "step": 9095 + }, + { + "epoch": 1.7931782334384858, + "grad_norm": 0.486266087331147, + "learning_rate": 1.1632586765126929e-05, + "loss": 0.3269, + "step": 9096 + }, + { + "epoch": 1.7933753943217665, + "grad_norm": 0.4580937992770503, + "learning_rate": 1.1631057515778403e-05, + "loss": 0.3137, + "step": 9097 + }, + { + "epoch": 1.7935725552050474, + "grad_norm": 0.5054061865722272, + "learning_rate": 1.1629528227242408e-05, + "loss": 0.3398, + "step": 9098 + }, + { + "epoch": 1.793769716088328, + "grad_norm": 0.47624328346608846, + "learning_rate": 1.1627998899555684e-05, + "loss": 0.3217, + "step": 9099 + }, + { + "epoch": 1.7939668769716088, + "grad_norm": 0.47501826137837944, + "learning_rate": 1.1626469532754975e-05, + "loss": 0.3436, + "step": 9100 + }, + { + "epoch": 1.7941640378548895, + "grad_norm": 0.5083570086033906, + "learning_rate": 1.1624940126877027e-05, + "loss": 0.3342, + "step": 9101 + }, + { + "epoch": 1.7943611987381702, + "grad_norm": 0.43834857149607065, + "learning_rate": 1.1623410681958583e-05, + "loss": 0.3176, + "step": 9102 + }, + { + "epoch": 1.7945583596214512, + "grad_norm": 0.46797550187905224, + "learning_rate": 1.1621881198036389e-05, + "loss": 0.3296, + "step": 9103 + }, + { + "epoch": 1.7947555205047319, + "grad_norm": 0.747473707199481, + "learning_rate": 1.1620351675147195e-05, + "loss": 0.336, + "step": 9104 + }, + { + "epoch": 1.7949526813880126, + "grad_norm": 0.4864213421889568, + "learning_rate": 1.1618822113327743e-05, + "loss": 0.346, + "step": 9105 + }, + { + "epoch": 1.7951498422712935, + "grad_norm": 0.48393670272857564, + "learning_rate": 1.1617292512614793e-05, + "loss": 0.3323, + "step": 9106 + }, + { + "epoch": 1.7953470031545742, + "grad_norm": 0.42975326675170367, + "learning_rate": 1.161576287304508e-05, + "loss": 0.2663, + "step": 9107 + }, + { + "epoch": 1.7955441640378549, + "grad_norm": 0.4861731628110563, + "learning_rate": 1.1614233194655371e-05, + "loss": 0.3257, + "step": 9108 + }, + { + "epoch": 1.7957413249211358, + "grad_norm": 0.8570529099252474, + "learning_rate": 1.1612703477482403e-05, + "loss": 0.32, + "step": 9109 + }, + { + "epoch": 1.7959384858044163, + "grad_norm": 0.4868050444217125, + "learning_rate": 1.161117372156294e-05, + "loss": 0.3324, + "step": 9110 + }, + { + "epoch": 1.7961356466876972, + "grad_norm": 0.5166058354822001, + "learning_rate": 1.1609643926933727e-05, + "loss": 0.3296, + "step": 9111 + }, + { + "epoch": 1.796332807570978, + "grad_norm": 0.454694260080299, + "learning_rate": 1.1608114093631523e-05, + "loss": 0.3057, + "step": 9112 + }, + { + "epoch": 1.7965299684542586, + "grad_norm": 0.4873463120948357, + "learning_rate": 1.1606584221693084e-05, + "loss": 0.3327, + "step": 9113 + }, + { + "epoch": 1.7967271293375395, + "grad_norm": 0.46258370737085464, + "learning_rate": 1.1605054311155165e-05, + "loss": 0.2937, + "step": 9114 + }, + { + "epoch": 1.7969242902208202, + "grad_norm": 0.5089951413840461, + "learning_rate": 1.1603524362054525e-05, + "loss": 0.347, + "step": 9115 + }, + { + "epoch": 1.797121451104101, + "grad_norm": 0.49112572937967497, + "learning_rate": 1.1601994374427921e-05, + "loss": 0.3328, + "step": 9116 + }, + { + "epoch": 1.7973186119873819, + "grad_norm": 0.4743944704648095, + "learning_rate": 1.160046434831211e-05, + "loss": 0.316, + "step": 9117 + }, + { + "epoch": 1.7975157728706623, + "grad_norm": 0.5012809900872015, + "learning_rate": 1.1598934283743855e-05, + "loss": 0.3618, + "step": 9118 + }, + { + "epoch": 1.7977129337539433, + "grad_norm": 0.4632607405847242, + "learning_rate": 1.1597404180759917e-05, + "loss": 0.3282, + "step": 9119 + }, + { + "epoch": 1.797910094637224, + "grad_norm": 0.4893372010154669, + "learning_rate": 1.1595874039397055e-05, + "loss": 0.3256, + "step": 9120 + }, + { + "epoch": 1.7981072555205047, + "grad_norm": 0.49627890794355706, + "learning_rate": 1.1594343859692037e-05, + "loss": 0.3487, + "step": 9121 + }, + { + "epoch": 1.7983044164037856, + "grad_norm": 0.49100214178136625, + "learning_rate": 1.1592813641681621e-05, + "loss": 0.3388, + "step": 9122 + }, + { + "epoch": 1.7985015772870663, + "grad_norm": 0.4542961165749031, + "learning_rate": 1.1591283385402577e-05, + "loss": 0.3299, + "step": 9123 + }, + { + "epoch": 1.798698738170347, + "grad_norm": 0.4801501150330402, + "learning_rate": 1.1589753090891667e-05, + "loss": 0.3323, + "step": 9124 + }, + { + "epoch": 1.798895899053628, + "grad_norm": 0.4851771812965912, + "learning_rate": 1.158822275818566e-05, + "loss": 0.3357, + "step": 9125 + }, + { + "epoch": 1.7990930599369084, + "grad_norm": 0.45182412694780133, + "learning_rate": 1.158669238732132e-05, + "loss": 0.305, + "step": 9126 + }, + { + "epoch": 1.7992902208201893, + "grad_norm": 0.47897297293682256, + "learning_rate": 1.158516197833542e-05, + "loss": 0.3231, + "step": 9127 + }, + { + "epoch": 1.79948738170347, + "grad_norm": 0.43389581021319834, + "learning_rate": 1.1583631531264723e-05, + "loss": 0.3173, + "step": 9128 + }, + { + "epoch": 1.7996845425867507, + "grad_norm": 0.47731478351982176, + "learning_rate": 1.1582101046146008e-05, + "loss": 0.3394, + "step": 9129 + }, + { + "epoch": 1.7998817034700316, + "grad_norm": 0.435886477568427, + "learning_rate": 1.1580570523016036e-05, + "loss": 0.2997, + "step": 9130 + }, + { + "epoch": 1.8000788643533123, + "grad_norm": 0.5238566166040188, + "learning_rate": 1.1579039961911591e-05, + "loss": 0.3432, + "step": 9131 + }, + { + "epoch": 1.800276025236593, + "grad_norm": 0.5379605041705956, + "learning_rate": 1.1577509362869433e-05, + "loss": 0.357, + "step": 9132 + }, + { + "epoch": 1.800473186119874, + "grad_norm": 0.449491662574799, + "learning_rate": 1.1575978725926347e-05, + "loss": 0.3106, + "step": 9133 + }, + { + "epoch": 1.8006703470031544, + "grad_norm": 0.5080608887234477, + "learning_rate": 1.1574448051119101e-05, + "loss": 0.3672, + "step": 9134 + }, + { + "epoch": 1.8008675078864353, + "grad_norm": 0.5323631360585213, + "learning_rate": 1.1572917338484471e-05, + "loss": 0.3742, + "step": 9135 + }, + { + "epoch": 1.801064668769716, + "grad_norm": 0.5031109008850254, + "learning_rate": 1.1571386588059236e-05, + "loss": 0.3406, + "step": 9136 + }, + { + "epoch": 1.8012618296529967, + "grad_norm": 0.49254801312177554, + "learning_rate": 1.1569855799880174e-05, + "loss": 0.3412, + "step": 9137 + }, + { + "epoch": 1.8014589905362777, + "grad_norm": 0.4836870483836016, + "learning_rate": 1.1568324973984065e-05, + "loss": 0.3256, + "step": 9138 + }, + { + "epoch": 1.8016561514195584, + "grad_norm": 0.5079333391604857, + "learning_rate": 1.1566794110407681e-05, + "loss": 0.3129, + "step": 9139 + }, + { + "epoch": 1.801853312302839, + "grad_norm": 0.48306063016614514, + "learning_rate": 1.156526320918781e-05, + "loss": 0.3263, + "step": 9140 + }, + { + "epoch": 1.80205047318612, + "grad_norm": 0.46236068282322035, + "learning_rate": 1.1563732270361228e-05, + "loss": 0.3218, + "step": 9141 + }, + { + "epoch": 1.8022476340694005, + "grad_norm": 0.46822948046376184, + "learning_rate": 1.1562201293964716e-05, + "loss": 0.3159, + "step": 9142 + }, + { + "epoch": 1.8024447949526814, + "grad_norm": 0.457275091142312, + "learning_rate": 1.1560670280035065e-05, + "loss": 0.3422, + "step": 9143 + }, + { + "epoch": 1.802641955835962, + "grad_norm": 0.45010333968693195, + "learning_rate": 1.155913922860905e-05, + "loss": 0.3038, + "step": 9144 + }, + { + "epoch": 1.8028391167192428, + "grad_norm": 0.460887214721349, + "learning_rate": 1.155760813972346e-05, + "loss": 0.3184, + "step": 9145 + }, + { + "epoch": 1.8030362776025237, + "grad_norm": 0.47074306673977423, + "learning_rate": 1.1556077013415084e-05, + "loss": 0.327, + "step": 9146 + }, + { + "epoch": 1.8032334384858044, + "grad_norm": 0.44541887094425797, + "learning_rate": 1.15545458497207e-05, + "loss": 0.314, + "step": 9147 + }, + { + "epoch": 1.8034305993690851, + "grad_norm": 0.46761848114178406, + "learning_rate": 1.1553014648677104e-05, + "loss": 0.3236, + "step": 9148 + }, + { + "epoch": 1.803627760252366, + "grad_norm": 0.47178941550054526, + "learning_rate": 1.1551483410321075e-05, + "loss": 0.3619, + "step": 9149 + }, + { + "epoch": 1.8038249211356467, + "grad_norm": 0.5060596355050806, + "learning_rate": 1.1549952134689414e-05, + "loss": 0.3555, + "step": 9150 + }, + { + "epoch": 1.8040220820189274, + "grad_norm": 0.48942752728337136, + "learning_rate": 1.1548420821818902e-05, + "loss": 0.3512, + "step": 9151 + }, + { + "epoch": 1.8042192429022084, + "grad_norm": 0.4656609142013564, + "learning_rate": 1.1546889471746333e-05, + "loss": 0.3254, + "step": 9152 + }, + { + "epoch": 1.8044164037854888, + "grad_norm": 0.5678872974738812, + "learning_rate": 1.1545358084508497e-05, + "loss": 0.3503, + "step": 9153 + }, + { + "epoch": 1.8046135646687698, + "grad_norm": 0.4746836161670529, + "learning_rate": 1.154382666014219e-05, + "loss": 0.3026, + "step": 9154 + }, + { + "epoch": 1.8048107255520505, + "grad_norm": 0.4919723933858667, + "learning_rate": 1.1542295198684206e-05, + "loss": 0.3567, + "step": 9155 + }, + { + "epoch": 1.8050078864353312, + "grad_norm": 0.4658920446396069, + "learning_rate": 1.1540763700171334e-05, + "loss": 0.3139, + "step": 9156 + }, + { + "epoch": 1.805205047318612, + "grad_norm": 0.4998610736336036, + "learning_rate": 1.1539232164640378e-05, + "loss": 0.3537, + "step": 9157 + }, + { + "epoch": 1.8054022082018928, + "grad_norm": 0.4838154631555964, + "learning_rate": 1.1537700592128126e-05, + "loss": 0.3231, + "step": 9158 + }, + { + "epoch": 1.8055993690851735, + "grad_norm": 0.481476456192672, + "learning_rate": 1.1536168982671378e-05, + "loss": 0.3587, + "step": 9159 + }, + { + "epoch": 1.8057965299684544, + "grad_norm": 0.6593255142170363, + "learning_rate": 1.1534637336306935e-05, + "loss": 0.3702, + "step": 9160 + }, + { + "epoch": 1.805993690851735, + "grad_norm": 0.47074519329216513, + "learning_rate": 1.1533105653071594e-05, + "loss": 0.3292, + "step": 9161 + }, + { + "epoch": 1.8061908517350158, + "grad_norm": 0.47104591661023276, + "learning_rate": 1.1531573933002156e-05, + "loss": 0.3273, + "step": 9162 + }, + { + "epoch": 1.8063880126182965, + "grad_norm": 0.4962259307049863, + "learning_rate": 1.153004217613542e-05, + "loss": 0.3381, + "step": 9163 + }, + { + "epoch": 1.8065851735015772, + "grad_norm": 0.47352216159289334, + "learning_rate": 1.152851038250819e-05, + "loss": 0.3293, + "step": 9164 + }, + { + "epoch": 1.8067823343848581, + "grad_norm": 0.4661605793322122, + "learning_rate": 1.1526978552157266e-05, + "loss": 0.3221, + "step": 9165 + }, + { + "epoch": 1.8069794952681388, + "grad_norm": 0.45860831067733454, + "learning_rate": 1.1525446685119452e-05, + "loss": 0.3341, + "step": 9166 + }, + { + "epoch": 1.8071766561514195, + "grad_norm": 0.4704413143236224, + "learning_rate": 1.1523914781431555e-05, + "loss": 0.3071, + "step": 9167 + }, + { + "epoch": 1.8073738170347005, + "grad_norm": 0.47559115700741583, + "learning_rate": 1.1522382841130377e-05, + "loss": 0.3467, + "step": 9168 + }, + { + "epoch": 1.807570977917981, + "grad_norm": 0.4852070321473458, + "learning_rate": 1.1520850864252724e-05, + "loss": 0.3132, + "step": 9169 + }, + { + "epoch": 1.8077681388012619, + "grad_norm": 0.5150731573019411, + "learning_rate": 1.1519318850835406e-05, + "loss": 0.3426, + "step": 9170 + }, + { + "epoch": 1.8079652996845426, + "grad_norm": 0.46442119276118826, + "learning_rate": 1.1517786800915229e-05, + "loss": 0.318, + "step": 9171 + }, + { + "epoch": 1.8081624605678233, + "grad_norm": 0.47760132684470824, + "learning_rate": 1.1516254714529001e-05, + "loss": 0.3138, + "step": 9172 + }, + { + "epoch": 1.8083596214511042, + "grad_norm": 0.5063732452560264, + "learning_rate": 1.1514722591713529e-05, + "loss": 0.3535, + "step": 9173 + }, + { + "epoch": 1.8085567823343849, + "grad_norm": 0.46353551339902077, + "learning_rate": 1.1513190432505634e-05, + "loss": 0.3085, + "step": 9174 + }, + { + "epoch": 1.8087539432176656, + "grad_norm": 0.45193354361169114, + "learning_rate": 1.1511658236942114e-05, + "loss": 0.2871, + "step": 9175 + }, + { + "epoch": 1.8089511041009465, + "grad_norm": 0.5095923560172854, + "learning_rate": 1.1510126005059793e-05, + "loss": 0.3514, + "step": 9176 + }, + { + "epoch": 1.809148264984227, + "grad_norm": 0.45401003471081053, + "learning_rate": 1.1508593736895475e-05, + "loss": 0.32, + "step": 9177 + }, + { + "epoch": 1.809345425867508, + "grad_norm": 0.4670090425518402, + "learning_rate": 1.150706143248598e-05, + "loss": 0.3338, + "step": 9178 + }, + { + "epoch": 1.8095425867507886, + "grad_norm": 0.49260267310440076, + "learning_rate": 1.1505529091868117e-05, + "loss": 0.3646, + "step": 9179 + }, + { + "epoch": 1.8097397476340693, + "grad_norm": 0.4892695211612479, + "learning_rate": 1.1503996715078707e-05, + "loss": 0.3159, + "step": 9180 + }, + { + "epoch": 1.8099369085173502, + "grad_norm": 0.5269511149927203, + "learning_rate": 1.1502464302154566e-05, + "loss": 0.3476, + "step": 9181 + }, + { + "epoch": 1.810134069400631, + "grad_norm": 0.477149389147672, + "learning_rate": 1.150093185313251e-05, + "loss": 0.3394, + "step": 9182 + }, + { + "epoch": 1.8103312302839116, + "grad_norm": 0.4911874732780398, + "learning_rate": 1.1499399368049356e-05, + "loss": 0.3367, + "step": 9183 + }, + { + "epoch": 1.8105283911671926, + "grad_norm": 0.7822176448681528, + "learning_rate": 1.1497866846941926e-05, + "loss": 0.3192, + "step": 9184 + }, + { + "epoch": 1.810725552050473, + "grad_norm": 0.4936248418394106, + "learning_rate": 1.1496334289847038e-05, + "loss": 0.3429, + "step": 9185 + }, + { + "epoch": 1.810922712933754, + "grad_norm": 0.47779103622401925, + "learning_rate": 1.1494801696801515e-05, + "loss": 0.3391, + "step": 9186 + }, + { + "epoch": 1.8111198738170347, + "grad_norm": 0.4911635374833865, + "learning_rate": 1.1493269067842175e-05, + "loss": 0.3392, + "step": 9187 + }, + { + "epoch": 1.8113170347003154, + "grad_norm": 0.4604725274713814, + "learning_rate": 1.1491736403005844e-05, + "loss": 0.324, + "step": 9188 + }, + { + "epoch": 1.8115141955835963, + "grad_norm": 0.5032560937410412, + "learning_rate": 1.1490203702329346e-05, + "loss": 0.3603, + "step": 9189 + }, + { + "epoch": 1.811711356466877, + "grad_norm": 0.6385600610637551, + "learning_rate": 1.1488670965849505e-05, + "loss": 0.3369, + "step": 9190 + }, + { + "epoch": 1.8119085173501577, + "grad_norm": 0.4885866435640685, + "learning_rate": 1.1487138193603142e-05, + "loss": 0.3309, + "step": 9191 + }, + { + "epoch": 1.8121056782334386, + "grad_norm": 0.5183397825685779, + "learning_rate": 1.1485605385627088e-05, + "loss": 0.3343, + "step": 9192 + }, + { + "epoch": 1.812302839116719, + "grad_norm": 0.4989032074820882, + "learning_rate": 1.1484072541958167e-05, + "loss": 0.3397, + "step": 9193 + }, + { + "epoch": 1.8125, + "grad_norm": 0.5033661808175196, + "learning_rate": 1.1482539662633208e-05, + "loss": 0.3417, + "step": 9194 + }, + { + "epoch": 1.812697160883281, + "grad_norm": 5.86204139038015, + "learning_rate": 1.1481006747689043e-05, + "loss": 0.328, + "step": 9195 + }, + { + "epoch": 1.8128943217665614, + "grad_norm": 0.5051699316872136, + "learning_rate": 1.1479473797162492e-05, + "loss": 0.3173, + "step": 9196 + }, + { + "epoch": 1.8130914826498423, + "grad_norm": 0.5070080136339032, + "learning_rate": 1.1477940811090398e-05, + "loss": 0.3271, + "step": 9197 + }, + { + "epoch": 1.813288643533123, + "grad_norm": 0.4811441468791982, + "learning_rate": 1.1476407789509583e-05, + "loss": 0.3268, + "step": 9198 + }, + { + "epoch": 1.8134858044164037, + "grad_norm": 0.4785568333077474, + "learning_rate": 1.1474874732456884e-05, + "loss": 0.3212, + "step": 9199 + }, + { + "epoch": 1.8136829652996846, + "grad_norm": 0.4698877579251847, + "learning_rate": 1.147334163996913e-05, + "loss": 0.3072, + "step": 9200 + }, + { + "epoch": 1.8138801261829653, + "grad_norm": 0.47991128581690556, + "learning_rate": 1.1471808512083156e-05, + "loss": 0.3416, + "step": 9201 + }, + { + "epoch": 1.814077287066246, + "grad_norm": 0.46961744858878407, + "learning_rate": 1.1470275348835797e-05, + "loss": 0.3187, + "step": 9202 + }, + { + "epoch": 1.814274447949527, + "grad_norm": 0.5056112272060148, + "learning_rate": 1.146874215026389e-05, + "loss": 0.3397, + "step": 9203 + }, + { + "epoch": 1.8144716088328074, + "grad_norm": 0.4972247643473326, + "learning_rate": 1.1467208916404271e-05, + "loss": 0.3404, + "step": 9204 + }, + { + "epoch": 1.8146687697160884, + "grad_norm": 0.4723317093599732, + "learning_rate": 1.1465675647293772e-05, + "loss": 0.304, + "step": 9205 + }, + { + "epoch": 1.814865930599369, + "grad_norm": 0.5198317290369266, + "learning_rate": 1.1464142342969242e-05, + "loss": 0.3535, + "step": 9206 + }, + { + "epoch": 1.8150630914826498, + "grad_norm": 2.3677318794574895, + "learning_rate": 1.1462609003467508e-05, + "loss": 0.4222, + "step": 9207 + }, + { + "epoch": 1.8152602523659307, + "grad_norm": 0.4794343279462834, + "learning_rate": 1.1461075628825416e-05, + "loss": 0.3156, + "step": 9208 + }, + { + "epoch": 1.8154574132492114, + "grad_norm": 0.48662660979986266, + "learning_rate": 1.1459542219079808e-05, + "loss": 0.3432, + "step": 9209 + }, + { + "epoch": 1.815654574132492, + "grad_norm": 0.48035488934776704, + "learning_rate": 1.1458008774267518e-05, + "loss": 0.3383, + "step": 9210 + }, + { + "epoch": 1.815851735015773, + "grad_norm": 0.48382873384961195, + "learning_rate": 1.1456475294425396e-05, + "loss": 0.327, + "step": 9211 + }, + { + "epoch": 1.8160488958990535, + "grad_norm": 0.47241477963626444, + "learning_rate": 1.1454941779590283e-05, + "loss": 0.3271, + "step": 9212 + }, + { + "epoch": 1.8162460567823344, + "grad_norm": 0.46721123265648445, + "learning_rate": 1.1453408229799017e-05, + "loss": 0.3125, + "step": 9213 + }, + { + "epoch": 1.8164432176656151, + "grad_norm": 0.4739931333373127, + "learning_rate": 1.1451874645088455e-05, + "loss": 0.3265, + "step": 9214 + }, + { + "epoch": 1.8166403785488958, + "grad_norm": 0.5189118810899088, + "learning_rate": 1.145034102549543e-05, + "loss": 0.3672, + "step": 9215 + }, + { + "epoch": 1.8168375394321767, + "grad_norm": 0.49357938108124044, + "learning_rate": 1.1448807371056798e-05, + "loss": 0.3343, + "step": 9216 + }, + { + "epoch": 1.8170347003154574, + "grad_norm": 0.476806812670295, + "learning_rate": 1.14472736818094e-05, + "loss": 0.3544, + "step": 9217 + }, + { + "epoch": 1.8172318611987381, + "grad_norm": 0.5426318526785301, + "learning_rate": 1.1445739957790087e-05, + "loss": 0.3388, + "step": 9218 + }, + { + "epoch": 1.817429022082019, + "grad_norm": 0.48940437942449916, + "learning_rate": 1.1444206199035708e-05, + "loss": 0.3436, + "step": 9219 + }, + { + "epoch": 1.8176261829652995, + "grad_norm": 0.47285073767586155, + "learning_rate": 1.1442672405583109e-05, + "loss": 0.342, + "step": 9220 + }, + { + "epoch": 1.8178233438485805, + "grad_norm": 0.4668025709102716, + "learning_rate": 1.1441138577469147e-05, + "loss": 0.327, + "step": 9221 + }, + { + "epoch": 1.8180205047318612, + "grad_norm": 0.45875055734217174, + "learning_rate": 1.1439604714730666e-05, + "loss": 0.3192, + "step": 9222 + }, + { + "epoch": 1.8182176656151419, + "grad_norm": 0.4409526335848642, + "learning_rate": 1.1438070817404527e-05, + "loss": 0.3101, + "step": 9223 + }, + { + "epoch": 1.8184148264984228, + "grad_norm": 0.45114553583024647, + "learning_rate": 1.1436536885527576e-05, + "loss": 0.3342, + "step": 9224 + }, + { + "epoch": 1.8186119873817035, + "grad_norm": 0.4806686518283818, + "learning_rate": 1.1435002919136671e-05, + "loss": 0.3391, + "step": 9225 + }, + { + "epoch": 1.8188091482649842, + "grad_norm": 0.5044312994268182, + "learning_rate": 1.1433468918268663e-05, + "loss": 0.3555, + "step": 9226 + }, + { + "epoch": 1.819006309148265, + "grad_norm": 0.49263755874516185, + "learning_rate": 1.1431934882960412e-05, + "loss": 0.3607, + "step": 9227 + }, + { + "epoch": 1.8192034700315456, + "grad_norm": 0.4536998690340305, + "learning_rate": 1.1430400813248772e-05, + "loss": 0.2918, + "step": 9228 + }, + { + "epoch": 1.8194006309148265, + "grad_norm": 0.532141725281099, + "learning_rate": 1.1428866709170599e-05, + "loss": 0.355, + "step": 9229 + }, + { + "epoch": 1.8195977917981072, + "grad_norm": 0.4635187686939784, + "learning_rate": 1.1427332570762754e-05, + "loss": 0.3211, + "step": 9230 + }, + { + "epoch": 1.819794952681388, + "grad_norm": 0.602694804528996, + "learning_rate": 1.1425798398062093e-05, + "loss": 0.3574, + "step": 9231 + }, + { + "epoch": 1.8199921135646688, + "grad_norm": 0.49418051715015165, + "learning_rate": 1.1424264191105481e-05, + "loss": 0.3526, + "step": 9232 + }, + { + "epoch": 1.8201892744479495, + "grad_norm": 0.47528793595220115, + "learning_rate": 1.1422729949929772e-05, + "loss": 0.3408, + "step": 9233 + }, + { + "epoch": 1.8203864353312302, + "grad_norm": 0.5040867868946186, + "learning_rate": 1.142119567457183e-05, + "loss": 0.3401, + "step": 9234 + }, + { + "epoch": 1.8205835962145112, + "grad_norm": 0.4709221541985399, + "learning_rate": 1.141966136506852e-05, + "loss": 0.322, + "step": 9235 + }, + { + "epoch": 1.8207807570977916, + "grad_norm": 0.5048500824063943, + "learning_rate": 1.14181270214567e-05, + "loss": 0.3449, + "step": 9236 + }, + { + "epoch": 1.8209779179810726, + "grad_norm": 0.5520156337084202, + "learning_rate": 1.1416592643773236e-05, + "loss": 0.3485, + "step": 9237 + }, + { + "epoch": 1.8211750788643533, + "grad_norm": 0.4662159013287956, + "learning_rate": 1.1415058232054995e-05, + "loss": 0.3116, + "step": 9238 + }, + { + "epoch": 1.821372239747634, + "grad_norm": 0.4583750639044251, + "learning_rate": 1.1413523786338838e-05, + "loss": 0.314, + "step": 9239 + }, + { + "epoch": 1.8215694006309149, + "grad_norm": 0.47754415155408275, + "learning_rate": 1.1411989306661635e-05, + "loss": 0.3322, + "step": 9240 + }, + { + "epoch": 1.8217665615141956, + "grad_norm": 0.4974506056160547, + "learning_rate": 1.1410454793060251e-05, + "loss": 0.3507, + "step": 9241 + }, + { + "epoch": 1.8219637223974763, + "grad_norm": 0.4734526011420336, + "learning_rate": 1.1408920245571558e-05, + "loss": 0.3168, + "step": 9242 + }, + { + "epoch": 1.8221608832807572, + "grad_norm": 0.46295318137196184, + "learning_rate": 1.1407385664232415e-05, + "loss": 0.3195, + "step": 9243 + }, + { + "epoch": 1.822358044164038, + "grad_norm": 0.4997671967465472, + "learning_rate": 1.1405851049079706e-05, + "loss": 0.3458, + "step": 9244 + }, + { + "epoch": 1.8225552050473186, + "grad_norm": 0.4727316629472926, + "learning_rate": 1.1404316400150288e-05, + "loss": 0.3358, + "step": 9245 + }, + { + "epoch": 1.8227523659305995, + "grad_norm": 0.4600159341304322, + "learning_rate": 1.1402781717481042e-05, + "loss": 0.3019, + "step": 9246 + }, + { + "epoch": 1.82294952681388, + "grad_norm": 0.4832490316233833, + "learning_rate": 1.1401247001108828e-05, + "loss": 0.3215, + "step": 9247 + }, + { + "epoch": 1.823146687697161, + "grad_norm": 0.4552192341859621, + "learning_rate": 1.1399712251070532e-05, + "loss": 0.3036, + "step": 9248 + }, + { + "epoch": 1.8233438485804416, + "grad_norm": 0.45254379685961227, + "learning_rate": 1.1398177467403022e-05, + "loss": 0.3207, + "step": 9249 + }, + { + "epoch": 1.8235410094637223, + "grad_norm": 0.48097837033082497, + "learning_rate": 1.1396642650143171e-05, + "loss": 0.3383, + "step": 9250 + }, + { + "epoch": 1.8237381703470033, + "grad_norm": 2.4890867749317755, + "learning_rate": 1.1395107799327856e-05, + "loss": 0.3532, + "step": 9251 + }, + { + "epoch": 1.823935331230284, + "grad_norm": 0.5014918794110886, + "learning_rate": 1.1393572914993954e-05, + "loss": 0.3663, + "step": 9252 + }, + { + "epoch": 1.8241324921135647, + "grad_norm": 0.5245523336186161, + "learning_rate": 1.1392037997178338e-05, + "loss": 0.3445, + "step": 9253 + }, + { + "epoch": 1.8243296529968456, + "grad_norm": 0.5134602963738575, + "learning_rate": 1.1390503045917892e-05, + "loss": 0.3631, + "step": 9254 + }, + { + "epoch": 1.824526813880126, + "grad_norm": 0.4701559391883266, + "learning_rate": 1.1388968061249486e-05, + "loss": 0.3283, + "step": 9255 + }, + { + "epoch": 1.824723974763407, + "grad_norm": 0.47107381684930266, + "learning_rate": 1.1387433043210006e-05, + "loss": 0.3278, + "step": 9256 + }, + { + "epoch": 1.8249211356466877, + "grad_norm": 0.47653058629900974, + "learning_rate": 1.138589799183633e-05, + "loss": 0.3238, + "step": 9257 + }, + { + "epoch": 1.8251182965299684, + "grad_norm": 0.48085309963347217, + "learning_rate": 1.138436290716534e-05, + "loss": 0.3355, + "step": 9258 + }, + { + "epoch": 1.8253154574132493, + "grad_norm": 0.46243620388093765, + "learning_rate": 1.1382827789233912e-05, + "loss": 0.3259, + "step": 9259 + }, + { + "epoch": 1.82551261829653, + "grad_norm": 0.45802315988824477, + "learning_rate": 1.1381292638078935e-05, + "loss": 0.3149, + "step": 9260 + }, + { + "epoch": 1.8257097791798107, + "grad_norm": 0.4794295563766952, + "learning_rate": 1.1379757453737293e-05, + "loss": 0.3337, + "step": 9261 + }, + { + "epoch": 1.8259069400630916, + "grad_norm": 0.4758846030069048, + "learning_rate": 1.1378222236245862e-05, + "loss": 0.324, + "step": 9262 + }, + { + "epoch": 1.826104100946372, + "grad_norm": 0.4709615758764996, + "learning_rate": 1.1376686985641536e-05, + "loss": 0.3223, + "step": 9263 + }, + { + "epoch": 1.826301261829653, + "grad_norm": 0.49999866958616157, + "learning_rate": 1.1375151701961191e-05, + "loss": 0.3621, + "step": 9264 + }, + { + "epoch": 1.8264984227129337, + "grad_norm": 0.5326305619030082, + "learning_rate": 1.1373616385241726e-05, + "loss": 0.3455, + "step": 9265 + }, + { + "epoch": 1.8266955835962144, + "grad_norm": 0.45691910409746533, + "learning_rate": 1.1372081035520015e-05, + "loss": 0.3085, + "step": 9266 + }, + { + "epoch": 1.8268927444794953, + "grad_norm": 0.4587323635393261, + "learning_rate": 1.1370545652832958e-05, + "loss": 0.3322, + "step": 9267 + }, + { + "epoch": 1.827089905362776, + "grad_norm": 0.4826907976994329, + "learning_rate": 1.1369010237217435e-05, + "loss": 0.3391, + "step": 9268 + }, + { + "epoch": 1.8272870662460567, + "grad_norm": 0.4816253057556002, + "learning_rate": 1.1367474788710338e-05, + "loss": 0.3408, + "step": 9269 + }, + { + "epoch": 1.8274842271293377, + "grad_norm": 0.5249937140583107, + "learning_rate": 1.1365939307348559e-05, + "loss": 0.3464, + "step": 9270 + }, + { + "epoch": 1.8276813880126181, + "grad_norm": 0.48583193536640634, + "learning_rate": 1.1364403793168988e-05, + "loss": 0.3469, + "step": 9271 + }, + { + "epoch": 1.827878548895899, + "grad_norm": 0.5051024554952468, + "learning_rate": 1.1362868246208519e-05, + "loss": 0.3261, + "step": 9272 + }, + { + "epoch": 1.8280757097791798, + "grad_norm": 0.4968850771719625, + "learning_rate": 1.1361332666504038e-05, + "loss": 0.3272, + "step": 9273 + }, + { + "epoch": 1.8282728706624605, + "grad_norm": 0.47294764285242963, + "learning_rate": 1.135979705409245e-05, + "loss": 0.3247, + "step": 9274 + }, + { + "epoch": 1.8284700315457414, + "grad_norm": 0.4606069743822721, + "learning_rate": 1.1358261409010636e-05, + "loss": 0.3035, + "step": 9275 + }, + { + "epoch": 1.828667192429022, + "grad_norm": 0.4771239186952894, + "learning_rate": 1.1356725731295501e-05, + "loss": 0.332, + "step": 9276 + }, + { + "epoch": 1.8288643533123028, + "grad_norm": 0.4478351388311611, + "learning_rate": 1.1355190020983937e-05, + "loss": 0.3076, + "step": 9277 + }, + { + "epoch": 1.8290615141955837, + "grad_norm": 0.46550837164514514, + "learning_rate": 1.1353654278112841e-05, + "loss": 0.3273, + "step": 9278 + }, + { + "epoch": 1.8292586750788642, + "grad_norm": 0.4811875611417182, + "learning_rate": 1.1352118502719115e-05, + "loss": 0.3424, + "step": 9279 + }, + { + "epoch": 1.8294558359621451, + "grad_norm": 0.4600111130971501, + "learning_rate": 1.135058269483965e-05, + "loss": 0.3362, + "step": 9280 + }, + { + "epoch": 1.8296529968454258, + "grad_norm": 0.4475616886590054, + "learning_rate": 1.1349046854511347e-05, + "loss": 0.3139, + "step": 9281 + }, + { + "epoch": 1.8298501577287065, + "grad_norm": 0.4806604678847091, + "learning_rate": 1.1347510981771108e-05, + "loss": 0.3672, + "step": 9282 + }, + { + "epoch": 1.8300473186119874, + "grad_norm": 0.5175341707370579, + "learning_rate": 1.1345975076655832e-05, + "loss": 0.339, + "step": 9283 + }, + { + "epoch": 1.8302444794952681, + "grad_norm": 0.5158084398771197, + "learning_rate": 1.134443913920242e-05, + "loss": 0.3422, + "step": 9284 + }, + { + "epoch": 1.8304416403785488, + "grad_norm": 0.44628621178473027, + "learning_rate": 1.1342903169447778e-05, + "loss": 0.3123, + "step": 9285 + }, + { + "epoch": 1.8306388012618298, + "grad_norm": 0.491244561834142, + "learning_rate": 1.1341367167428806e-05, + "loss": 0.3356, + "step": 9286 + }, + { + "epoch": 1.8308359621451105, + "grad_norm": 0.4618159349464898, + "learning_rate": 1.1339831133182405e-05, + "loss": 0.3195, + "step": 9287 + }, + { + "epoch": 1.8310331230283912, + "grad_norm": 0.46357096939810233, + "learning_rate": 1.1338295066745482e-05, + "loss": 0.3277, + "step": 9288 + }, + { + "epoch": 1.831230283911672, + "grad_norm": 0.44971140458495884, + "learning_rate": 1.1336758968154943e-05, + "loss": 0.3236, + "step": 9289 + }, + { + "epoch": 1.8314274447949526, + "grad_norm": 0.4528759786598488, + "learning_rate": 1.1335222837447692e-05, + "loss": 0.3091, + "step": 9290 + }, + { + "epoch": 1.8316246056782335, + "grad_norm": 0.4797326570038922, + "learning_rate": 1.1333686674660643e-05, + "loss": 0.3388, + "step": 9291 + }, + { + "epoch": 1.8318217665615142, + "grad_norm": 0.9510910814957004, + "learning_rate": 1.133215047983069e-05, + "loss": 0.3446, + "step": 9292 + }, + { + "epoch": 1.8320189274447949, + "grad_norm": 0.47801369290214624, + "learning_rate": 1.1330614252994753e-05, + "loss": 0.337, + "step": 9293 + }, + { + "epoch": 1.8322160883280758, + "grad_norm": 0.4798818841814811, + "learning_rate": 1.1329077994189736e-05, + "loss": 0.3283, + "step": 9294 + }, + { + "epoch": 1.8324132492113565, + "grad_norm": 0.4649250612908643, + "learning_rate": 1.132754170345255e-05, + "loss": 0.3143, + "step": 9295 + }, + { + "epoch": 1.8326104100946372, + "grad_norm": 0.5111682031018355, + "learning_rate": 1.1326005380820106e-05, + "loss": 0.3489, + "step": 9296 + }, + { + "epoch": 1.8328075709779181, + "grad_norm": 0.479500042771139, + "learning_rate": 1.1324469026329314e-05, + "loss": 0.3467, + "step": 9297 + }, + { + "epoch": 1.8330047318611986, + "grad_norm": 0.48760322704258524, + "learning_rate": 1.1322932640017087e-05, + "loss": 0.338, + "step": 9298 + }, + { + "epoch": 1.8332018927444795, + "grad_norm": 0.4615980037814281, + "learning_rate": 1.132139622192034e-05, + "loss": 0.3211, + "step": 9299 + }, + { + "epoch": 1.8333990536277602, + "grad_norm": 0.478582392329709, + "learning_rate": 1.1319859772075982e-05, + "loss": 0.3437, + "step": 9300 + }, + { + "epoch": 1.833596214511041, + "grad_norm": 0.45732711770150825, + "learning_rate": 1.1318323290520935e-05, + "loss": 0.307, + "step": 9301 + }, + { + "epoch": 1.8337933753943219, + "grad_norm": 0.4785336546942506, + "learning_rate": 1.1316786777292103e-05, + "loss": 0.3467, + "step": 9302 + }, + { + "epoch": 1.8339905362776026, + "grad_norm": 0.48770360639670446, + "learning_rate": 1.1315250232426411e-05, + "loss": 0.3178, + "step": 9303 + }, + { + "epoch": 1.8341876971608833, + "grad_norm": 0.4909997945985066, + "learning_rate": 1.1313713655960773e-05, + "loss": 0.339, + "step": 9304 + }, + { + "epoch": 1.8343848580441642, + "grad_norm": 0.46943047019703255, + "learning_rate": 1.1312177047932107e-05, + "loss": 0.3233, + "step": 9305 + }, + { + "epoch": 1.8345820189274447, + "grad_norm": 0.4637968220251219, + "learning_rate": 1.1310640408377331e-05, + "loss": 0.3354, + "step": 9306 + }, + { + "epoch": 1.8347791798107256, + "grad_norm": 0.4811381690374539, + "learning_rate": 1.1309103737333363e-05, + "loss": 0.3206, + "step": 9307 + }, + { + "epoch": 1.8349763406940063, + "grad_norm": 0.4841461834363835, + "learning_rate": 1.1307567034837123e-05, + "loss": 0.3355, + "step": 9308 + }, + { + "epoch": 1.835173501577287, + "grad_norm": 0.5075047264571099, + "learning_rate": 1.1306030300925531e-05, + "loss": 0.3364, + "step": 9309 + }, + { + "epoch": 1.835370662460568, + "grad_norm": 0.48761090026693843, + "learning_rate": 1.1304493535635512e-05, + "loss": 0.334, + "step": 9310 + }, + { + "epoch": 1.8355678233438486, + "grad_norm": 0.4707017965699666, + "learning_rate": 1.1302956739003981e-05, + "loss": 0.3206, + "step": 9311 + }, + { + "epoch": 1.8357649842271293, + "grad_norm": 0.47390782968886325, + "learning_rate": 1.1301419911067871e-05, + "loss": 0.3581, + "step": 9312 + }, + { + "epoch": 1.8359621451104102, + "grad_norm": 0.5179730801274194, + "learning_rate": 1.1299883051864095e-05, + "loss": 0.3345, + "step": 9313 + }, + { + "epoch": 1.8361593059936907, + "grad_norm": 0.4565817457776229, + "learning_rate": 1.1298346161429585e-05, + "loss": 0.324, + "step": 9314 + }, + { + "epoch": 1.8363564668769716, + "grad_norm": 0.4743170511252832, + "learning_rate": 1.1296809239801258e-05, + "loss": 0.3257, + "step": 9315 + }, + { + "epoch": 1.8365536277602523, + "grad_norm": 0.5062603340279239, + "learning_rate": 1.129527228701605e-05, + "loss": 0.3397, + "step": 9316 + }, + { + "epoch": 1.836750788643533, + "grad_norm": 0.4627524681098276, + "learning_rate": 1.129373530311088e-05, + "loss": 0.3008, + "step": 9317 + }, + { + "epoch": 1.836947949526814, + "grad_norm": 0.45366614630787505, + "learning_rate": 1.1292198288122678e-05, + "loss": 0.3239, + "step": 9318 + }, + { + "epoch": 1.8371451104100947, + "grad_norm": 0.49553583595035205, + "learning_rate": 1.1290661242088373e-05, + "loss": 0.3335, + "step": 9319 + }, + { + "epoch": 1.8373422712933754, + "grad_norm": 0.4656239756890831, + "learning_rate": 1.1289124165044889e-05, + "loss": 0.3343, + "step": 9320 + }, + { + "epoch": 1.8375394321766563, + "grad_norm": 0.4592207478297998, + "learning_rate": 1.1287587057029164e-05, + "loss": 0.31, + "step": 9321 + }, + { + "epoch": 1.8377365930599368, + "grad_norm": 0.48719866177332166, + "learning_rate": 1.1286049918078118e-05, + "loss": 0.3601, + "step": 9322 + }, + { + "epoch": 1.8379337539432177, + "grad_norm": 0.4609017660568222, + "learning_rate": 1.1284512748228686e-05, + "loss": 0.3532, + "step": 9323 + }, + { + "epoch": 1.8381309148264984, + "grad_norm": 0.471845332576064, + "learning_rate": 1.1282975547517805e-05, + "loss": 0.3428, + "step": 9324 + }, + { + "epoch": 1.838328075709779, + "grad_norm": 0.45141311669092293, + "learning_rate": 1.1281438315982403e-05, + "loss": 0.3313, + "step": 9325 + }, + { + "epoch": 1.83852523659306, + "grad_norm": 0.4846124283156861, + "learning_rate": 1.127990105365941e-05, + "loss": 0.3437, + "step": 9326 + }, + { + "epoch": 1.8387223974763407, + "grad_norm": 0.4706449746560727, + "learning_rate": 1.1278363760585767e-05, + "loss": 0.3239, + "step": 9327 + }, + { + "epoch": 1.8389195583596214, + "grad_norm": 0.46592281748659253, + "learning_rate": 1.1276826436798406e-05, + "loss": 0.2927, + "step": 9328 + }, + { + "epoch": 1.8391167192429023, + "grad_norm": 0.4506091488970719, + "learning_rate": 1.1275289082334257e-05, + "loss": 0.3299, + "step": 9329 + }, + { + "epoch": 1.839313880126183, + "grad_norm": 0.49247797496055923, + "learning_rate": 1.1273751697230262e-05, + "loss": 0.3177, + "step": 9330 + }, + { + "epoch": 1.8395110410094637, + "grad_norm": 0.4823856031434032, + "learning_rate": 1.1272214281523359e-05, + "loss": 0.3369, + "step": 9331 + }, + { + "epoch": 1.8397082018927446, + "grad_norm": 0.44703802630892214, + "learning_rate": 1.127067683525048e-05, + "loss": 0.3009, + "step": 9332 + }, + { + "epoch": 1.8399053627760251, + "grad_norm": 0.4352564519007158, + "learning_rate": 1.1269139358448573e-05, + "loss": 0.2883, + "step": 9333 + }, + { + "epoch": 1.840102523659306, + "grad_norm": 0.4897196424734099, + "learning_rate": 1.1267601851154569e-05, + "loss": 0.3292, + "step": 9334 + }, + { + "epoch": 1.8402996845425867, + "grad_norm": 0.47061110718447907, + "learning_rate": 1.1266064313405404e-05, + "loss": 0.3198, + "step": 9335 + }, + { + "epoch": 1.8404968454258674, + "grad_norm": 0.4725374845188905, + "learning_rate": 1.1264526745238032e-05, + "loss": 0.3303, + "step": 9336 + }, + { + "epoch": 1.8406940063091484, + "grad_norm": 0.46014350127590964, + "learning_rate": 1.1262989146689378e-05, + "loss": 0.3047, + "step": 9337 + }, + { + "epoch": 1.840891167192429, + "grad_norm": 0.48716019997900245, + "learning_rate": 1.12614515177964e-05, + "loss": 0.3345, + "step": 9338 + }, + { + "epoch": 1.8410883280757098, + "grad_norm": 0.462419364697719, + "learning_rate": 1.125991385859603e-05, + "loss": 0.3345, + "step": 9339 + }, + { + "epoch": 1.8412854889589907, + "grad_norm": 0.4838141067825107, + "learning_rate": 1.1258376169125218e-05, + "loss": 0.3418, + "step": 9340 + }, + { + "epoch": 1.8414826498422712, + "grad_norm": 0.4638010611429982, + "learning_rate": 1.1256838449420902e-05, + "loss": 0.3397, + "step": 9341 + }, + { + "epoch": 1.841679810725552, + "grad_norm": 0.4742578303058376, + "learning_rate": 1.125530069952003e-05, + "loss": 0.3369, + "step": 9342 + }, + { + "epoch": 1.8418769716088328, + "grad_norm": 0.6012207890435203, + "learning_rate": 1.1253762919459548e-05, + "loss": 0.3922, + "step": 9343 + }, + { + "epoch": 1.8420741324921135, + "grad_norm": 0.4640385924151846, + "learning_rate": 1.1252225109276404e-05, + "loss": 0.3276, + "step": 9344 + }, + { + "epoch": 1.8422712933753944, + "grad_norm": 0.46720201438034464, + "learning_rate": 1.1250687269007544e-05, + "loss": 0.3231, + "step": 9345 + }, + { + "epoch": 1.8424684542586751, + "grad_norm": 0.4809439170465585, + "learning_rate": 1.1249149398689912e-05, + "loss": 0.3359, + "step": 9346 + }, + { + "epoch": 1.8426656151419558, + "grad_norm": 0.46092655938380556, + "learning_rate": 1.1247611498360463e-05, + "loss": 0.3288, + "step": 9347 + }, + { + "epoch": 1.8428627760252367, + "grad_norm": 0.46354010282488706, + "learning_rate": 1.124607356805614e-05, + "loss": 0.3085, + "step": 9348 + }, + { + "epoch": 1.8430599369085172, + "grad_norm": 0.4580081566598378, + "learning_rate": 1.1244535607813898e-05, + "loss": 0.3361, + "step": 9349 + }, + { + "epoch": 1.8432570977917981, + "grad_norm": 0.49486686537167013, + "learning_rate": 1.1242997617670685e-05, + "loss": 0.3191, + "step": 9350 + }, + { + "epoch": 1.8434542586750788, + "grad_norm": 0.4738563965909639, + "learning_rate": 1.1241459597663453e-05, + "loss": 0.358, + "step": 9351 + }, + { + "epoch": 1.8436514195583595, + "grad_norm": 0.46422380906100263, + "learning_rate": 1.1239921547829156e-05, + "loss": 0.3178, + "step": 9352 + }, + { + "epoch": 1.8438485804416405, + "grad_norm": 0.4863236006835687, + "learning_rate": 1.1238383468204744e-05, + "loss": 0.3352, + "step": 9353 + }, + { + "epoch": 1.8440457413249212, + "grad_norm": 0.5448224040290974, + "learning_rate": 1.1236845358827174e-05, + "loss": 0.3508, + "step": 9354 + }, + { + "epoch": 1.8442429022082019, + "grad_norm": 0.5119358198476269, + "learning_rate": 1.1235307219733396e-05, + "loss": 0.3626, + "step": 9355 + }, + { + "epoch": 1.8444400630914828, + "grad_norm": 0.48695813930072723, + "learning_rate": 1.1233769050960366e-05, + "loss": 0.3276, + "step": 9356 + }, + { + "epoch": 1.8446372239747633, + "grad_norm": 0.45412864850612233, + "learning_rate": 1.1232230852545042e-05, + "loss": 0.3367, + "step": 9357 + }, + { + "epoch": 1.8448343848580442, + "grad_norm": 0.41684857678310244, + "learning_rate": 1.1230692624524379e-05, + "loss": 0.2956, + "step": 9358 + }, + { + "epoch": 1.8450315457413249, + "grad_norm": 0.4765022219720706, + "learning_rate": 1.1229154366935337e-05, + "loss": 0.3451, + "step": 9359 + }, + { + "epoch": 1.8452287066246056, + "grad_norm": 0.4451789441382438, + "learning_rate": 1.1227616079814869e-05, + "loss": 0.3078, + "step": 9360 + }, + { + "epoch": 1.8454258675078865, + "grad_norm": 0.4783197235034446, + "learning_rate": 1.1226077763199941e-05, + "loss": 0.3355, + "step": 9361 + }, + { + "epoch": 1.8456230283911672, + "grad_norm": 0.45356257014627016, + "learning_rate": 1.12245394171275e-05, + "loss": 0.3263, + "step": 9362 + }, + { + "epoch": 1.845820189274448, + "grad_norm": 0.6426674681188173, + "learning_rate": 1.1223001041634517e-05, + "loss": 0.3478, + "step": 9363 + }, + { + "epoch": 1.8460173501577288, + "grad_norm": 0.5256663639141104, + "learning_rate": 1.122146263675795e-05, + "loss": 0.3536, + "step": 9364 + }, + { + "epoch": 1.8462145110410093, + "grad_norm": 0.49322701169852573, + "learning_rate": 1.121992420253476e-05, + "loss": 0.3507, + "step": 9365 + }, + { + "epoch": 1.8464116719242902, + "grad_norm": 0.49133913419542524, + "learning_rate": 1.1218385739001908e-05, + "loss": 0.3297, + "step": 9366 + }, + { + "epoch": 1.846608832807571, + "grad_norm": 0.4936609617307454, + "learning_rate": 1.1216847246196356e-05, + "loss": 0.3569, + "step": 9367 + }, + { + "epoch": 1.8468059936908516, + "grad_norm": 0.4662341419677541, + "learning_rate": 1.121530872415507e-05, + "loss": 0.2912, + "step": 9368 + }, + { + "epoch": 1.8470031545741326, + "grad_norm": 0.4643026216620009, + "learning_rate": 1.1213770172915012e-05, + "loss": 0.3205, + "step": 9369 + }, + { + "epoch": 1.8472003154574133, + "grad_norm": 0.4620634065880125, + "learning_rate": 1.121223159251315e-05, + "loss": 0.3324, + "step": 9370 + }, + { + "epoch": 1.847397476340694, + "grad_norm": 0.45144218730120467, + "learning_rate": 1.1210692982986447e-05, + "loss": 0.3085, + "step": 9371 + }, + { + "epoch": 1.8475946372239749, + "grad_norm": 0.481324283568681, + "learning_rate": 1.120915434437187e-05, + "loss": 0.3384, + "step": 9372 + }, + { + "epoch": 1.8477917981072554, + "grad_norm": 0.48469219856799683, + "learning_rate": 1.1207615676706387e-05, + "loss": 0.3237, + "step": 9373 + }, + { + "epoch": 1.8479889589905363, + "grad_norm": 0.43868531841936526, + "learning_rate": 1.1206076980026963e-05, + "loss": 0.331, + "step": 9374 + }, + { + "epoch": 1.848186119873817, + "grad_norm": 0.49905037754094533, + "learning_rate": 1.120453825437057e-05, + "loss": 0.3504, + "step": 9375 + }, + { + "epoch": 1.8483832807570977, + "grad_norm": 0.47057721365819305, + "learning_rate": 1.1202999499774174e-05, + "loss": 0.3341, + "step": 9376 + }, + { + "epoch": 1.8485804416403786, + "grad_norm": 0.5115538854447896, + "learning_rate": 1.1201460716274745e-05, + "loss": 0.3557, + "step": 9377 + }, + { + "epoch": 1.8487776025236593, + "grad_norm": 0.49106393680197397, + "learning_rate": 1.1199921903909258e-05, + "loss": 0.3348, + "step": 9378 + }, + { + "epoch": 1.84897476340694, + "grad_norm": 0.4783550963001248, + "learning_rate": 1.119838306271468e-05, + "loss": 0.3447, + "step": 9379 + }, + { + "epoch": 1.849171924290221, + "grad_norm": 15.982642688070285, + "learning_rate": 1.1196844192727984e-05, + "loss": 0.3471, + "step": 9380 + }, + { + "epoch": 1.8493690851735016, + "grad_norm": 0.49460635668203545, + "learning_rate": 1.119530529398614e-05, + "loss": 0.3547, + "step": 9381 + }, + { + "epoch": 1.8495662460567823, + "grad_norm": 0.4767109854012262, + "learning_rate": 1.1193766366526128e-05, + "loss": 0.3347, + "step": 9382 + }, + { + "epoch": 1.8497634069400632, + "grad_norm": 0.4344678704925985, + "learning_rate": 1.1192227410384915e-05, + "loss": 0.2973, + "step": 9383 + }, + { + "epoch": 1.8499605678233437, + "grad_norm": 0.4942642287727857, + "learning_rate": 1.1190688425599478e-05, + "loss": 0.3555, + "step": 9384 + }, + { + "epoch": 1.8501577287066246, + "grad_norm": 0.45936308831216105, + "learning_rate": 1.1189149412206795e-05, + "loss": 0.3256, + "step": 9385 + }, + { + "epoch": 1.8503548895899053, + "grad_norm": 0.48263042540453904, + "learning_rate": 1.1187610370243837e-05, + "loss": 0.337, + "step": 9386 + }, + { + "epoch": 1.850552050473186, + "grad_norm": 0.48546115028194037, + "learning_rate": 1.1186071299747588e-05, + "loss": 0.3529, + "step": 9387 + }, + { + "epoch": 1.850749211356467, + "grad_norm": 1.2555124942050995, + "learning_rate": 1.1184532200755017e-05, + "loss": 0.3508, + "step": 9388 + }, + { + "epoch": 1.8509463722397477, + "grad_norm": 0.517094897963285, + "learning_rate": 1.1182993073303107e-05, + "loss": 0.3507, + "step": 9389 + }, + { + "epoch": 1.8511435331230284, + "grad_norm": 0.4931959158526763, + "learning_rate": 1.1181453917428835e-05, + "loss": 0.3404, + "step": 9390 + }, + { + "epoch": 1.8513406940063093, + "grad_norm": 0.4720258856465021, + "learning_rate": 1.117991473316918e-05, + "loss": 0.3271, + "step": 9391 + }, + { + "epoch": 1.8515378548895898, + "grad_norm": 0.4663809318007513, + "learning_rate": 1.1178375520561126e-05, + "loss": 0.3231, + "step": 9392 + }, + { + "epoch": 1.8517350157728707, + "grad_norm": 0.4791754199299042, + "learning_rate": 1.1176836279641649e-05, + "loss": 0.3495, + "step": 9393 + }, + { + "epoch": 1.8519321766561514, + "grad_norm": 0.44613918375908324, + "learning_rate": 1.1175297010447734e-05, + "loss": 0.3234, + "step": 9394 + }, + { + "epoch": 1.852129337539432, + "grad_norm": 0.5340353989096874, + "learning_rate": 1.1173757713016362e-05, + "loss": 0.3315, + "step": 9395 + }, + { + "epoch": 1.852326498422713, + "grad_norm": 0.46860519180264953, + "learning_rate": 1.1172218387384517e-05, + "loss": 0.3234, + "step": 9396 + }, + { + "epoch": 1.8525236593059937, + "grad_norm": 0.46923350765977884, + "learning_rate": 1.117067903358918e-05, + "loss": 0.3152, + "step": 9397 + }, + { + "epoch": 1.8527208201892744, + "grad_norm": 0.4989908576360559, + "learning_rate": 1.1169139651667334e-05, + "loss": 0.3334, + "step": 9398 + }, + { + "epoch": 1.8529179810725553, + "grad_norm": 0.48850368752631385, + "learning_rate": 1.1167600241655969e-05, + "loss": 0.3168, + "step": 9399 + }, + { + "epoch": 1.8531151419558358, + "grad_norm": 0.506462305386158, + "learning_rate": 1.116606080359207e-05, + "loss": 0.3354, + "step": 9400 + }, + { + "epoch": 1.8533123028391167, + "grad_norm": 0.4783276972037197, + "learning_rate": 1.1164521337512618e-05, + "loss": 0.3444, + "step": 9401 + }, + { + "epoch": 1.8535094637223974, + "grad_norm": 0.49486017910579383, + "learning_rate": 1.1162981843454603e-05, + "loss": 0.322, + "step": 9402 + }, + { + "epoch": 1.8537066246056781, + "grad_norm": 0.48872781997990056, + "learning_rate": 1.1161442321455013e-05, + "loss": 0.3376, + "step": 9403 + }, + { + "epoch": 1.853903785488959, + "grad_norm": 0.49644332416860143, + "learning_rate": 1.1159902771550836e-05, + "loss": 0.3282, + "step": 9404 + }, + { + "epoch": 1.8541009463722398, + "grad_norm": 0.483535918659441, + "learning_rate": 1.115836319377906e-05, + "loss": 0.3394, + "step": 9405 + }, + { + "epoch": 1.8542981072555205, + "grad_norm": 0.45628473409890424, + "learning_rate": 1.115682358817668e-05, + "loss": 0.3345, + "step": 9406 + }, + { + "epoch": 1.8544952681388014, + "grad_norm": 0.5002627802730854, + "learning_rate": 1.1155283954780676e-05, + "loss": 0.3342, + "step": 9407 + }, + { + "epoch": 1.8546924290220819, + "grad_norm": 0.46363502477326907, + "learning_rate": 1.1153744293628049e-05, + "loss": 0.341, + "step": 9408 + }, + { + "epoch": 1.8548895899053628, + "grad_norm": 0.45754159982180126, + "learning_rate": 1.115220460475578e-05, + "loss": 0.322, + "step": 9409 + }, + { + "epoch": 1.8550867507886435, + "grad_norm": 0.4811557071074235, + "learning_rate": 1.1150664888200874e-05, + "loss": 0.3448, + "step": 9410 + }, + { + "epoch": 1.8552839116719242, + "grad_norm": 0.45695254954143394, + "learning_rate": 1.1149125144000315e-05, + "loss": 0.3172, + "step": 9411 + }, + { + "epoch": 1.8554810725552051, + "grad_norm": 0.4611849986321203, + "learning_rate": 1.1147585372191099e-05, + "loss": 0.3324, + "step": 9412 + }, + { + "epoch": 1.8556782334384858, + "grad_norm": 0.4951261883208246, + "learning_rate": 1.114604557281022e-05, + "loss": 0.3467, + "step": 9413 + }, + { + "epoch": 1.8558753943217665, + "grad_norm": 0.45894758304595423, + "learning_rate": 1.1144505745894674e-05, + "loss": 0.3354, + "step": 9414 + }, + { + "epoch": 1.8560725552050474, + "grad_norm": 0.47361962578122113, + "learning_rate": 1.1142965891481456e-05, + "loss": 0.338, + "step": 9415 + }, + { + "epoch": 1.856269716088328, + "grad_norm": 0.46717084409335563, + "learning_rate": 1.1141426009607562e-05, + "loss": 0.3181, + "step": 9416 + }, + { + "epoch": 1.8564668769716088, + "grad_norm": 0.46910516420489423, + "learning_rate": 1.1139886100309987e-05, + "loss": 0.3114, + "step": 9417 + }, + { + "epoch": 1.8566640378548895, + "grad_norm": 0.45627899747872586, + "learning_rate": 1.1138346163625732e-05, + "loss": 0.3362, + "step": 9418 + }, + { + "epoch": 1.8568611987381702, + "grad_norm": 0.488764972259954, + "learning_rate": 1.1136806199591794e-05, + "loss": 0.318, + "step": 9419 + }, + { + "epoch": 1.8570583596214512, + "grad_norm": 0.48207706868844363, + "learning_rate": 1.1135266208245173e-05, + "loss": 0.3252, + "step": 9420 + }, + { + "epoch": 1.8572555205047319, + "grad_norm": 0.4533304622987771, + "learning_rate": 1.1133726189622865e-05, + "loss": 0.3134, + "step": 9421 + }, + { + "epoch": 1.8574526813880126, + "grad_norm": 0.5209819124401526, + "learning_rate": 1.1132186143761872e-05, + "loss": 0.3225, + "step": 9422 + }, + { + "epoch": 1.8576498422712935, + "grad_norm": 0.4764148070256464, + "learning_rate": 1.1130646070699196e-05, + "loss": 0.335, + "step": 9423 + }, + { + "epoch": 1.8578470031545742, + "grad_norm": 0.46903416952858623, + "learning_rate": 1.1129105970471836e-05, + "loss": 0.3372, + "step": 9424 + }, + { + "epoch": 1.8580441640378549, + "grad_norm": 0.4386714428559711, + "learning_rate": 1.1127565843116798e-05, + "loss": 0.2949, + "step": 9425 + }, + { + "epoch": 1.8582413249211358, + "grad_norm": 0.4453659320932465, + "learning_rate": 1.1126025688671081e-05, + "loss": 0.3091, + "step": 9426 + }, + { + "epoch": 1.8584384858044163, + "grad_norm": 0.6218845162516824, + "learning_rate": 1.1124485507171691e-05, + "loss": 0.367, + "step": 9427 + }, + { + "epoch": 1.8586356466876972, + "grad_norm": 0.5032362384746675, + "learning_rate": 1.112294529865563e-05, + "loss": 0.3386, + "step": 9428 + }, + { + "epoch": 1.858832807570978, + "grad_norm": 0.46700700293384795, + "learning_rate": 1.1121405063159906e-05, + "loss": 0.344, + "step": 9429 + }, + { + "epoch": 1.8590299684542586, + "grad_norm": 0.4650433144492055, + "learning_rate": 1.111986480072152e-05, + "loss": 0.3257, + "step": 9430 + }, + { + "epoch": 1.8592271293375395, + "grad_norm": 0.470873212417909, + "learning_rate": 1.1118324511377482e-05, + "loss": 0.3012, + "step": 9431 + }, + { + "epoch": 1.8594242902208202, + "grad_norm": 0.4694211550117268, + "learning_rate": 1.1116784195164797e-05, + "loss": 0.3423, + "step": 9432 + }, + { + "epoch": 1.859621451104101, + "grad_norm": 0.46797333442001665, + "learning_rate": 1.1115243852120472e-05, + "loss": 0.3196, + "step": 9433 + }, + { + "epoch": 1.8598186119873819, + "grad_norm": 0.4847132759132295, + "learning_rate": 1.1113703482281515e-05, + "loss": 0.3374, + "step": 9434 + }, + { + "epoch": 1.8600157728706623, + "grad_norm": 0.48758924974039297, + "learning_rate": 1.1112163085684935e-05, + "loss": 0.3437, + "step": 9435 + }, + { + "epoch": 1.8602129337539433, + "grad_norm": 0.48133455329307273, + "learning_rate": 1.111062266236774e-05, + "loss": 0.3175, + "step": 9436 + }, + { + "epoch": 1.860410094637224, + "grad_norm": 0.4906755288935251, + "learning_rate": 1.1109082212366944e-05, + "loss": 0.318, + "step": 9437 + }, + { + "epoch": 1.8606072555205047, + "grad_norm": 0.4633813435313994, + "learning_rate": 1.1107541735719554e-05, + "loss": 0.3103, + "step": 9438 + }, + { + "epoch": 1.8608044164037856, + "grad_norm": 0.5163900036338083, + "learning_rate": 1.110600123246258e-05, + "loss": 0.3263, + "step": 9439 + }, + { + "epoch": 1.8610015772870663, + "grad_norm": 0.5956374108250188, + "learning_rate": 1.1104460702633038e-05, + "loss": 0.3441, + "step": 9440 + }, + { + "epoch": 1.861198738170347, + "grad_norm": 0.4763548264026657, + "learning_rate": 1.1102920146267938e-05, + "loss": 0.3032, + "step": 9441 + }, + { + "epoch": 1.861395899053628, + "grad_norm": 0.4958429310254217, + "learning_rate": 1.1101379563404291e-05, + "loss": 0.3171, + "step": 9442 + }, + { + "epoch": 1.8615930599369084, + "grad_norm": 0.4894185337405908, + "learning_rate": 1.1099838954079117e-05, + "loss": 0.3349, + "step": 9443 + }, + { + "epoch": 1.8617902208201893, + "grad_norm": 0.48658157622850023, + "learning_rate": 1.1098298318329421e-05, + "loss": 0.3516, + "step": 9444 + }, + { + "epoch": 1.86198738170347, + "grad_norm": 0.477421882843568, + "learning_rate": 1.1096757656192226e-05, + "loss": 0.3442, + "step": 9445 + }, + { + "epoch": 1.8621845425867507, + "grad_norm": 0.4587814387324469, + "learning_rate": 1.1095216967704548e-05, + "loss": 0.3155, + "step": 9446 + }, + { + "epoch": 1.8623817034700316, + "grad_norm": 0.46345456592813855, + "learning_rate": 1.1093676252903395e-05, + "loss": 0.3245, + "step": 9447 + }, + { + "epoch": 1.8625788643533123, + "grad_norm": 0.49258102378140683, + "learning_rate": 1.1092135511825795e-05, + "loss": 0.3385, + "step": 9448 + }, + { + "epoch": 1.862776025236593, + "grad_norm": 0.4885376518923072, + "learning_rate": 1.1090594744508754e-05, + "loss": 0.3429, + "step": 9449 + }, + { + "epoch": 1.862973186119874, + "grad_norm": 0.5003966905526008, + "learning_rate": 1.1089053950989301e-05, + "loss": 0.3572, + "step": 9450 + }, + { + "epoch": 1.8631703470031544, + "grad_norm": 0.48522433245566304, + "learning_rate": 1.1087513131304446e-05, + "loss": 0.3174, + "step": 9451 + }, + { + "epoch": 1.8633675078864353, + "grad_norm": 0.5072726660997204, + "learning_rate": 1.1085972285491213e-05, + "loss": 0.3142, + "step": 9452 + }, + { + "epoch": 1.863564668769716, + "grad_norm": 0.4492624933232977, + "learning_rate": 1.1084431413586625e-05, + "loss": 0.3264, + "step": 9453 + }, + { + "epoch": 1.8637618296529967, + "grad_norm": 0.47440212617966615, + "learning_rate": 1.1082890515627696e-05, + "loss": 0.3221, + "step": 9454 + }, + { + "epoch": 1.8639589905362777, + "grad_norm": 0.4449072907644378, + "learning_rate": 1.108134959165145e-05, + "loss": 0.3093, + "step": 9455 + }, + { + "epoch": 1.8641561514195584, + "grad_norm": 0.4454938083744686, + "learning_rate": 1.1079808641694909e-05, + "loss": 0.3215, + "step": 9456 + }, + { + "epoch": 1.864353312302839, + "grad_norm": 0.472087185708321, + "learning_rate": 1.10782676657951e-05, + "loss": 0.3397, + "step": 9457 + }, + { + "epoch": 1.86455047318612, + "grad_norm": 0.4678074204749738, + "learning_rate": 1.1076726663989037e-05, + "loss": 0.3284, + "step": 9458 + }, + { + "epoch": 1.8647476340694005, + "grad_norm": 0.46656067923498784, + "learning_rate": 1.107518563631375e-05, + "loss": 0.3283, + "step": 9459 + }, + { + "epoch": 1.8649447949526814, + "grad_norm": 0.4635770298520586, + "learning_rate": 1.1073644582806263e-05, + "loss": 0.3187, + "step": 9460 + }, + { + "epoch": 1.865141955835962, + "grad_norm": 0.44301184098285906, + "learning_rate": 1.1072103503503599e-05, + "loss": 0.3063, + "step": 9461 + }, + { + "epoch": 1.8653391167192428, + "grad_norm": 0.4628837166980266, + "learning_rate": 1.1070562398442789e-05, + "loss": 0.3335, + "step": 9462 + }, + { + "epoch": 1.8655362776025237, + "grad_norm": 0.6943912518558549, + "learning_rate": 1.106902126766085e-05, + "loss": 0.3459, + "step": 9463 + }, + { + "epoch": 1.8657334384858044, + "grad_norm": 0.5689959265430884, + "learning_rate": 1.1067480111194817e-05, + "loss": 0.3639, + "step": 9464 + }, + { + "epoch": 1.8659305993690851, + "grad_norm": 0.478559291667436, + "learning_rate": 1.1065938929081714e-05, + "loss": 0.3357, + "step": 9465 + }, + { + "epoch": 1.866127760252366, + "grad_norm": 0.4676687566999173, + "learning_rate": 1.1064397721358571e-05, + "loss": 0.3022, + "step": 9466 + }, + { + "epoch": 1.8663249211356467, + "grad_norm": 0.46290577274262135, + "learning_rate": 1.1062856488062414e-05, + "loss": 0.3096, + "step": 9467 + }, + { + "epoch": 1.8665220820189274, + "grad_norm": 0.5040641995883025, + "learning_rate": 1.1061315229230276e-05, + "loss": 0.3394, + "step": 9468 + }, + { + "epoch": 1.8667192429022084, + "grad_norm": 0.45321672662297996, + "learning_rate": 1.1059773944899183e-05, + "loss": 0.3369, + "step": 9469 + }, + { + "epoch": 1.8669164037854888, + "grad_norm": 0.4487604982696012, + "learning_rate": 1.1058232635106167e-05, + "loss": 0.3296, + "step": 9470 + }, + { + "epoch": 1.8671135646687698, + "grad_norm": 0.5159266623498804, + "learning_rate": 1.1056691299888262e-05, + "loss": 0.3446, + "step": 9471 + }, + { + "epoch": 1.8673107255520505, + "grad_norm": 0.48017706589184767, + "learning_rate": 1.1055149939282497e-05, + "loss": 0.3296, + "step": 9472 + }, + { + "epoch": 1.8675078864353312, + "grad_norm": 0.47518034239431195, + "learning_rate": 1.1053608553325901e-05, + "loss": 0.3314, + "step": 9473 + }, + { + "epoch": 1.867705047318612, + "grad_norm": 0.4352311407076608, + "learning_rate": 1.1052067142055516e-05, + "loss": 0.3109, + "step": 9474 + }, + { + "epoch": 1.8679022082018928, + "grad_norm": 0.47370145361968125, + "learning_rate": 1.1050525705508369e-05, + "loss": 0.3371, + "step": 9475 + }, + { + "epoch": 1.8680993690851735, + "grad_norm": 0.5177165653942762, + "learning_rate": 1.1048984243721496e-05, + "loss": 0.353, + "step": 9476 + }, + { + "epoch": 1.8682965299684544, + "grad_norm": 0.4685075059578066, + "learning_rate": 1.104744275673193e-05, + "loss": 0.3392, + "step": 9477 + }, + { + "epoch": 1.868493690851735, + "grad_norm": 0.4547513508412821, + "learning_rate": 1.1045901244576713e-05, + "loss": 0.3096, + "step": 9478 + }, + { + "epoch": 1.8686908517350158, + "grad_norm": 0.4715660539717716, + "learning_rate": 1.104435970729287e-05, + "loss": 0.3337, + "step": 9479 + }, + { + "epoch": 1.8688880126182965, + "grad_norm": 0.5173676222111224, + "learning_rate": 1.1042818144917449e-05, + "loss": 0.3429, + "step": 9480 + }, + { + "epoch": 1.8690851735015772, + "grad_norm": 0.4748763088025496, + "learning_rate": 1.1041276557487482e-05, + "loss": 0.3285, + "step": 9481 + }, + { + "epoch": 1.8692823343848581, + "grad_norm": 0.4570571040060861, + "learning_rate": 1.1039734945040004e-05, + "loss": 0.3291, + "step": 9482 + }, + { + "epoch": 1.8694794952681388, + "grad_norm": 0.5076175558770766, + "learning_rate": 1.103819330761206e-05, + "loss": 0.3466, + "step": 9483 + }, + { + "epoch": 1.8696766561514195, + "grad_norm": 0.43967840426291815, + "learning_rate": 1.1036651645240683e-05, + "loss": 0.3072, + "step": 9484 + }, + { + "epoch": 1.8698738170347005, + "grad_norm": 0.488582403780876, + "learning_rate": 1.1035109957962918e-05, + "loss": 0.3156, + "step": 9485 + }, + { + "epoch": 1.870070977917981, + "grad_norm": 0.47909357015758164, + "learning_rate": 1.10335682458158e-05, + "loss": 0.3425, + "step": 9486 + }, + { + "epoch": 1.8702681388012619, + "grad_norm": 0.47912178648751735, + "learning_rate": 1.1032026508836376e-05, + "loss": 0.3053, + "step": 9487 + }, + { + "epoch": 1.8704652996845426, + "grad_norm": 0.480061395371812, + "learning_rate": 1.103048474706168e-05, + "loss": 0.364, + "step": 9488 + }, + { + "epoch": 1.8706624605678233, + "grad_norm": 0.46424946405738576, + "learning_rate": 1.102894296052876e-05, + "loss": 0.3152, + "step": 9489 + }, + { + "epoch": 1.8708596214511042, + "grad_norm": 0.46382382583955356, + "learning_rate": 1.1027401149274658e-05, + "loss": 0.331, + "step": 9490 + }, + { + "epoch": 1.8710567823343849, + "grad_norm": 0.4707474802471004, + "learning_rate": 1.1025859313336415e-05, + "loss": 0.3308, + "step": 9491 + }, + { + "epoch": 1.8712539432176656, + "grad_norm": 0.46377381491584824, + "learning_rate": 1.1024317452751076e-05, + "loss": 0.327, + "step": 9492 + }, + { + "epoch": 1.8714511041009465, + "grad_norm": 0.44780835687333587, + "learning_rate": 1.1022775567555686e-05, + "loss": 0.3347, + "step": 9493 + }, + { + "epoch": 1.871648264984227, + "grad_norm": 0.45613465092557437, + "learning_rate": 1.1021233657787285e-05, + "loss": 0.3223, + "step": 9494 + }, + { + "epoch": 1.871845425867508, + "grad_norm": 0.4667856099355164, + "learning_rate": 1.1019691723482928e-05, + "loss": 0.3262, + "step": 9495 + }, + { + "epoch": 1.8720425867507886, + "grad_norm": 0.5087693131824845, + "learning_rate": 1.1018149764679653e-05, + "loss": 0.3565, + "step": 9496 + }, + { + "epoch": 1.8722397476340693, + "grad_norm": 0.47261955625623386, + "learning_rate": 1.1016607781414514e-05, + "loss": 0.3387, + "step": 9497 + }, + { + "epoch": 1.8724369085173502, + "grad_norm": 0.4507061778033932, + "learning_rate": 1.101506577372455e-05, + "loss": 0.3378, + "step": 9498 + }, + { + "epoch": 1.872634069400631, + "grad_norm": 0.4914038805520105, + "learning_rate": 1.1013523741646817e-05, + "loss": 0.3635, + "step": 9499 + }, + { + "epoch": 1.8728312302839116, + "grad_norm": 0.46523334735695693, + "learning_rate": 1.1011981685218355e-05, + "loss": 0.3163, + "step": 9500 + }, + { + "epoch": 1.8730283911671926, + "grad_norm": 0.461046105271434, + "learning_rate": 1.1010439604476222e-05, + "loss": 0.3027, + "step": 9501 + }, + { + "epoch": 1.873225552050473, + "grad_norm": 0.4917301931134718, + "learning_rate": 1.1008897499457466e-05, + "loss": 0.3536, + "step": 9502 + }, + { + "epoch": 1.873422712933754, + "grad_norm": 0.4931865848981832, + "learning_rate": 1.100735537019913e-05, + "loss": 0.338, + "step": 9503 + }, + { + "epoch": 1.8736198738170347, + "grad_norm": 0.465842711087265, + "learning_rate": 1.1005813216738273e-05, + "loss": 0.3327, + "step": 9504 + }, + { + "epoch": 1.8738170347003154, + "grad_norm": 0.46721071233696176, + "learning_rate": 1.1004271039111943e-05, + "loss": 0.3349, + "step": 9505 + }, + { + "epoch": 1.8740141955835963, + "grad_norm": 0.47405778733750703, + "learning_rate": 1.1002728837357192e-05, + "loss": 0.3388, + "step": 9506 + }, + { + "epoch": 1.874211356466877, + "grad_norm": 0.4734967427166212, + "learning_rate": 1.1001186611511071e-05, + "loss": 0.3385, + "step": 9507 + }, + { + "epoch": 1.8744085173501577, + "grad_norm": 0.5255895043873378, + "learning_rate": 1.099964436161064e-05, + "loss": 0.3372, + "step": 9508 + }, + { + "epoch": 1.8746056782334386, + "grad_norm": 0.4787785522800639, + "learning_rate": 1.0998102087692946e-05, + "loss": 0.3355, + "step": 9509 + }, + { + "epoch": 1.874802839116719, + "grad_norm": 0.5023739258469361, + "learning_rate": 1.0996559789795045e-05, + "loss": 0.3478, + "step": 9510 + }, + { + "epoch": 1.875, + "grad_norm": 0.46464061305129783, + "learning_rate": 1.0995017467953994e-05, + "loss": 0.3331, + "step": 9511 + }, + { + "epoch": 1.875197160883281, + "grad_norm": 0.4995781916354229, + "learning_rate": 1.0993475122206846e-05, + "loss": 0.3407, + "step": 9512 + }, + { + "epoch": 1.8753943217665614, + "grad_norm": 0.4733291385390516, + "learning_rate": 1.0991932752590657e-05, + "loss": 0.3197, + "step": 9513 + }, + { + "epoch": 1.8755914826498423, + "grad_norm": 0.5105493418105221, + "learning_rate": 1.0990390359142488e-05, + "loss": 0.3507, + "step": 9514 + }, + { + "epoch": 1.875788643533123, + "grad_norm": 0.4942055666455541, + "learning_rate": 1.098884794189939e-05, + "loss": 0.3403, + "step": 9515 + }, + { + "epoch": 1.8759858044164037, + "grad_norm": 0.4900283583301513, + "learning_rate": 1.0987305500898427e-05, + "loss": 0.3239, + "step": 9516 + }, + { + "epoch": 1.8761829652996846, + "grad_norm": 0.4555388435909809, + "learning_rate": 1.0985763036176648e-05, + "loss": 0.3254, + "step": 9517 + }, + { + "epoch": 1.8763801261829653, + "grad_norm": 0.521257593570195, + "learning_rate": 1.0984220547771127e-05, + "loss": 0.3489, + "step": 9518 + }, + { + "epoch": 1.876577287066246, + "grad_norm": 0.45372277802554656, + "learning_rate": 1.098267803571891e-05, + "loss": 0.2968, + "step": 9519 + }, + { + "epoch": 1.876774447949527, + "grad_norm": 0.4731885774415591, + "learning_rate": 1.098113550005706e-05, + "loss": 0.348, + "step": 9520 + }, + { + "epoch": 1.8769716088328074, + "grad_norm": 0.4704727543410908, + "learning_rate": 1.0979592940822643e-05, + "loss": 0.3237, + "step": 9521 + }, + { + "epoch": 1.8771687697160884, + "grad_norm": 0.4716211417273583, + "learning_rate": 1.0978050358052715e-05, + "loss": 0.3332, + "step": 9522 + }, + { + "epoch": 1.877365930599369, + "grad_norm": 0.4656934460308149, + "learning_rate": 1.0976507751784343e-05, + "loss": 0.3253, + "step": 9523 + }, + { + "epoch": 1.8775630914826498, + "grad_norm": 0.45429803312317435, + "learning_rate": 1.097496512205458e-05, + "loss": 0.3213, + "step": 9524 + }, + { + "epoch": 1.8777602523659307, + "grad_norm": 0.4941603848877281, + "learning_rate": 1.0973422468900498e-05, + "loss": 0.3314, + "step": 9525 + }, + { + "epoch": 1.8779574132492114, + "grad_norm": 0.4921863038345052, + "learning_rate": 1.0971879792359154e-05, + "loss": 0.3161, + "step": 9526 + }, + { + "epoch": 1.878154574132492, + "grad_norm": 0.46662564704980725, + "learning_rate": 1.097033709246762e-05, + "loss": 0.3451, + "step": 9527 + }, + { + "epoch": 1.878351735015773, + "grad_norm": 0.4434045432752366, + "learning_rate": 1.0968794369262954e-05, + "loss": 0.3125, + "step": 9528 + }, + { + "epoch": 1.8785488958990535, + "grad_norm": 0.4809233182201705, + "learning_rate": 1.0967251622782223e-05, + "loss": 0.3559, + "step": 9529 + }, + { + "epoch": 1.8787460567823344, + "grad_norm": 0.4744798298419412, + "learning_rate": 1.0965708853062493e-05, + "loss": 0.3364, + "step": 9530 + }, + { + "epoch": 1.8789432176656151, + "grad_norm": 0.47368018073228213, + "learning_rate": 1.0964166060140831e-05, + "loss": 0.3171, + "step": 9531 + }, + { + "epoch": 1.8791403785488958, + "grad_norm": 0.5127652861424508, + "learning_rate": 1.0962623244054302e-05, + "loss": 0.3484, + "step": 9532 + }, + { + "epoch": 1.8793375394321767, + "grad_norm": 0.4858636735208188, + "learning_rate": 1.0961080404839974e-05, + "loss": 0.3506, + "step": 9533 + }, + { + "epoch": 1.8795347003154574, + "grad_norm": 0.4957252696098338, + "learning_rate": 1.0959537542534916e-05, + "loss": 0.3296, + "step": 9534 + }, + { + "epoch": 1.8797318611987381, + "grad_norm": 0.5147145847410192, + "learning_rate": 1.0957994657176197e-05, + "loss": 0.3467, + "step": 9535 + }, + { + "epoch": 1.879929022082019, + "grad_norm": 0.4335412701687705, + "learning_rate": 1.0956451748800883e-05, + "loss": 0.3149, + "step": 9536 + }, + { + "epoch": 1.8801261829652995, + "grad_norm": 0.46479740344238335, + "learning_rate": 1.0954908817446047e-05, + "loss": 0.3318, + "step": 9537 + }, + { + "epoch": 1.8803233438485805, + "grad_norm": 0.4683628747889899, + "learning_rate": 1.0953365863148757e-05, + "loss": 0.3287, + "step": 9538 + }, + { + "epoch": 1.8805205047318612, + "grad_norm": 0.472736232015758, + "learning_rate": 1.0951822885946084e-05, + "loss": 0.3136, + "step": 9539 + }, + { + "epoch": 1.8807176656151419, + "grad_norm": 0.4788330670775191, + "learning_rate": 1.0950279885875098e-05, + "loss": 0.3368, + "step": 9540 + }, + { + "epoch": 1.8809148264984228, + "grad_norm": 0.4760770871051563, + "learning_rate": 1.0948736862972873e-05, + "loss": 0.3262, + "step": 9541 + }, + { + "epoch": 1.8811119873817035, + "grad_norm": 0.456235107469069, + "learning_rate": 1.0947193817276485e-05, + "loss": 0.3283, + "step": 9542 + }, + { + "epoch": 1.8813091482649842, + "grad_norm": 0.4949947731416457, + "learning_rate": 1.0945650748822998e-05, + "loss": 0.3504, + "step": 9543 + }, + { + "epoch": 1.881506309148265, + "grad_norm": 0.5538356410729359, + "learning_rate": 1.0944107657649494e-05, + "loss": 0.319, + "step": 9544 + }, + { + "epoch": 1.8817034700315456, + "grad_norm": 0.5051031777149498, + "learning_rate": 1.0942564543793039e-05, + "loss": 0.3364, + "step": 9545 + }, + { + "epoch": 1.8819006309148265, + "grad_norm": 0.4953979151686171, + "learning_rate": 1.0941021407290717e-05, + "loss": 0.3856, + "step": 9546 + }, + { + "epoch": 1.8820977917981072, + "grad_norm": 0.4632524126131711, + "learning_rate": 1.0939478248179594e-05, + "loss": 0.3225, + "step": 9547 + }, + { + "epoch": 1.882294952681388, + "grad_norm": 0.46099411952055647, + "learning_rate": 1.093793506649675e-05, + "loss": 0.3221, + "step": 9548 + }, + { + "epoch": 1.8824921135646688, + "grad_norm": 0.48987986252693555, + "learning_rate": 1.093639186227926e-05, + "loss": 0.3414, + "step": 9549 + }, + { + "epoch": 1.8826892744479495, + "grad_norm": 0.5892412407244659, + "learning_rate": 1.0934848635564203e-05, + "loss": 0.3317, + "step": 9550 + }, + { + "epoch": 1.8828864353312302, + "grad_norm": 0.464294311971776, + "learning_rate": 1.0933305386388656e-05, + "loss": 0.3274, + "step": 9551 + }, + { + "epoch": 1.8830835962145112, + "grad_norm": 0.48576088684391133, + "learning_rate": 1.0931762114789695e-05, + "loss": 0.3357, + "step": 9552 + }, + { + "epoch": 1.8832807570977916, + "grad_norm": 0.44811752905575575, + "learning_rate": 1.0930218820804398e-05, + "loss": 0.3113, + "step": 9553 + }, + { + "epoch": 1.8834779179810726, + "grad_norm": 0.4927473006999295, + "learning_rate": 1.0928675504469843e-05, + "loss": 0.3513, + "step": 9554 + }, + { + "epoch": 1.8836750788643533, + "grad_norm": 0.45362869939341455, + "learning_rate": 1.0927132165823113e-05, + "loss": 0.3272, + "step": 9555 + }, + { + "epoch": 1.883872239747634, + "grad_norm": 0.45381446754200727, + "learning_rate": 1.0925588804901286e-05, + "loss": 0.3233, + "step": 9556 + }, + { + "epoch": 1.8840694006309149, + "grad_norm": 0.44466594957868893, + "learning_rate": 1.0924045421741442e-05, + "loss": 0.3331, + "step": 9557 + }, + { + "epoch": 1.8842665615141956, + "grad_norm": 0.5061065779984183, + "learning_rate": 1.0922502016380663e-05, + "loss": 0.3372, + "step": 9558 + }, + { + "epoch": 1.8844637223974763, + "grad_norm": 0.5421030190329682, + "learning_rate": 1.092095858885603e-05, + "loss": 0.3578, + "step": 9559 + }, + { + "epoch": 1.8846608832807572, + "grad_norm": 0.5002948397548355, + "learning_rate": 1.0919415139204625e-05, + "loss": 0.3225, + "step": 9560 + }, + { + "epoch": 1.884858044164038, + "grad_norm": 0.49365892934666894, + "learning_rate": 1.0917871667463533e-05, + "loss": 0.3349, + "step": 9561 + }, + { + "epoch": 1.8850552050473186, + "grad_norm": 0.4803469375107623, + "learning_rate": 1.091632817366983e-05, + "loss": 0.3515, + "step": 9562 + }, + { + "epoch": 1.8852523659305995, + "grad_norm": 0.49730327018911846, + "learning_rate": 1.091478465786061e-05, + "loss": 0.3453, + "step": 9563 + }, + { + "epoch": 1.88544952681388, + "grad_norm": 0.5162410884212086, + "learning_rate": 1.0913241120072947e-05, + "loss": 0.34, + "step": 9564 + }, + { + "epoch": 1.885646687697161, + "grad_norm": 0.4919385394499981, + "learning_rate": 1.0911697560343937e-05, + "loss": 0.347, + "step": 9565 + }, + { + "epoch": 1.8858438485804416, + "grad_norm": 0.48694966659079764, + "learning_rate": 1.0910153978710654e-05, + "loss": 0.3522, + "step": 9566 + }, + { + "epoch": 1.8860410094637223, + "grad_norm": 0.488293991846791, + "learning_rate": 1.0908610375210193e-05, + "loss": 0.3287, + "step": 9567 + }, + { + "epoch": 1.8862381703470033, + "grad_norm": 0.48621217336734124, + "learning_rate": 1.0907066749879632e-05, + "loss": 0.3401, + "step": 9568 + }, + { + "epoch": 1.886435331230284, + "grad_norm": 0.47778447085942694, + "learning_rate": 1.0905523102756061e-05, + "loss": 0.3199, + "step": 9569 + }, + { + "epoch": 1.8866324921135647, + "grad_norm": 0.47090656319170815, + "learning_rate": 1.0903979433876573e-05, + "loss": 0.3117, + "step": 9570 + }, + { + "epoch": 1.8868296529968456, + "grad_norm": 0.4595803156099776, + "learning_rate": 1.0902435743278248e-05, + "loss": 0.3295, + "step": 9571 + }, + { + "epoch": 1.887026813880126, + "grad_norm": 0.441046356319071, + "learning_rate": 1.0900892030998181e-05, + "loss": 0.2994, + "step": 9572 + }, + { + "epoch": 1.887223974763407, + "grad_norm": 0.46255381742264307, + "learning_rate": 1.089934829707345e-05, + "loss": 0.3106, + "step": 9573 + }, + { + "epoch": 1.8874211356466877, + "grad_norm": 0.48070659599098814, + "learning_rate": 1.0897804541541159e-05, + "loss": 0.3268, + "step": 9574 + }, + { + "epoch": 1.8876182965299684, + "grad_norm": 0.47064959203140777, + "learning_rate": 1.0896260764438387e-05, + "loss": 0.3443, + "step": 9575 + }, + { + "epoch": 1.8878154574132493, + "grad_norm": 0.48437387551779304, + "learning_rate": 1.089471696580223e-05, + "loss": 0.3274, + "step": 9576 + }, + { + "epoch": 1.88801261829653, + "grad_norm": 0.4936834752237707, + "learning_rate": 1.0893173145669777e-05, + "loss": 0.354, + "step": 9577 + }, + { + "epoch": 1.8882097791798107, + "grad_norm": 0.47759344988792984, + "learning_rate": 1.089162930407812e-05, + "loss": 0.3548, + "step": 9578 + }, + { + "epoch": 1.8884069400630916, + "grad_norm": 0.4673787356484052, + "learning_rate": 1.089008544106435e-05, + "loss": 0.3184, + "step": 9579 + }, + { + "epoch": 1.888604100946372, + "grad_norm": 5.043034026594802, + "learning_rate": 1.0888541556665562e-05, + "loss": 0.3314, + "step": 9580 + }, + { + "epoch": 1.888801261829653, + "grad_norm": 0.49514263389138563, + "learning_rate": 1.0886997650918848e-05, + "loss": 0.3212, + "step": 9581 + }, + { + "epoch": 1.8889984227129337, + "grad_norm": 0.45933092936924486, + "learning_rate": 1.08854537238613e-05, + "loss": 0.3102, + "step": 9582 + }, + { + "epoch": 1.8891955835962144, + "grad_norm": 0.4590562261492261, + "learning_rate": 1.0883909775530013e-05, + "loss": 0.3114, + "step": 9583 + }, + { + "epoch": 1.8893927444794953, + "grad_norm": 0.4834863213201024, + "learning_rate": 1.0882365805962083e-05, + "loss": 0.3437, + "step": 9584 + }, + { + "epoch": 1.889589905362776, + "grad_norm": 0.4659483454174267, + "learning_rate": 1.0880821815194602e-05, + "loss": 0.316, + "step": 9585 + }, + { + "epoch": 1.8897870662460567, + "grad_norm": 0.4768864743117075, + "learning_rate": 1.087927780326467e-05, + "loss": 0.3356, + "step": 9586 + }, + { + "epoch": 1.8899842271293377, + "grad_norm": 0.459781264277085, + "learning_rate": 1.087773377020938e-05, + "loss": 0.3245, + "step": 9587 + }, + { + "epoch": 1.8901813880126181, + "grad_norm": 0.4583408374138778, + "learning_rate": 1.0876189716065825e-05, + "loss": 0.323, + "step": 9588 + }, + { + "epoch": 1.890378548895899, + "grad_norm": 0.4504538132773227, + "learning_rate": 1.0874645640871114e-05, + "loss": 0.3059, + "step": 9589 + }, + { + "epoch": 1.8905757097791798, + "grad_norm": 0.47695654088042766, + "learning_rate": 1.087310154466233e-05, + "loss": 0.3227, + "step": 9590 + }, + { + "epoch": 1.8907728706624605, + "grad_norm": 0.4482267647409884, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.3133, + "step": 9591 + }, + { + "epoch": 1.8909700315457414, + "grad_norm": 0.46554460153463734, + "learning_rate": 1.0870013289350964e-05, + "loss": 0.3353, + "step": 9592 + }, + { + "epoch": 1.891167192429022, + "grad_norm": 0.4672677533066259, + "learning_rate": 1.0868469130322581e-05, + "loss": 0.314, + "step": 9593 + }, + { + "epoch": 1.8913643533123028, + "grad_norm": 0.6158071510944071, + "learning_rate": 1.086692495042852e-05, + "loss": 0.3526, + "step": 9594 + }, + { + "epoch": 1.8915615141955837, + "grad_norm": 0.43646887186476924, + "learning_rate": 1.0865380749705892e-05, + "loss": 0.3137, + "step": 9595 + }, + { + "epoch": 1.8917586750788642, + "grad_norm": 0.5134052948284018, + "learning_rate": 1.0863836528191795e-05, + "loss": 0.3521, + "step": 9596 + }, + { + "epoch": 1.8919558359621451, + "grad_norm": 0.4882031174942994, + "learning_rate": 1.0862292285923331e-05, + "loss": 0.3404, + "step": 9597 + }, + { + "epoch": 1.8921529968454258, + "grad_norm": 0.44241922335608497, + "learning_rate": 1.08607480229376e-05, + "loss": 0.3076, + "step": 9598 + }, + { + "epoch": 1.8923501577287065, + "grad_norm": 0.48941048003891413, + "learning_rate": 1.0859203739271702e-05, + "loss": 0.3379, + "step": 9599 + }, + { + "epoch": 1.8925473186119874, + "grad_norm": 0.5021082128605233, + "learning_rate": 1.0857659434962744e-05, + "loss": 0.3502, + "step": 9600 + }, + { + "epoch": 1.8927444794952681, + "grad_norm": 0.49462878367308655, + "learning_rate": 1.0856115110047829e-05, + "loss": 0.344, + "step": 9601 + }, + { + "epoch": 1.8929416403785488, + "grad_norm": 0.5710152300950999, + "learning_rate": 1.0854570764564057e-05, + "loss": 0.3439, + "step": 9602 + }, + { + "epoch": 1.8931388012618298, + "grad_norm": 0.4611516368668063, + "learning_rate": 1.0853026398548535e-05, + "loss": 0.3331, + "step": 9603 + }, + { + "epoch": 1.8933359621451105, + "grad_norm": 0.45617109895197894, + "learning_rate": 1.0851482012038366e-05, + "loss": 0.299, + "step": 9604 + }, + { + "epoch": 1.8935331230283912, + "grad_norm": 0.4710318513048329, + "learning_rate": 1.0849937605070658e-05, + "loss": 0.3069, + "step": 9605 + }, + { + "epoch": 1.893730283911672, + "grad_norm": 0.5515208008448891, + "learning_rate": 1.0848393177682513e-05, + "loss": 0.3783, + "step": 9606 + }, + { + "epoch": 1.8939274447949526, + "grad_norm": 0.48486227843023466, + "learning_rate": 1.0846848729911037e-05, + "loss": 0.337, + "step": 9607 + }, + { + "epoch": 1.8941246056782335, + "grad_norm": 0.4820429170759435, + "learning_rate": 1.084530426179334e-05, + "loss": 0.3312, + "step": 9608 + }, + { + "epoch": 1.8943217665615142, + "grad_norm": 0.5600154595642701, + "learning_rate": 1.0843759773366526e-05, + "loss": 0.3339, + "step": 9609 + }, + { + "epoch": 1.8945189274447949, + "grad_norm": 0.48414804671739764, + "learning_rate": 1.0842215264667708e-05, + "loss": 0.3515, + "step": 9610 + }, + { + "epoch": 1.8947160883280758, + "grad_norm": 0.46300778683793087, + "learning_rate": 1.0840670735733984e-05, + "loss": 0.3089, + "step": 9611 + }, + { + "epoch": 1.8949132492113565, + "grad_norm": 0.4933738100516299, + "learning_rate": 1.0839126186602475e-05, + "loss": 0.3337, + "step": 9612 + }, + { + "epoch": 1.8951104100946372, + "grad_norm": 0.4854372334064325, + "learning_rate": 1.0837581617310279e-05, + "loss": 0.3506, + "step": 9613 + }, + { + "epoch": 1.8953075709779181, + "grad_norm": 0.458378537894229, + "learning_rate": 1.0836037027894515e-05, + "loss": 0.3205, + "step": 9614 + }, + { + "epoch": 1.8955047318611986, + "grad_norm": 0.4564763917102619, + "learning_rate": 1.0834492418392281e-05, + "loss": 0.3117, + "step": 9615 + }, + { + "epoch": 1.8957018927444795, + "grad_norm": 0.508560600760082, + "learning_rate": 1.0832947788840699e-05, + "loss": 0.3454, + "step": 9616 + }, + { + "epoch": 1.8958990536277602, + "grad_norm": 0.4692816776007284, + "learning_rate": 1.0831403139276875e-05, + "loss": 0.3364, + "step": 9617 + }, + { + "epoch": 1.896096214511041, + "grad_norm": 0.4606352560271506, + "learning_rate": 1.0829858469737921e-05, + "loss": 0.3357, + "step": 9618 + }, + { + "epoch": 1.8962933753943219, + "grad_norm": 0.4547120875924028, + "learning_rate": 1.082831378026095e-05, + "loss": 0.3286, + "step": 9619 + }, + { + "epoch": 1.8964905362776026, + "grad_norm": 0.5162498136227341, + "learning_rate": 1.0826769070883073e-05, + "loss": 0.3436, + "step": 9620 + }, + { + "epoch": 1.8966876971608833, + "grad_norm": 0.4804924944515462, + "learning_rate": 1.0825224341641403e-05, + "loss": 0.3225, + "step": 9621 + }, + { + "epoch": 1.8968848580441642, + "grad_norm": 0.4755043731674942, + "learning_rate": 1.0823679592573052e-05, + "loss": 0.3484, + "step": 9622 + }, + { + "epoch": 1.8970820189274447, + "grad_norm": 0.47229749756388667, + "learning_rate": 1.0822134823715139e-05, + "loss": 0.3348, + "step": 9623 + }, + { + "epoch": 1.8972791798107256, + "grad_norm": 0.443728175982202, + "learning_rate": 1.0820590035104773e-05, + "loss": 0.3156, + "step": 9624 + }, + { + "epoch": 1.8974763406940063, + "grad_norm": 0.45201673400276804, + "learning_rate": 1.0819045226779071e-05, + "loss": 0.3132, + "step": 9625 + }, + { + "epoch": 1.897673501577287, + "grad_norm": 0.46420972816236206, + "learning_rate": 1.0817500398775147e-05, + "loss": 0.3285, + "step": 9626 + }, + { + "epoch": 1.897870662460568, + "grad_norm": 0.4878624846038825, + "learning_rate": 1.0815955551130117e-05, + "loss": 0.3621, + "step": 9627 + }, + { + "epoch": 1.8980678233438486, + "grad_norm": 0.46796009590591403, + "learning_rate": 1.0814410683881098e-05, + "loss": 0.3341, + "step": 9628 + }, + { + "epoch": 1.8982649842271293, + "grad_norm": 0.46658974852311247, + "learning_rate": 1.0812865797065209e-05, + "loss": 0.3476, + "step": 9629 + }, + { + "epoch": 1.8984621451104102, + "grad_norm": 0.48330294725808115, + "learning_rate": 1.0811320890719558e-05, + "loss": 0.3436, + "step": 9630 + }, + { + "epoch": 1.8986593059936907, + "grad_norm": 0.47789134303226777, + "learning_rate": 1.0809775964881278e-05, + "loss": 0.3399, + "step": 9631 + }, + { + "epoch": 1.8988564668769716, + "grad_norm": 0.462828839034776, + "learning_rate": 1.0808231019587472e-05, + "loss": 0.3406, + "step": 9632 + }, + { + "epoch": 1.8990536277602523, + "grad_norm": 0.4498197895663266, + "learning_rate": 1.0806686054875268e-05, + "loss": 0.3139, + "step": 9633 + }, + { + "epoch": 1.899250788643533, + "grad_norm": 0.5032026325434863, + "learning_rate": 1.080514107078178e-05, + "loss": 0.333, + "step": 9634 + }, + { + "epoch": 1.899447949526814, + "grad_norm": 0.4669772787243432, + "learning_rate": 1.0803596067344134e-05, + "loss": 0.319, + "step": 9635 + }, + { + "epoch": 1.8996451104100947, + "grad_norm": 0.4506661711957322, + "learning_rate": 1.0802051044599441e-05, + "loss": 0.3223, + "step": 9636 + }, + { + "epoch": 1.8998422712933754, + "grad_norm": 0.47399636404681555, + "learning_rate": 1.0800506002584825e-05, + "loss": 0.3244, + "step": 9637 + }, + { + "epoch": 1.9000394321766563, + "grad_norm": 0.4508527663354465, + "learning_rate": 1.0798960941337411e-05, + "loss": 0.3369, + "step": 9638 + }, + { + "epoch": 1.9002365930599368, + "grad_norm": 0.5017850647875128, + "learning_rate": 1.0797415860894313e-05, + "loss": 0.3421, + "step": 9639 + }, + { + "epoch": 1.9004337539432177, + "grad_norm": 0.4883584730902664, + "learning_rate": 1.0795870761292661e-05, + "loss": 0.3313, + "step": 9640 + }, + { + "epoch": 1.9006309148264984, + "grad_norm": 0.4761447190397066, + "learning_rate": 1.079432564256957e-05, + "loss": 0.3124, + "step": 9641 + }, + { + "epoch": 1.900828075709779, + "grad_norm": 0.44783562310151726, + "learning_rate": 1.0792780504762168e-05, + "loss": 0.2974, + "step": 9642 + }, + { + "epoch": 1.90102523659306, + "grad_norm": 0.47770652935997016, + "learning_rate": 1.0791235347907573e-05, + "loss": 0.3256, + "step": 9643 + }, + { + "epoch": 1.9012223974763407, + "grad_norm": 0.5877815822517798, + "learning_rate": 1.0789690172042912e-05, + "loss": 0.2918, + "step": 9644 + }, + { + "epoch": 1.9014195583596214, + "grad_norm": 0.4726070261592881, + "learning_rate": 1.078814497720531e-05, + "loss": 0.321, + "step": 9645 + }, + { + "epoch": 1.9016167192429023, + "grad_norm": 0.4907105853731056, + "learning_rate": 1.0786599763431891e-05, + "loss": 0.3557, + "step": 9646 + }, + { + "epoch": 1.901813880126183, + "grad_norm": 0.4597358486182623, + "learning_rate": 1.078505453075978e-05, + "loss": 0.3013, + "step": 9647 + }, + { + "epoch": 1.9020110410094637, + "grad_norm": 0.5048614261322799, + "learning_rate": 1.0783509279226099e-05, + "loss": 0.3458, + "step": 9648 + }, + { + "epoch": 1.9022082018927446, + "grad_norm": 0.4938247930425652, + "learning_rate": 1.0781964008867979e-05, + "loss": 0.3517, + "step": 9649 + }, + { + "epoch": 1.9024053627760251, + "grad_norm": 0.4798368494115209, + "learning_rate": 1.0780418719722544e-05, + "loss": 0.3387, + "step": 9650 + }, + { + "epoch": 1.902602523659306, + "grad_norm": 0.4594536938455119, + "learning_rate": 1.0778873411826918e-05, + "loss": 0.3178, + "step": 9651 + }, + { + "epoch": 1.9027996845425867, + "grad_norm": 0.4600549598776842, + "learning_rate": 1.0777328085218232e-05, + "loss": 0.3114, + "step": 9652 + }, + { + "epoch": 1.9029968454258674, + "grad_norm": 0.6555278594200854, + "learning_rate": 1.0775782739933614e-05, + "loss": 0.3139, + "step": 9653 + }, + { + "epoch": 1.9031940063091484, + "grad_norm": 0.47716562507209187, + "learning_rate": 1.077423737601019e-05, + "loss": 0.3301, + "step": 9654 + }, + { + "epoch": 1.903391167192429, + "grad_norm": 0.47582609005457543, + "learning_rate": 1.0772691993485091e-05, + "loss": 0.3196, + "step": 9655 + }, + { + "epoch": 1.9035883280757098, + "grad_norm": 0.4654364416610627, + "learning_rate": 1.0771146592395443e-05, + "loss": 0.3211, + "step": 9656 + }, + { + "epoch": 1.9037854889589907, + "grad_norm": 0.46468189188527265, + "learning_rate": 1.0769601172778379e-05, + "loss": 0.3355, + "step": 9657 + }, + { + "epoch": 1.9039826498422712, + "grad_norm": 0.45410022068436584, + "learning_rate": 1.0768055734671023e-05, + "loss": 0.3109, + "step": 9658 + }, + { + "epoch": 1.904179810725552, + "grad_norm": 0.48221225731467116, + "learning_rate": 1.0766510278110514e-05, + "loss": 0.3421, + "step": 9659 + }, + { + "epoch": 1.9043769716088328, + "grad_norm": 0.47483955562907587, + "learning_rate": 1.0764964803133975e-05, + "loss": 0.324, + "step": 9660 + }, + { + "epoch": 1.9045741324921135, + "grad_norm": 0.5016946982181942, + "learning_rate": 1.0763419309778544e-05, + "loss": 0.3586, + "step": 9661 + }, + { + "epoch": 1.9047712933753944, + "grad_norm": 0.4816207167555914, + "learning_rate": 1.0761873798081343e-05, + "loss": 0.3425, + "step": 9662 + }, + { + "epoch": 1.9049684542586751, + "grad_norm": 0.46646055121198043, + "learning_rate": 1.0760328268079517e-05, + "loss": 0.3387, + "step": 9663 + }, + { + "epoch": 1.9051656151419558, + "grad_norm": 0.4558246279347243, + "learning_rate": 1.075878271981019e-05, + "loss": 0.3035, + "step": 9664 + }, + { + "epoch": 1.9053627760252367, + "grad_norm": 0.49067026661890945, + "learning_rate": 1.0757237153310496e-05, + "loss": 0.3373, + "step": 9665 + }, + { + "epoch": 1.9055599369085172, + "grad_norm": 0.4654982629990961, + "learning_rate": 1.0755691568617573e-05, + "loss": 0.3214, + "step": 9666 + }, + { + "epoch": 1.9057570977917981, + "grad_norm": 0.48044344209508094, + "learning_rate": 1.0754145965768548e-05, + "loss": 0.3276, + "step": 9667 + }, + { + "epoch": 1.9059542586750788, + "grad_norm": 0.4639232573098766, + "learning_rate": 1.075260034480056e-05, + "loss": 0.3367, + "step": 9668 + }, + { + "epoch": 1.9061514195583595, + "grad_norm": 0.4610967414683846, + "learning_rate": 1.0751054705750744e-05, + "loss": 0.3195, + "step": 9669 + }, + { + "epoch": 1.9063485804416405, + "grad_norm": 0.4456558119416812, + "learning_rate": 1.0749509048656231e-05, + "loss": 0.3163, + "step": 9670 + }, + { + "epoch": 1.9065457413249212, + "grad_norm": 0.4679939791043013, + "learning_rate": 1.074796337355416e-05, + "loss": 0.3354, + "step": 9671 + }, + { + "epoch": 1.9067429022082019, + "grad_norm": 0.4886824586751215, + "learning_rate": 1.074641768048167e-05, + "loss": 0.3325, + "step": 9672 + }, + { + "epoch": 1.9069400630914828, + "grad_norm": 0.6283610239404007, + "learning_rate": 1.074487196947589e-05, + "loss": 0.3256, + "step": 9673 + }, + { + "epoch": 1.9071372239747633, + "grad_norm": 0.47479516642440406, + "learning_rate": 1.0743326240573964e-05, + "loss": 0.3345, + "step": 9674 + }, + { + "epoch": 1.9073343848580442, + "grad_norm": 0.48102011928532773, + "learning_rate": 1.0741780493813025e-05, + "loss": 0.3687, + "step": 9675 + }, + { + "epoch": 1.9075315457413249, + "grad_norm": 0.44787874825005314, + "learning_rate": 1.0740234729230213e-05, + "loss": 0.3302, + "step": 9676 + }, + { + "epoch": 1.9077287066246056, + "grad_norm": 0.4786425290897722, + "learning_rate": 1.0738688946862661e-05, + "loss": 0.3335, + "step": 9677 + }, + { + "epoch": 1.9079258675078865, + "grad_norm": 0.47312252556976, + "learning_rate": 1.073714314674752e-05, + "loss": 0.3166, + "step": 9678 + }, + { + "epoch": 1.9081230283911672, + "grad_norm": 0.4382653645073557, + "learning_rate": 1.0735597328921914e-05, + "loss": 0.3034, + "step": 9679 + }, + { + "epoch": 1.908320189274448, + "grad_norm": 0.44778499820386447, + "learning_rate": 1.0734051493422996e-05, + "loss": 0.3082, + "step": 9680 + }, + { + "epoch": 1.9085173501577288, + "grad_norm": 0.46994341815466767, + "learning_rate": 1.0732505640287895e-05, + "loss": 0.3271, + "step": 9681 + }, + { + "epoch": 1.9087145110410093, + "grad_norm": 0.5035019762587363, + "learning_rate": 1.0730959769553762e-05, + "loss": 0.3197, + "step": 9682 + }, + { + "epoch": 1.9089116719242902, + "grad_norm": 0.4658601369964348, + "learning_rate": 1.0729413881257725e-05, + "loss": 0.3325, + "step": 9683 + }, + { + "epoch": 1.909108832807571, + "grad_norm": 0.48330429263892694, + "learning_rate": 1.0727867975436936e-05, + "loss": 0.322, + "step": 9684 + }, + { + "epoch": 1.9093059936908516, + "grad_norm": 0.489211526633787, + "learning_rate": 1.072632205212853e-05, + "loss": 0.3395, + "step": 9685 + }, + { + "epoch": 1.9095031545741326, + "grad_norm": 0.49250668046451535, + "learning_rate": 1.0724776111369654e-05, + "loss": 0.3657, + "step": 9686 + }, + { + "epoch": 1.9097003154574133, + "grad_norm": 0.4740052263296321, + "learning_rate": 1.072323015319745e-05, + "loss": 0.3284, + "step": 9687 + }, + { + "epoch": 1.909897476340694, + "grad_norm": 0.46842386445090717, + "learning_rate": 1.0721684177649056e-05, + "loss": 0.3339, + "step": 9688 + }, + { + "epoch": 1.9100946372239749, + "grad_norm": 0.46625362508947077, + "learning_rate": 1.0720138184761621e-05, + "loss": 0.342, + "step": 9689 + }, + { + "epoch": 1.9102917981072554, + "grad_norm": 0.4848740749436997, + "learning_rate": 1.0718592174572285e-05, + "loss": 0.3375, + "step": 9690 + }, + { + "epoch": 1.9104889589905363, + "grad_norm": 0.48490516898151786, + "learning_rate": 1.0717046147118193e-05, + "loss": 0.3189, + "step": 9691 + }, + { + "epoch": 1.910686119873817, + "grad_norm": 0.46286777382849786, + "learning_rate": 1.071550010243649e-05, + "loss": 0.3207, + "step": 9692 + }, + { + "epoch": 1.9108832807570977, + "grad_norm": 0.4509100884863312, + "learning_rate": 1.071395404056432e-05, + "loss": 0.3283, + "step": 9693 + }, + { + "epoch": 1.9110804416403786, + "grad_norm": 0.4959185650773511, + "learning_rate": 1.071240796153883e-05, + "loss": 0.3451, + "step": 9694 + }, + { + "epoch": 1.9112776025236593, + "grad_norm": 0.4708104478931626, + "learning_rate": 1.0710861865397166e-05, + "loss": 0.348, + "step": 9695 + }, + { + "epoch": 1.91147476340694, + "grad_norm": 0.48697087265632166, + "learning_rate": 1.0709315752176472e-05, + "loss": 0.3357, + "step": 9696 + }, + { + "epoch": 1.911671924290221, + "grad_norm": 0.49229875756280933, + "learning_rate": 1.0707769621913897e-05, + "loss": 0.3522, + "step": 9697 + }, + { + "epoch": 1.9118690851735016, + "grad_norm": 0.4787942182967307, + "learning_rate": 1.0706223474646581e-05, + "loss": 0.3469, + "step": 9698 + }, + { + "epoch": 1.9120662460567823, + "grad_norm": 0.46221031697674925, + "learning_rate": 1.0704677310411686e-05, + "loss": 0.3215, + "step": 9699 + }, + { + "epoch": 1.9122634069400632, + "grad_norm": 0.49657974351122547, + "learning_rate": 1.0703131129246347e-05, + "loss": 0.3449, + "step": 9700 + }, + { + "epoch": 1.9124605678233437, + "grad_norm": 0.4700103591052916, + "learning_rate": 1.070158493118772e-05, + "loss": 0.3143, + "step": 9701 + }, + { + "epoch": 1.9126577287066246, + "grad_norm": 0.5045654021745932, + "learning_rate": 1.0700038716272944e-05, + "loss": 0.3246, + "step": 9702 + }, + { + "epoch": 1.9128548895899053, + "grad_norm": 0.4708499061368683, + "learning_rate": 1.0698492484539178e-05, + "loss": 0.3436, + "step": 9703 + }, + { + "epoch": 1.913052050473186, + "grad_norm": 0.4808287774080232, + "learning_rate": 1.0696946236023566e-05, + "loss": 0.3474, + "step": 9704 + }, + { + "epoch": 1.913249211356467, + "grad_norm": 0.4919946738804181, + "learning_rate": 1.0695399970763258e-05, + "loss": 0.331, + "step": 9705 + }, + { + "epoch": 1.9134463722397477, + "grad_norm": 0.48323751424237105, + "learning_rate": 1.069385368879541e-05, + "loss": 0.3436, + "step": 9706 + }, + { + "epoch": 1.9136435331230284, + "grad_norm": 0.4694330210445866, + "learning_rate": 1.0692307390157164e-05, + "loss": 0.3144, + "step": 9707 + }, + { + "epoch": 1.9138406940063093, + "grad_norm": 0.4689921139270199, + "learning_rate": 1.069076107488568e-05, + "loss": 0.3196, + "step": 9708 + }, + { + "epoch": 1.9140378548895898, + "grad_norm": 0.46415715567906024, + "learning_rate": 1.0689214743018102e-05, + "loss": 0.3288, + "step": 9709 + }, + { + "epoch": 1.9142350157728707, + "grad_norm": 0.515132913417703, + "learning_rate": 1.0687668394591586e-05, + "loss": 0.3284, + "step": 9710 + }, + { + "epoch": 1.9144321766561514, + "grad_norm": 0.4513787042971108, + "learning_rate": 1.068612202964328e-05, + "loss": 0.311, + "step": 9711 + }, + { + "epoch": 1.914629337539432, + "grad_norm": 0.502933214732867, + "learning_rate": 1.0684575648210343e-05, + "loss": 0.3317, + "step": 9712 + }, + { + "epoch": 1.914826498422713, + "grad_norm": 0.45248819432377035, + "learning_rate": 1.0683029250329924e-05, + "loss": 0.3317, + "step": 9713 + }, + { + "epoch": 1.9150236593059937, + "grad_norm": 0.46741383909252043, + "learning_rate": 1.0681482836039176e-05, + "loss": 0.3067, + "step": 9714 + }, + { + "epoch": 1.9152208201892744, + "grad_norm": 0.47201782201403564, + "learning_rate": 1.0679936405375255e-05, + "loss": 0.3541, + "step": 9715 + }, + { + "epoch": 1.9154179810725553, + "grad_norm": 0.4945029413774007, + "learning_rate": 1.0678389958375316e-05, + "loss": 0.3423, + "step": 9716 + }, + { + "epoch": 1.9156151419558358, + "grad_norm": 0.5418610271930345, + "learning_rate": 1.067684349507651e-05, + "loss": 0.3696, + "step": 9717 + }, + { + "epoch": 1.9158123028391167, + "grad_norm": 0.5060226118055992, + "learning_rate": 1.0675297015515993e-05, + "loss": 0.3504, + "step": 9718 + }, + { + "epoch": 1.9160094637223974, + "grad_norm": 0.44874325788032515, + "learning_rate": 1.0673750519730923e-05, + "loss": 0.2836, + "step": 9719 + }, + { + "epoch": 1.9162066246056781, + "grad_norm": 0.4762929005302375, + "learning_rate": 1.0672204007758453e-05, + "loss": 0.3362, + "step": 9720 + }, + { + "epoch": 1.916403785488959, + "grad_norm": 0.4643277801877159, + "learning_rate": 1.0670657479635742e-05, + "loss": 0.3236, + "step": 9721 + }, + { + "epoch": 1.9166009463722398, + "grad_norm": 0.49866980334934874, + "learning_rate": 1.0669110935399944e-05, + "loss": 0.3474, + "step": 9722 + }, + { + "epoch": 1.9167981072555205, + "grad_norm": 0.46636862046761957, + "learning_rate": 1.0667564375088218e-05, + "loss": 0.3371, + "step": 9723 + }, + { + "epoch": 1.9169952681388014, + "grad_norm": 0.4624733590198138, + "learning_rate": 1.066601779873772e-05, + "loss": 0.3149, + "step": 9724 + }, + { + "epoch": 1.9171924290220819, + "grad_norm": 0.4477193303841462, + "learning_rate": 1.0664471206385607e-05, + "loss": 0.2978, + "step": 9725 + }, + { + "epoch": 1.9173895899053628, + "grad_norm": 0.4436196530416114, + "learning_rate": 1.0662924598069035e-05, + "loss": 0.302, + "step": 9726 + }, + { + "epoch": 1.9175867507886435, + "grad_norm": 0.4980769406665461, + "learning_rate": 1.0661377973825173e-05, + "loss": 0.355, + "step": 9727 + }, + { + "epoch": 1.9177839116719242, + "grad_norm": 0.4692953470690748, + "learning_rate": 1.0659831333691166e-05, + "loss": 0.3222, + "step": 9728 + }, + { + "epoch": 1.9179810725552051, + "grad_norm": 0.4855708808826588, + "learning_rate": 1.0658284677704187e-05, + "loss": 0.3387, + "step": 9729 + }, + { + "epoch": 1.9181782334384858, + "grad_norm": 0.4926783442889073, + "learning_rate": 1.0656738005901382e-05, + "loss": 0.3263, + "step": 9730 + }, + { + "epoch": 1.9183753943217665, + "grad_norm": 0.49008026881045197, + "learning_rate": 1.0655191318319921e-05, + "loss": 0.3788, + "step": 9731 + }, + { + "epoch": 1.9185725552050474, + "grad_norm": 0.47996998660573686, + "learning_rate": 1.0653644614996958e-05, + "loss": 0.3424, + "step": 9732 + }, + { + "epoch": 1.918769716088328, + "grad_norm": 0.6127908630549663, + "learning_rate": 1.0652097895969657e-05, + "loss": 0.3558, + "step": 9733 + }, + { + "epoch": 1.9189668769716088, + "grad_norm": 0.4627932710115664, + "learning_rate": 1.0650551161275182e-05, + "loss": 0.3155, + "step": 9734 + }, + { + "epoch": 1.9191640378548895, + "grad_norm": 0.45413739024853905, + "learning_rate": 1.064900441095069e-05, + "loss": 0.3031, + "step": 9735 + }, + { + "epoch": 1.9193611987381702, + "grad_norm": 0.4495957982119002, + "learning_rate": 1.0647457645033343e-05, + "loss": 0.3198, + "step": 9736 + }, + { + "epoch": 1.9195583596214512, + "grad_norm": 0.44672878440578423, + "learning_rate": 1.0645910863560306e-05, + "loss": 0.3065, + "step": 9737 + }, + { + "epoch": 1.9197555205047319, + "grad_norm": 0.4540120569214132, + "learning_rate": 1.0644364066568742e-05, + "loss": 0.316, + "step": 9738 + }, + { + "epoch": 1.9199526813880126, + "grad_norm": 1.8452643662313766, + "learning_rate": 1.0642817254095809e-05, + "loss": 0.3175, + "step": 9739 + }, + { + "epoch": 1.9201498422712935, + "grad_norm": 0.5187412295956796, + "learning_rate": 1.0641270426178677e-05, + "loss": 0.3445, + "step": 9740 + }, + { + "epoch": 1.9203470031545742, + "grad_norm": 0.4665160701342525, + "learning_rate": 1.0639723582854505e-05, + "loss": 0.3309, + "step": 9741 + }, + { + "epoch": 1.9205441640378549, + "grad_norm": 0.4551439613909609, + "learning_rate": 1.0638176724160458e-05, + "loss": 0.3291, + "step": 9742 + }, + { + "epoch": 1.9207413249211358, + "grad_norm": 0.5160208376705452, + "learning_rate": 1.0636629850133705e-05, + "loss": 0.3391, + "step": 9743 + }, + { + "epoch": 1.9209384858044163, + "grad_norm": 0.49989041787707916, + "learning_rate": 1.0635082960811403e-05, + "loss": 0.3444, + "step": 9744 + }, + { + "epoch": 1.9211356466876972, + "grad_norm": 0.4964849127683154, + "learning_rate": 1.063353605623072e-05, + "loss": 0.3492, + "step": 9745 + }, + { + "epoch": 1.921332807570978, + "grad_norm": 0.6049573542199426, + "learning_rate": 1.0631989136428828e-05, + "loss": 0.3631, + "step": 9746 + }, + { + "epoch": 1.9215299684542586, + "grad_norm": 0.4756893237472388, + "learning_rate": 1.0630442201442884e-05, + "loss": 0.3283, + "step": 9747 + }, + { + "epoch": 1.9217271293375395, + "grad_norm": 0.6792395870038784, + "learning_rate": 1.0628895251310063e-05, + "loss": 0.3448, + "step": 9748 + }, + { + "epoch": 1.9219242902208202, + "grad_norm": 0.4749099206929661, + "learning_rate": 1.0627348286067521e-05, + "loss": 0.3263, + "step": 9749 + }, + { + "epoch": 1.922121451104101, + "grad_norm": 0.46009586758698623, + "learning_rate": 1.0625801305752436e-05, + "loss": 0.3141, + "step": 9750 + }, + { + "epoch": 1.9223186119873819, + "grad_norm": 0.46687228371318157, + "learning_rate": 1.062425431040197e-05, + "loss": 0.3375, + "step": 9751 + }, + { + "epoch": 1.9225157728706623, + "grad_norm": 0.5157651333274518, + "learning_rate": 1.062270730005329e-05, + "loss": 0.3194, + "step": 9752 + }, + { + "epoch": 1.9227129337539433, + "grad_norm": 0.5008517739542978, + "learning_rate": 1.0621160274743564e-05, + "loss": 0.3596, + "step": 9753 + }, + { + "epoch": 1.922910094637224, + "grad_norm": 0.44671709049249914, + "learning_rate": 1.0619613234509967e-05, + "loss": 0.3119, + "step": 9754 + }, + { + "epoch": 1.9231072555205047, + "grad_norm": 0.4313404095262679, + "learning_rate": 1.0618066179389663e-05, + "loss": 0.2903, + "step": 9755 + }, + { + "epoch": 1.9233044164037856, + "grad_norm": 0.44712186986789765, + "learning_rate": 1.0616519109419815e-05, + "loss": 0.31, + "step": 9756 + }, + { + "epoch": 1.9235015772870663, + "grad_norm": 0.48107891788084883, + "learning_rate": 1.0614972024637606e-05, + "loss": 0.3371, + "step": 9757 + }, + { + "epoch": 1.923698738170347, + "grad_norm": 0.4302137346468184, + "learning_rate": 1.0613424925080194e-05, + "loss": 0.2886, + "step": 9758 + }, + { + "epoch": 1.923895899053628, + "grad_norm": 0.4463000909131128, + "learning_rate": 1.0611877810784756e-05, + "loss": 0.3051, + "step": 9759 + }, + { + "epoch": 1.9240930599369084, + "grad_norm": 0.46877366951404054, + "learning_rate": 1.061033068178846e-05, + "loss": 0.3319, + "step": 9760 + }, + { + "epoch": 1.9242902208201893, + "grad_norm": 0.45058760214666865, + "learning_rate": 1.0608783538128479e-05, + "loss": 0.3294, + "step": 9761 + }, + { + "epoch": 1.92448738170347, + "grad_norm": 0.4673506711450003, + "learning_rate": 1.0607236379841984e-05, + "loss": 0.3131, + "step": 9762 + }, + { + "epoch": 1.9246845425867507, + "grad_norm": 0.4448969256754839, + "learning_rate": 1.0605689206966145e-05, + "loss": 0.3046, + "step": 9763 + }, + { + "epoch": 1.9248817034700316, + "grad_norm": 0.44398749817741634, + "learning_rate": 1.0604142019538135e-05, + "loss": 0.3036, + "step": 9764 + }, + { + "epoch": 1.9250788643533123, + "grad_norm": 0.4662281069913097, + "learning_rate": 1.0602594817595126e-05, + "loss": 0.3327, + "step": 9765 + }, + { + "epoch": 1.925276025236593, + "grad_norm": 0.6123896643925617, + "learning_rate": 1.060104760117429e-05, + "loss": 0.2994, + "step": 9766 + }, + { + "epoch": 1.925473186119874, + "grad_norm": 0.4800059909800257, + "learning_rate": 1.0599500370312805e-05, + "loss": 0.332, + "step": 9767 + }, + { + "epoch": 1.9256703470031544, + "grad_norm": 0.47073997673558676, + "learning_rate": 1.0597953125047839e-05, + "loss": 0.3173, + "step": 9768 + }, + { + "epoch": 1.9258675078864353, + "grad_norm": 0.486613027526376, + "learning_rate": 1.0596405865416569e-05, + "loss": 0.3397, + "step": 9769 + }, + { + "epoch": 1.926064668769716, + "grad_norm": 0.4412020035394467, + "learning_rate": 1.0594858591456166e-05, + "loss": 0.3088, + "step": 9770 + }, + { + "epoch": 1.9262618296529967, + "grad_norm": 0.4645088549635914, + "learning_rate": 1.0593311303203806e-05, + "loss": 0.3399, + "step": 9771 + }, + { + "epoch": 1.9264589905362777, + "grad_norm": 0.5045226039115558, + "learning_rate": 1.0591764000696665e-05, + "loss": 0.3695, + "step": 9772 + }, + { + "epoch": 1.9266561514195584, + "grad_norm": 0.4783611486444141, + "learning_rate": 1.0590216683971915e-05, + "loss": 0.334, + "step": 9773 + }, + { + "epoch": 1.926853312302839, + "grad_norm": 0.47330458094671435, + "learning_rate": 1.0588669353066739e-05, + "loss": 0.3499, + "step": 9774 + }, + { + "epoch": 1.92705047318612, + "grad_norm": 0.4625886872385459, + "learning_rate": 1.0587122008018303e-05, + "loss": 0.3267, + "step": 9775 + }, + { + "epoch": 1.9272476340694005, + "grad_norm": 0.4856730238683575, + "learning_rate": 1.058557464886379e-05, + "loss": 0.3312, + "step": 9776 + }, + { + "epoch": 1.9274447949526814, + "grad_norm": 0.46470351748252176, + "learning_rate": 1.0584027275640372e-05, + "loss": 0.3281, + "step": 9777 + }, + { + "epoch": 1.927641955835962, + "grad_norm": 0.4805088560192612, + "learning_rate": 1.0582479888385233e-05, + "loss": 0.3373, + "step": 9778 + }, + { + "epoch": 1.9278391167192428, + "grad_norm": 0.4876012478543451, + "learning_rate": 1.0580932487135541e-05, + "loss": 0.3425, + "step": 9779 + }, + { + "epoch": 1.9280362776025237, + "grad_norm": 0.5473660280771162, + "learning_rate": 1.057938507192848e-05, + "loss": 0.351, + "step": 9780 + }, + { + "epoch": 1.9282334384858044, + "grad_norm": 0.4684135144874953, + "learning_rate": 1.0577837642801227e-05, + "loss": 0.338, + "step": 9781 + }, + { + "epoch": 1.9284305993690851, + "grad_norm": 0.48445875938977345, + "learning_rate": 1.0576290199790959e-05, + "loss": 0.3315, + "step": 9782 + }, + { + "epoch": 1.928627760252366, + "grad_norm": 0.5000034658687103, + "learning_rate": 1.0574742742934853e-05, + "loss": 0.3623, + "step": 9783 + }, + { + "epoch": 1.9288249211356467, + "grad_norm": 0.4536757092944629, + "learning_rate": 1.0573195272270091e-05, + "loss": 0.3083, + "step": 9784 + }, + { + "epoch": 1.9290220820189274, + "grad_norm": 0.47051487862294444, + "learning_rate": 1.0571647787833853e-05, + "loss": 0.3294, + "step": 9785 + }, + { + "epoch": 1.9292192429022084, + "grad_norm": 0.48487850881869465, + "learning_rate": 1.0570100289663314e-05, + "loss": 0.3444, + "step": 9786 + }, + { + "epoch": 1.9294164037854888, + "grad_norm": 0.4587817967805745, + "learning_rate": 1.0568552777795657e-05, + "loss": 0.3369, + "step": 9787 + }, + { + "epoch": 1.9296135646687698, + "grad_norm": 0.4489966707521766, + "learning_rate": 1.0567005252268063e-05, + "loss": 0.3023, + "step": 9788 + }, + { + "epoch": 1.9298107255520505, + "grad_norm": 0.5396452051256785, + "learning_rate": 1.056545771311771e-05, + "loss": 0.3587, + "step": 9789 + }, + { + "epoch": 1.9300078864353312, + "grad_norm": 0.45895425892398584, + "learning_rate": 1.056391016038178e-05, + "loss": 0.3135, + "step": 9790 + }, + { + "epoch": 1.930205047318612, + "grad_norm": 0.45902487475066067, + "learning_rate": 1.0562362594097456e-05, + "loss": 0.3256, + "step": 9791 + }, + { + "epoch": 1.9304022082018928, + "grad_norm": 0.48459698795680317, + "learning_rate": 1.0560815014301916e-05, + "loss": 0.3315, + "step": 9792 + }, + { + "epoch": 1.9305993690851735, + "grad_norm": 0.48790563977946905, + "learning_rate": 1.0559267421032345e-05, + "loss": 0.3464, + "step": 9793 + }, + { + "epoch": 1.9307965299684544, + "grad_norm": 0.45850742759324065, + "learning_rate": 1.055771981432592e-05, + "loss": 0.3014, + "step": 9794 + }, + { + "epoch": 1.930993690851735, + "grad_norm": 0.5039083825372765, + "learning_rate": 1.0556172194219831e-05, + "loss": 0.3541, + "step": 9795 + }, + { + "epoch": 1.9311908517350158, + "grad_norm": 0.48911414228696054, + "learning_rate": 1.0554624560751254e-05, + "loss": 0.3467, + "step": 9796 + }, + { + "epoch": 1.9313880126182965, + "grad_norm": 0.4929694442801919, + "learning_rate": 1.0553076913957381e-05, + "loss": 0.3387, + "step": 9797 + }, + { + "epoch": 1.9315851735015772, + "grad_norm": 0.47700645459297586, + "learning_rate": 1.0551529253875383e-05, + "loss": 0.3645, + "step": 9798 + }, + { + "epoch": 1.9317823343848581, + "grad_norm": 0.45061951666064487, + "learning_rate": 1.0549981580542457e-05, + "loss": 0.2962, + "step": 9799 + }, + { + "epoch": 1.9319794952681388, + "grad_norm": 0.47286203795175763, + "learning_rate": 1.0548433893995775e-05, + "loss": 0.3213, + "step": 9800 + }, + { + "epoch": 1.9321766561514195, + "grad_norm": 0.486492356786951, + "learning_rate": 1.054688619427253e-05, + "loss": 0.3414, + "step": 9801 + }, + { + "epoch": 1.9323738170347005, + "grad_norm": 0.48260028863528787, + "learning_rate": 1.0545338481409903e-05, + "loss": 0.3489, + "step": 9802 + }, + { + "epoch": 1.932570977917981, + "grad_norm": 0.6276466016632191, + "learning_rate": 1.054379075544508e-05, + "loss": 0.3597, + "step": 9803 + }, + { + "epoch": 1.9327681388012619, + "grad_norm": 0.44166593615676375, + "learning_rate": 1.0542243016415248e-05, + "loss": 0.3116, + "step": 9804 + }, + { + "epoch": 1.9329652996845426, + "grad_norm": 6.057606271885772, + "learning_rate": 1.0540695264357587e-05, + "loss": 0.3798, + "step": 9805 + }, + { + "epoch": 1.9331624605678233, + "grad_norm": 0.4682081351995988, + "learning_rate": 1.053914749930929e-05, + "loss": 0.3357, + "step": 9806 + }, + { + "epoch": 1.9333596214511042, + "grad_norm": 0.45711029196728215, + "learning_rate": 1.0537599721307538e-05, + "loss": 0.3182, + "step": 9807 + }, + { + "epoch": 1.9335567823343849, + "grad_norm": 0.477630148881728, + "learning_rate": 1.0536051930389522e-05, + "loss": 0.3151, + "step": 9808 + }, + { + "epoch": 1.9337539432176656, + "grad_norm": 0.45025864365539675, + "learning_rate": 1.0534504126592426e-05, + "loss": 0.321, + "step": 9809 + }, + { + "epoch": 1.9339511041009465, + "grad_norm": 0.45003521165597976, + "learning_rate": 1.0532956309953437e-05, + "loss": 0.3104, + "step": 9810 + }, + { + "epoch": 1.934148264984227, + "grad_norm": 0.5519090372934141, + "learning_rate": 1.0531408480509744e-05, + "loss": 0.3511, + "step": 9811 + }, + { + "epoch": 1.934345425867508, + "grad_norm": 0.45552387144099366, + "learning_rate": 1.0529860638298535e-05, + "loss": 0.3169, + "step": 9812 + }, + { + "epoch": 1.9345425867507886, + "grad_norm": 0.48604649269003025, + "learning_rate": 1.0528312783356998e-05, + "loss": 0.3348, + "step": 9813 + }, + { + "epoch": 1.9347397476340693, + "grad_norm": 0.48778714572885906, + "learning_rate": 1.0526764915722319e-05, + "loss": 0.3564, + "step": 9814 + }, + { + "epoch": 1.9349369085173502, + "grad_norm": 0.46707720374812933, + "learning_rate": 1.0525217035431687e-05, + "loss": 0.3261, + "step": 9815 + }, + { + "epoch": 1.935134069400631, + "grad_norm": 0.5104817179313064, + "learning_rate": 1.0523669142522296e-05, + "loss": 0.3757, + "step": 9816 + }, + { + "epoch": 1.9353312302839116, + "grad_norm": 0.4760419839422579, + "learning_rate": 1.0522121237031331e-05, + "loss": 0.3202, + "step": 9817 + }, + { + "epoch": 1.9355283911671926, + "grad_norm": 0.44494383811455845, + "learning_rate": 1.0520573318995986e-05, + "loss": 0.32, + "step": 9818 + }, + { + "epoch": 1.935725552050473, + "grad_norm": 0.46156901553051594, + "learning_rate": 1.051902538845344e-05, + "loss": 0.3212, + "step": 9819 + }, + { + "epoch": 1.935922712933754, + "grad_norm": 0.4658238149819304, + "learning_rate": 1.0517477445440898e-05, + "loss": 0.321, + "step": 9820 + }, + { + "epoch": 1.9361198738170347, + "grad_norm": 0.44452316579492956, + "learning_rate": 1.0515929489995544e-05, + "loss": 0.3043, + "step": 9821 + }, + { + "epoch": 1.9363170347003154, + "grad_norm": 0.45081267801444497, + "learning_rate": 1.0514381522154563e-05, + "loss": 0.3149, + "step": 9822 + }, + { + "epoch": 1.9365141955835963, + "grad_norm": 0.5245331794257556, + "learning_rate": 1.0512833541955158e-05, + "loss": 0.3809, + "step": 9823 + }, + { + "epoch": 1.936711356466877, + "grad_norm": 0.4930837487343131, + "learning_rate": 1.0511285549434509e-05, + "loss": 0.3181, + "step": 9824 + }, + { + "epoch": 1.9369085173501577, + "grad_norm": 0.4545151283658732, + "learning_rate": 1.0509737544629817e-05, + "loss": 0.3117, + "step": 9825 + }, + { + "epoch": 1.9371056782334386, + "grad_norm": 0.47452766613882447, + "learning_rate": 1.0508189527578268e-05, + "loss": 0.3187, + "step": 9826 + }, + { + "epoch": 1.937302839116719, + "grad_norm": 0.44914864022298373, + "learning_rate": 1.0506641498317056e-05, + "loss": 0.3108, + "step": 9827 + }, + { + "epoch": 1.9375, + "grad_norm": 6.606550108751509, + "learning_rate": 1.0505093456883373e-05, + "loss": 0.3925, + "step": 9828 + }, + { + "epoch": 1.937697160883281, + "grad_norm": 0.48300218271218115, + "learning_rate": 1.0503545403314414e-05, + "loss": 0.3362, + "step": 9829 + }, + { + "epoch": 1.9378943217665614, + "grad_norm": 0.4709845468244718, + "learning_rate": 1.0501997337647372e-05, + "loss": 0.3292, + "step": 9830 + }, + { + "epoch": 1.9380914826498423, + "grad_norm": 0.4845184212039963, + "learning_rate": 1.050044925991944e-05, + "loss": 0.3554, + "step": 9831 + }, + { + "epoch": 1.938288643533123, + "grad_norm": 0.4879693795170068, + "learning_rate": 1.049890117016781e-05, + "loss": 0.3615, + "step": 9832 + }, + { + "epoch": 1.9384858044164037, + "grad_norm": 0.46307402255271723, + "learning_rate": 1.0497353068429678e-05, + "loss": 0.3401, + "step": 9833 + }, + { + "epoch": 1.9386829652996846, + "grad_norm": 0.5531223341102023, + "learning_rate": 1.049580495474224e-05, + "loss": 0.3652, + "step": 9834 + }, + { + "epoch": 1.9388801261829653, + "grad_norm": 0.4687758624727122, + "learning_rate": 1.0494256829142687e-05, + "loss": 0.3233, + "step": 9835 + }, + { + "epoch": 1.939077287066246, + "grad_norm": 0.4923244502736517, + "learning_rate": 1.0492708691668216e-05, + "loss": 0.3246, + "step": 9836 + }, + { + "epoch": 1.939274447949527, + "grad_norm": 0.44535709840076704, + "learning_rate": 1.049116054235602e-05, + "loss": 0.3239, + "step": 9837 + }, + { + "epoch": 1.9394716088328074, + "grad_norm": 0.473698052381602, + "learning_rate": 1.0489612381243299e-05, + "loss": 0.3464, + "step": 9838 + }, + { + "epoch": 1.9396687697160884, + "grad_norm": 0.4778541195055896, + "learning_rate": 1.0488064208367246e-05, + "loss": 0.3427, + "step": 9839 + }, + { + "epoch": 1.939865930599369, + "grad_norm": 0.46660859837931956, + "learning_rate": 1.0486516023765057e-05, + "loss": 0.3334, + "step": 9840 + }, + { + "epoch": 1.9400630914826498, + "grad_norm": 0.4609030291063781, + "learning_rate": 1.0484967827473927e-05, + "loss": 0.3233, + "step": 9841 + }, + { + "epoch": 1.9402602523659307, + "grad_norm": 0.46978565070370426, + "learning_rate": 1.0483419619531057e-05, + "loss": 0.3397, + "step": 9842 + }, + { + "epoch": 1.9404574132492114, + "grad_norm": 0.446991858909842, + "learning_rate": 1.0481871399973638e-05, + "loss": 0.3048, + "step": 9843 + }, + { + "epoch": 1.940654574132492, + "grad_norm": 0.4538700492840836, + "learning_rate": 1.0480323168838876e-05, + "loss": 0.3104, + "step": 9844 + }, + { + "epoch": 1.940851735015773, + "grad_norm": 0.4799559035841734, + "learning_rate": 1.0478774926163957e-05, + "loss": 0.3292, + "step": 9845 + }, + { + "epoch": 1.9410488958990535, + "grad_norm": 0.49610549652225705, + "learning_rate": 1.0477226671986089e-05, + "loss": 0.3407, + "step": 9846 + }, + { + "epoch": 1.9412460567823344, + "grad_norm": 0.4691565602936203, + "learning_rate": 1.0475678406342462e-05, + "loss": 0.3249, + "step": 9847 + }, + { + "epoch": 1.9414432176656151, + "grad_norm": 0.4548964647773779, + "learning_rate": 1.0474130129270281e-05, + "loss": 0.3158, + "step": 9848 + }, + { + "epoch": 1.9416403785488958, + "grad_norm": 0.46442569328638555, + "learning_rate": 1.0472581840806742e-05, + "loss": 0.3275, + "step": 9849 + }, + { + "epoch": 1.9418375394321767, + "grad_norm": 0.47409430097779603, + "learning_rate": 1.0471033540989044e-05, + "loss": 0.3487, + "step": 9850 + }, + { + "epoch": 1.9420347003154574, + "grad_norm": 0.4714341339076567, + "learning_rate": 1.0469485229854383e-05, + "loss": 0.3247, + "step": 9851 + }, + { + "epoch": 1.9422318611987381, + "grad_norm": 0.4471157743782902, + "learning_rate": 1.0467936907439966e-05, + "loss": 0.3133, + "step": 9852 + }, + { + "epoch": 1.942429022082019, + "grad_norm": 0.5320875040440245, + "learning_rate": 1.0466388573782984e-05, + "loss": 0.3648, + "step": 9853 + }, + { + "epoch": 1.9426261829652995, + "grad_norm": 0.4714349073262386, + "learning_rate": 1.0464840228920643e-05, + "loss": 0.3159, + "step": 9854 + }, + { + "epoch": 1.9428233438485805, + "grad_norm": 0.4907557101702487, + "learning_rate": 1.046329187289014e-05, + "loss": 0.3466, + "step": 9855 + }, + { + "epoch": 1.9430205047318612, + "grad_norm": 0.4838478550153892, + "learning_rate": 1.046174350572868e-05, + "loss": 0.3141, + "step": 9856 + }, + { + "epoch": 1.9432176656151419, + "grad_norm": 0.49718799827485166, + "learning_rate": 1.0460195127473456e-05, + "loss": 0.3363, + "step": 9857 + }, + { + "epoch": 1.9434148264984228, + "grad_norm": 0.4538672203134272, + "learning_rate": 1.0458646738161676e-05, + "loss": 0.318, + "step": 9858 + }, + { + "epoch": 1.9436119873817035, + "grad_norm": 0.4718009026198612, + "learning_rate": 1.0457098337830536e-05, + "loss": 0.3121, + "step": 9859 + }, + { + "epoch": 1.9438091482649842, + "grad_norm": 0.4863092292989189, + "learning_rate": 1.0455549926517243e-05, + "loss": 0.3575, + "step": 9860 + }, + { + "epoch": 1.944006309148265, + "grad_norm": 0.45207391928372037, + "learning_rate": 1.0454001504258994e-05, + "loss": 0.3208, + "step": 9861 + }, + { + "epoch": 1.9442034700315456, + "grad_norm": 0.49540411046222155, + "learning_rate": 1.0452453071092993e-05, + "loss": 0.3148, + "step": 9862 + }, + { + "epoch": 1.9444006309148265, + "grad_norm": 0.4902146796712335, + "learning_rate": 1.0450904627056446e-05, + "loss": 0.354, + "step": 9863 + }, + { + "epoch": 1.9445977917981072, + "grad_norm": 0.4302996025002257, + "learning_rate": 1.0449356172186548e-05, + "loss": 0.2963, + "step": 9864 + }, + { + "epoch": 1.944794952681388, + "grad_norm": 0.47578474259947834, + "learning_rate": 1.0447807706520513e-05, + "loss": 0.3117, + "step": 9865 + }, + { + "epoch": 1.9449921135646688, + "grad_norm": 0.4504023483184343, + "learning_rate": 1.0446259230095531e-05, + "loss": 0.3401, + "step": 9866 + }, + { + "epoch": 1.9451892744479495, + "grad_norm": 0.4757062498349321, + "learning_rate": 1.0444710742948814e-05, + "loss": 0.3366, + "step": 9867 + }, + { + "epoch": 1.9453864353312302, + "grad_norm": 0.4819233484036111, + "learning_rate": 1.0443162245117562e-05, + "loss": 0.3342, + "step": 9868 + }, + { + "epoch": 1.9455835962145112, + "grad_norm": 0.5213874575567724, + "learning_rate": 1.044161373663898e-05, + "loss": 0.3172, + "step": 9869 + }, + { + "epoch": 1.9457807570977916, + "grad_norm": 0.44488837497482253, + "learning_rate": 1.0440065217550273e-05, + "loss": 0.3125, + "step": 9870 + }, + { + "epoch": 1.9459779179810726, + "grad_norm": 0.460970808844026, + "learning_rate": 1.0438516687888645e-05, + "loss": 0.3258, + "step": 9871 + }, + { + "epoch": 1.9461750788643533, + "grad_norm": 0.4810006442638309, + "learning_rate": 1.04369681476913e-05, + "loss": 0.3219, + "step": 9872 + }, + { + "epoch": 1.946372239747634, + "grad_norm": 0.4838450735864227, + "learning_rate": 1.0435419596995444e-05, + "loss": 0.3655, + "step": 9873 + }, + { + "epoch": 1.9465694006309149, + "grad_norm": 0.47505078585700605, + "learning_rate": 1.0433871035838283e-05, + "loss": 0.3257, + "step": 9874 + }, + { + "epoch": 1.9467665615141956, + "grad_norm": 0.47990929946296423, + "learning_rate": 1.0432322464257019e-05, + "loss": 0.3165, + "step": 9875 + }, + { + "epoch": 1.9469637223974763, + "grad_norm": 0.4824430346339747, + "learning_rate": 1.0430773882288859e-05, + "loss": 0.3493, + "step": 9876 + }, + { + "epoch": 1.9471608832807572, + "grad_norm": 0.47127696634444705, + "learning_rate": 1.042922528997101e-05, + "loss": 0.3265, + "step": 9877 + }, + { + "epoch": 1.947358044164038, + "grad_norm": 0.45182006106255146, + "learning_rate": 1.0427676687340678e-05, + "loss": 0.3219, + "step": 9878 + }, + { + "epoch": 1.9475552050473186, + "grad_norm": 0.46337044334398714, + "learning_rate": 1.0426128074435068e-05, + "loss": 0.3319, + "step": 9879 + }, + { + "epoch": 1.9477523659305995, + "grad_norm": 0.4555312238661632, + "learning_rate": 1.0424579451291393e-05, + "loss": 0.3119, + "step": 9880 + }, + { + "epoch": 1.94794952681388, + "grad_norm": 0.5027627443388722, + "learning_rate": 1.042303081794685e-05, + "loss": 0.3372, + "step": 9881 + }, + { + "epoch": 1.948146687697161, + "grad_norm": 0.46830494008681434, + "learning_rate": 1.042148217443865e-05, + "loss": 0.3491, + "step": 9882 + }, + { + "epoch": 1.9483438485804416, + "grad_norm": 0.4738873885085205, + "learning_rate": 1.0419933520804002e-05, + "loss": 0.3493, + "step": 9883 + }, + { + "epoch": 1.9485410094637223, + "grad_norm": 0.4965510696033795, + "learning_rate": 1.0418384857080118e-05, + "loss": 0.336, + "step": 9884 + }, + { + "epoch": 1.9487381703470033, + "grad_norm": 0.4779020902575847, + "learning_rate": 1.0416836183304198e-05, + "loss": 0.3348, + "step": 9885 + }, + { + "epoch": 1.948935331230284, + "grad_norm": 0.4533557633530483, + "learning_rate": 1.0415287499513452e-05, + "loss": 0.3072, + "step": 9886 + }, + { + "epoch": 1.9491324921135647, + "grad_norm": 0.46478354483577006, + "learning_rate": 1.0413738805745089e-05, + "loss": 0.3269, + "step": 9887 + }, + { + "epoch": 1.9493296529968456, + "grad_norm": 0.4553262713575033, + "learning_rate": 1.0412190102036317e-05, + "loss": 0.329, + "step": 9888 + }, + { + "epoch": 1.949526813880126, + "grad_norm": 0.5236120156557359, + "learning_rate": 1.041064138842435e-05, + "loss": 0.3589, + "step": 9889 + }, + { + "epoch": 1.949723974763407, + "grad_norm": 0.46404257823966893, + "learning_rate": 1.0409092664946388e-05, + "loss": 0.3165, + "step": 9890 + }, + { + "epoch": 1.9499211356466877, + "grad_norm": 0.46119544702732324, + "learning_rate": 1.040754393163965e-05, + "loss": 0.3198, + "step": 9891 + }, + { + "epoch": 1.9501182965299684, + "grad_norm": 0.48159542350926215, + "learning_rate": 1.0405995188541336e-05, + "loss": 0.3373, + "step": 9892 + }, + { + "epoch": 1.9503154574132493, + "grad_norm": 0.4397391980222927, + "learning_rate": 1.0404446435688665e-05, + "loss": 0.3109, + "step": 9893 + }, + { + "epoch": 1.95051261829653, + "grad_norm": 0.4668066514467705, + "learning_rate": 1.040289767311884e-05, + "loss": 0.345, + "step": 9894 + }, + { + "epoch": 1.9507097791798107, + "grad_norm": 0.4880251587331396, + "learning_rate": 1.0401348900869073e-05, + "loss": 0.3402, + "step": 9895 + }, + { + "epoch": 1.9509069400630916, + "grad_norm": 0.4909340031989673, + "learning_rate": 1.0399800118976577e-05, + "loss": 0.3561, + "step": 9896 + }, + { + "epoch": 1.951104100946372, + "grad_norm": 0.46507313249997145, + "learning_rate": 1.0398251327478561e-05, + "loss": 0.3226, + "step": 9897 + }, + { + "epoch": 1.951301261829653, + "grad_norm": 0.45585618774916764, + "learning_rate": 1.0396702526412237e-05, + "loss": 0.2961, + "step": 9898 + }, + { + "epoch": 1.9514984227129337, + "grad_norm": 0.5000463843480156, + "learning_rate": 1.0395153715814816e-05, + "loss": 0.3413, + "step": 9899 + }, + { + "epoch": 1.9516955835962144, + "grad_norm": 0.45225941628046795, + "learning_rate": 1.0393604895723509e-05, + "loss": 0.3243, + "step": 9900 + }, + { + "epoch": 1.9518927444794953, + "grad_norm": 0.4853464256835092, + "learning_rate": 1.0392056066175524e-05, + "loss": 0.3294, + "step": 9901 + }, + { + "epoch": 1.952089905362776, + "grad_norm": 0.48505362928524964, + "learning_rate": 1.039050722720808e-05, + "loss": 0.3592, + "step": 9902 + }, + { + "epoch": 1.9522870662460567, + "grad_norm": 0.48062154733831464, + "learning_rate": 1.0388958378858383e-05, + "loss": 0.3211, + "step": 9903 + }, + { + "epoch": 1.9524842271293377, + "grad_norm": 0.5171535755363212, + "learning_rate": 1.038740952116365e-05, + "loss": 0.3546, + "step": 9904 + }, + { + "epoch": 1.9526813880126181, + "grad_norm": 0.46818506771167984, + "learning_rate": 1.0385860654161088e-05, + "loss": 0.3284, + "step": 9905 + }, + { + "epoch": 1.952878548895899, + "grad_norm": 0.4915580603272551, + "learning_rate": 1.0384311777887916e-05, + "loss": 0.3658, + "step": 9906 + }, + { + "epoch": 1.9530757097791798, + "grad_norm": 0.46935304552685453, + "learning_rate": 1.0382762892381342e-05, + "loss": 0.3332, + "step": 9907 + }, + { + "epoch": 1.9532728706624605, + "grad_norm": 0.45363492810935857, + "learning_rate": 1.0381213997678582e-05, + "loss": 0.3388, + "step": 9908 + }, + { + "epoch": 1.9534700315457414, + "grad_norm": 0.44641220154213385, + "learning_rate": 1.0379665093816848e-05, + "loss": 0.3035, + "step": 9909 + }, + { + "epoch": 1.953667192429022, + "grad_norm": 0.45206940818234764, + "learning_rate": 1.0378116180833357e-05, + "loss": 0.3222, + "step": 9910 + }, + { + "epoch": 1.9538643533123028, + "grad_norm": 0.4575547617482853, + "learning_rate": 1.0376567258765316e-05, + "loss": 0.3193, + "step": 9911 + }, + { + "epoch": 1.9540615141955837, + "grad_norm": 0.4590418732960005, + "learning_rate": 1.0375018327649948e-05, + "loss": 0.3023, + "step": 9912 + }, + { + "epoch": 1.9542586750788642, + "grad_norm": 0.4543796513395991, + "learning_rate": 1.037346938752446e-05, + "loss": 0.2872, + "step": 9913 + }, + { + "epoch": 1.9544558359621451, + "grad_norm": 0.4805196349855143, + "learning_rate": 1.037192043842607e-05, + "loss": 0.325, + "step": 9914 + }, + { + "epoch": 1.9546529968454258, + "grad_norm": 0.46446662799687205, + "learning_rate": 1.037037148039199e-05, + "loss": 0.3449, + "step": 9915 + }, + { + "epoch": 1.9548501577287065, + "grad_norm": 0.7024562863012095, + "learning_rate": 1.036882251345944e-05, + "loss": 0.3346, + "step": 9916 + }, + { + "epoch": 1.9550473186119874, + "grad_norm": 0.4792762668817157, + "learning_rate": 1.036727353766563e-05, + "loss": 0.3535, + "step": 9917 + }, + { + "epoch": 1.9552444794952681, + "grad_norm": 0.46573884017043454, + "learning_rate": 1.0365724553047778e-05, + "loss": 0.3114, + "step": 9918 + }, + { + "epoch": 1.9554416403785488, + "grad_norm": 0.45556820668060394, + "learning_rate": 1.03641755596431e-05, + "loss": 0.3101, + "step": 9919 + }, + { + "epoch": 1.9556388012618298, + "grad_norm": 0.5061371270375018, + "learning_rate": 1.0362626557488811e-05, + "loss": 0.3451, + "step": 9920 + }, + { + "epoch": 1.9558359621451105, + "grad_norm": 0.45544222128631295, + "learning_rate": 1.0361077546622125e-05, + "loss": 0.3414, + "step": 9921 + }, + { + "epoch": 1.9560331230283912, + "grad_norm": 0.49708322112827474, + "learning_rate": 1.0359528527080263e-05, + "loss": 0.3501, + "step": 9922 + }, + { + "epoch": 1.956230283911672, + "grad_norm": 0.4777705719722766, + "learning_rate": 1.0357979498900436e-05, + "loss": 0.3216, + "step": 9923 + }, + { + "epoch": 1.9564274447949526, + "grad_norm": 0.4774834671431984, + "learning_rate": 1.0356430462119865e-05, + "loss": 0.3045, + "step": 9924 + }, + { + "epoch": 1.9566246056782335, + "grad_norm": 0.47057412024018935, + "learning_rate": 1.0354881416775765e-05, + "loss": 0.3218, + "step": 9925 + }, + { + "epoch": 1.9568217665615142, + "grad_norm": 0.4991382437521349, + "learning_rate": 1.0353332362905351e-05, + "loss": 0.3485, + "step": 9926 + }, + { + "epoch": 1.9570189274447949, + "grad_norm": 0.4561117034939663, + "learning_rate": 1.0351783300545843e-05, + "loss": 0.333, + "step": 9927 + }, + { + "epoch": 1.9572160883280758, + "grad_norm": 0.47236910408393734, + "learning_rate": 1.0350234229734459e-05, + "loss": 0.341, + "step": 9928 + }, + { + "epoch": 1.9574132492113565, + "grad_norm": 0.46739069562356134, + "learning_rate": 1.0348685150508417e-05, + "loss": 0.316, + "step": 9929 + }, + { + "epoch": 1.9576104100946372, + "grad_norm": 0.4726019967619267, + "learning_rate": 1.034713606290493e-05, + "loss": 0.3164, + "step": 9930 + }, + { + "epoch": 1.9578075709779181, + "grad_norm": 0.46230340086353033, + "learning_rate": 1.0345586966961223e-05, + "loss": 0.3286, + "step": 9931 + }, + { + "epoch": 1.9580047318611986, + "grad_norm": 0.4729940247692541, + "learning_rate": 1.0344037862714506e-05, + "loss": 0.3364, + "step": 9932 + }, + { + "epoch": 1.9582018927444795, + "grad_norm": 0.49856429925552365, + "learning_rate": 1.034248875020201e-05, + "loss": 0.3196, + "step": 9933 + }, + { + "epoch": 1.9583990536277602, + "grad_norm": 0.4701007265414603, + "learning_rate": 1.0340939629460938e-05, + "loss": 0.3181, + "step": 9934 + }, + { + "epoch": 1.958596214511041, + "grad_norm": 0.4658639064265051, + "learning_rate": 1.0339390500528523e-05, + "loss": 0.3145, + "step": 9935 + }, + { + "epoch": 1.9587933753943219, + "grad_norm": 0.49004170327794266, + "learning_rate": 1.0337841363441973e-05, + "loss": 0.336, + "step": 9936 + }, + { + "epoch": 1.9589905362776026, + "grad_norm": 0.44921738293967783, + "learning_rate": 1.0336292218238514e-05, + "loss": 0.3133, + "step": 9937 + }, + { + "epoch": 1.9591876971608833, + "grad_norm": 0.4823452316355001, + "learning_rate": 1.0334743064955367e-05, + "loss": 0.3471, + "step": 9938 + }, + { + "epoch": 1.9593848580441642, + "grad_norm": 0.4507059368884933, + "learning_rate": 1.0333193903629743e-05, + "loss": 0.295, + "step": 9939 + }, + { + "epoch": 1.9595820189274447, + "grad_norm": 0.46683494943121584, + "learning_rate": 1.0331644734298874e-05, + "loss": 0.3325, + "step": 9940 + }, + { + "epoch": 1.9597791798107256, + "grad_norm": 0.4744162384642641, + "learning_rate": 1.0330095556999966e-05, + "loss": 0.32, + "step": 9941 + }, + { + "epoch": 1.9599763406940063, + "grad_norm": 0.4617197712445625, + "learning_rate": 1.0328546371770249e-05, + "loss": 0.3618, + "step": 9942 + }, + { + "epoch": 1.960173501577287, + "grad_norm": 0.5244555576526214, + "learning_rate": 1.0326997178646941e-05, + "loss": 0.3248, + "step": 9943 + }, + { + "epoch": 1.960370662460568, + "grad_norm": 0.4799968339049716, + "learning_rate": 1.0325447977667262e-05, + "loss": 0.3388, + "step": 9944 + }, + { + "epoch": 1.9605678233438486, + "grad_norm": 0.44066352020859745, + "learning_rate": 1.0323898768868434e-05, + "loss": 0.3199, + "step": 9945 + }, + { + "epoch": 1.9607649842271293, + "grad_norm": 0.5245090776857998, + "learning_rate": 1.0322349552287676e-05, + "loss": 0.3169, + "step": 9946 + }, + { + "epoch": 1.9609621451104102, + "grad_norm": 0.4716152433206715, + "learning_rate": 1.0320800327962212e-05, + "loss": 0.3217, + "step": 9947 + }, + { + "epoch": 1.9611593059936907, + "grad_norm": 0.4712223481043858, + "learning_rate": 1.0319251095929262e-05, + "loss": 0.3397, + "step": 9948 + }, + { + "epoch": 1.9613564668769716, + "grad_norm": 0.43783005573851125, + "learning_rate": 1.0317701856226045e-05, + "loss": 0.3187, + "step": 9949 + }, + { + "epoch": 1.9615536277602523, + "grad_norm": 0.4490127963028393, + "learning_rate": 1.0316152608889787e-05, + "loss": 0.3312, + "step": 9950 + }, + { + "epoch": 1.961750788643533, + "grad_norm": 0.47018160199287945, + "learning_rate": 1.0314603353957709e-05, + "loss": 0.3378, + "step": 9951 + }, + { + "epoch": 1.961947949526814, + "grad_norm": 0.45982647728871295, + "learning_rate": 1.031305409146703e-05, + "loss": 0.3053, + "step": 9952 + }, + { + "epoch": 1.9621451104100947, + "grad_norm": 0.47793598964818645, + "learning_rate": 1.0311504821454973e-05, + "loss": 0.3528, + "step": 9953 + }, + { + "epoch": 1.9623422712933754, + "grad_norm": 0.5116582725353143, + "learning_rate": 1.0309955543958765e-05, + "loss": 0.3343, + "step": 9954 + }, + { + "epoch": 1.9625394321766563, + "grad_norm": 0.4623753000651675, + "learning_rate": 1.0308406259015624e-05, + "loss": 0.345, + "step": 9955 + }, + { + "epoch": 1.9627365930599368, + "grad_norm": 0.45920787227072735, + "learning_rate": 1.0306856966662776e-05, + "loss": 0.3179, + "step": 9956 + }, + { + "epoch": 1.9629337539432177, + "grad_norm": 0.48812471718237876, + "learning_rate": 1.0305307666937441e-05, + "loss": 0.3162, + "step": 9957 + }, + { + "epoch": 1.9631309148264984, + "grad_norm": 0.4525568703068494, + "learning_rate": 1.0303758359876841e-05, + "loss": 0.3335, + "step": 9958 + }, + { + "epoch": 1.963328075709779, + "grad_norm": 0.4797900640643521, + "learning_rate": 1.0302209045518206e-05, + "loss": 0.3557, + "step": 9959 + }, + { + "epoch": 1.96352523659306, + "grad_norm": 0.46653885535240164, + "learning_rate": 1.0300659723898752e-05, + "loss": 0.3313, + "step": 9960 + }, + { + "epoch": 1.9637223974763407, + "grad_norm": 0.5002009677410206, + "learning_rate": 1.029911039505571e-05, + "loss": 0.3303, + "step": 9961 + }, + { + "epoch": 1.9639195583596214, + "grad_norm": 0.4853690102944126, + "learning_rate": 1.0297561059026293e-05, + "loss": 0.3165, + "step": 9962 + }, + { + "epoch": 1.9641167192429023, + "grad_norm": 0.4855838797298784, + "learning_rate": 1.0296011715847738e-05, + "loss": 0.3383, + "step": 9963 + }, + { + "epoch": 1.964313880126183, + "grad_norm": 0.48686860386638087, + "learning_rate": 1.029446236555726e-05, + "loss": 0.3265, + "step": 9964 + }, + { + "epoch": 1.9645110410094637, + "grad_norm": 0.519576709109696, + "learning_rate": 1.0292913008192088e-05, + "loss": 0.3623, + "step": 9965 + }, + { + "epoch": 1.9647082018927446, + "grad_norm": 0.5048904976816195, + "learning_rate": 1.0291363643789445e-05, + "loss": 0.3572, + "step": 9966 + }, + { + "epoch": 1.9649053627760251, + "grad_norm": 0.42587004814801377, + "learning_rate": 1.0289814272386556e-05, + "loss": 0.292, + "step": 9967 + }, + { + "epoch": 1.965102523659306, + "grad_norm": 0.500466873158178, + "learning_rate": 1.0288264894020646e-05, + "loss": 0.333, + "step": 9968 + }, + { + "epoch": 1.9652996845425867, + "grad_norm": 0.465182567762671, + "learning_rate": 1.0286715508728937e-05, + "loss": 0.3157, + "step": 9969 + }, + { + "epoch": 1.9654968454258674, + "grad_norm": 0.4617031591725659, + "learning_rate": 1.0285166116548662e-05, + "loss": 0.3202, + "step": 9970 + }, + { + "epoch": 1.9656940063091484, + "grad_norm": 0.47956736857907367, + "learning_rate": 1.0283616717517037e-05, + "loss": 0.3549, + "step": 9971 + }, + { + "epoch": 1.965891167192429, + "grad_norm": 0.45146021950685644, + "learning_rate": 1.0282067311671293e-05, + "loss": 0.3241, + "step": 9972 + }, + { + "epoch": 1.9660883280757098, + "grad_norm": 0.4674887113548766, + "learning_rate": 1.0280517899048657e-05, + "loss": 0.3395, + "step": 9973 + }, + { + "epoch": 1.9662854889589907, + "grad_norm": 0.47145112139932266, + "learning_rate": 1.027896847968635e-05, + "loss": 0.3287, + "step": 9974 + }, + { + "epoch": 1.9664826498422712, + "grad_norm": 0.4565740445948791, + "learning_rate": 1.0277419053621602e-05, + "loss": 0.3196, + "step": 9975 + }, + { + "epoch": 1.966679810725552, + "grad_norm": 0.48138321162232844, + "learning_rate": 1.0275869620891637e-05, + "loss": 0.3403, + "step": 9976 + }, + { + "epoch": 1.9668769716088328, + "grad_norm": 0.484975445459746, + "learning_rate": 1.0274320181533681e-05, + "loss": 0.3283, + "step": 9977 + }, + { + "epoch": 1.9670741324921135, + "grad_norm": 0.4387256747272943, + "learning_rate": 1.0272770735584966e-05, + "loss": 0.3262, + "step": 9978 + }, + { + "epoch": 1.9672712933753944, + "grad_norm": 0.471596315685171, + "learning_rate": 1.0271221283082709e-05, + "loss": 0.3276, + "step": 9979 + }, + { + "epoch": 1.9674684542586751, + "grad_norm": 0.4545253122593284, + "learning_rate": 1.0269671824064146e-05, + "loss": 0.3234, + "step": 9980 + }, + { + "epoch": 1.9676656151419558, + "grad_norm": 0.45641111253411415, + "learning_rate": 1.0268122358566496e-05, + "loss": 0.3175, + "step": 9981 + }, + { + "epoch": 1.9678627760252367, + "grad_norm": 0.44841650898946805, + "learning_rate": 1.0266572886626997e-05, + "loss": 0.2985, + "step": 9982 + }, + { + "epoch": 1.9680599369085172, + "grad_norm": 0.43192291651712367, + "learning_rate": 1.0265023408282866e-05, + "loss": 0.2932, + "step": 9983 + }, + { + "epoch": 1.9682570977917981, + "grad_norm": 0.5012210582692301, + "learning_rate": 1.0263473923571334e-05, + "loss": 0.329, + "step": 9984 + }, + { + "epoch": 1.9684542586750788, + "grad_norm": 1.1443330318609166, + "learning_rate": 1.0261924432529629e-05, + "loss": 0.3461, + "step": 9985 + }, + { + "epoch": 1.9686514195583595, + "grad_norm": 0.5004253208337783, + "learning_rate": 1.0260374935194979e-05, + "loss": 0.3069, + "step": 9986 + }, + { + "epoch": 1.9688485804416405, + "grad_norm": 0.4696360012631041, + "learning_rate": 1.025882543160461e-05, + "loss": 0.3157, + "step": 9987 + }, + { + "epoch": 1.9690457413249212, + "grad_norm": 0.4826746914030759, + "learning_rate": 1.0257275921795756e-05, + "loss": 0.326, + "step": 9988 + }, + { + "epoch": 1.9692429022082019, + "grad_norm": 1.3597036785857965, + "learning_rate": 1.0255726405805637e-05, + "loss": 0.3169, + "step": 9989 + }, + { + "epoch": 1.9694400630914828, + "grad_norm": 0.4967882690519761, + "learning_rate": 1.0254176883671485e-05, + "loss": 0.335, + "step": 9990 + }, + { + "epoch": 1.9696372239747633, + "grad_norm": 0.502620609744957, + "learning_rate": 1.0252627355430532e-05, + "loss": 0.3477, + "step": 9991 + }, + { + "epoch": 1.9698343848580442, + "grad_norm": 0.4484287142620489, + "learning_rate": 1.0251077821119998e-05, + "loss": 0.3114, + "step": 9992 + }, + { + "epoch": 1.9700315457413249, + "grad_norm": 0.4747578081774963, + "learning_rate": 1.0249528280777121e-05, + "loss": 0.3072, + "step": 9993 + }, + { + "epoch": 1.9702287066246056, + "grad_norm": 0.4973932701653613, + "learning_rate": 1.0247978734439127e-05, + "loss": 0.3364, + "step": 9994 + }, + { + "epoch": 1.9704258675078865, + "grad_norm": 0.5356785346512383, + "learning_rate": 1.0246429182143241e-05, + "loss": 0.325, + "step": 9995 + }, + { + "epoch": 1.9706230283911672, + "grad_norm": 0.46472104839260125, + "learning_rate": 1.0244879623926698e-05, + "loss": 0.3337, + "step": 9996 + }, + { + "epoch": 1.970820189274448, + "grad_norm": 0.47337341572225305, + "learning_rate": 1.0243330059826724e-05, + "loss": 0.3333, + "step": 9997 + }, + { + "epoch": 1.9710173501577288, + "grad_norm": 0.48176261552874655, + "learning_rate": 1.0241780489880546e-05, + "loss": 0.338, + "step": 9998 + }, + { + "epoch": 1.9712145110410093, + "grad_norm": 0.4816284206757202, + "learning_rate": 1.0240230914125401e-05, + "loss": 0.3312, + "step": 9999 + }, + { + "epoch": 1.9714116719242902, + "grad_norm": 0.4632673744732919, + "learning_rate": 1.0238681332598512e-05, + "loss": 0.3249, + "step": 10000 + }, + { + "epoch": 1.971608832807571, + "grad_norm": 0.48018702662774365, + "learning_rate": 1.0237131745337117e-05, + "loss": 0.3549, + "step": 10001 + }, + { + "epoch": 1.9718059936908516, + "grad_norm": 0.468079428043231, + "learning_rate": 1.0235582152378435e-05, + "loss": 0.3089, + "step": 10002 + }, + { + "epoch": 1.9720031545741326, + "grad_norm": 0.4966800301788038, + "learning_rate": 1.0234032553759707e-05, + "loss": 0.3486, + "step": 10003 + }, + { + "epoch": 1.9722003154574133, + "grad_norm": 0.44884608245777396, + "learning_rate": 1.0232482949518157e-05, + "loss": 0.2966, + "step": 10004 + }, + { + "epoch": 1.972397476340694, + "grad_norm": 8.64090741552038, + "learning_rate": 1.0230933339691014e-05, + "loss": 0.3888, + "step": 10005 + }, + { + "epoch": 1.9725946372239749, + "grad_norm": 0.48715703305183966, + "learning_rate": 1.0229383724315516e-05, + "loss": 0.3216, + "step": 10006 + }, + { + "epoch": 1.9727917981072554, + "grad_norm": 0.44760735526085277, + "learning_rate": 1.0227834103428884e-05, + "loss": 0.3165, + "step": 10007 + }, + { + "epoch": 1.9729889589905363, + "grad_norm": 0.46751672698359104, + "learning_rate": 1.022628447706836e-05, + "loss": 0.3247, + "step": 10008 + }, + { + "epoch": 1.973186119873817, + "grad_norm": 0.5038928323160687, + "learning_rate": 1.0224734845271163e-05, + "loss": 0.3713, + "step": 10009 + }, + { + "epoch": 1.9733832807570977, + "grad_norm": 0.7157608221471825, + "learning_rate": 1.0223185208074538e-05, + "loss": 0.3501, + "step": 10010 + }, + { + "epoch": 1.9735804416403786, + "grad_norm": 0.46267508667900453, + "learning_rate": 1.0221635565515699e-05, + "loss": 0.3268, + "step": 10011 + }, + { + "epoch": 1.9737776025236593, + "grad_norm": 0.47469784854807734, + "learning_rate": 1.0220085917631894e-05, + "loss": 0.3323, + "step": 10012 + }, + { + "epoch": 1.97397476340694, + "grad_norm": 0.4807980595553942, + "learning_rate": 1.0218536264460346e-05, + "loss": 0.3215, + "step": 10013 + }, + { + "epoch": 1.974171924290221, + "grad_norm": 0.4659523222268142, + "learning_rate": 1.0216986606038288e-05, + "loss": 0.3288, + "step": 10014 + }, + { + "epoch": 1.9743690851735016, + "grad_norm": 0.46518600281925643, + "learning_rate": 1.0215436942402952e-05, + "loss": 0.3337, + "step": 10015 + }, + { + "epoch": 1.9745662460567823, + "grad_norm": 0.49142919292843934, + "learning_rate": 1.0213887273591573e-05, + "loss": 0.3318, + "step": 10016 + }, + { + "epoch": 1.9747634069400632, + "grad_norm": 0.46949418334164145, + "learning_rate": 1.0212337599641376e-05, + "loss": 0.3245, + "step": 10017 + }, + { + "epoch": 1.9749605678233437, + "grad_norm": 0.4508227196884441, + "learning_rate": 1.0210787920589598e-05, + "loss": 0.3187, + "step": 10018 + }, + { + "epoch": 1.9751577287066246, + "grad_norm": 0.48656630600989037, + "learning_rate": 1.0209238236473472e-05, + "loss": 0.3362, + "step": 10019 + }, + { + "epoch": 1.9753548895899053, + "grad_norm": 0.4759349272246813, + "learning_rate": 1.0207688547330225e-05, + "loss": 0.3247, + "step": 10020 + }, + { + "epoch": 1.975552050473186, + "grad_norm": 0.5742771214058191, + "learning_rate": 1.0206138853197098e-05, + "loss": 0.3278, + "step": 10021 + }, + { + "epoch": 1.975749211356467, + "grad_norm": 0.5031639152587618, + "learning_rate": 1.0204589154111318e-05, + "loss": 0.3604, + "step": 10022 + }, + { + "epoch": 1.9759463722397477, + "grad_norm": 0.48247181846875936, + "learning_rate": 1.0203039450110117e-05, + "loss": 0.3441, + "step": 10023 + }, + { + "epoch": 1.9761435331230284, + "grad_norm": 0.43458711087785906, + "learning_rate": 1.020148974123073e-05, + "loss": 0.2844, + "step": 10024 + }, + { + "epoch": 1.9763406940063093, + "grad_norm": 0.47496966219956804, + "learning_rate": 1.0199940027510392e-05, + "loss": 0.3473, + "step": 10025 + }, + { + "epoch": 1.9765378548895898, + "grad_norm": 0.5215472264487935, + "learning_rate": 1.0198390308986328e-05, + "loss": 0.3459, + "step": 10026 + }, + { + "epoch": 1.9767350157728707, + "grad_norm": 0.44281471612530954, + "learning_rate": 1.0196840585695785e-05, + "loss": 0.3057, + "step": 10027 + }, + { + "epoch": 1.9769321766561514, + "grad_norm": 0.4818228906154598, + "learning_rate": 1.0195290857675982e-05, + "loss": 0.3293, + "step": 10028 + }, + { + "epoch": 1.977129337539432, + "grad_norm": 0.460334518749255, + "learning_rate": 1.0193741124964164e-05, + "loss": 0.3286, + "step": 10029 + }, + { + "epoch": 1.977326498422713, + "grad_norm": 0.46235057626123255, + "learning_rate": 1.0192191387597554e-05, + "loss": 0.3202, + "step": 10030 + }, + { + "epoch": 1.9775236593059937, + "grad_norm": 0.44454174849526384, + "learning_rate": 1.0190641645613397e-05, + "loss": 0.3184, + "step": 10031 + }, + { + "epoch": 1.9777208201892744, + "grad_norm": 0.46226996398858433, + "learning_rate": 1.0189091899048914e-05, + "loss": 0.3438, + "step": 10032 + }, + { + "epoch": 1.9779179810725553, + "grad_norm": 0.5082875526155569, + "learning_rate": 1.0187542147941352e-05, + "loss": 0.3517, + "step": 10033 + }, + { + "epoch": 1.9781151419558358, + "grad_norm": 0.43241743937186994, + "learning_rate": 1.0185992392327936e-05, + "loss": 0.3188, + "step": 10034 + }, + { + "epoch": 1.9783123028391167, + "grad_norm": 0.4476815196740319, + "learning_rate": 1.0184442632245905e-05, + "loss": 0.3252, + "step": 10035 + }, + { + "epoch": 1.9785094637223974, + "grad_norm": 0.45862822746483506, + "learning_rate": 1.018289286773249e-05, + "loss": 0.325, + "step": 10036 + }, + { + "epoch": 1.9787066246056781, + "grad_norm": 0.4483776581846178, + "learning_rate": 1.0181343098824928e-05, + "loss": 0.3299, + "step": 10037 + }, + { + "epoch": 1.978903785488959, + "grad_norm": 0.46878007068368455, + "learning_rate": 1.017979332556045e-05, + "loss": 0.3291, + "step": 10038 + }, + { + "epoch": 1.9791009463722398, + "grad_norm": 0.4916705031341869, + "learning_rate": 1.0178243547976293e-05, + "loss": 0.3253, + "step": 10039 + }, + { + "epoch": 1.9792981072555205, + "grad_norm": 0.5025149474174093, + "learning_rate": 1.017669376610969e-05, + "loss": 0.3469, + "step": 10040 + }, + { + "epoch": 1.9794952681388014, + "grad_norm": 0.4592700151321055, + "learning_rate": 1.0175143979997878e-05, + "loss": 0.3341, + "step": 10041 + }, + { + "epoch": 1.9796924290220819, + "grad_norm": 0.4918000568041209, + "learning_rate": 1.0173594189678093e-05, + "loss": 0.3598, + "step": 10042 + }, + { + "epoch": 1.9798895899053628, + "grad_norm": 0.4479364044121577, + "learning_rate": 1.0172044395187566e-05, + "loss": 0.3027, + "step": 10043 + }, + { + "epoch": 1.9800867507886435, + "grad_norm": 0.47799619307194813, + "learning_rate": 1.0170494596563533e-05, + "loss": 0.3433, + "step": 10044 + }, + { + "epoch": 1.9802839116719242, + "grad_norm": 0.4523062427056764, + "learning_rate": 1.016894479384323e-05, + "loss": 0.305, + "step": 10045 + }, + { + "epoch": 1.9804810725552051, + "grad_norm": 0.4759998481341186, + "learning_rate": 1.0167394987063894e-05, + "loss": 0.3446, + "step": 10046 + }, + { + "epoch": 1.9806782334384858, + "grad_norm": 0.4869635858753714, + "learning_rate": 1.0165845176262757e-05, + "loss": 0.3478, + "step": 10047 + }, + { + "epoch": 1.9808753943217665, + "grad_norm": 0.45645594907618603, + "learning_rate": 1.016429536147706e-05, + "loss": 0.3201, + "step": 10048 + }, + { + "epoch": 1.9810725552050474, + "grad_norm": 0.46823112725914234, + "learning_rate": 1.0162745542744028e-05, + "loss": 0.305, + "step": 10049 + }, + { + "epoch": 1.981269716088328, + "grad_norm": 0.46841106024358004, + "learning_rate": 1.016119572010091e-05, + "loss": 0.3323, + "step": 10050 + }, + { + "epoch": 1.9814668769716088, + "grad_norm": 0.44805736427909854, + "learning_rate": 1.015964589358493e-05, + "loss": 0.3241, + "step": 10051 + }, + { + "epoch": 1.9816640378548895, + "grad_norm": 0.46092527228178226, + "learning_rate": 1.015809606323333e-05, + "loss": 0.325, + "step": 10052 + }, + { + "epoch": 1.9818611987381702, + "grad_norm": 0.4813889305251375, + "learning_rate": 1.0156546229083346e-05, + "loss": 0.3346, + "step": 10053 + }, + { + "epoch": 1.9820583596214512, + "grad_norm": 0.48424701481984905, + "learning_rate": 1.0154996391172211e-05, + "loss": 0.3176, + "step": 10054 + }, + { + "epoch": 1.9822555205047319, + "grad_norm": 0.5052195601955368, + "learning_rate": 1.0153446549537164e-05, + "loss": 0.3738, + "step": 10055 + }, + { + "epoch": 1.9824526813880126, + "grad_norm": 0.5264958898669171, + "learning_rate": 1.0151896704215441e-05, + "loss": 0.3509, + "step": 10056 + }, + { + "epoch": 1.9826498422712935, + "grad_norm": 0.4424729969934259, + "learning_rate": 1.015034685524428e-05, + "loss": 0.3016, + "step": 10057 + }, + { + "epoch": 1.9828470031545742, + "grad_norm": 0.45230293654878495, + "learning_rate": 1.0148797002660909e-05, + "loss": 0.3415, + "step": 10058 + }, + { + "epoch": 1.9830441640378549, + "grad_norm": 0.4296621647967024, + "learning_rate": 1.0147247146502573e-05, + "loss": 0.3229, + "step": 10059 + }, + { + "epoch": 1.9832413249211358, + "grad_norm": 0.4736201795007952, + "learning_rate": 1.0145697286806505e-05, + "loss": 0.3193, + "step": 10060 + }, + { + "epoch": 1.9834384858044163, + "grad_norm": 0.4483471743167527, + "learning_rate": 1.0144147423609942e-05, + "loss": 0.3078, + "step": 10061 + }, + { + "epoch": 1.9836356466876972, + "grad_norm": 0.4795452273103116, + "learning_rate": 1.0142597556950123e-05, + "loss": 0.314, + "step": 10062 + }, + { + "epoch": 1.983832807570978, + "grad_norm": 0.4651739245868516, + "learning_rate": 1.014104768686428e-05, + "loss": 0.3245, + "step": 10063 + }, + { + "epoch": 1.9840299684542586, + "grad_norm": 0.4678656723303721, + "learning_rate": 1.0139497813389654e-05, + "loss": 0.3262, + "step": 10064 + }, + { + "epoch": 1.9842271293375395, + "grad_norm": 0.4663884375595269, + "learning_rate": 1.0137947936563481e-05, + "loss": 0.3314, + "step": 10065 + }, + { + "epoch": 1.9844242902208202, + "grad_norm": 0.4991990370561572, + "learning_rate": 1.0136398056422995e-05, + "loss": 0.3662, + "step": 10066 + }, + { + "epoch": 1.984621451104101, + "grad_norm": 0.4630237349751292, + "learning_rate": 1.013484817300544e-05, + "loss": 0.3131, + "step": 10067 + }, + { + "epoch": 1.9848186119873819, + "grad_norm": 0.4760005186639651, + "learning_rate": 1.0133298286348046e-05, + "loss": 0.3309, + "step": 10068 + }, + { + "epoch": 1.9850157728706623, + "grad_norm": 0.47490621093107355, + "learning_rate": 1.0131748396488057e-05, + "loss": 0.3426, + "step": 10069 + }, + { + "epoch": 1.9852129337539433, + "grad_norm": 0.4490036193130323, + "learning_rate": 1.0130198503462705e-05, + "loss": 0.3244, + "step": 10070 + }, + { + "epoch": 1.985410094637224, + "grad_norm": 0.43921214974034223, + "learning_rate": 1.0128648607309228e-05, + "loss": 0.3108, + "step": 10071 + }, + { + "epoch": 1.9856072555205047, + "grad_norm": 0.47746525475723234, + "learning_rate": 1.0127098708064866e-05, + "loss": 0.3631, + "step": 10072 + }, + { + "epoch": 1.9858044164037856, + "grad_norm": 0.44863839819394496, + "learning_rate": 1.0125548805766852e-05, + "loss": 0.3203, + "step": 10073 + }, + { + "epoch": 1.9860015772870663, + "grad_norm": 0.43643215846559164, + "learning_rate": 1.0123998900452431e-05, + "loss": 0.3055, + "step": 10074 + }, + { + "epoch": 1.986198738170347, + "grad_norm": 0.4722620454489846, + "learning_rate": 1.0122448992158834e-05, + "loss": 0.3485, + "step": 10075 + }, + { + "epoch": 1.986395899053628, + "grad_norm": 0.4686910381456083, + "learning_rate": 1.0120899080923306e-05, + "loss": 0.3494, + "step": 10076 + }, + { + "epoch": 1.9865930599369084, + "grad_norm": 0.5235470374582382, + "learning_rate": 1.0119349166783073e-05, + "loss": 0.3158, + "step": 10077 + }, + { + "epoch": 1.9867902208201893, + "grad_norm": 0.47775678927346366, + "learning_rate": 1.0117799249775387e-05, + "loss": 0.3274, + "step": 10078 + }, + { + "epoch": 1.98698738170347, + "grad_norm": 0.4703367415865138, + "learning_rate": 1.0116249329937474e-05, + "loss": 0.3429, + "step": 10079 + }, + { + "epoch": 1.9871845425867507, + "grad_norm": 0.4475956538792856, + "learning_rate": 1.0114699407306576e-05, + "loss": 0.3144, + "step": 10080 + }, + { + "epoch": 1.9873817034700316, + "grad_norm": 0.4401013553494955, + "learning_rate": 1.0113149481919938e-05, + "loss": 0.3181, + "step": 10081 + }, + { + "epoch": 1.9875788643533123, + "grad_norm": 0.4383130137426036, + "learning_rate": 1.0111599553814788e-05, + "loss": 0.3046, + "step": 10082 + }, + { + "epoch": 1.987776025236593, + "grad_norm": 0.4776571024905826, + "learning_rate": 1.0110049623028371e-05, + "loss": 0.3395, + "step": 10083 + }, + { + "epoch": 1.987973186119874, + "grad_norm": 0.4672666220627933, + "learning_rate": 1.0108499689597924e-05, + "loss": 0.3152, + "step": 10084 + }, + { + "epoch": 1.9881703470031544, + "grad_norm": 0.45203351381336665, + "learning_rate": 1.0106949753560682e-05, + "loss": 0.312, + "step": 10085 + }, + { + "epoch": 1.9883675078864353, + "grad_norm": 0.4774637834527684, + "learning_rate": 1.0105399814953889e-05, + "loss": 0.3212, + "step": 10086 + }, + { + "epoch": 1.988564668769716, + "grad_norm": 0.4517907516822674, + "learning_rate": 1.010384987381478e-05, + "loss": 0.2959, + "step": 10087 + }, + { + "epoch": 1.9887618296529967, + "grad_norm": 0.47886943448952884, + "learning_rate": 1.0102299930180592e-05, + "loss": 0.3485, + "step": 10088 + }, + { + "epoch": 1.9889589905362777, + "grad_norm": 0.6789498721589519, + "learning_rate": 1.0100749984088567e-05, + "loss": 0.3312, + "step": 10089 + }, + { + "epoch": 1.9891561514195584, + "grad_norm": 0.48039651499274616, + "learning_rate": 1.0099200035575943e-05, + "loss": 0.3376, + "step": 10090 + }, + { + "epoch": 1.989353312302839, + "grad_norm": 0.49344206103676497, + "learning_rate": 1.0097650084679957e-05, + "loss": 0.3566, + "step": 10091 + }, + { + "epoch": 1.98955047318612, + "grad_norm": 0.455626086749079, + "learning_rate": 1.0096100131437851e-05, + "loss": 0.3167, + "step": 10092 + }, + { + "epoch": 1.9897476340694005, + "grad_norm": 0.471900481186439, + "learning_rate": 1.009455017588686e-05, + "loss": 0.3215, + "step": 10093 + }, + { + "epoch": 1.9899447949526814, + "grad_norm": 0.43095008957712944, + "learning_rate": 1.0093000218064224e-05, + "loss": 0.3152, + "step": 10094 + }, + { + "epoch": 1.990141955835962, + "grad_norm": 0.47850763704179666, + "learning_rate": 1.0091450258007188e-05, + "loss": 0.3285, + "step": 10095 + }, + { + "epoch": 1.9903391167192428, + "grad_norm": 0.46902329428111766, + "learning_rate": 1.008990029575298e-05, + "loss": 0.327, + "step": 10096 + }, + { + "epoch": 1.9905362776025237, + "grad_norm": 0.4635116114920501, + "learning_rate": 1.008835033133885e-05, + "loss": 0.3245, + "step": 10097 + }, + { + "epoch": 1.9907334384858044, + "grad_norm": 0.4899631136805948, + "learning_rate": 1.0086800364802028e-05, + "loss": 0.3454, + "step": 10098 + }, + { + "epoch": 1.9909305993690851, + "grad_norm": 0.4424518951849392, + "learning_rate": 1.008525039617976e-05, + "loss": 0.2982, + "step": 10099 + }, + { + "epoch": 1.991127760252366, + "grad_norm": 0.4843796306436317, + "learning_rate": 1.008370042550928e-05, + "loss": 0.3226, + "step": 10100 + }, + { + "epoch": 1.9913249211356467, + "grad_norm": 0.47515439701906687, + "learning_rate": 1.0082150452827832e-05, + "loss": 0.3647, + "step": 10101 + }, + { + "epoch": 1.9915220820189274, + "grad_norm": 0.4984861179086406, + "learning_rate": 1.0080600478172653e-05, + "loss": 0.3689, + "step": 10102 + }, + { + "epoch": 1.9917192429022084, + "grad_norm": 0.46711955198863625, + "learning_rate": 1.0079050501580983e-05, + "loss": 0.3347, + "step": 10103 + }, + { + "epoch": 1.9919164037854888, + "grad_norm": 0.5227568346359791, + "learning_rate": 1.0077500523090058e-05, + "loss": 0.3125, + "step": 10104 + }, + { + "epoch": 1.9921135646687698, + "grad_norm": 0.4491627976049431, + "learning_rate": 1.0075950542737123e-05, + "loss": 0.3253, + "step": 10105 + }, + { + "epoch": 1.9923107255520505, + "grad_norm": 0.47884782845375357, + "learning_rate": 1.0074400560559416e-05, + "loss": 0.3402, + "step": 10106 + }, + { + "epoch": 1.9925078864353312, + "grad_norm": 0.45076597168659827, + "learning_rate": 1.0072850576594175e-05, + "loss": 0.321, + "step": 10107 + }, + { + "epoch": 1.992705047318612, + "grad_norm": 0.45577225933101717, + "learning_rate": 1.0071300590878639e-05, + "loss": 0.3216, + "step": 10108 + }, + { + "epoch": 1.9929022082018928, + "grad_norm": 0.4654018460101875, + "learning_rate": 1.006975060345005e-05, + "loss": 0.3336, + "step": 10109 + }, + { + "epoch": 1.9930993690851735, + "grad_norm": 0.4833827723808277, + "learning_rate": 1.0068200614345647e-05, + "loss": 0.3326, + "step": 10110 + }, + { + "epoch": 1.9932965299684544, + "grad_norm": 0.45506045092199265, + "learning_rate": 1.0066650623602667e-05, + "loss": 0.2972, + "step": 10111 + }, + { + "epoch": 1.993493690851735, + "grad_norm": 0.47117744890482555, + "learning_rate": 1.0065100631258356e-05, + "loss": 0.3359, + "step": 10112 + }, + { + "epoch": 1.9936908517350158, + "grad_norm": 0.4155693551302444, + "learning_rate": 1.0063550637349946e-05, + "loss": 0.3016, + "step": 10113 + }, + { + "epoch": 1.9938880126182965, + "grad_norm": 0.4796712431778576, + "learning_rate": 1.0062000641914683e-05, + "loss": 0.3038, + "step": 10114 + }, + { + "epoch": 1.9940851735015772, + "grad_norm": 0.4460608776935147, + "learning_rate": 1.00604506449898e-05, + "loss": 0.3156, + "step": 10115 + }, + { + "epoch": 1.9942823343848581, + "grad_norm": 0.4636295632848529, + "learning_rate": 1.0058900646612548e-05, + "loss": 0.34, + "step": 10116 + }, + { + "epoch": 1.9944794952681388, + "grad_norm": 0.4419540655820689, + "learning_rate": 1.0057350646820157e-05, + "loss": 0.3153, + "step": 10117 + }, + { + "epoch": 1.9946766561514195, + "grad_norm": 0.4595608276786376, + "learning_rate": 1.0055800645649874e-05, + "loss": 0.3308, + "step": 10118 + }, + { + "epoch": 1.9948738170347005, + "grad_norm": 0.46822641410388405, + "learning_rate": 1.0054250643138931e-05, + "loss": 0.3351, + "step": 10119 + }, + { + "epoch": 1.995070977917981, + "grad_norm": 0.4571949677373604, + "learning_rate": 1.0052700639324574e-05, + "loss": 0.3409, + "step": 10120 + }, + { + "epoch": 1.9952681388012619, + "grad_norm": 0.4565357948126349, + "learning_rate": 1.0051150634244042e-05, + "loss": 0.3246, + "step": 10121 + }, + { + "epoch": 1.9954652996845426, + "grad_norm": 0.4715824788374287, + "learning_rate": 1.0049600627934576e-05, + "loss": 0.3266, + "step": 10122 + }, + { + "epoch": 1.9956624605678233, + "grad_norm": 36.0684477965843, + "learning_rate": 1.0048050620433415e-05, + "loss": 0.53, + "step": 10123 + }, + { + "epoch": 1.9958596214511042, + "grad_norm": 0.4920636243774622, + "learning_rate": 1.0046500611777799e-05, + "loss": 0.3162, + "step": 10124 + }, + { + "epoch": 1.9960567823343849, + "grad_norm": 0.4865650474889048, + "learning_rate": 1.004495060200497e-05, + "loss": 0.3395, + "step": 10125 + }, + { + "epoch": 1.9962539432176656, + "grad_norm": 0.4730171891025844, + "learning_rate": 1.0043400591152162e-05, + "loss": 0.3264, + "step": 10126 + }, + { + "epoch": 1.9964511041009465, + "grad_norm": 0.49951458846201335, + "learning_rate": 1.0041850579256623e-05, + "loss": 0.3605, + "step": 10127 + }, + { + "epoch": 1.996648264984227, + "grad_norm": 0.4595162916836803, + "learning_rate": 1.0040300566355588e-05, + "loss": 0.3143, + "step": 10128 + }, + { + "epoch": 1.996845425867508, + "grad_norm": 0.48676470289418966, + "learning_rate": 1.00387505524863e-05, + "loss": 0.3381, + "step": 10129 + }, + { + "epoch": 1.9970425867507886, + "grad_norm": 0.4953287911408293, + "learning_rate": 1.0037200537686001e-05, + "loss": 0.3395, + "step": 10130 + }, + { + "epoch": 1.9972397476340693, + "grad_norm": 0.4507924402166945, + "learning_rate": 1.0035650521991927e-05, + "loss": 0.3162, + "step": 10131 + }, + { + "epoch": 1.9974369085173502, + "grad_norm": 0.4651561295109272, + "learning_rate": 1.0034100505441322e-05, + "loss": 0.3225, + "step": 10132 + }, + { + "epoch": 1.997634069400631, + "grad_norm": 0.45225944154125214, + "learning_rate": 1.0032550488071424e-05, + "loss": 0.3214, + "step": 10133 + }, + { + "epoch": 1.9978312302839116, + "grad_norm": 0.46650198815431315, + "learning_rate": 1.0031000469919474e-05, + "loss": 0.3524, + "step": 10134 + }, + { + "epoch": 1.9980283911671926, + "grad_norm": 0.4685090100993953, + "learning_rate": 1.0029450451022713e-05, + "loss": 0.3366, + "step": 10135 + }, + { + "epoch": 1.998225552050473, + "grad_norm": 0.46959485471009443, + "learning_rate": 1.002790043141838e-05, + "loss": 0.3227, + "step": 10136 + }, + { + "epoch": 1.998422712933754, + "grad_norm": 0.4875933386597677, + "learning_rate": 1.0026350411143719e-05, + "loss": 0.337, + "step": 10137 + }, + { + "epoch": 1.9986198738170347, + "grad_norm": 0.4640353105289818, + "learning_rate": 1.0024800390235967e-05, + "loss": 0.3296, + "step": 10138 + }, + { + "epoch": 1.9988170347003154, + "grad_norm": 0.4365597644010865, + "learning_rate": 1.0023250368732367e-05, + "loss": 0.3246, + "step": 10139 + }, + { + "epoch": 1.9990141955835963, + "grad_norm": 0.463634992887384, + "learning_rate": 1.0021700346670156e-05, + "loss": 0.3476, + "step": 10140 + }, + { + "epoch": 1.999211356466877, + "grad_norm": 0.4488592770922772, + "learning_rate": 1.0020150324086575e-05, + "loss": 0.3288, + "step": 10141 + }, + { + "epoch": 1.9994085173501577, + "grad_norm": 0.4474284368558747, + "learning_rate": 1.0018600301018873e-05, + "loss": 0.3236, + "step": 10142 + }, + { + "epoch": 1.9996056782334386, + "grad_norm": 0.47845080896837267, + "learning_rate": 1.0017050277504276e-05, + "loss": 0.3214, + "step": 10143 + }, + { + "epoch": 1.999802839116719, + "grad_norm": 0.47724597287669274, + "learning_rate": 1.0015500253580039e-05, + "loss": 0.3339, + "step": 10144 + }, + { + "epoch": 1.999802839116719, + "eval_loss": 0.41522932052612305, + "eval_runtime": 344.5021, + "eval_samples_per_second": 23.599, + "eval_steps_per_second": 1.477, + "step": 10144 + } + ], + "logging_steps": 1, + "max_steps": 20288, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 5072, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.323382503112704e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}