{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 300000, "global_step": 1125165, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013331378064550532, "grad_norm": 48.62757873535156, "learning_rate": 1.447398491658414e-07, "loss": 7.9433, "step": 100 }, { "epoch": 0.0026662756129101064, "grad_norm": 63.5054931640625, "learning_rate": 2.97097585129885e-07, "loss": 6.6101, "step": 200 }, { "epoch": 0.00399941341936516, "grad_norm": 28.41693687438965, "learning_rate": 4.494553210939285e-07, "loss": 5.0121, "step": 300 }, { "epoch": 0.005332551225820213, "grad_norm": 29.294654846191406, "learning_rate": 6.018130570579721e-07, "loss": 3.6115, "step": 400 }, { "epoch": 0.0066656890322752665, "grad_norm": 31.856111526489258, "learning_rate": 7.526472156623753e-07, "loss": 3.353, "step": 500 }, { "epoch": 0.00799882683873032, "grad_norm": 21.679893493652344, "learning_rate": 9.050049516264189e-07, "loss": 3.0315, "step": 600 }, { "epoch": 0.009331964645185372, "grad_norm": 15.496798515319824, "learning_rate": 1.0573626875904623e-06, "loss": 2.8661, "step": 700 }, { "epoch": 0.010665102451640426, "grad_norm": 23.380613327026367, "learning_rate": 1.209720423554506e-06, "loss": 2.5809, "step": 800 }, { "epoch": 0.01199824025809548, "grad_norm": 24.171405792236328, "learning_rate": 1.3620781595185497e-06, "loss": 2.599, "step": 900 }, { "epoch": 0.013331378064550533, "grad_norm": 39.315853118896484, "learning_rate": 1.5144358954825931e-06, "loss": 2.4091, "step": 1000 }, { "epoch": 0.014664515871005587, "grad_norm": 27.648170471191406, "learning_rate": 1.6667936314466368e-06, "loss": 2.4422, "step": 1100 }, { "epoch": 0.01599765367746064, "grad_norm": 15.394720077514648, "learning_rate": 1.8191513674106803e-06, "loss": 2.3096, "step": 1200 }, { "epoch": 0.017330791483915694, "grad_norm": 23.057252883911133, "learning_rate": 1.9715091033747238e-06, "loss": 2.2327, "step": 1300 }, { "epoch": 0.018663929290370744, "grad_norm": 21.54753303527832, "learning_rate": 2.1238668393387676e-06, "loss": 2.0939, "step": 1400 }, { "epoch": 0.019997067096825798, "grad_norm": 24.023548126220703, "learning_rate": 2.276224575302811e-06, "loss": 2.2677, "step": 1500 }, { "epoch": 0.02133020490328085, "grad_norm": 22.988351821899414, "learning_rate": 2.428582311266855e-06, "loss": 2.0709, "step": 1600 }, { "epoch": 0.022663342709735905, "grad_norm": 17.036029815673828, "learning_rate": 2.580940047230898e-06, "loss": 2.0102, "step": 1700 }, { "epoch": 0.02399648051619096, "grad_norm": 18.221567153930664, "learning_rate": 2.7332977831949415e-06, "loss": 1.9705, "step": 1800 }, { "epoch": 0.025329618322646012, "grad_norm": 25.236820220947266, "learning_rate": 2.8856555191589854e-06, "loss": 2.042, "step": 1900 }, { "epoch": 0.026662756129101066, "grad_norm": 18.677886962890625, "learning_rate": 3.038013255123029e-06, "loss": 2.1647, "step": 2000 }, { "epoch": 0.02799589393555612, "grad_norm": 18.681873321533203, "learning_rate": 3.1903709910870728e-06, "loss": 1.9771, "step": 2100 }, { "epoch": 0.029329031742011173, "grad_norm": 17.565290451049805, "learning_rate": 3.3427287270511162e-06, "loss": 1.8713, "step": 2200 }, { "epoch": 0.030662169548466223, "grad_norm": 27.260120391845703, "learning_rate": 3.4950864630151597e-06, "loss": 1.7865, "step": 2300 }, { "epoch": 0.03199530735492128, "grad_norm": 20.9194393157959, "learning_rate": 3.647444198979203e-06, "loss": 1.7126, "step": 2400 }, { "epoch": 0.033328445161376334, "grad_norm": 25.235183715820312, "learning_rate": 3.799801934943247e-06, "loss": 1.7669, "step": 2500 }, { "epoch": 0.03466158296783139, "grad_norm": 21.713947296142578, "learning_rate": 3.9521596709072905e-06, "loss": 1.657, "step": 2600 }, { "epoch": 0.03599472077428644, "grad_norm": 20.2688045501709, "learning_rate": 4.102993829511693e-06, "loss": 1.7223, "step": 2700 }, { "epoch": 0.03732785858074149, "grad_norm": 12.779273986816406, "learning_rate": 4.255351565475737e-06, "loss": 1.7237, "step": 2800 }, { "epoch": 0.03866099638719654, "grad_norm": 29.376150131225586, "learning_rate": 4.40770930143978e-06, "loss": 1.9161, "step": 2900 }, { "epoch": 0.039994134193651595, "grad_norm": 21.00773811340332, "learning_rate": 4.560067037403824e-06, "loss": 1.6736, "step": 3000 }, { "epoch": 0.04132727200010665, "grad_norm": 22.627586364746094, "learning_rate": 4.712424773367868e-06, "loss": 1.5514, "step": 3100 }, { "epoch": 0.0426604098065617, "grad_norm": 17.85454559326172, "learning_rate": 4.864782509331911e-06, "loss": 1.458, "step": 3200 }, { "epoch": 0.043993547613016756, "grad_norm": 13.928373336791992, "learning_rate": 5.017140245295955e-06, "loss": 1.5452, "step": 3300 }, { "epoch": 0.04532668541947181, "grad_norm": 21.45450210571289, "learning_rate": 5.169497981259999e-06, "loss": 1.4685, "step": 3400 }, { "epoch": 0.046659823225926864, "grad_norm": 34.083961486816406, "learning_rate": 5.321855717224043e-06, "loss": 1.6222, "step": 3500 }, { "epoch": 0.04799296103238192, "grad_norm": 55.6972770690918, "learning_rate": 5.474213453188086e-06, "loss": 1.4822, "step": 3600 }, { "epoch": 0.04932609883883697, "grad_norm": 19.22019386291504, "learning_rate": 5.626571189152129e-06, "loss": 1.4493, "step": 3700 }, { "epoch": 0.050659236645292025, "grad_norm": 17.8115291595459, "learning_rate": 5.778928925116173e-06, "loss": 1.6283, "step": 3800 }, { "epoch": 0.05199237445174708, "grad_norm": 35.747703552246094, "learning_rate": 5.931286661080217e-06, "loss": 1.4171, "step": 3900 }, { "epoch": 0.05332551225820213, "grad_norm": 19.620441436767578, "learning_rate": 6.0836443970442605e-06, "loss": 1.5073, "step": 4000 }, { "epoch": 0.054658650064657185, "grad_norm": 12.912127494812012, "learning_rate": 6.2360021330083036e-06, "loss": 1.5513, "step": 4100 }, { "epoch": 0.05599178787111224, "grad_norm": 30.24208641052246, "learning_rate": 6.388359868972347e-06, "loss": 1.5101, "step": 4200 }, { "epoch": 0.05732492567756729, "grad_norm": 48.571590423583984, "learning_rate": 6.540717604936391e-06, "loss": 1.6273, "step": 4300 }, { "epoch": 0.058658063484022346, "grad_norm": 14.590188980102539, "learning_rate": 6.693075340900434e-06, "loss": 1.3831, "step": 4400 }, { "epoch": 0.0599912012904774, "grad_norm": 17.75982093811035, "learning_rate": 6.845433076864478e-06, "loss": 1.3821, "step": 4500 }, { "epoch": 0.06132433909693245, "grad_norm": 16.415212631225586, "learning_rate": 6.997790812828521e-06, "loss": 1.3892, "step": 4600 }, { "epoch": 0.06265747690338751, "grad_norm": 42.8680419921875, "learning_rate": 7.150148548792565e-06, "loss": 1.4241, "step": 4700 }, { "epoch": 0.06399061470984256, "grad_norm": 19.037755966186523, "learning_rate": 7.302506284756609e-06, "loss": 1.316, "step": 4800 }, { "epoch": 0.06532375251629761, "grad_norm": 25.890825271606445, "learning_rate": 7.454864020720652e-06, "loss": 1.2999, "step": 4900 }, { "epoch": 0.06665689032275267, "grad_norm": 10.623682022094727, "learning_rate": 7.607221756684695e-06, "loss": 1.399, "step": 5000 }, { "epoch": 0.06799002812920772, "grad_norm": 17.25834846496582, "learning_rate": 7.75957949264874e-06, "loss": 1.4151, "step": 5100 }, { "epoch": 0.06932316593566278, "grad_norm": 14.895671844482422, "learning_rate": 7.911937228612784e-06, "loss": 1.3543, "step": 5200 }, { "epoch": 0.07065630374211783, "grad_norm": 12.48409366607666, "learning_rate": 8.064294964576826e-06, "loss": 1.4719, "step": 5300 }, { "epoch": 0.07198944154857288, "grad_norm": 17.75235939025879, "learning_rate": 8.21665270054087e-06, "loss": 1.3447, "step": 5400 }, { "epoch": 0.07332257935502792, "grad_norm": 6.568273544311523, "learning_rate": 8.369010436504914e-06, "loss": 1.3517, "step": 5500 }, { "epoch": 0.07465571716148298, "grad_norm": 14.745866775512695, "learning_rate": 8.521368172468958e-06, "loss": 1.3219, "step": 5600 }, { "epoch": 0.07598885496793803, "grad_norm": 18.130077362060547, "learning_rate": 8.673725908433002e-06, "loss": 1.2975, "step": 5700 }, { "epoch": 0.07732199277439308, "grad_norm": 22.177501678466797, "learning_rate": 8.826083644397044e-06, "loss": 1.2784, "step": 5800 }, { "epoch": 0.07865513058084814, "grad_norm": 22.109601974487305, "learning_rate": 8.978441380361088e-06, "loss": 1.2234, "step": 5900 }, { "epoch": 0.07998826838730319, "grad_norm": 16.332239151000977, "learning_rate": 9.130799116325132e-06, "loss": 1.3506, "step": 6000 }, { "epoch": 0.08132140619375824, "grad_norm": 15.126394271850586, "learning_rate": 9.283156852289175e-06, "loss": 1.3464, "step": 6100 }, { "epoch": 0.0826545440002133, "grad_norm": 10.728598594665527, "learning_rate": 9.43551458825322e-06, "loss": 1.3076, "step": 6200 }, { "epoch": 0.08398768180666835, "grad_norm": 18.38832664489746, "learning_rate": 9.587872324217262e-06, "loss": 1.2603, "step": 6300 }, { "epoch": 0.0853208196131234, "grad_norm": 22.932697296142578, "learning_rate": 9.740230060181305e-06, "loss": 1.3272, "step": 6400 }, { "epoch": 0.08665395741957846, "grad_norm": 29.621030807495117, "learning_rate": 9.892587796145351e-06, "loss": 1.3011, "step": 6500 }, { "epoch": 0.08798709522603351, "grad_norm": 15.129687309265137, "learning_rate": 1.0044945532109393e-05, "loss": 1.3012, "step": 6600 }, { "epoch": 0.08932023303248857, "grad_norm": 19.68153190612793, "learning_rate": 1.0195779690713795e-05, "loss": 1.3359, "step": 6700 }, { "epoch": 0.09065337083894362, "grad_norm": 10.530654907226562, "learning_rate": 1.034813742667784e-05, "loss": 1.3565, "step": 6800 }, { "epoch": 0.09198650864539867, "grad_norm": 21.431621551513672, "learning_rate": 1.0500495162641885e-05, "loss": 1.3282, "step": 6900 }, { "epoch": 0.09331964645185373, "grad_norm": 9.689653396606445, "learning_rate": 1.0652852898605927e-05, "loss": 1.2813, "step": 7000 }, { "epoch": 0.09465278425830878, "grad_norm": 11.680956840515137, "learning_rate": 1.080521063456997e-05, "loss": 1.3225, "step": 7100 }, { "epoch": 0.09598592206476383, "grad_norm": 51.08759689331055, "learning_rate": 1.0957568370534013e-05, "loss": 1.3099, "step": 7200 }, { "epoch": 0.09731905987121889, "grad_norm": 12.349186897277832, "learning_rate": 1.1109926106498059e-05, "loss": 1.3677, "step": 7300 }, { "epoch": 0.09865219767767394, "grad_norm": 11.336565971374512, "learning_rate": 1.1262283842462102e-05, "loss": 1.2945, "step": 7400 }, { "epoch": 0.099985335484129, "grad_norm": 22.537038803100586, "learning_rate": 1.1414641578426145e-05, "loss": 1.2762, "step": 7500 }, { "epoch": 0.10131847329058405, "grad_norm": 14.839827537536621, "learning_rate": 1.1566999314390188e-05, "loss": 1.2577, "step": 7600 }, { "epoch": 0.1026516110970391, "grad_norm": 20.88617515563965, "learning_rate": 1.171935705035423e-05, "loss": 1.2336, "step": 7700 }, { "epoch": 0.10398474890349416, "grad_norm": 15.363484382629395, "learning_rate": 1.1871714786318276e-05, "loss": 1.3189, "step": 7800 }, { "epoch": 0.10531788670994921, "grad_norm": 15.221962928771973, "learning_rate": 1.202407252228232e-05, "loss": 1.1953, "step": 7900 }, { "epoch": 0.10665102451640426, "grad_norm": 16.448259353637695, "learning_rate": 1.2176430258246362e-05, "loss": 1.2209, "step": 8000 }, { "epoch": 0.10798416232285932, "grad_norm": 25.986299514770508, "learning_rate": 1.2328787994210406e-05, "loss": 1.2431, "step": 8100 }, { "epoch": 0.10931730012931437, "grad_norm": 16.5120906829834, "learning_rate": 1.2481145730174448e-05, "loss": 1.2337, "step": 8200 }, { "epoch": 0.11065043793576942, "grad_norm": 36.10545349121094, "learning_rate": 1.2633503466138494e-05, "loss": 1.2051, "step": 8300 }, { "epoch": 0.11198357574222448, "grad_norm": 8.288851737976074, "learning_rate": 1.2785861202102538e-05, "loss": 1.3947, "step": 8400 }, { "epoch": 0.11331671354867953, "grad_norm": 8.871662139892578, "learning_rate": 1.293821893806658e-05, "loss": 1.3009, "step": 8500 }, { "epoch": 0.11464985135513459, "grad_norm": 17.308908462524414, "learning_rate": 1.3090576674030624e-05, "loss": 1.2073, "step": 8600 }, { "epoch": 0.11598298916158964, "grad_norm": 7.278769016265869, "learning_rate": 1.3242934409994668e-05, "loss": 1.1842, "step": 8700 }, { "epoch": 0.11731612696804469, "grad_norm": 47.840576171875, "learning_rate": 1.3395292145958712e-05, "loss": 1.3531, "step": 8800 }, { "epoch": 0.11864926477449975, "grad_norm": 24.470443725585938, "learning_rate": 1.3547649881922756e-05, "loss": 1.3015, "step": 8900 }, { "epoch": 0.1199824025809548, "grad_norm": 8.26863956451416, "learning_rate": 1.3700007617886798e-05, "loss": 1.3005, "step": 9000 }, { "epoch": 0.12131554038740984, "grad_norm": 17.496753692626953, "learning_rate": 1.3852365353850842e-05, "loss": 1.3463, "step": 9100 }, { "epoch": 0.1226486781938649, "grad_norm": 21.325342178344727, "learning_rate": 1.4004723089814886e-05, "loss": 1.1968, "step": 9200 }, { "epoch": 0.12398181600031995, "grad_norm": 22.53859519958496, "learning_rate": 1.415708082577893e-05, "loss": 1.3186, "step": 9300 }, { "epoch": 0.12531495380677501, "grad_norm": 11.75147819519043, "learning_rate": 1.4309438561742973e-05, "loss": 1.2582, "step": 9400 }, { "epoch": 0.12664809161323007, "grad_norm": 11.544166564941406, "learning_rate": 1.4461796297707016e-05, "loss": 1.3282, "step": 9500 }, { "epoch": 0.12798122941968512, "grad_norm": 9.322103500366211, "learning_rate": 1.461415403367106e-05, "loss": 1.2283, "step": 9600 }, { "epoch": 0.12931436722614018, "grad_norm": 7.91118860244751, "learning_rate": 1.4766511769635105e-05, "loss": 1.2184, "step": 9700 }, { "epoch": 0.13064750503259523, "grad_norm": 5.996368885040283, "learning_rate": 1.4918869505599147e-05, "loss": 1.3688, "step": 9800 }, { "epoch": 0.13198064283905028, "grad_norm": 8.645076751708984, "learning_rate": 1.507122724156319e-05, "loss": 1.1556, "step": 9900 }, { "epoch": 0.13331378064550534, "grad_norm": 15.709826469421387, "learning_rate": 1.5223584977527233e-05, "loss": 1.2622, "step": 10000 }, { "epoch": 0.1346469184519604, "grad_norm": 11.15367317199707, "learning_rate": 1.537594271349128e-05, "loss": 1.1092, "step": 10100 }, { "epoch": 0.13598005625841544, "grad_norm": 8.455020904541016, "learning_rate": 1.552830044945532e-05, "loss": 1.1032, "step": 10200 }, { "epoch": 0.1373131940648705, "grad_norm": 28.911344528198242, "learning_rate": 1.5680658185419367e-05, "loss": 1.2377, "step": 10300 }, { "epoch": 0.13864633187132555, "grad_norm": 9.72524356842041, "learning_rate": 1.583301592138341e-05, "loss": 1.1404, "step": 10400 }, { "epoch": 0.1399794696777806, "grad_norm": 22.322458267211914, "learning_rate": 1.598537365734745e-05, "loss": 1.2709, "step": 10500 }, { "epoch": 0.14131260748423566, "grad_norm": 17.362812042236328, "learning_rate": 1.6137731393311497e-05, "loss": 1.1408, "step": 10600 }, { "epoch": 0.1426457452906907, "grad_norm": 9.3365478515625, "learning_rate": 1.629008912927554e-05, "loss": 1.1962, "step": 10700 }, { "epoch": 0.14397888309714577, "grad_norm": 11.509284973144531, "learning_rate": 1.6442446865239585e-05, "loss": 1.2002, "step": 10800 }, { "epoch": 0.1453120209036008, "grad_norm": 22.724319458007812, "learning_rate": 1.6594804601203627e-05, "loss": 1.2339, "step": 10900 }, { "epoch": 0.14664515871005585, "grad_norm": 6.827927112579346, "learning_rate": 1.674716233716767e-05, "loss": 1.2671, "step": 11000 }, { "epoch": 0.1479782965165109, "grad_norm": 17.569862365722656, "learning_rate": 1.6899520073131714e-05, "loss": 1.1371, "step": 11100 }, { "epoch": 0.14931143432296595, "grad_norm": 18.67312240600586, "learning_rate": 1.7051877809095757e-05, "loss": 1.1835, "step": 11200 }, { "epoch": 0.150644572129421, "grad_norm": 3.237417697906494, "learning_rate": 1.7204235545059802e-05, "loss": 1.2318, "step": 11300 }, { "epoch": 0.15197770993587606, "grad_norm": 11.685154914855957, "learning_rate": 1.7356593281023844e-05, "loss": 1.2314, "step": 11400 }, { "epoch": 0.1533108477423311, "grad_norm": 20.69179344177246, "learning_rate": 1.7508951016987887e-05, "loss": 1.3132, "step": 11500 }, { "epoch": 0.15464398554878617, "grad_norm": 21.22089195251465, "learning_rate": 1.7661308752951932e-05, "loss": 1.2097, "step": 11600 }, { "epoch": 0.15597712335524122, "grad_norm": 12.935917854309082, "learning_rate": 1.7813666488915974e-05, "loss": 1.2295, "step": 11700 }, { "epoch": 0.15731026116169627, "grad_norm": 24.15074920654297, "learning_rate": 1.796602422488002e-05, "loss": 1.1744, "step": 11800 }, { "epoch": 0.15864339896815133, "grad_norm": 12.920533180236816, "learning_rate": 1.8118381960844062e-05, "loss": 1.2465, "step": 11900 }, { "epoch": 0.15997653677460638, "grad_norm": 19.24158477783203, "learning_rate": 1.8270739696808104e-05, "loss": 1.221, "step": 12000 }, { "epoch": 0.16130967458106144, "grad_norm": 22.08138084411621, "learning_rate": 1.842309743277215e-05, "loss": 1.2385, "step": 12100 }, { "epoch": 0.1626428123875165, "grad_norm": 14.95280933380127, "learning_rate": 1.8575455168736192e-05, "loss": 1.2322, "step": 12200 }, { "epoch": 0.16397595019397154, "grad_norm": 18.447845458984375, "learning_rate": 1.8727812904700238e-05, "loss": 1.1289, "step": 12300 }, { "epoch": 0.1653090880004266, "grad_norm": 12.489338874816895, "learning_rate": 1.888017064066428e-05, "loss": 1.0938, "step": 12400 }, { "epoch": 0.16664222580688165, "grad_norm": 14.297622680664062, "learning_rate": 1.9032528376628322e-05, "loss": 1.2019, "step": 12500 }, { "epoch": 0.1679753636133367, "grad_norm": 38.17632293701172, "learning_rate": 1.9183362535232726e-05, "loss": 1.2035, "step": 12600 }, { "epoch": 0.16930850141979176, "grad_norm": 16.18353843688965, "learning_rate": 1.933572027119677e-05, "loss": 1.251, "step": 12700 }, { "epoch": 0.1706416392262468, "grad_norm": 10.895003318786621, "learning_rate": 1.9488078007160814e-05, "loss": 1.2347, "step": 12800 }, { "epoch": 0.17197477703270186, "grad_norm": 14.233658790588379, "learning_rate": 1.964043574312486e-05, "loss": 1.3228, "step": 12900 }, { "epoch": 0.17330791483915692, "grad_norm": 10.391095161437988, "learning_rate": 1.97927934790889e-05, "loss": 1.1204, "step": 13000 }, { "epoch": 0.17464105264561197, "grad_norm": 11.41118335723877, "learning_rate": 1.9945151215052944e-05, "loss": 1.343, "step": 13100 }, { "epoch": 0.17597419045206703, "grad_norm": 13.362486839294434, "learning_rate": 2.009750895101699e-05, "loss": 1.268, "step": 13200 }, { "epoch": 0.17730732825852208, "grad_norm": 12.438713073730469, "learning_rate": 2.024986668698103e-05, "loss": 1.1474, "step": 13300 }, { "epoch": 0.17864046606497713, "grad_norm": 13.487771987915039, "learning_rate": 2.0402224422945077e-05, "loss": 1.1365, "step": 13400 }, { "epoch": 0.1799736038714322, "grad_norm": 6.871811866760254, "learning_rate": 2.055458215890912e-05, "loss": 1.2296, "step": 13500 }, { "epoch": 0.18130674167788724, "grad_norm": 20.80048370361328, "learning_rate": 2.070693989487316e-05, "loss": 1.283, "step": 13600 }, { "epoch": 0.1826398794843423, "grad_norm": 16.09881019592285, "learning_rate": 2.0859297630837207e-05, "loss": 1.1789, "step": 13700 }, { "epoch": 0.18397301729079735, "grad_norm": 15.140607833862305, "learning_rate": 2.101165536680125e-05, "loss": 1.1714, "step": 13800 }, { "epoch": 0.1853061550972524, "grad_norm": 7.161064147949219, "learning_rate": 2.1164013102765295e-05, "loss": 1.148, "step": 13900 }, { "epoch": 0.18663929290370745, "grad_norm": 9.653188705444336, "learning_rate": 2.1316370838729337e-05, "loss": 1.1741, "step": 14000 }, { "epoch": 0.1879724307101625, "grad_norm": 14.273736953735352, "learning_rate": 2.146872857469338e-05, "loss": 1.2164, "step": 14100 }, { "epoch": 0.18930556851661756, "grad_norm": 8.113191604614258, "learning_rate": 2.1621086310657425e-05, "loss": 1.1719, "step": 14200 }, { "epoch": 0.19063870632307262, "grad_norm": 6.882025241851807, "learning_rate": 2.1773444046621467e-05, "loss": 1.1334, "step": 14300 }, { "epoch": 0.19197184412952767, "grad_norm": 21.5977725982666, "learning_rate": 2.1925801782585512e-05, "loss": 1.2622, "step": 14400 }, { "epoch": 0.19330498193598272, "grad_norm": 46.230125427246094, "learning_rate": 2.2078159518549555e-05, "loss": 1.2079, "step": 14500 }, { "epoch": 0.19463811974243778, "grad_norm": 19.497507095336914, "learning_rate": 2.2230517254513597e-05, "loss": 1.2298, "step": 14600 }, { "epoch": 0.19597125754889283, "grad_norm": 12.560151100158691, "learning_rate": 2.2382874990477642e-05, "loss": 1.1804, "step": 14700 }, { "epoch": 0.19730439535534788, "grad_norm": 20.856603622436523, "learning_rate": 2.2535232726441685e-05, "loss": 1.1791, "step": 14800 }, { "epoch": 0.19863753316180294, "grad_norm": 16.704465866088867, "learning_rate": 2.268759046240573e-05, "loss": 1.2124, "step": 14900 }, { "epoch": 0.199970670968258, "grad_norm": 9.750483512878418, "learning_rate": 2.2839948198369776e-05, "loss": 1.2249, "step": 15000 }, { "epoch": 0.20130380877471304, "grad_norm": 7.600627422332764, "learning_rate": 2.2992305934333815e-05, "loss": 1.2409, "step": 15100 }, { "epoch": 0.2026369465811681, "grad_norm": 33.99422836303711, "learning_rate": 2.314466367029786e-05, "loss": 1.2728, "step": 15200 }, { "epoch": 0.20397008438762315, "grad_norm": 14.20000171661377, "learning_rate": 2.3297021406261902e-05, "loss": 1.0914, "step": 15300 }, { "epoch": 0.2053032221940782, "grad_norm": 10.18528938293457, "learning_rate": 2.3449379142225948e-05, "loss": 1.242, "step": 15400 }, { "epoch": 0.20663636000053326, "grad_norm": 30.62076187133789, "learning_rate": 2.3601736878189994e-05, "loss": 1.2025, "step": 15500 }, { "epoch": 0.2079694978069883, "grad_norm": 27.396848678588867, "learning_rate": 2.3754094614154032e-05, "loss": 1.1442, "step": 15600 }, { "epoch": 0.20930263561344337, "grad_norm": 18.30869483947754, "learning_rate": 2.3906452350118078e-05, "loss": 1.2605, "step": 15700 }, { "epoch": 0.21063577341989842, "grad_norm": 12.99421501159668, "learning_rate": 2.405881008608212e-05, "loss": 1.1484, "step": 15800 }, { "epoch": 0.21196891122635347, "grad_norm": 9.586811065673828, "learning_rate": 2.4211167822046166e-05, "loss": 1.1704, "step": 15900 }, { "epoch": 0.21330204903280853, "grad_norm": 9.95820140838623, "learning_rate": 2.436352555801021e-05, "loss": 1.1198, "step": 16000 }, { "epoch": 0.21463518683926358, "grad_norm": 15.190288543701172, "learning_rate": 2.451588329397425e-05, "loss": 1.2225, "step": 16100 }, { "epoch": 0.21596832464571863, "grad_norm": 14.05370044708252, "learning_rate": 2.4668241029938296e-05, "loss": 1.2019, "step": 16200 }, { "epoch": 0.2173014624521737, "grad_norm": 44.88285827636719, "learning_rate": 2.4820598765902338e-05, "loss": 1.2461, "step": 16300 }, { "epoch": 0.21863460025862874, "grad_norm": 16.069385528564453, "learning_rate": 2.4972956501866384e-05, "loss": 1.3377, "step": 16400 }, { "epoch": 0.2199677380650838, "grad_norm": 13.041169166564941, "learning_rate": 2.512531423783043e-05, "loss": 1.0649, "step": 16500 }, { "epoch": 0.22130087587153885, "grad_norm": 15.148236274719238, "learning_rate": 2.5277671973794468e-05, "loss": 1.1981, "step": 16600 }, { "epoch": 0.2226340136779939, "grad_norm": 8.328532218933105, "learning_rate": 2.5430029709758513e-05, "loss": 1.252, "step": 16700 }, { "epoch": 0.22396715148444896, "grad_norm": 15.253068923950195, "learning_rate": 2.5582387445722556e-05, "loss": 1.228, "step": 16800 }, { "epoch": 0.225300289290904, "grad_norm": 14.996654510498047, "learning_rate": 2.57347451816866e-05, "loss": 1.3157, "step": 16900 }, { "epoch": 0.22663342709735906, "grad_norm": 7.6138715744018555, "learning_rate": 2.5887102917650647e-05, "loss": 1.1341, "step": 17000 }, { "epoch": 0.22796656490381412, "grad_norm": 19.87457275390625, "learning_rate": 2.6037937076255047e-05, "loss": 1.1554, "step": 17100 }, { "epoch": 0.22929970271026917, "grad_norm": 29.767147064208984, "learning_rate": 2.6190294812219093e-05, "loss": 1.3142, "step": 17200 }, { "epoch": 0.23063284051672422, "grad_norm": 21.5781307220459, "learning_rate": 2.6342652548183135e-05, "loss": 1.1348, "step": 17300 }, { "epoch": 0.23196597832317928, "grad_norm": 55.95188522338867, "learning_rate": 2.6495010284147177e-05, "loss": 1.1673, "step": 17400 }, { "epoch": 0.23329911612963433, "grad_norm": 12.587220191955566, "learning_rate": 2.6647368020111223e-05, "loss": 1.2857, "step": 17500 }, { "epoch": 0.23463225393608939, "grad_norm": 15.643916130065918, "learning_rate": 2.6799725756075265e-05, "loss": 1.2935, "step": 17600 }, { "epoch": 0.23596539174254444, "grad_norm": 23.042165756225586, "learning_rate": 2.695208349203931e-05, "loss": 1.1582, "step": 17700 }, { "epoch": 0.2372985295489995, "grad_norm": 7.61171817779541, "learning_rate": 2.7104441228003353e-05, "loss": 1.1805, "step": 17800 }, { "epoch": 0.23863166735545455, "grad_norm": 8.766804695129395, "learning_rate": 2.7256798963967395e-05, "loss": 1.2207, "step": 17900 }, { "epoch": 0.2399648051619096, "grad_norm": 20.35345458984375, "learning_rate": 2.740915669993144e-05, "loss": 1.1717, "step": 18000 }, { "epoch": 0.24129794296836465, "grad_norm": 11.304494857788086, "learning_rate": 2.7561514435895483e-05, "loss": 1.1543, "step": 18100 }, { "epoch": 0.24263108077481968, "grad_norm": 19.132272720336914, "learning_rate": 2.7713872171859528e-05, "loss": 1.1658, "step": 18200 }, { "epoch": 0.24396421858127473, "grad_norm": 10.874183654785156, "learning_rate": 2.786622990782357e-05, "loss": 1.2176, "step": 18300 }, { "epoch": 0.2452973563877298, "grad_norm": 7.204709529876709, "learning_rate": 2.8018587643787613e-05, "loss": 1.2411, "step": 18400 }, { "epoch": 0.24663049419418484, "grad_norm": 10.241418838500977, "learning_rate": 2.8170945379751658e-05, "loss": 1.1726, "step": 18500 }, { "epoch": 0.2479636320006399, "grad_norm": 25.872150421142578, "learning_rate": 2.83233031157157e-05, "loss": 1.1105, "step": 18600 }, { "epoch": 0.24929676980709495, "grad_norm": 5.569912910461426, "learning_rate": 2.8475660851679746e-05, "loss": 1.1818, "step": 18700 }, { "epoch": 0.25062990761355003, "grad_norm": 10.390409469604492, "learning_rate": 2.8628018587643788e-05, "loss": 1.194, "step": 18800 }, { "epoch": 0.2519630454200051, "grad_norm": 9.97793197631836, "learning_rate": 2.878037632360783e-05, "loss": 1.3648, "step": 18900 }, { "epoch": 0.25329618322646014, "grad_norm": 6.428100109100342, "learning_rate": 2.8932734059571876e-05, "loss": 1.1224, "step": 19000 }, { "epoch": 0.2546293210329152, "grad_norm": 9.965995788574219, "learning_rate": 2.9085091795535918e-05, "loss": 1.171, "step": 19100 }, { "epoch": 0.25596245883937024, "grad_norm": 12.338706970214844, "learning_rate": 2.9237449531499964e-05, "loss": 1.2639, "step": 19200 }, { "epoch": 0.2572955966458253, "grad_norm": 9.033255577087402, "learning_rate": 2.9388283690104367e-05, "loss": 1.1921, "step": 19300 }, { "epoch": 0.25862873445228035, "grad_norm": 11.28254508972168, "learning_rate": 2.954064142606841e-05, "loss": 1.277, "step": 19400 }, { "epoch": 0.2599618722587354, "grad_norm": 37.26377868652344, "learning_rate": 2.9692999162032452e-05, "loss": 1.131, "step": 19500 }, { "epoch": 0.26129501006519046, "grad_norm": 29.9090576171875, "learning_rate": 2.9845356897996497e-05, "loss": 1.2951, "step": 19600 }, { "epoch": 0.2626281478716455, "grad_norm": 20.856468200683594, "learning_rate": 2.999771463396054e-05, "loss": 1.2807, "step": 19700 }, { "epoch": 0.26396128567810057, "grad_norm": 3.655729293823242, "learning_rate": 3.0150072369924585e-05, "loss": 1.1182, "step": 19800 }, { "epoch": 0.2652944234845556, "grad_norm": 7.873777389526367, "learning_rate": 3.0302430105888624e-05, "loss": 1.1194, "step": 19900 }, { "epoch": 0.2666275612910107, "grad_norm": 7.779646873474121, "learning_rate": 3.0454787841852673e-05, "loss": 1.2125, "step": 20000 }, { "epoch": 0.2679606990974657, "grad_norm": 12.739456176757812, "learning_rate": 3.0607145577816715e-05, "loss": 1.2223, "step": 20100 }, { "epoch": 0.2692938369039208, "grad_norm": 14.48033618927002, "learning_rate": 3.075950331378076e-05, "loss": 1.1885, "step": 20200 }, { "epoch": 0.27062697471037583, "grad_norm": 14.283453941345215, "learning_rate": 3.09118610497448e-05, "loss": 1.2008, "step": 20300 }, { "epoch": 0.2719601125168309, "grad_norm": 14.55647087097168, "learning_rate": 3.106421878570885e-05, "loss": 1.1906, "step": 20400 }, { "epoch": 0.27329325032328594, "grad_norm": 9.568668365478516, "learning_rate": 3.121657652167289e-05, "loss": 1.1847, "step": 20500 }, { "epoch": 0.274626388129741, "grad_norm": 9.719205856323242, "learning_rate": 3.136893425763693e-05, "loss": 1.2125, "step": 20600 }, { "epoch": 0.27595952593619605, "grad_norm": 8.21892261505127, "learning_rate": 3.1521291993600975e-05, "loss": 1.1992, "step": 20700 }, { "epoch": 0.2772926637426511, "grad_norm": 12.22874927520752, "learning_rate": 3.167364972956502e-05, "loss": 1.1496, "step": 20800 }, { "epoch": 0.27862580154910616, "grad_norm": 14.49831485748291, "learning_rate": 3.182600746552906e-05, "loss": 1.2013, "step": 20900 }, { "epoch": 0.2799589393555612, "grad_norm": 7.3931379318237305, "learning_rate": 3.197836520149311e-05, "loss": 1.193, "step": 21000 }, { "epoch": 0.28129207716201626, "grad_norm": 15.12801742553711, "learning_rate": 3.213072293745715e-05, "loss": 1.209, "step": 21100 }, { "epoch": 0.2826252149684713, "grad_norm": 26.489347457885742, "learning_rate": 3.228308067342119e-05, "loss": 1.194, "step": 21200 }, { "epoch": 0.28395835277492637, "grad_norm": 30.90116310119629, "learning_rate": 3.2435438409385235e-05, "loss": 1.2905, "step": 21300 }, { "epoch": 0.2852914905813814, "grad_norm": 5.639130115509033, "learning_rate": 3.2587796145349284e-05, "loss": 1.2569, "step": 21400 }, { "epoch": 0.2866246283878365, "grad_norm": 3.282135009765625, "learning_rate": 3.2740153881313326e-05, "loss": 1.2288, "step": 21500 }, { "epoch": 0.28795776619429153, "grad_norm": 13.050392150878906, "learning_rate": 3.289251161727737e-05, "loss": 1.2652, "step": 21600 }, { "epoch": 0.2892909040007466, "grad_norm": 11.476880073547363, "learning_rate": 3.304486935324141e-05, "loss": 1.1219, "step": 21700 }, { "epoch": 0.2906240418072016, "grad_norm": 16.305477142333984, "learning_rate": 3.319722708920545e-05, "loss": 1.2424, "step": 21800 }, { "epoch": 0.29195717961365664, "grad_norm": 16.452320098876953, "learning_rate": 3.3349584825169495e-05, "loss": 1.1994, "step": 21900 }, { "epoch": 0.2932903174201117, "grad_norm": 8.222269058227539, "learning_rate": 3.3501942561133544e-05, "loss": 1.1493, "step": 22000 }, { "epoch": 0.29462345522656674, "grad_norm": 14.022012710571289, "learning_rate": 3.3654300297097586e-05, "loss": 1.11, "step": 22100 }, { "epoch": 0.2959565930330218, "grad_norm": 16.044740676879883, "learning_rate": 3.380665803306163e-05, "loss": 1.1324, "step": 22200 }, { "epoch": 0.29728973083947685, "grad_norm": 20.89525032043457, "learning_rate": 3.395901576902568e-05, "loss": 1.2241, "step": 22300 }, { "epoch": 0.2986228686459319, "grad_norm": 64.70343017578125, "learning_rate": 3.411137350498972e-05, "loss": 1.28, "step": 22400 }, { "epoch": 0.29995600645238696, "grad_norm": 63.12675094604492, "learning_rate": 3.4262207663594116e-05, "loss": 1.2269, "step": 22500 }, { "epoch": 0.301289144258842, "grad_norm": 23.980283737182617, "learning_rate": 3.4414565399558165e-05, "loss": 1.2873, "step": 22600 }, { "epoch": 0.30262228206529707, "grad_norm": 34.06816482543945, "learning_rate": 3.456692313552221e-05, "loss": 1.2127, "step": 22700 }, { "epoch": 0.3039554198717521, "grad_norm": 15.259536743164062, "learning_rate": 3.471928087148625e-05, "loss": 1.2137, "step": 22800 }, { "epoch": 0.3052885576782072, "grad_norm": 11.128557205200195, "learning_rate": 3.487163860745029e-05, "loss": 1.2855, "step": 22900 }, { "epoch": 0.3066216954846622, "grad_norm": 10.14976692199707, "learning_rate": 3.502399634341434e-05, "loss": 1.2443, "step": 23000 }, { "epoch": 0.3079548332911173, "grad_norm": 11.269165992736816, "learning_rate": 3.517635407937838e-05, "loss": 1.2139, "step": 23100 }, { "epoch": 0.30928797109757233, "grad_norm": 16.02304458618164, "learning_rate": 3.5328711815342425e-05, "loss": 1.1829, "step": 23200 }, { "epoch": 0.3106211089040274, "grad_norm": 19.672222137451172, "learning_rate": 3.548106955130647e-05, "loss": 1.2742, "step": 23300 }, { "epoch": 0.31195424671048244, "grad_norm": 23.10077667236328, "learning_rate": 3.563342728727051e-05, "loss": 1.2845, "step": 23400 }, { "epoch": 0.3132873845169375, "grad_norm": 7.3322553634643555, "learning_rate": 3.578578502323455e-05, "loss": 1.2324, "step": 23500 }, { "epoch": 0.31462052232339255, "grad_norm": 7.258352756500244, "learning_rate": 3.59381427591986e-05, "loss": 1.2608, "step": 23600 }, { "epoch": 0.3159536601298476, "grad_norm": 16.36671257019043, "learning_rate": 3.609050049516264e-05, "loss": 1.2066, "step": 23700 }, { "epoch": 0.31728679793630266, "grad_norm": 9.553563117980957, "learning_rate": 3.6242858231126685e-05, "loss": 1.2212, "step": 23800 }, { "epoch": 0.3186199357427577, "grad_norm": 7.491336345672607, "learning_rate": 3.639521596709073e-05, "loss": 1.2378, "step": 23900 }, { "epoch": 0.31995307354921276, "grad_norm": 12.043441772460938, "learning_rate": 3.6547573703054776e-05, "loss": 1.1526, "step": 24000 }, { "epoch": 0.3212862113556678, "grad_norm": 12.958592414855957, "learning_rate": 3.669993143901882e-05, "loss": 1.1449, "step": 24100 }, { "epoch": 0.32261934916212287, "grad_norm": 7.323258876800537, "learning_rate": 3.685228917498286e-05, "loss": 1.1233, "step": 24200 }, { "epoch": 0.3239524869685779, "grad_norm": 4.984567642211914, "learning_rate": 3.700464691094691e-05, "loss": 1.2274, "step": 24300 }, { "epoch": 0.325285624775033, "grad_norm": 14.493622779846191, "learning_rate": 3.7157004646910945e-05, "loss": 1.1625, "step": 24400 }, { "epoch": 0.32661876258148803, "grad_norm": 7.492556571960449, "learning_rate": 3.730936238287499e-05, "loss": 1.1338, "step": 24500 }, { "epoch": 0.3279519003879431, "grad_norm": 20.187728881835938, "learning_rate": 3.7461720118839036e-05, "loss": 1.2394, "step": 24600 }, { "epoch": 0.32928503819439814, "grad_norm": 10.571295738220215, "learning_rate": 3.761407785480308e-05, "loss": 1.1667, "step": 24700 }, { "epoch": 0.3306181760008532, "grad_norm": 24.140941619873047, "learning_rate": 3.776643559076712e-05, "loss": 1.2836, "step": 24800 }, { "epoch": 0.33195131380730825, "grad_norm": 18.39565658569336, "learning_rate": 3.791879332673116e-05, "loss": 1.1419, "step": 24900 }, { "epoch": 0.3332844516137633, "grad_norm": 12.406903266906738, "learning_rate": 3.807115106269521e-05, "loss": 1.2973, "step": 25000 }, { "epoch": 0.33461758942021835, "grad_norm": 7.893985748291016, "learning_rate": 3.8223508798659254e-05, "loss": 1.1725, "step": 25100 }, { "epoch": 0.3359507272266734, "grad_norm": 14.83206558227539, "learning_rate": 3.8375866534623296e-05, "loss": 1.2492, "step": 25200 }, { "epoch": 0.33728386503312846, "grad_norm": 7.520331382751465, "learning_rate": 3.8528224270587345e-05, "loss": 1.132, "step": 25300 }, { "epoch": 0.3386170028395835, "grad_norm": 4.178710460662842, "learning_rate": 3.868058200655138e-05, "loss": 1.2785, "step": 25400 }, { "epoch": 0.33995014064603857, "grad_norm": 15.66732406616211, "learning_rate": 3.883293974251542e-05, "loss": 1.357, "step": 25500 }, { "epoch": 0.3412832784524936, "grad_norm": 33.815250396728516, "learning_rate": 3.898529747847947e-05, "loss": 1.1845, "step": 25600 }, { "epoch": 0.3426164162589487, "grad_norm": 5.291676044464111, "learning_rate": 3.9137655214443514e-05, "loss": 1.1657, "step": 25700 }, { "epoch": 0.34394955406540373, "grad_norm": 12.740199089050293, "learning_rate": 3.9290012950407556e-05, "loss": 1.238, "step": 25800 }, { "epoch": 0.3452826918718588, "grad_norm": 11.138826370239258, "learning_rate": 3.94423706863716e-05, "loss": 1.1821, "step": 25900 }, { "epoch": 0.34661582967831384, "grad_norm": 16.28569793701172, "learning_rate": 3.959472842233565e-05, "loss": 1.2304, "step": 26000 }, { "epoch": 0.3479489674847689, "grad_norm": 13.212550163269043, "learning_rate": 3.974708615829969e-05, "loss": 1.1552, "step": 26100 }, { "epoch": 0.34928210529122394, "grad_norm": 15.506020545959473, "learning_rate": 3.989944389426373e-05, "loss": 1.2927, "step": 26200 }, { "epoch": 0.350615243097679, "grad_norm": 26.39496421813965, "learning_rate": 4.005180163022778e-05, "loss": 1.1996, "step": 26300 }, { "epoch": 0.35194838090413405, "grad_norm": 6.945533752441406, "learning_rate": 4.020263578883218e-05, "loss": 1.2023, "step": 26400 }, { "epoch": 0.3532815187105891, "grad_norm": 20.365005493164062, "learning_rate": 4.035499352479622e-05, "loss": 1.1375, "step": 26500 }, { "epoch": 0.35461465651704416, "grad_norm": 17.62857437133789, "learning_rate": 4.050735126076027e-05, "loss": 1.3561, "step": 26600 }, { "epoch": 0.3559477943234992, "grad_norm": 17.22665023803711, "learning_rate": 4.065970899672431e-05, "loss": 1.2067, "step": 26700 }, { "epoch": 0.35728093212995427, "grad_norm": 11.894963264465332, "learning_rate": 4.081206673268835e-05, "loss": 1.3035, "step": 26800 }, { "epoch": 0.3586140699364093, "grad_norm": 15.139612197875977, "learning_rate": 4.09644244686524e-05, "loss": 1.3366, "step": 26900 }, { "epoch": 0.3599472077428644, "grad_norm": 13.213565826416016, "learning_rate": 4.111678220461644e-05, "loss": 1.3802, "step": 27000 }, { "epoch": 0.3612803455493194, "grad_norm": 5.927510738372803, "learning_rate": 4.126913994058048e-05, "loss": 1.2121, "step": 27100 }, { "epoch": 0.3626134833557745, "grad_norm": 16.2600040435791, "learning_rate": 4.142149767654453e-05, "loss": 1.2725, "step": 27200 }, { "epoch": 0.36394662116222953, "grad_norm": 8.108368873596191, "learning_rate": 4.157385541250857e-05, "loss": 1.2357, "step": 27300 }, { "epoch": 0.3652797589686846, "grad_norm": 18.87217903137207, "learning_rate": 4.172621314847261e-05, "loss": 1.2754, "step": 27400 }, { "epoch": 0.36661289677513964, "grad_norm": 6.51332426071167, "learning_rate": 4.1878570884436655e-05, "loss": 1.2553, "step": 27500 }, { "epoch": 0.3679460345815947, "grad_norm": 12.500776290893555, "learning_rate": 4.2030928620400704e-05, "loss": 1.3602, "step": 27600 }, { "epoch": 0.36927917238804975, "grad_norm": 12.991177558898926, "learning_rate": 4.218328635636475e-05, "loss": 1.1459, "step": 27700 }, { "epoch": 0.3706123101945048, "grad_norm": 14.9971284866333, "learning_rate": 4.233564409232879e-05, "loss": 1.2469, "step": 27800 }, { "epoch": 0.37194544800095986, "grad_norm": 16.038131713867188, "learning_rate": 4.248800182829284e-05, "loss": 1.2239, "step": 27900 }, { "epoch": 0.3732785858074149, "grad_norm": 10.284772872924805, "learning_rate": 4.264035956425688e-05, "loss": 1.2397, "step": 28000 }, { "epoch": 0.37461172361386996, "grad_norm": 4.482223987579346, "learning_rate": 4.2792717300220915e-05, "loss": 1.2015, "step": 28100 }, { "epoch": 0.375944861420325, "grad_norm": 11.958584785461426, "learning_rate": 4.2945075036184964e-05, "loss": 1.1999, "step": 28200 }, { "epoch": 0.37727799922678007, "grad_norm": 7.895247936248779, "learning_rate": 4.3097432772149007e-05, "loss": 1.3191, "step": 28300 }, { "epoch": 0.3786111370332351, "grad_norm": 13.85595417022705, "learning_rate": 4.324979050811305e-05, "loss": 1.2127, "step": 28400 }, { "epoch": 0.3799442748396902, "grad_norm": 7.486861705780029, "learning_rate": 4.340214824407709e-05, "loss": 1.2426, "step": 28500 }, { "epoch": 0.38127741264614523, "grad_norm": 13.184840202331543, "learning_rate": 4.355450598004114e-05, "loss": 1.1409, "step": 28600 }, { "epoch": 0.3826105504526003, "grad_norm": 16.825237274169922, "learning_rate": 4.370686371600518e-05, "loss": 1.2723, "step": 28700 }, { "epoch": 0.38394368825905534, "grad_norm": 10.859269142150879, "learning_rate": 4.3859221451969224e-05, "loss": 1.3355, "step": 28800 }, { "epoch": 0.3852768260655104, "grad_norm": 3.810872793197632, "learning_rate": 4.401157918793327e-05, "loss": 1.3212, "step": 28900 }, { "epoch": 0.38660996387196545, "grad_norm": 14.156526565551758, "learning_rate": 4.416241334653767e-05, "loss": 1.327, "step": 29000 }, { "epoch": 0.3879431016784205, "grad_norm": 7.9193196296691895, "learning_rate": 4.431477108250171e-05, "loss": 1.3282, "step": 29100 }, { "epoch": 0.38927623948487555, "grad_norm": 5.17882776260376, "learning_rate": 4.446712881846576e-05, "loss": 1.2246, "step": 29200 }, { "epoch": 0.3906093772913306, "grad_norm": 21.39368438720703, "learning_rate": 4.4619486554429804e-05, "loss": 1.1934, "step": 29300 }, { "epoch": 0.39194251509778566, "grad_norm": 22.791284561157227, "learning_rate": 4.4771844290393846e-05, "loss": 1.3055, "step": 29400 }, { "epoch": 0.3932756529042407, "grad_norm": 12.004083633422852, "learning_rate": 4.4924202026357895e-05, "loss": 1.3823, "step": 29500 }, { "epoch": 0.39460879071069577, "grad_norm": 18.269126892089844, "learning_rate": 4.507655976232194e-05, "loss": 1.2982, "step": 29600 }, { "epoch": 0.3959419285171508, "grad_norm": 21.394935607910156, "learning_rate": 4.522891749828597e-05, "loss": 1.2622, "step": 29700 }, { "epoch": 0.3972750663236059, "grad_norm": 111.48169708251953, "learning_rate": 4.5381275234250015e-05, "loss": 1.2729, "step": 29800 }, { "epoch": 0.39860820413006093, "grad_norm": 12.26659870147705, "learning_rate": 4.5533632970214064e-05, "loss": 1.2762, "step": 29900 }, { "epoch": 0.399941341936516, "grad_norm": 7.627758026123047, "learning_rate": 4.5685990706178106e-05, "loss": 1.2745, "step": 30000 }, { "epoch": 0.40127447974297104, "grad_norm": 9.28622817993164, "learning_rate": 4.583834844214215e-05, "loss": 1.3286, "step": 30100 }, { "epoch": 0.4026076175494261, "grad_norm": 6.923389434814453, "learning_rate": 4.59907061781062e-05, "loss": 1.2115, "step": 30200 }, { "epoch": 0.40394075535588114, "grad_norm": 27.003978729248047, "learning_rate": 4.614306391407024e-05, "loss": 1.3399, "step": 30300 }, { "epoch": 0.4052738931623362, "grad_norm": 9.637168884277344, "learning_rate": 4.629542165003428e-05, "loss": 1.2151, "step": 30400 }, { "epoch": 0.40660703096879125, "grad_norm": 34.17839431762695, "learning_rate": 4.644777938599833e-05, "loss": 1.3214, "step": 30500 }, { "epoch": 0.4079401687752463, "grad_norm": 8.091509819030762, "learning_rate": 4.660013712196237e-05, "loss": 1.2868, "step": 30600 }, { "epoch": 0.40927330658170136, "grad_norm": 10.198869705200195, "learning_rate": 4.675249485792641e-05, "loss": 1.1483, "step": 30700 }, { "epoch": 0.4106064443881564, "grad_norm": 11.684032440185547, "learning_rate": 4.690485259389045e-05, "loss": 1.245, "step": 30800 }, { "epoch": 0.41193958219461146, "grad_norm": 7.1316423416137695, "learning_rate": 4.70572103298545e-05, "loss": 1.3232, "step": 30900 }, { "epoch": 0.4132727200010665, "grad_norm": 7.505682945251465, "learning_rate": 4.720956806581854e-05, "loss": 1.2189, "step": 31000 }, { "epoch": 0.41460585780752157, "grad_norm": 7.079606533050537, "learning_rate": 4.7361925801782583e-05, "loss": 1.243, "step": 31100 }, { "epoch": 0.4159389956139766, "grad_norm": 23.02866554260254, "learning_rate": 4.7512759960386994e-05, "loss": 1.3092, "step": 31200 }, { "epoch": 0.4172721334204317, "grad_norm": 10.16215705871582, "learning_rate": 4.766511769635103e-05, "loss": 1.2673, "step": 31300 }, { "epoch": 0.41860527122688673, "grad_norm": 24.121414184570312, "learning_rate": 4.781747543231507e-05, "loss": 1.3242, "step": 31400 }, { "epoch": 0.4199384090333418, "grad_norm": 14.515036582946777, "learning_rate": 4.796983316827912e-05, "loss": 1.2335, "step": 31500 }, { "epoch": 0.42127154683979684, "grad_norm": 11.118270874023438, "learning_rate": 4.812219090424316e-05, "loss": 1.1807, "step": 31600 }, { "epoch": 0.4226046846462519, "grad_norm": 6.6718597412109375, "learning_rate": 4.8274548640207205e-05, "loss": 1.2922, "step": 31700 }, { "epoch": 0.42393782245270695, "grad_norm": 7.875730037689209, "learning_rate": 4.8426906376171254e-05, "loss": 1.3898, "step": 31800 }, { "epoch": 0.425270960259162, "grad_norm": 15.03526782989502, "learning_rate": 4.8579264112135296e-05, "loss": 1.1763, "step": 31900 }, { "epoch": 0.42660409806561705, "grad_norm": 8.172929763793945, "learning_rate": 4.873162184809934e-05, "loss": 1.3529, "step": 32000 }, { "epoch": 0.4279372358720721, "grad_norm": 12.484905242919922, "learning_rate": 4.888397958406339e-05, "loss": 1.2159, "step": 32100 }, { "epoch": 0.42927037367852716, "grad_norm": 8.00546932220459, "learning_rate": 4.903633732002743e-05, "loss": 1.2044, "step": 32200 }, { "epoch": 0.4306035114849822, "grad_norm": 9.175239562988281, "learning_rate": 4.918869505599147e-05, "loss": 1.2032, "step": 32300 }, { "epoch": 0.43193664929143727, "grad_norm": 51.8287239074707, "learning_rate": 4.934105279195551e-05, "loss": 1.2073, "step": 32400 }, { "epoch": 0.4332697870978923, "grad_norm": 8.223674774169922, "learning_rate": 4.9493410527919556e-05, "loss": 1.3485, "step": 32500 }, { "epoch": 0.4346029249043474, "grad_norm": 7.687358856201172, "learning_rate": 4.96457682638836e-05, "loss": 1.3592, "step": 32600 }, { "epoch": 0.43593606271080243, "grad_norm": 5.703846454620361, "learning_rate": 4.979812599984764e-05, "loss": 1.3624, "step": 32700 }, { "epoch": 0.4372692005172575, "grad_norm": 7.999549388885498, "learning_rate": 4.994896015845205e-05, "loss": 1.2829, "step": 32800 }, { "epoch": 0.43860233832371254, "grad_norm": 11.251028060913086, "learning_rate": 5.010131789441609e-05, "loss": 1.3089, "step": 32900 }, { "epoch": 0.4399354761301676, "grad_norm": 21.458017349243164, "learning_rate": 5.025367563038013e-05, "loss": 1.2058, "step": 33000 }, { "epoch": 0.44126861393662264, "grad_norm": 8.2625732421875, "learning_rate": 5.040603336634418e-05, "loss": 1.354, "step": 33100 }, { "epoch": 0.4426017517430777, "grad_norm": 9.192827224731445, "learning_rate": 5.055839110230822e-05, "loss": 1.3402, "step": 33200 }, { "epoch": 0.44393488954953275, "grad_norm": 7.578523635864258, "learning_rate": 5.071074883827226e-05, "loss": 1.2431, "step": 33300 }, { "epoch": 0.4452680273559878, "grad_norm": 13.1193208694458, "learning_rate": 5.086310657423631e-05, "loss": 1.285, "step": 33400 }, { "epoch": 0.44660116516244286, "grad_norm": 8.734386444091797, "learning_rate": 5.101546431020035e-05, "loss": 1.3133, "step": 33500 }, { "epoch": 0.4479343029688979, "grad_norm": 29.639982223510742, "learning_rate": 5.1167822046164395e-05, "loss": 1.3497, "step": 33600 }, { "epoch": 0.44926744077535297, "grad_norm": 17.9084415435791, "learning_rate": 5.132017978212844e-05, "loss": 1.3797, "step": 33700 }, { "epoch": 0.450600578581808, "grad_norm": 8.754751205444336, "learning_rate": 5.1472537518092486e-05, "loss": 1.3058, "step": 33800 }, { "epoch": 0.4519337163882631, "grad_norm": 7.53584623336792, "learning_rate": 5.162489525405653e-05, "loss": 1.1797, "step": 33900 }, { "epoch": 0.4532668541947181, "grad_norm": 12.5559663772583, "learning_rate": 5.1777252990020564e-05, "loss": 1.3435, "step": 34000 }, { "epoch": 0.4545999920011732, "grad_norm": 48.5744743347168, "learning_rate": 5.192961072598461e-05, "loss": 1.3098, "step": 34100 }, { "epoch": 0.45593312980762823, "grad_norm": 20.458942413330078, "learning_rate": 5.2081968461948655e-05, "loss": 1.2874, "step": 34200 }, { "epoch": 0.4572662676140833, "grad_norm": 21.392860412597656, "learning_rate": 5.22343261979127e-05, "loss": 1.4397, "step": 34300 }, { "epoch": 0.45859940542053834, "grad_norm": 16.01394271850586, "learning_rate": 5.2386683933876746e-05, "loss": 1.3491, "step": 34400 }, { "epoch": 0.4599325432269934, "grad_norm": 28.018455505371094, "learning_rate": 5.253904166984079e-05, "loss": 1.2804, "step": 34500 }, { "epoch": 0.46126568103344845, "grad_norm": 12.526464462280273, "learning_rate": 5.269139940580483e-05, "loss": 1.427, "step": 34600 }, { "epoch": 0.4625988188399035, "grad_norm": 10.681849479675293, "learning_rate": 5.284375714176887e-05, "loss": 1.2229, "step": 34700 }, { "epoch": 0.46393195664635856, "grad_norm": 23.190549850463867, "learning_rate": 5.299611487773292e-05, "loss": 1.3209, "step": 34800 }, { "epoch": 0.4652650944528136, "grad_norm": 10.226524353027344, "learning_rate": 5.3148472613696964e-05, "loss": 1.273, "step": 34900 }, { "epoch": 0.46659823225926866, "grad_norm": 18.497314453125, "learning_rate": 5.3300830349661e-05, "loss": 1.2961, "step": 35000 }, { "epoch": 0.4679313700657237, "grad_norm": 7.95158576965332, "learning_rate": 5.345318808562505e-05, "loss": 1.2876, "step": 35100 }, { "epoch": 0.46926450787217877, "grad_norm": 23.072799682617188, "learning_rate": 5.360554582158909e-05, "loss": 1.2651, "step": 35200 }, { "epoch": 0.4705976456786338, "grad_norm": 12.676689147949219, "learning_rate": 5.375790355755313e-05, "loss": 1.4441, "step": 35300 }, { "epoch": 0.4719307834850889, "grad_norm": 13.948905944824219, "learning_rate": 5.391026129351718e-05, "loss": 1.3156, "step": 35400 }, { "epoch": 0.47326392129154393, "grad_norm": 13.157307624816895, "learning_rate": 5.4062619029481224e-05, "loss": 1.3887, "step": 35500 }, { "epoch": 0.474597059097999, "grad_norm": 11.105155944824219, "learning_rate": 5.4214976765445266e-05, "loss": 1.427, "step": 35600 }, { "epoch": 0.47593019690445404, "grad_norm": 14.916239738464355, "learning_rate": 5.4367334501409315e-05, "loss": 1.3492, "step": 35700 }, { "epoch": 0.4772633347109091, "grad_norm": 6.535190582275391, "learning_rate": 5.451969223737336e-05, "loss": 1.2582, "step": 35800 }, { "epoch": 0.47859647251736415, "grad_norm": 18.27303695678711, "learning_rate": 5.46720499733374e-05, "loss": 1.2758, "step": 35900 }, { "epoch": 0.4799296103238192, "grad_norm": 9.056538581848145, "learning_rate": 5.48228841319418e-05, "loss": 1.3436, "step": 36000 }, { "epoch": 0.48126274813027425, "grad_norm": 7.964415073394775, "learning_rate": 5.4975241867905845e-05, "loss": 1.2447, "step": 36100 }, { "epoch": 0.4825958859367293, "grad_norm": 10.005159378051758, "learning_rate": 5.512759960386989e-05, "loss": 1.2959, "step": 36200 }, { "epoch": 0.4839290237431843, "grad_norm": 11.74345588684082, "learning_rate": 5.527995733983393e-05, "loss": 1.3722, "step": 36300 }, { "epoch": 0.48526216154963936, "grad_norm": 6.152487754821777, "learning_rate": 5.543231507579798e-05, "loss": 1.2752, "step": 36400 }, { "epoch": 0.4865952993560944, "grad_norm": 15.276147842407227, "learning_rate": 5.558467281176202e-05, "loss": 1.3229, "step": 36500 }, { "epoch": 0.48792843716254947, "grad_norm": 10.067095756530762, "learning_rate": 5.573703054772606e-05, "loss": 1.3472, "step": 36600 }, { "epoch": 0.4892615749690045, "grad_norm": 17.995410919189453, "learning_rate": 5.5889388283690105e-05, "loss": 1.2696, "step": 36700 }, { "epoch": 0.4905947127754596, "grad_norm": 12.359143257141113, "learning_rate": 5.604174601965415e-05, "loss": 1.2989, "step": 36800 }, { "epoch": 0.4919278505819146, "grad_norm": 5.553112983703613, "learning_rate": 5.619410375561819e-05, "loss": 1.3723, "step": 36900 }, { "epoch": 0.4932609883883697, "grad_norm": 9.624715805053711, "learning_rate": 5.634646149158224e-05, "loss": 1.3483, "step": 37000 }, { "epoch": 0.49459412619482473, "grad_norm": 8.208943367004395, "learning_rate": 5.649881922754628e-05, "loss": 1.2304, "step": 37100 }, { "epoch": 0.4959272640012798, "grad_norm": 10.545738220214844, "learning_rate": 5.665117696351032e-05, "loss": 1.3381, "step": 37200 }, { "epoch": 0.49726040180773484, "grad_norm": 7.3529558181762695, "learning_rate": 5.6803534699474365e-05, "loss": 1.2832, "step": 37300 }, { "epoch": 0.4985935396141899, "grad_norm": 11.9561128616333, "learning_rate": 5.6955892435438414e-05, "loss": 1.2858, "step": 37400 }, { "epoch": 0.49992667742064495, "grad_norm": 10.537229537963867, "learning_rate": 5.7108250171402457e-05, "loss": 1.3592, "step": 37500 }, { "epoch": 0.5012598152271001, "grad_norm": 23.016809463500977, "learning_rate": 5.72606079073665e-05, "loss": 1.28, "step": 37600 }, { "epoch": 0.5025929530335551, "grad_norm": 9.565058708190918, "learning_rate": 5.741296564333054e-05, "loss": 1.4054, "step": 37700 }, { "epoch": 0.5039260908400102, "grad_norm": 9.67759895324707, "learning_rate": 5.756532337929458e-05, "loss": 1.3378, "step": 37800 }, { "epoch": 0.5052592286464652, "grad_norm": 9.902351379394531, "learning_rate": 5.7717681115258625e-05, "loss": 1.3643, "step": 37900 }, { "epoch": 0.5065923664529203, "grad_norm": 8.46121883392334, "learning_rate": 5.7870038851222674e-05, "loss": 1.3404, "step": 38000 }, { "epoch": 0.5079255042593753, "grad_norm": 7.357236862182617, "learning_rate": 5.8022396587186716e-05, "loss": 1.2585, "step": 38100 }, { "epoch": 0.5092586420658304, "grad_norm": 4.434939384460449, "learning_rate": 5.817475432315076e-05, "loss": 1.2789, "step": 38200 }, { "epoch": 0.5105917798722854, "grad_norm": 10.350858688354492, "learning_rate": 5.83271120591148e-05, "loss": 1.2818, "step": 38300 }, { "epoch": 0.5119249176787405, "grad_norm": 35.593204498291016, "learning_rate": 5.8477946217719205e-05, "loss": 1.3442, "step": 38400 }, { "epoch": 0.5132580554851955, "grad_norm": 32.05377960205078, "learning_rate": 5.863030395368325e-05, "loss": 1.2427, "step": 38500 }, { "epoch": 0.5145911932916506, "grad_norm": 12.365283966064453, "learning_rate": 5.878266168964729e-05, "loss": 1.3033, "step": 38600 }, { "epoch": 0.5159243310981056, "grad_norm": 8.098183631896973, "learning_rate": 5.893501942561134e-05, "loss": 1.3639, "step": 38700 }, { "epoch": 0.5172574689045607, "grad_norm": 10.024205207824707, "learning_rate": 5.908737716157538e-05, "loss": 1.3507, "step": 38800 }, { "epoch": 0.5185906067110158, "grad_norm": 13.642242431640625, "learning_rate": 5.923973489753942e-05, "loss": 1.3106, "step": 38900 }, { "epoch": 0.5199237445174708, "grad_norm": 12.285043716430664, "learning_rate": 5.939209263350347e-05, "loss": 1.397, "step": 39000 }, { "epoch": 0.5212568823239259, "grad_norm": 26.97838020324707, "learning_rate": 5.9544450369467513e-05, "loss": 1.3581, "step": 39100 }, { "epoch": 0.5225900201303809, "grad_norm": 9.388677597045898, "learning_rate": 5.9696808105431556e-05, "loss": 1.2654, "step": 39200 }, { "epoch": 0.523923157936836, "grad_norm": 13.721781730651855, "learning_rate": 5.98491658413956e-05, "loss": 1.3373, "step": 39300 }, { "epoch": 0.525256295743291, "grad_norm": 15.618371963500977, "learning_rate": 5.9999999999874424e-05, "loss": 1.2688, "step": 39400 }, { "epoch": 0.5265894335497461, "grad_norm": 8.335700988769531, "learning_rate": 5.999999871900666e-05, "loss": 1.3613, "step": 39500 }, { "epoch": 0.5279225713562011, "grad_norm": 16.63068199157715, "learning_rate": 5.999999492663356e-05, "loss": 1.3044, "step": 39600 }, { "epoch": 0.5292557091626562, "grad_norm": 6.350239276885986, "learning_rate": 5.999998862275546e-05, "loss": 1.335, "step": 39700 }, { "epoch": 0.5305888469691112, "grad_norm": 4.959856986999512, "learning_rate": 5.9999979807372877e-05, "loss": 1.3545, "step": 39800 }, { "epoch": 0.5319219847755663, "grad_norm": 7.4758501052856445, "learning_rate": 5.9999968480486555e-05, "loss": 1.1933, "step": 39900 }, { "epoch": 0.5332551225820213, "grad_norm": 7.734913349151611, "learning_rate": 5.999995464209744e-05, "loss": 1.2952, "step": 40000 }, { "epoch": 0.5345882603884764, "grad_norm": 8.593339920043945, "learning_rate": 5.999993829220669e-05, "loss": 1.3326, "step": 40100 }, { "epoch": 0.5359213981949315, "grad_norm": 7.086404800415039, "learning_rate": 5.9999919430815675e-05, "loss": 1.2896, "step": 40200 }, { "epoch": 0.5372545360013865, "grad_norm": 20.135164260864258, "learning_rate": 5.999989805792597e-05, "loss": 1.3082, "step": 40300 }, { "epoch": 0.5385876738078416, "grad_norm": 12.929563522338867, "learning_rate": 5.999987417353937e-05, "loss": 1.3381, "step": 40400 }, { "epoch": 0.5399208116142966, "grad_norm": 8.300307273864746, "learning_rate": 5.999984805404858e-05, "loss": 1.4163, "step": 40500 }, { "epoch": 0.5412539494207517, "grad_norm": 17.959304809570312, "learning_rate": 5.9999819171789306e-05, "loss": 1.3329, "step": 40600 }, { "epoch": 0.5425870872272067, "grad_norm": 7.324941635131836, "learning_rate": 5.999978777803974e-05, "loss": 1.3323, "step": 40700 }, { "epoch": 0.5439202250336618, "grad_norm": 14.08326244354248, "learning_rate": 5.9999753872802514e-05, "loss": 1.335, "step": 40800 }, { "epoch": 0.5452533628401168, "grad_norm": 8.803863525390625, "learning_rate": 5.999971745608045e-05, "loss": 1.4561, "step": 40900 }, { "epoch": 0.5465865006465719, "grad_norm": 9.109027862548828, "learning_rate": 5.999967852787662e-05, "loss": 1.3992, "step": 41000 }, { "epoch": 0.5479196384530269, "grad_norm": 46.815269470214844, "learning_rate": 5.999963708819426e-05, "loss": 1.273, "step": 41100 }, { "epoch": 0.549252776259482, "grad_norm": 21.403419494628906, "learning_rate": 5.999959313703686e-05, "loss": 1.3162, "step": 41200 }, { "epoch": 0.550585914065937, "grad_norm": 14.300725936889648, "learning_rate": 5.9999546674408086e-05, "loss": 1.3714, "step": 41300 }, { "epoch": 0.5519190518723921, "grad_norm": 24.372669219970703, "learning_rate": 5.9999497700311823e-05, "loss": 1.3279, "step": 41400 }, { "epoch": 0.5532521896788472, "grad_norm": 13.517022132873535, "learning_rate": 5.999944621475219e-05, "loss": 1.2674, "step": 41500 }, { "epoch": 0.5545853274853022, "grad_norm": 13.194217681884766, "learning_rate": 5.999939221773348e-05, "loss": 1.3597, "step": 41600 }, { "epoch": 0.5559184652917573, "grad_norm": 31.31551742553711, "learning_rate": 5.999933570926023e-05, "loss": 1.4277, "step": 41700 }, { "epoch": 0.5572516030982123, "grad_norm": 22.99677085876465, "learning_rate": 5.999927668933715e-05, "loss": 1.2764, "step": 41800 }, { "epoch": 0.5585847409046674, "grad_norm": 7.096306324005127, "learning_rate": 5.99992151579692e-05, "loss": 1.2568, "step": 41900 }, { "epoch": 0.5599178787111224, "grad_norm": 33.336299896240234, "learning_rate": 5.9999151115161514e-05, "loss": 1.3746, "step": 42000 }, { "epoch": 0.5612510165175775, "grad_norm": 28.866811752319336, "learning_rate": 5.9999084560919464e-05, "loss": 1.3235, "step": 42100 }, { "epoch": 0.5625841543240325, "grad_norm": 8.361762046813965, "learning_rate": 5.999901549524862e-05, "loss": 1.4542, "step": 42200 }, { "epoch": 0.5639172921304876, "grad_norm": 7.606881141662598, "learning_rate": 5.999894391815476e-05, "loss": 1.3669, "step": 42300 }, { "epoch": 0.5652504299369426, "grad_norm": 21.38722801208496, "learning_rate": 5.999886982964389e-05, "loss": 1.3876, "step": 42400 }, { "epoch": 0.5665835677433977, "grad_norm": 8.899712562561035, "learning_rate": 5.999879322972219e-05, "loss": 1.3902, "step": 42500 }, { "epoch": 0.5679167055498527, "grad_norm": 24.345924377441406, "learning_rate": 5.99987141183961e-05, "loss": 1.238, "step": 42600 }, { "epoch": 0.5692498433563078, "grad_norm": 36.01554870605469, "learning_rate": 5.999863249567222e-05, "loss": 1.2752, "step": 42700 }, { "epoch": 0.5705829811627628, "grad_norm": 10.678837776184082, "learning_rate": 5.999854836155739e-05, "loss": 1.336, "step": 42800 }, { "epoch": 0.5719161189692179, "grad_norm": 6.1797003746032715, "learning_rate": 5.999846171605865e-05, "loss": 1.4152, "step": 42900 }, { "epoch": 0.573249256775673, "grad_norm": 4.393489360809326, "learning_rate": 5.999837255918327e-05, "loss": 1.3446, "step": 43000 }, { "epoch": 0.574582394582128, "grad_norm": 27.451656341552734, "learning_rate": 5.99982818200524e-05, "loss": 1.3522, "step": 43100 }, { "epoch": 0.5759155323885831, "grad_norm": 7.281174182891846, "learning_rate": 5.999818766555989e-05, "loss": 1.3848, "step": 43200 }, { "epoch": 0.5772486701950381, "grad_norm": 21.500255584716797, "learning_rate": 5.999809099971367e-05, "loss": 1.3104, "step": 43300 }, { "epoch": 0.5785818080014932, "grad_norm": 13.734038352966309, "learning_rate": 5.999799182252183e-05, "loss": 1.2392, "step": 43400 }, { "epoch": 0.5799149458079481, "grad_norm": 22.133398056030273, "learning_rate": 5.999789013399268e-05, "loss": 1.2684, "step": 43500 }, { "epoch": 0.5812480836144032, "grad_norm": 10.31998348236084, "learning_rate": 5.999778593413474e-05, "loss": 1.3589, "step": 43600 }, { "epoch": 0.5825812214208582, "grad_norm": 9.781447410583496, "learning_rate": 5.999767922295671e-05, "loss": 1.2845, "step": 43700 }, { "epoch": 0.5839143592273133, "grad_norm": 15.449861526489258, "learning_rate": 5.999757000046754e-05, "loss": 1.478, "step": 43800 }, { "epoch": 0.5852474970337683, "grad_norm": 12.148350715637207, "learning_rate": 5.999745826667637e-05, "loss": 1.4243, "step": 43900 }, { "epoch": 0.5865806348402234, "grad_norm": 11.196051597595215, "learning_rate": 5.999734402159255e-05, "loss": 1.3786, "step": 44000 }, { "epoch": 0.5879137726466784, "grad_norm": 10.769905090332031, "learning_rate": 5.9997227265225656e-05, "loss": 1.2979, "step": 44100 }, { "epoch": 0.5892469104531335, "grad_norm": 6.754711627960205, "learning_rate": 5.999710799758545e-05, "loss": 1.4186, "step": 44200 }, { "epoch": 0.5905800482595885, "grad_norm": 12.977498054504395, "learning_rate": 5.999698621868192e-05, "loss": 1.3053, "step": 44300 }, { "epoch": 0.5919131860660436, "grad_norm": 38.46126937866211, "learning_rate": 5.999686192852527e-05, "loss": 1.3453, "step": 44400 }, { "epoch": 0.5932463238724986, "grad_norm": 14.83212947845459, "learning_rate": 5.999673512712589e-05, "loss": 1.2516, "step": 44500 }, { "epoch": 0.5945794616789537, "grad_norm": 23.734724044799805, "learning_rate": 5.999660581449441e-05, "loss": 1.2794, "step": 44600 }, { "epoch": 0.5959125994854088, "grad_norm": 14.392127990722656, "learning_rate": 5.9996473990641644e-05, "loss": 1.3359, "step": 44700 }, { "epoch": 0.5972457372918638, "grad_norm": 19.07663345336914, "learning_rate": 5.9996339655578626e-05, "loss": 1.2699, "step": 44800 }, { "epoch": 0.5985788750983189, "grad_norm": 11.392196655273438, "learning_rate": 5.9996204190209635e-05, "loss": 1.381, "step": 44900 }, { "epoch": 0.5999120129047739, "grad_norm": 21.036901473999023, "learning_rate": 5.9996066263625615e-05, "loss": 1.4379, "step": 45000 }, { "epoch": 0.601245150711229, "grad_norm": 10.03642749786377, "learning_rate": 5.9995924445223595e-05, "loss": 1.3164, "step": 45100 }, { "epoch": 0.602578288517684, "grad_norm": 10.187807083129883, "learning_rate": 5.999578011565732e-05, "loss": 1.303, "step": 45200 }, { "epoch": 0.6039114263241391, "grad_norm": 7.434815406799316, "learning_rate": 5.9995633274938894e-05, "loss": 1.1709, "step": 45300 }, { "epoch": 0.6052445641305941, "grad_norm": 9.762859344482422, "learning_rate": 5.9995483923080614e-05, "loss": 1.3679, "step": 45400 }, { "epoch": 0.6065777019370492, "grad_norm": 11.107604026794434, "learning_rate": 5.999533206009496e-05, "loss": 1.3572, "step": 45500 }, { "epoch": 0.6079108397435042, "grad_norm": 8.890416145324707, "learning_rate": 5.999517768599467e-05, "loss": 1.1929, "step": 45600 }, { "epoch": 0.6092439775499593, "grad_norm": 10.935742378234863, "learning_rate": 5.999502080079266e-05, "loss": 1.2834, "step": 45700 }, { "epoch": 0.6105771153564143, "grad_norm": 7.0402445793151855, "learning_rate": 5.999486140450205e-05, "loss": 1.2611, "step": 45800 }, { "epoch": 0.6119102531628694, "grad_norm": 7.542079448699951, "learning_rate": 5.99946994971362e-05, "loss": 1.3264, "step": 45900 }, { "epoch": 0.6132433909693245, "grad_norm": 26.186691284179688, "learning_rate": 5.9994535078708665e-05, "loss": 1.3491, "step": 46000 }, { "epoch": 0.6145765287757795, "grad_norm": 29.799230575561523, "learning_rate": 5.9994368149233194e-05, "loss": 1.3453, "step": 46100 }, { "epoch": 0.6159096665822346, "grad_norm": 21.0030517578125, "learning_rate": 5.999419870872377e-05, "loss": 1.2404, "step": 46200 }, { "epoch": 0.6172428043886896, "grad_norm": 11.73917293548584, "learning_rate": 5.9994026757194586e-05, "loss": 1.1905, "step": 46300 }, { "epoch": 0.6185759421951447, "grad_norm": 10.508408546447754, "learning_rate": 5.999385229466004e-05, "loss": 1.2671, "step": 46400 }, { "epoch": 0.6199090800015997, "grad_norm": 18.016828536987305, "learning_rate": 5.999367532113471e-05, "loss": 1.2903, "step": 46500 }, { "epoch": 0.6212422178080548, "grad_norm": 8.398161888122559, "learning_rate": 5.999349583663345e-05, "loss": 1.3829, "step": 46600 }, { "epoch": 0.6225753556145098, "grad_norm": 5.148888111114502, "learning_rate": 5.999331384117125e-05, "loss": 1.2646, "step": 46700 }, { "epoch": 0.6239084934209649, "grad_norm": 15.298406600952148, "learning_rate": 5.999312933476337e-05, "loss": 1.3156, "step": 46800 }, { "epoch": 0.6252416312274199, "grad_norm": 9.473503112792969, "learning_rate": 5.999294231742525e-05, "loss": 1.2809, "step": 46900 }, { "epoch": 0.626574769033875, "grad_norm": 11.671876907348633, "learning_rate": 5.999275278917254e-05, "loss": 1.2203, "step": 47000 }, { "epoch": 0.62790790684033, "grad_norm": 34.20414733886719, "learning_rate": 5.9992560750021115e-05, "loss": 1.248, "step": 47100 }, { "epoch": 0.6292410446467851, "grad_norm": 10.008123397827148, "learning_rate": 5.999236619998705e-05, "loss": 1.4208, "step": 47200 }, { "epoch": 0.6305741824532402, "grad_norm": 12.692756652832031, "learning_rate": 5.999216913908663e-05, "loss": 1.263, "step": 47300 }, { "epoch": 0.6319073202596952, "grad_norm": 20.906978607177734, "learning_rate": 5.9991969567336345e-05, "loss": 1.2534, "step": 47400 }, { "epoch": 0.6332404580661503, "grad_norm": 18.825881958007812, "learning_rate": 5.999176748475292e-05, "loss": 1.3693, "step": 47500 }, { "epoch": 0.6345735958726053, "grad_norm": 31.788925170898438, "learning_rate": 5.999156289135326e-05, "loss": 1.2563, "step": 47600 }, { "epoch": 0.6359067336790604, "grad_norm": 9.973587989807129, "learning_rate": 5.99913557871545e-05, "loss": 1.2707, "step": 47700 }, { "epoch": 0.6372398714855154, "grad_norm": 11.438252449035645, "learning_rate": 5.9991146172173976e-05, "loss": 1.1746, "step": 47800 }, { "epoch": 0.6385730092919705, "grad_norm": 12.495111465454102, "learning_rate": 5.999093404642923e-05, "loss": 1.2835, "step": 47900 }, { "epoch": 0.6399061470984255, "grad_norm": 12.485495567321777, "learning_rate": 5.999071940993803e-05, "loss": 1.2855, "step": 48000 }, { "epoch": 0.6412392849048806, "grad_norm": 11.922140121459961, "learning_rate": 5.999050226271833e-05, "loss": 1.1845, "step": 48100 }, { "epoch": 0.6425724227113356, "grad_norm": 19.591997146606445, "learning_rate": 5.999028260478832e-05, "loss": 1.3546, "step": 48200 }, { "epoch": 0.6439055605177907, "grad_norm": 7.580881595611572, "learning_rate": 5.999006043616639e-05, "loss": 1.2313, "step": 48300 }, { "epoch": 0.6452386983242457, "grad_norm": 13.019157409667969, "learning_rate": 5.9989835756871144e-05, "loss": 1.2475, "step": 48400 }, { "epoch": 0.6465718361307008, "grad_norm": 5.613773345947266, "learning_rate": 5.998960856692139e-05, "loss": 1.304, "step": 48500 }, { "epoch": 0.6479049739371558, "grad_norm": 8.87348747253418, "learning_rate": 5.998937886633613e-05, "loss": 1.3272, "step": 48600 }, { "epoch": 0.6492381117436109, "grad_norm": 7.045472145080566, "learning_rate": 5.998914665513461e-05, "loss": 1.2599, "step": 48700 }, { "epoch": 0.650571249550066, "grad_norm": 16.687519073486328, "learning_rate": 5.9988911933336254e-05, "loss": 1.3161, "step": 48800 }, { "epoch": 0.651904387356521, "grad_norm": 18.241466522216797, "learning_rate": 5.998867470096073e-05, "loss": 1.3542, "step": 48900 }, { "epoch": 0.6532375251629761, "grad_norm": 5.193562984466553, "learning_rate": 5.9988434958027904e-05, "loss": 1.4059, "step": 49000 }, { "epoch": 0.6545706629694311, "grad_norm": 12.596224784851074, "learning_rate": 5.9988192704557825e-05, "loss": 1.3611, "step": 49100 }, { "epoch": 0.6559038007758862, "grad_norm": 10.191391944885254, "learning_rate": 5.998795040063765e-05, "loss": 1.3506, "step": 49200 }, { "epoch": 0.6572369385823412, "grad_norm": 12.75635051727295, "learning_rate": 5.9987703151259e-05, "loss": 1.3403, "step": 49300 }, { "epoch": 0.6585700763887963, "grad_norm": 12.724204063415527, "learning_rate": 5.9987453391404376e-05, "loss": 1.3248, "step": 49400 }, { "epoch": 0.6599032141952513, "grad_norm": 13.20604133605957, "learning_rate": 5.998720112109468e-05, "loss": 1.2863, "step": 49500 }, { "epoch": 0.6612363520017064, "grad_norm": 10.58996295928955, "learning_rate": 5.998694634035103e-05, "loss": 1.251, "step": 49600 }, { "epoch": 0.6625694898081614, "grad_norm": 15.703396797180176, "learning_rate": 5.9986689049194775e-05, "loss": 1.2885, "step": 49700 }, { "epoch": 0.6639026276146165, "grad_norm": 16.25677490234375, "learning_rate": 5.998642924764742e-05, "loss": 1.3522, "step": 49800 }, { "epoch": 0.6652357654210715, "grad_norm": 7.3817315101623535, "learning_rate": 5.9986166935730746e-05, "loss": 1.2778, "step": 49900 }, { "epoch": 0.6665689032275266, "grad_norm": 8.617868423461914, "learning_rate": 5.99859021134667e-05, "loss": 1.3567, "step": 50000 }, { "epoch": 0.6679020410339817, "grad_norm": 7.9203362464904785, "learning_rate": 5.998563478087745e-05, "loss": 1.2908, "step": 50100 }, { "epoch": 0.6692351788404367, "grad_norm": 17.823514938354492, "learning_rate": 5.998536493798537e-05, "loss": 1.3427, "step": 50200 }, { "epoch": 0.6705683166468918, "grad_norm": 24.45656967163086, "learning_rate": 5.9985092584813076e-05, "loss": 1.2179, "step": 50300 }, { "epoch": 0.6719014544533468, "grad_norm": 10.135210990905762, "learning_rate": 5.998481772138334e-05, "loss": 1.3498, "step": 50400 }, { "epoch": 0.6732345922598019, "grad_norm": 12.289299011230469, "learning_rate": 5.9984540347719186e-05, "loss": 1.3105, "step": 50500 }, { "epoch": 0.6745677300662569, "grad_norm": 6.584577560424805, "learning_rate": 5.9984260463843846e-05, "loss": 1.1754, "step": 50600 }, { "epoch": 0.675900867872712, "grad_norm": 73.39702606201172, "learning_rate": 5.998397806978072e-05, "loss": 1.2936, "step": 50700 }, { "epoch": 0.677234005679167, "grad_norm": 11.965841293334961, "learning_rate": 5.998369316555348e-05, "loss": 1.2658, "step": 50800 }, { "epoch": 0.6785671434856221, "grad_norm": 14.384963989257812, "learning_rate": 5.998340575118596e-05, "loss": 1.3714, "step": 50900 }, { "epoch": 0.6799002812920771, "grad_norm": 7.187297821044922, "learning_rate": 5.9983115826702225e-05, "loss": 1.2549, "step": 51000 }, { "epoch": 0.6812334190985322, "grad_norm": 8.20315933227539, "learning_rate": 5.998282339212655e-05, "loss": 1.3665, "step": 51100 }, { "epoch": 0.6825665569049872, "grad_norm": 9.35573959350586, "learning_rate": 5.9982528447483414e-05, "loss": 1.2239, "step": 51200 }, { "epoch": 0.6838996947114423, "grad_norm": 11.09323501586914, "learning_rate": 5.998223099279751e-05, "loss": 1.3061, "step": 51300 }, { "epoch": 0.6852328325178974, "grad_norm": 12.573692321777344, "learning_rate": 5.9981931028093746e-05, "loss": 1.3053, "step": 51400 }, { "epoch": 0.6865659703243524, "grad_norm": 14.43775749206543, "learning_rate": 5.998162855339722e-05, "loss": 1.1957, "step": 51500 }, { "epoch": 0.6878991081308075, "grad_norm": 14.865554809570312, "learning_rate": 5.998132356873326e-05, "loss": 1.2288, "step": 51600 }, { "epoch": 0.6892322459372625, "grad_norm": 8.374665260314941, "learning_rate": 5.998101607412741e-05, "loss": 1.2216, "step": 51700 }, { "epoch": 0.6905653837437176, "grad_norm": 4.0364532470703125, "learning_rate": 5.9980706069605396e-05, "loss": 1.27, "step": 51800 }, { "epoch": 0.6918985215501726, "grad_norm": 23.91683006286621, "learning_rate": 5.998039355519317e-05, "loss": 1.2471, "step": 51900 }, { "epoch": 0.6932316593566277, "grad_norm": 10.430766105651855, "learning_rate": 5.9980078530916916e-05, "loss": 1.3468, "step": 52000 }, { "epoch": 0.6945647971630827, "grad_norm": 6.2454352378845215, "learning_rate": 5.9979760996802984e-05, "loss": 1.234, "step": 52100 }, { "epoch": 0.6958979349695378, "grad_norm": 11.811392784118652, "learning_rate": 5.997944095287797e-05, "loss": 1.1369, "step": 52200 }, { "epoch": 0.6972310727759928, "grad_norm": 12.949581146240234, "learning_rate": 5.997911839916865e-05, "loss": 1.3461, "step": 52300 }, { "epoch": 0.6985642105824479, "grad_norm": 8.653752326965332, "learning_rate": 5.997879333570206e-05, "loss": 1.3897, "step": 52400 }, { "epoch": 0.6998973483889029, "grad_norm": 7.392680644989014, "learning_rate": 5.997846576250538e-05, "loss": 1.2792, "step": 52500 }, { "epoch": 0.701230486195358, "grad_norm": 8.5979585647583, "learning_rate": 5.997813899285798e-05, "loss": 1.2335, "step": 52600 }, { "epoch": 0.702563624001813, "grad_norm": 19.37425422668457, "learning_rate": 5.997780642538024e-05, "loss": 1.3156, "step": 52700 }, { "epoch": 0.7038967618082681, "grad_norm": 37.801971435546875, "learning_rate": 5.997747134825505e-05, "loss": 1.3512, "step": 52800 }, { "epoch": 0.7052298996147232, "grad_norm": 11.554051399230957, "learning_rate": 5.997713376151045e-05, "loss": 1.2641, "step": 52900 }, { "epoch": 0.7065630374211782, "grad_norm": 20.78178596496582, "learning_rate": 5.997679366517472e-05, "loss": 1.215, "step": 53000 }, { "epoch": 0.7078961752276333, "grad_norm": 12.549651145935059, "learning_rate": 5.9976451059276304e-05, "loss": 1.2193, "step": 53100 }, { "epoch": 0.7092293130340883, "grad_norm": 7.253143310546875, "learning_rate": 5.997610594384391e-05, "loss": 1.1858, "step": 53200 }, { "epoch": 0.7105624508405434, "grad_norm": 10.900320053100586, "learning_rate": 5.9975758318906415e-05, "loss": 1.2581, "step": 53300 }, { "epoch": 0.7118955886469984, "grad_norm": 10.935173034667969, "learning_rate": 5.997540818449292e-05, "loss": 1.1674, "step": 53400 }, { "epoch": 0.7132287264534535, "grad_norm": 10.749198913574219, "learning_rate": 5.997505554063275e-05, "loss": 1.3501, "step": 53500 }, { "epoch": 0.7145618642599085, "grad_norm": 16.905561447143555, "learning_rate": 5.997470038735542e-05, "loss": 1.196, "step": 53600 }, { "epoch": 0.7158950020663636, "grad_norm": 11.074957847595215, "learning_rate": 5.997434272469066e-05, "loss": 1.1425, "step": 53700 }, { "epoch": 0.7172281398728186, "grad_norm": 13.230973243713379, "learning_rate": 5.9973982552668415e-05, "loss": 1.2008, "step": 53800 }, { "epoch": 0.7185612776792737, "grad_norm": 43.86670684814453, "learning_rate": 5.9973619871318835e-05, "loss": 1.2837, "step": 53900 }, { "epoch": 0.7198944154857287, "grad_norm": 15.078032493591309, "learning_rate": 5.997325468067228e-05, "loss": 1.1943, "step": 54000 }, { "epoch": 0.7212275532921838, "grad_norm": 23.69322395324707, "learning_rate": 5.9972890670179226e-05, "loss": 1.2407, "step": 54100 }, { "epoch": 0.7225606910986389, "grad_norm": 7.454013347625732, "learning_rate": 5.9972520486122864e-05, "loss": 1.1731, "step": 54200 }, { "epoch": 0.7238938289050939, "grad_norm": 10.2823486328125, "learning_rate": 5.9972147792861575e-05, "loss": 1.2082, "step": 54300 }, { "epoch": 0.725226966711549, "grad_norm": 9.504066467285156, "learning_rate": 5.997177259042654e-05, "loss": 1.3055, "step": 54400 }, { "epoch": 0.726560104518004, "grad_norm": 38.84284591674805, "learning_rate": 5.9971394878849195e-05, "loss": 1.1623, "step": 54500 }, { "epoch": 0.7278932423244591, "grad_norm": 21.924182891845703, "learning_rate": 5.997101465816115e-05, "loss": 1.3391, "step": 54600 }, { "epoch": 0.7292263801309141, "grad_norm": 14.00391674041748, "learning_rate": 5.9970631928394226e-05, "loss": 1.2144, "step": 54700 }, { "epoch": 0.7305595179373692, "grad_norm": 7.2553181648254395, "learning_rate": 5.9970246689580476e-05, "loss": 1.2505, "step": 54800 }, { "epoch": 0.7318926557438242, "grad_norm": 20.9500732421875, "learning_rate": 5.996985894175215e-05, "loss": 1.2575, "step": 54900 }, { "epoch": 0.7332257935502793, "grad_norm": 10.053970336914062, "learning_rate": 5.9969468684941695e-05, "loss": 1.0449, "step": 55000 }, { "epoch": 0.7345589313567343, "grad_norm": 18.690462112426758, "learning_rate": 5.9969075919181804e-05, "loss": 1.1786, "step": 55100 }, { "epoch": 0.7358920691631894, "grad_norm": 8.11339282989502, "learning_rate": 5.996868064450535e-05, "loss": 1.2262, "step": 55200 }, { "epoch": 0.7372252069696444, "grad_norm": 8.34908676147461, "learning_rate": 5.996828286094542e-05, "loss": 1.1407, "step": 55300 }, { "epoch": 0.7385583447760995, "grad_norm": 8.763638496398926, "learning_rate": 5.9967882568535314e-05, "loss": 1.2237, "step": 55400 }, { "epoch": 0.7398914825825545, "grad_norm": 10.491510391235352, "learning_rate": 5.996747976730855e-05, "loss": 1.2564, "step": 55500 }, { "epoch": 0.7412246203890096, "grad_norm": 11.493358612060547, "learning_rate": 5.996707445729884e-05, "loss": 1.311, "step": 55600 }, { "epoch": 0.7425577581954647, "grad_norm": 9.324875831604004, "learning_rate": 5.996666663854012e-05, "loss": 1.236, "step": 55700 }, { "epoch": 0.7438908960019197, "grad_norm": 7.667178153991699, "learning_rate": 5.9966256311066536e-05, "loss": 1.1753, "step": 55800 }, { "epoch": 0.7452240338083748, "grad_norm": 11.318379402160645, "learning_rate": 5.996584347491243e-05, "loss": 1.1928, "step": 55900 }, { "epoch": 0.7465571716148298, "grad_norm": 23.366519927978516, "learning_rate": 5.996542813011238e-05, "loss": 1.3084, "step": 56000 }, { "epoch": 0.7478903094212849, "grad_norm": 6.912543296813965, "learning_rate": 5.996501027670114e-05, "loss": 1.2115, "step": 56100 }, { "epoch": 0.7492234472277399, "grad_norm": 25.378494262695312, "learning_rate": 5.9964589914713694e-05, "loss": 1.2196, "step": 56200 }, { "epoch": 0.750556585034195, "grad_norm": 8.835494041442871, "learning_rate": 5.9964167044185244e-05, "loss": 1.2492, "step": 56300 }, { "epoch": 0.75188972284065, "grad_norm": 8.243240356445312, "learning_rate": 5.996374166515118e-05, "loss": 1.175, "step": 56400 }, { "epoch": 0.7532228606471051, "grad_norm": 10.198135375976562, "learning_rate": 5.9963313777647116e-05, "loss": 1.1928, "step": 56500 }, { "epoch": 0.7545559984535601, "grad_norm": 8.14645004272461, "learning_rate": 5.996288338170888e-05, "loss": 1.2476, "step": 56600 }, { "epoch": 0.7558891362600152, "grad_norm": 16.538536071777344, "learning_rate": 5.99624504773725e-05, "loss": 1.2664, "step": 56700 }, { "epoch": 0.7572222740664702, "grad_norm": 9.398180961608887, "learning_rate": 5.9962019431217465e-05, "loss": 1.2582, "step": 56800 }, { "epoch": 0.7585554118729253, "grad_norm": 11.678868293762207, "learning_rate": 5.99615815352768e-05, "loss": 1.2593, "step": 56900 }, { "epoch": 0.7598885496793804, "grad_norm": 9.743160247802734, "learning_rate": 5.996114113104697e-05, "loss": 1.2942, "step": 57000 }, { "epoch": 0.7612216874858354, "grad_norm": 11.049501419067383, "learning_rate": 5.996069821856485e-05, "loss": 1.1478, "step": 57100 }, { "epoch": 0.7625548252922905, "grad_norm": 33.0406608581543, "learning_rate": 5.996025279786753e-05, "loss": 1.1138, "step": 57200 }, { "epoch": 0.7638879630987455, "grad_norm": 19.820329666137695, "learning_rate": 5.9959804868992275e-05, "loss": 1.1997, "step": 57300 }, { "epoch": 0.7652211009052006, "grad_norm": 10.833277702331543, "learning_rate": 5.9959354431976606e-05, "loss": 1.3754, "step": 57400 }, { "epoch": 0.7665542387116556, "grad_norm": 6.459231853485107, "learning_rate": 5.995890148685822e-05, "loss": 1.251, "step": 57500 }, { "epoch": 0.7678873765181107, "grad_norm": 9.464316368103027, "learning_rate": 5.995844603367504e-05, "loss": 1.2808, "step": 57600 }, { "epoch": 0.7692205143245657, "grad_norm": 5.59986686706543, "learning_rate": 5.99579880724652e-05, "loss": 1.1848, "step": 57700 }, { "epoch": 0.7705536521310208, "grad_norm": 17.06837272644043, "learning_rate": 5.995752760326703e-05, "loss": 1.2681, "step": 57800 }, { "epoch": 0.7718867899374758, "grad_norm": 13.600661277770996, "learning_rate": 5.995706462611909e-05, "loss": 1.2119, "step": 57900 }, { "epoch": 0.7732199277439309, "grad_norm": 6.7140092849731445, "learning_rate": 5.995659914106012e-05, "loss": 1.1343, "step": 58000 }, { "epoch": 0.7745530655503859, "grad_norm": 10.38271713256836, "learning_rate": 5.9956131148129106e-05, "loss": 1.3437, "step": 58100 }, { "epoch": 0.775886203356841, "grad_norm": 8.25704288482666, "learning_rate": 5.9955660647365235e-05, "loss": 1.1234, "step": 58200 }, { "epoch": 0.777219341163296, "grad_norm": 10.54246711730957, "learning_rate": 5.995518763880787e-05, "loss": 1.2082, "step": 58300 }, { "epoch": 0.7785524789697511, "grad_norm": 8.126418113708496, "learning_rate": 5.9954712122496634e-05, "loss": 1.3545, "step": 58400 }, { "epoch": 0.7798856167762062, "grad_norm": 8.615402221679688, "learning_rate": 5.995423409847131e-05, "loss": 1.0853, "step": 58500 }, { "epoch": 0.7812187545826612, "grad_norm": 8.23779582977295, "learning_rate": 5.995375356677194e-05, "loss": 1.2042, "step": 58600 }, { "epoch": 0.7825518923891163, "grad_norm": 9.512518882751465, "learning_rate": 5.9953270527438735e-05, "loss": 1.2647, "step": 58700 }, { "epoch": 0.7838850301955713, "grad_norm": 10.957072257995605, "learning_rate": 5.995278498051215e-05, "loss": 1.2608, "step": 58800 }, { "epoch": 0.7852181680020264, "grad_norm": 17.66162109375, "learning_rate": 5.9952296926032825e-05, "loss": 1.2732, "step": 58900 }, { "epoch": 0.7865513058084814, "grad_norm": 94.13514709472656, "learning_rate": 5.995180636404162e-05, "loss": 1.2961, "step": 59000 }, { "epoch": 0.7878844436149365, "grad_norm": 11.095081329345703, "learning_rate": 5.99513132945796e-05, "loss": 1.2849, "step": 59100 }, { "epoch": 0.7892175814213915, "grad_norm": 13.91390609741211, "learning_rate": 5.995081771768805e-05, "loss": 1.3024, "step": 59200 }, { "epoch": 0.7905507192278466, "grad_norm": 9.26659107208252, "learning_rate": 5.995031963340844e-05, "loss": 1.2547, "step": 59300 }, { "epoch": 0.7918838570343016, "grad_norm": 11.258423805236816, "learning_rate": 5.99498190417825e-05, "loss": 1.3258, "step": 59400 }, { "epoch": 0.7932169948407567, "grad_norm": 20.11414337158203, "learning_rate": 5.9949315942852106e-05, "loss": 1.1794, "step": 59500 }, { "epoch": 0.7945501326472117, "grad_norm": 15.176568031311035, "learning_rate": 5.99488103366594e-05, "loss": 1.3382, "step": 59600 }, { "epoch": 0.7958832704536668, "grad_norm": 12.417036056518555, "learning_rate": 5.9948302223246696e-05, "loss": 1.1819, "step": 59700 }, { "epoch": 0.7972164082601219, "grad_norm": 13.691413879394531, "learning_rate": 5.994779160265653e-05, "loss": 1.2694, "step": 59800 }, { "epoch": 0.7985495460665769, "grad_norm": 6.697714805603027, "learning_rate": 5.994727847493166e-05, "loss": 1.1822, "step": 59900 }, { "epoch": 0.799882683873032, "grad_norm": 11.71452808380127, "learning_rate": 5.9946762840115035e-05, "loss": 1.3278, "step": 60000 }, { "epoch": 0.801215821679487, "grad_norm": 12.379654884338379, "learning_rate": 5.994624469824983e-05, "loss": 1.2421, "step": 60100 }, { "epoch": 0.8025489594859421, "grad_norm": 91.89208984375, "learning_rate": 5.9945724049379415e-05, "loss": 1.2044, "step": 60200 }, { "epoch": 0.8038820972923971, "grad_norm": 14.701720237731934, "learning_rate": 5.994520089354738e-05, "loss": 1.251, "step": 60300 }, { "epoch": 0.8052152350988522, "grad_norm": 18.508026123046875, "learning_rate": 5.9944675230797525e-05, "loss": 1.1918, "step": 60400 }, { "epoch": 0.8065483729053072, "grad_norm": 35.94780349731445, "learning_rate": 5.994414706117385e-05, "loss": 1.2745, "step": 60500 }, { "epoch": 0.8078815107117623, "grad_norm": 15.806722640991211, "learning_rate": 5.994361638472057e-05, "loss": 1.1746, "step": 60600 }, { "epoch": 0.8092146485182173, "grad_norm": 124.15242004394531, "learning_rate": 5.994308320148212e-05, "loss": 1.2318, "step": 60700 }, { "epoch": 0.8105477863246724, "grad_norm": 15.878678321838379, "learning_rate": 5.994254751150313e-05, "loss": 1.3072, "step": 60800 }, { "epoch": 0.8118809241311274, "grad_norm": 9.722448348999023, "learning_rate": 5.994200931482846e-05, "loss": 1.2823, "step": 60900 }, { "epoch": 0.8132140619375825, "grad_norm": 10.717652320861816, "learning_rate": 5.994146861150315e-05, "loss": 1.2654, "step": 61000 }, { "epoch": 0.8145471997440376, "grad_norm": 15.704151153564453, "learning_rate": 5.994092540157247e-05, "loss": 1.2446, "step": 61100 }, { "epoch": 0.8158803375504926, "grad_norm": 16.65939712524414, "learning_rate": 5.9940379685081897e-05, "loss": 1.1919, "step": 61200 }, { "epoch": 0.8172134753569477, "grad_norm": 18.23529815673828, "learning_rate": 5.9939831462077116e-05, "loss": 1.2319, "step": 61300 }, { "epoch": 0.8185466131634027, "grad_norm": 7.180056095123291, "learning_rate": 5.993928073260402e-05, "loss": 1.1929, "step": 61400 }, { "epoch": 0.8198797509698578, "grad_norm": 7.980491638183594, "learning_rate": 5.993872749670873e-05, "loss": 1.2807, "step": 61500 }, { "epoch": 0.8212128887763128, "grad_norm": 12.967564582824707, "learning_rate": 5.9938171754437544e-05, "loss": 1.2734, "step": 61600 }, { "epoch": 0.8225460265827679, "grad_norm": 7.137316703796387, "learning_rate": 5.9937613505836986e-05, "loss": 1.2796, "step": 61700 }, { "epoch": 0.8238791643892229, "grad_norm": 12.427287101745605, "learning_rate": 5.9937052750953805e-05, "loss": 1.2422, "step": 61800 }, { "epoch": 0.825212302195678, "grad_norm": 9.61901569366455, "learning_rate": 5.9936489489834926e-05, "loss": 1.3336, "step": 61900 }, { "epoch": 0.826545440002133, "grad_norm": 19.16749382019043, "learning_rate": 5.9935923722527526e-05, "loss": 1.2903, "step": 62000 }, { "epoch": 0.8278785778085881, "grad_norm": 21.164390563964844, "learning_rate": 5.993535544907896e-05, "loss": 1.1333, "step": 62100 }, { "epoch": 0.8292117156150431, "grad_norm": 5.4536519050598145, "learning_rate": 5.99347846695368e-05, "loss": 1.2592, "step": 62200 }, { "epoch": 0.8305448534214982, "grad_norm": 13.070548057556152, "learning_rate": 5.993421712920946e-05, "loss": 1.3004, "step": 62300 }, { "epoch": 0.8318779912279533, "grad_norm": 11.288627624511719, "learning_rate": 5.993364136268342e-05, "loss": 1.2242, "step": 62400 }, { "epoch": 0.8332111290344083, "grad_norm": 12.732458114624023, "learning_rate": 5.9933068885336315e-05, "loss": 1.3262, "step": 62500 }, { "epoch": 0.8345442668408634, "grad_norm": 7.170271873474121, "learning_rate": 5.993248813201727e-05, "loss": 1.3955, "step": 62600 }, { "epoch": 0.8358774046473184, "grad_norm": 24.625259399414062, "learning_rate": 5.9931904872844656e-05, "loss": 1.2602, "step": 62700 }, { "epoch": 0.8372105424537735, "grad_norm": 11.399343490600586, "learning_rate": 5.9931319107867334e-05, "loss": 1.3034, "step": 62800 }, { "epoch": 0.8385436802602285, "grad_norm": 10.434297561645508, "learning_rate": 5.993073083713431e-05, "loss": 1.2677, "step": 62900 }, { "epoch": 0.8398768180666836, "grad_norm": 32.53263854980469, "learning_rate": 5.9930140060694864e-05, "loss": 1.3018, "step": 63000 }, { "epoch": 0.8412099558731386, "grad_norm": 6.004155158996582, "learning_rate": 5.992954677859844e-05, "loss": 1.2902, "step": 63100 }, { "epoch": 0.8425430936795937, "grad_norm": 11.689565658569336, "learning_rate": 5.99289509908947e-05, "loss": 1.1443, "step": 63200 }, { "epoch": 0.8438762314860487, "grad_norm": 15.929191589355469, "learning_rate": 5.9928352697633536e-05, "loss": 1.3377, "step": 63300 }, { "epoch": 0.8452093692925038, "grad_norm": 5.0186357498168945, "learning_rate": 5.992775189886501e-05, "loss": 1.3275, "step": 63400 }, { "epoch": 0.8465425070989588, "grad_norm": 32.35592269897461, "learning_rate": 5.992715464008355e-05, "loss": 1.3227, "step": 63500 }, { "epoch": 0.8478756449054139, "grad_norm": 11.108904838562012, "learning_rate": 5.992654885550527e-05, "loss": 1.3125, "step": 63600 }, { "epoch": 0.849208782711869, "grad_norm": 8.805634498596191, "learning_rate": 5.992594056557064e-05, "loss": 1.2356, "step": 63700 }, { "epoch": 0.850541920518324, "grad_norm": 17.968122482299805, "learning_rate": 5.9925329770330603e-05, "loss": 1.1956, "step": 63800 }, { "epoch": 0.851875058324779, "grad_norm": 3.672020196914673, "learning_rate": 5.99247164698363e-05, "loss": 1.1302, "step": 63900 }, { "epoch": 0.8532081961312341, "grad_norm": 166.76686096191406, "learning_rate": 5.992410066413905e-05, "loss": 1.2296, "step": 64000 }, { "epoch": 0.8545413339376892, "grad_norm": 32.736045837402344, "learning_rate": 5.9923482353290436e-05, "loss": 1.2912, "step": 64100 }, { "epoch": 0.8558744717441442, "grad_norm": 9.806450843811035, "learning_rate": 5.99228615373422e-05, "loss": 1.1772, "step": 64200 }, { "epoch": 0.8572076095505993, "grad_norm": 7.256938457489014, "learning_rate": 5.9922238216346314e-05, "loss": 1.1321, "step": 64300 }, { "epoch": 0.8585407473570543, "grad_norm": 7.859914302825928, "learning_rate": 5.9921612390354976e-05, "loss": 1.2662, "step": 64400 }, { "epoch": 0.8598738851635094, "grad_norm": 7.61277961730957, "learning_rate": 5.992098405942057e-05, "loss": 1.2087, "step": 64500 }, { "epoch": 0.8612070229699644, "grad_norm": 23.956418991088867, "learning_rate": 5.992035322359569e-05, "loss": 1.1775, "step": 64600 }, { "epoch": 0.8625401607764195, "grad_norm": 15.108598709106445, "learning_rate": 5.991971988293316e-05, "loss": 1.1838, "step": 64700 }, { "epoch": 0.8638732985828745, "grad_norm": 36.4500617980957, "learning_rate": 5.9919084037486e-05, "loss": 1.1374, "step": 64800 }, { "epoch": 0.8652064363893296, "grad_norm": 8.214080810546875, "learning_rate": 5.991844568730744e-05, "loss": 1.1295, "step": 64900 }, { "epoch": 0.8665395741957846, "grad_norm": 7.913750648498535, "learning_rate": 5.991780483245091e-05, "loss": 1.2211, "step": 65000 }, { "epoch": 0.8678727120022397, "grad_norm": 10.037924766540527, "learning_rate": 5.9917161472970075e-05, "loss": 1.2271, "step": 65100 }, { "epoch": 0.8692058498086948, "grad_norm": 10.798456192016602, "learning_rate": 5.991651560891879e-05, "loss": 1.2244, "step": 65200 }, { "epoch": 0.8705389876151498, "grad_norm": 8.095571517944336, "learning_rate": 5.991586724035111e-05, "loss": 1.3505, "step": 65300 }, { "epoch": 0.8718721254216049, "grad_norm": 7.92722225189209, "learning_rate": 5.991521636732133e-05, "loss": 1.1276, "step": 65400 }, { "epoch": 0.8732052632280599, "grad_norm": 15.808429718017578, "learning_rate": 5.991456298988395e-05, "loss": 1.2721, "step": 65500 }, { "epoch": 0.874538401034515, "grad_norm": 7.752704620361328, "learning_rate": 5.9913907108093644e-05, "loss": 1.2578, "step": 65600 }, { "epoch": 0.87587153884097, "grad_norm": 11.361353874206543, "learning_rate": 5.991324872200533e-05, "loss": 1.174, "step": 65700 }, { "epoch": 0.8772046766474251, "grad_norm": 19.054170608520508, "learning_rate": 5.991258783167414e-05, "loss": 1.2297, "step": 65800 }, { "epoch": 0.8785378144538801, "grad_norm": 8.19528865814209, "learning_rate": 5.991192443715538e-05, "loss": 1.2764, "step": 65900 }, { "epoch": 0.8798709522603352, "grad_norm": 8.855433464050293, "learning_rate": 5.991125853850459e-05, "loss": 1.2374, "step": 66000 }, { "epoch": 0.8812040900667902, "grad_norm": 8.035970687866211, "learning_rate": 5.9910590135777546e-05, "loss": 1.2937, "step": 66100 }, { "epoch": 0.8825372278732453, "grad_norm": 17.42535400390625, "learning_rate": 5.9909919229030166e-05, "loss": 1.1496, "step": 66200 }, { "epoch": 0.8838703656797003, "grad_norm": 18.43121910095215, "learning_rate": 5.990924581831864e-05, "loss": 1.1537, "step": 66300 }, { "epoch": 0.8852035034861554, "grad_norm": 5.792033672332764, "learning_rate": 5.990856990369932e-05, "loss": 1.1601, "step": 66400 }, { "epoch": 0.8865366412926105, "grad_norm": 21.136398315429688, "learning_rate": 5.990789148522883e-05, "loss": 1.1675, "step": 66500 }, { "epoch": 0.8878697790990655, "grad_norm": 4.407113552093506, "learning_rate": 5.990721056296392e-05, "loss": 1.1116, "step": 66600 }, { "epoch": 0.8892029169055206, "grad_norm": 40.766151428222656, "learning_rate": 5.9906527136961636e-05, "loss": 1.2535, "step": 66700 }, { "epoch": 0.8905360547119756, "grad_norm": 10.010986328125, "learning_rate": 5.9905841207279164e-05, "loss": 1.1888, "step": 66800 }, { "epoch": 0.8918691925184307, "grad_norm": 7.588259696960449, "learning_rate": 5.990515277397394e-05, "loss": 1.2641, "step": 66900 }, { "epoch": 0.8932023303248857, "grad_norm": 12.582446098327637, "learning_rate": 5.99044618371036e-05, "loss": 1.1864, "step": 67000 }, { "epoch": 0.8945354681313408, "grad_norm": 9.988810539245605, "learning_rate": 5.990376839672597e-05, "loss": 1.3173, "step": 67100 }, { "epoch": 0.8958686059377958, "grad_norm": 12.044880867004395, "learning_rate": 5.990307245289913e-05, "loss": 1.2605, "step": 67200 }, { "epoch": 0.8972017437442509, "grad_norm": 21.958425521850586, "learning_rate": 5.9902374005681314e-05, "loss": 1.2352, "step": 67300 }, { "epoch": 0.8985348815507059, "grad_norm": 43.04095458984375, "learning_rate": 5.9901673055131005e-05, "loss": 1.2242, "step": 67400 }, { "epoch": 0.899868019357161, "grad_norm": 22.055557250976562, "learning_rate": 5.99009696013069e-05, "loss": 1.2956, "step": 67500 }, { "epoch": 0.901201157163616, "grad_norm": 34.293209075927734, "learning_rate": 5.990026364426787e-05, "loss": 1.315, "step": 67600 }, { "epoch": 0.9025342949700711, "grad_norm": 10.863748550415039, "learning_rate": 5.9899555184073014e-05, "loss": 1.2313, "step": 67700 }, { "epoch": 0.9038674327765261, "grad_norm": 16.644046783447266, "learning_rate": 5.9898844220781655e-05, "loss": 1.2412, "step": 67800 }, { "epoch": 0.9052005705829812, "grad_norm": 19.618083953857422, "learning_rate": 5.9898130754453314e-05, "loss": 1.2524, "step": 67900 }, { "epoch": 0.9065337083894363, "grad_norm": 9.237844467163086, "learning_rate": 5.98974147851477e-05, "loss": 1.3057, "step": 68000 }, { "epoch": 0.9078668461958913, "grad_norm": 9.927729606628418, "learning_rate": 5.989670351003625e-05, "loss": 1.224, "step": 68100 }, { "epoch": 0.9091999840023464, "grad_norm": 8.207894325256348, "learning_rate": 5.989598255998442e-05, "loss": 1.2377, "step": 68200 }, { "epoch": 0.9105331218088014, "grad_norm": 27.68171501159668, "learning_rate": 5.989525910713517e-05, "loss": 1.1744, "step": 68300 }, { "epoch": 0.9118662596152565, "grad_norm": 18.748010635375977, "learning_rate": 5.989453315154907e-05, "loss": 1.1595, "step": 68400 }, { "epoch": 0.9131993974217115, "grad_norm": 65.15042877197266, "learning_rate": 5.989380469328689e-05, "loss": 1.2144, "step": 68500 }, { "epoch": 0.9145325352281666, "grad_norm": 15.320661544799805, "learning_rate": 5.989307373240961e-05, "loss": 1.1038, "step": 68600 }, { "epoch": 0.9158656730346216, "grad_norm": 34.1961555480957, "learning_rate": 5.9892340268978435e-05, "loss": 1.1617, "step": 68700 }, { "epoch": 0.9171988108410767, "grad_norm": 23.074356079101562, "learning_rate": 5.989160430305475e-05, "loss": 1.1821, "step": 68800 }, { "epoch": 0.9185319486475317, "grad_norm": 5.347226619720459, "learning_rate": 5.989086583470019e-05, "loss": 1.2323, "step": 68900 }, { "epoch": 0.9198650864539868, "grad_norm": 10.734393119812012, "learning_rate": 5.989012486397656e-05, "loss": 1.1454, "step": 69000 }, { "epoch": 0.9211982242604418, "grad_norm": 11.214410781860352, "learning_rate": 5.9889381390945906e-05, "loss": 1.1875, "step": 69100 }, { "epoch": 0.9225313620668969, "grad_norm": 33.327632904052734, "learning_rate": 5.988863541567045e-05, "loss": 1.2258, "step": 69200 }, { "epoch": 0.923864499873352, "grad_norm": 11.935234069824219, "learning_rate": 5.988788693821267e-05, "loss": 1.1676, "step": 69300 }, { "epoch": 0.925197637679807, "grad_norm": 9.289461135864258, "learning_rate": 5.988714348081626e-05, "loss": 1.1373, "step": 69400 }, { "epoch": 0.9265307754862621, "grad_norm": 8.93590259552002, "learning_rate": 5.9886390024202234e-05, "loss": 1.2591, "step": 69500 }, { "epoch": 0.9278639132927171, "grad_norm": 5.849639892578125, "learning_rate": 5.988563406559385e-05, "loss": 1.2081, "step": 69600 }, { "epoch": 0.9291970510991722, "grad_norm": 7.280407428741455, "learning_rate": 5.988487560505438e-05, "loss": 1.2555, "step": 69700 }, { "epoch": 0.9305301889056272, "grad_norm": 5.298121929168701, "learning_rate": 5.988411464264734e-05, "loss": 1.2169, "step": 69800 }, { "epoch": 0.9318633267120823, "grad_norm": 3.3890349864959717, "learning_rate": 5.988335117843643e-05, "loss": 1.1672, "step": 69900 }, { "epoch": 0.9331964645185373, "grad_norm": 26.262561798095703, "learning_rate": 5.9882585212485554e-05, "loss": 1.1808, "step": 70000 }, { "epoch": 0.9345296023249924, "grad_norm": 47.78841781616211, "learning_rate": 5.9881816744858836e-05, "loss": 1.121, "step": 70100 }, { "epoch": 0.9358627401314474, "grad_norm": 12.056443214416504, "learning_rate": 5.9881045775620624e-05, "loss": 1.226, "step": 70200 }, { "epoch": 0.9371958779379025, "grad_norm": 12.062369346618652, "learning_rate": 5.988027230483544e-05, "loss": 1.2758, "step": 70300 }, { "epoch": 0.9385290157443575, "grad_norm": 7.46888542175293, "learning_rate": 5.987949633256807e-05, "loss": 1.1547, "step": 70400 }, { "epoch": 0.9398621535508126, "grad_norm": 10.78403091430664, "learning_rate": 5.987871785888344e-05, "loss": 1.2471, "step": 70500 }, { "epoch": 0.9411952913572676, "grad_norm": 10.418928146362305, "learning_rate": 5.987793688384674e-05, "loss": 1.2649, "step": 70600 }, { "epoch": 0.9425284291637227, "grad_norm": 7.946994781494141, "learning_rate": 5.987715340752335e-05, "loss": 1.145, "step": 70700 }, { "epoch": 0.9438615669701778, "grad_norm": 7.327362537384033, "learning_rate": 5.987636742997885e-05, "loss": 1.1369, "step": 70800 }, { "epoch": 0.9451947047766328, "grad_norm": 7.959192752838135, "learning_rate": 5.9875578951279054e-05, "loss": 1.256, "step": 70900 }, { "epoch": 0.9465278425830879, "grad_norm": 13.971134185791016, "learning_rate": 5.987478797148997e-05, "loss": 1.2379, "step": 71000 }, { "epoch": 0.9478609803895429, "grad_norm": 1.5159504413604736, "learning_rate": 5.98739944906778e-05, "loss": 1.1239, "step": 71100 }, { "epoch": 0.949194118195998, "grad_norm": 6.576629161834717, "learning_rate": 5.9873198508908985e-05, "loss": 1.1743, "step": 71200 }, { "epoch": 0.950527256002453, "grad_norm": 15.255958557128906, "learning_rate": 5.9872400026250154e-05, "loss": 1.153, "step": 71300 }, { "epoch": 0.9518603938089081, "grad_norm": 12.639915466308594, "learning_rate": 5.9871599042768166e-05, "loss": 1.1822, "step": 71400 }, { "epoch": 0.9531935316153631, "grad_norm": 17.46959114074707, "learning_rate": 5.987079555853007e-05, "loss": 1.1245, "step": 71500 }, { "epoch": 0.9545266694218182, "grad_norm": 7.706136703491211, "learning_rate": 5.986998957360312e-05, "loss": 1.1335, "step": 71600 }, { "epoch": 0.9558598072282732, "grad_norm": 39.25468444824219, "learning_rate": 5.9869181088054815e-05, "loss": 1.3075, "step": 71700 }, { "epoch": 0.9571929450347283, "grad_norm": 13.006002426147461, "learning_rate": 5.986837010195282e-05, "loss": 1.198, "step": 71800 }, { "epoch": 0.9585260828411833, "grad_norm": 6.020803928375244, "learning_rate": 5.9867556615365034e-05, "loss": 1.2591, "step": 71900 }, { "epoch": 0.9598592206476384, "grad_norm": 19.46979522705078, "learning_rate": 5.9866740628359564e-05, "loss": 1.2132, "step": 72000 }, { "epoch": 0.9611923584540935, "grad_norm": 12.634291648864746, "learning_rate": 5.986592214100471e-05, "loss": 1.2584, "step": 72100 }, { "epoch": 0.9625254962605485, "grad_norm": 9.080987930297852, "learning_rate": 5.9865101153368996e-05, "loss": 1.2276, "step": 72200 }, { "epoch": 0.9638586340670036, "grad_norm": 14.256789207458496, "learning_rate": 5.9864277665521165e-05, "loss": 1.1851, "step": 72300 }, { "epoch": 0.9651917718734586, "grad_norm": 11.18542194366455, "learning_rate": 5.986345167753015e-05, "loss": 1.1733, "step": 72400 }, { "epoch": 0.9665249096799136, "grad_norm": 24.035905838012695, "learning_rate": 5.986262318946509e-05, "loss": 1.2244, "step": 72500 }, { "epoch": 0.9678580474863686, "grad_norm": 5.771480083465576, "learning_rate": 5.9861792201395346e-05, "loss": 1.2005, "step": 72600 }, { "epoch": 0.9691911852928237, "grad_norm": 10.688034057617188, "learning_rate": 5.986095871339051e-05, "loss": 1.1399, "step": 72700 }, { "epoch": 0.9705243230992787, "grad_norm": 6.106632709503174, "learning_rate": 5.9860122725520325e-05, "loss": 1.1619, "step": 72800 }, { "epoch": 0.9718574609057338, "grad_norm": 6.642432689666748, "learning_rate": 5.9859284237854794e-05, "loss": 1.2483, "step": 72900 }, { "epoch": 0.9731905987121888, "grad_norm": 23.3453311920166, "learning_rate": 5.985844325046412e-05, "loss": 1.2183, "step": 73000 }, { "epoch": 0.9745237365186439, "grad_norm": 52.03811264038086, "learning_rate": 5.9857599763418684e-05, "loss": 1.2105, "step": 73100 }, { "epoch": 0.9758568743250989, "grad_norm": 12.850321769714355, "learning_rate": 5.985675377678913e-05, "loss": 1.1617, "step": 73200 }, { "epoch": 0.977190012131554, "grad_norm": 5.467399597167969, "learning_rate": 5.9855913787880055e-05, "loss": 1.1336, "step": 73300 }, { "epoch": 0.978523149938009, "grad_norm": 6.518427848815918, "learning_rate": 5.9855062827288973e-05, "loss": 1.2184, "step": 73400 }, { "epoch": 0.9798562877444641, "grad_norm": 13.939553260803223, "learning_rate": 5.985420936732615e-05, "loss": 1.1211, "step": 73500 }, { "epoch": 0.9811894255509191, "grad_norm": 13.831701278686523, "learning_rate": 5.9853353408063025e-05, "loss": 1.2824, "step": 73600 }, { "epoch": 0.9825225633573742, "grad_norm": 40.75484085083008, "learning_rate": 5.9852494949571266e-05, "loss": 1.2371, "step": 73700 }, { "epoch": 0.9838557011638293, "grad_norm": 43.923187255859375, "learning_rate": 5.985163399192272e-05, "loss": 1.157, "step": 73800 }, { "epoch": 0.9851888389702843, "grad_norm": 49.986480712890625, "learning_rate": 5.9850770535189485e-05, "loss": 1.2449, "step": 73900 }, { "epoch": 0.9865219767767394, "grad_norm": 35.75630569458008, "learning_rate": 5.9849904579443844e-05, "loss": 1.226, "step": 74000 }, { "epoch": 0.9878551145831944, "grad_norm": 17.33039665222168, "learning_rate": 5.984903612475828e-05, "loss": 1.3473, "step": 74100 }, { "epoch": 0.9891882523896495, "grad_norm": 13.50650691986084, "learning_rate": 5.98481651712055e-05, "loss": 1.2459, "step": 74200 }, { "epoch": 0.9905213901961045, "grad_norm": 9.92557144165039, "learning_rate": 5.9847291718858425e-05, "loss": 1.2908, "step": 74300 }, { "epoch": 0.9918545280025596, "grad_norm": 7.2754106521606445, "learning_rate": 5.9846415767790174e-05, "loss": 1.2303, "step": 74400 }, { "epoch": 0.9931876658090146, "grad_norm": 10.590252876281738, "learning_rate": 5.984553731807408e-05, "loss": 1.2779, "step": 74500 }, { "epoch": 0.9945208036154697, "grad_norm": 6.410673141479492, "learning_rate": 5.9844656369783684e-05, "loss": 1.1268, "step": 74600 }, { "epoch": 0.9958539414219247, "grad_norm": 8.85578441619873, "learning_rate": 5.984377292299273e-05, "loss": 1.1553, "step": 74700 }, { "epoch": 0.9971870792283798, "grad_norm": 4.457735061645508, "learning_rate": 5.984288697777519e-05, "loss": 1.2773, "step": 74800 }, { "epoch": 0.9985202170348348, "grad_norm": 10.811666488647461, "learning_rate": 5.984199853420522e-05, "loss": 1.3586, "step": 74900 }, { "epoch": 0.9998533548412899, "grad_norm": 8.510993003845215, "learning_rate": 5.98411075923572e-05, "loss": 1.2479, "step": 75000 }, { "epoch": 1.001186492647745, "grad_norm": 11.043274879455566, "learning_rate": 5.984021415230573e-05, "loss": 1.2188, "step": 75100 }, { "epoch": 1.0025196304542001, "grad_norm": 18.26079559326172, "learning_rate": 5.983931821412558e-05, "loss": 1.1047, "step": 75200 }, { "epoch": 1.003852768260655, "grad_norm": 12.116172790527344, "learning_rate": 5.983841977789179e-05, "loss": 1.1388, "step": 75300 }, { "epoch": 1.0051859060671102, "grad_norm": 7.292959213256836, "learning_rate": 5.983751884367954e-05, "loss": 1.1702, "step": 75400 }, { "epoch": 1.0065190438735652, "grad_norm": 14.203654289245605, "learning_rate": 5.983661541156427e-05, "loss": 1.2158, "step": 75500 }, { "epoch": 1.0078521816800203, "grad_norm": 14.431889533996582, "learning_rate": 5.983570948162162e-05, "loss": 1.1645, "step": 75600 }, { "epoch": 1.0091853194864753, "grad_norm": 13.229927062988281, "learning_rate": 5.983480105392742e-05, "loss": 1.2121, "step": 75700 }, { "epoch": 1.0105184572929304, "grad_norm": 9.136079788208008, "learning_rate": 5.9833890128557717e-05, "loss": 1.2008, "step": 75800 }, { "epoch": 1.0118515950993854, "grad_norm": 16.414794921875, "learning_rate": 5.983297670558878e-05, "loss": 1.0985, "step": 75900 }, { "epoch": 1.0131847329058405, "grad_norm": 5.387160778045654, "learning_rate": 5.9832060785097075e-05, "loss": 1.1573, "step": 76000 }, { "epoch": 1.0145178707122955, "grad_norm": 27.029399871826172, "learning_rate": 5.983114236715928e-05, "loss": 1.0958, "step": 76100 }, { "epoch": 1.0158510085187507, "grad_norm": 20.360342025756836, "learning_rate": 5.983022145185228e-05, "loss": 1.0955, "step": 76200 }, { "epoch": 1.0171841463252056, "grad_norm": 10.14236831665039, "learning_rate": 5.9829298039253175e-05, "loss": 1.0436, "step": 76300 }, { "epoch": 1.0185172841316608, "grad_norm": 8.58442211151123, "learning_rate": 5.9828372129439273e-05, "loss": 1.1735, "step": 76400 }, { "epoch": 1.0198504219381157, "grad_norm": 18.077312469482422, "learning_rate": 5.9827443722488076e-05, "loss": 1.0877, "step": 76500 }, { "epoch": 1.0211835597445709, "grad_norm": 4.568294048309326, "learning_rate": 5.982651281847732e-05, "loss": 1.1923, "step": 76600 }, { "epoch": 1.0225166975510258, "grad_norm": 7.134129524230957, "learning_rate": 5.982557941748492e-05, "loss": 1.0453, "step": 76700 }, { "epoch": 1.023849835357481, "grad_norm": 8.096660614013672, "learning_rate": 5.9824643519589045e-05, "loss": 1.2296, "step": 76800 }, { "epoch": 1.025182973163936, "grad_norm": 43.836124420166016, "learning_rate": 5.982370512486802e-05, "loss": 1.0807, "step": 76900 }, { "epoch": 1.026516110970391, "grad_norm": 4.485092639923096, "learning_rate": 5.9822764233400424e-05, "loss": 1.2019, "step": 77000 }, { "epoch": 1.027849248776846, "grad_norm": 22.090463638305664, "learning_rate": 5.9821820845265005e-05, "loss": 1.1344, "step": 77100 }, { "epoch": 1.0291823865833012, "grad_norm": 39.70230484008789, "learning_rate": 5.982087496054076e-05, "loss": 1.2073, "step": 77200 }, { "epoch": 1.0305155243897561, "grad_norm": 12.33920955657959, "learning_rate": 5.981992657930686e-05, "loss": 1.0715, "step": 77300 }, { "epoch": 1.0318486621962113, "grad_norm": 8.832989692687988, "learning_rate": 5.981897570164271e-05, "loss": 1.0974, "step": 77400 }, { "epoch": 1.0331818000026662, "grad_norm": 6.981178283691406, "learning_rate": 5.9818022327627915e-05, "loss": 1.0758, "step": 77500 }, { "epoch": 1.0345149378091214, "grad_norm": 7.889525413513184, "learning_rate": 5.981706645734228e-05, "loss": 1.1621, "step": 77600 }, { "epoch": 1.0358480756155763, "grad_norm": 6.384875774383545, "learning_rate": 5.981610809086583e-05, "loss": 1.1755, "step": 77700 }, { "epoch": 1.0371812134220315, "grad_norm": 12.435880661010742, "learning_rate": 5.981514722827881e-05, "loss": 1.093, "step": 77800 }, { "epoch": 1.0385143512284865, "grad_norm": 10.348038673400879, "learning_rate": 5.9814183869661637e-05, "loss": 1.2996, "step": 77900 }, { "epoch": 1.0398474890349416, "grad_norm": 49.08681869506836, "learning_rate": 5.981321801509499e-05, "loss": 1.2362, "step": 78000 }, { "epoch": 1.0411806268413966, "grad_norm": 6.475674152374268, "learning_rate": 5.98122496646597e-05, "loss": 1.0717, "step": 78100 }, { "epoch": 1.0425137646478517, "grad_norm": 6.125776767730713, "learning_rate": 5.981127881843683e-05, "loss": 1.0976, "step": 78200 }, { "epoch": 1.0438469024543067, "grad_norm": 7.008230686187744, "learning_rate": 5.9810305476507694e-05, "loss": 1.0572, "step": 78300 }, { "epoch": 1.0451800402607618, "grad_norm": 14.468332290649414, "learning_rate": 5.980932963895374e-05, "loss": 1.1047, "step": 78400 }, { "epoch": 1.0465131780672168, "grad_norm": 8.05910587310791, "learning_rate": 5.980835130585668e-05, "loss": 1.1505, "step": 78500 }, { "epoch": 1.047846315873672, "grad_norm": 14.195695877075195, "learning_rate": 5.9807370477298414e-05, "loss": 1.1586, "step": 78600 }, { "epoch": 1.0491794536801269, "grad_norm": 6.8311991691589355, "learning_rate": 5.9806387153361056e-05, "loss": 1.103, "step": 78700 }, { "epoch": 1.050512591486582, "grad_norm": 9.17585277557373, "learning_rate": 5.980540133412692e-05, "loss": 1.1246, "step": 78800 }, { "epoch": 1.051845729293037, "grad_norm": 5.913827896118164, "learning_rate": 5.9804413019678535e-05, "loss": 1.1552, "step": 78900 }, { "epoch": 1.0531788670994922, "grad_norm": 8.89026927947998, "learning_rate": 5.9803422210098646e-05, "loss": 1.0671, "step": 79000 }, { "epoch": 1.054512004905947, "grad_norm": 35.286048889160156, "learning_rate": 5.9802428905470196e-05, "loss": 1.2117, "step": 79100 }, { "epoch": 1.0558451427124023, "grad_norm": 19.25956916809082, "learning_rate": 5.9801433105876344e-05, "loss": 1.0006, "step": 79200 }, { "epoch": 1.0571782805188572, "grad_norm": 59.562278747558594, "learning_rate": 5.9800434811400464e-05, "loss": 1.0775, "step": 79300 }, { "epoch": 1.0585114183253124, "grad_norm": 8.845512390136719, "learning_rate": 5.979943402212611e-05, "loss": 1.248, "step": 79400 }, { "epoch": 1.0598445561317673, "grad_norm": 3.4655189514160156, "learning_rate": 5.979843073813707e-05, "loss": 1.0545, "step": 79500 }, { "epoch": 1.0611776939382225, "grad_norm": 26.739925384521484, "learning_rate": 5.979742495951736e-05, "loss": 1.1393, "step": 79600 }, { "epoch": 1.0625108317446774, "grad_norm": 10.953495025634766, "learning_rate": 5.979642678143054e-05, "loss": 1.1376, "step": 79700 }, { "epoch": 1.0638439695511326, "grad_norm": 6.133086681365967, "learning_rate": 5.9795416038746455e-05, "loss": 1.0403, "step": 79800 }, { "epoch": 1.0651771073575875, "grad_norm": 6.895145893096924, "learning_rate": 5.979440280168407e-05, "loss": 0.9479, "step": 79900 }, { "epoch": 1.0665102451640427, "grad_norm": 5.534389495849609, "learning_rate": 5.979338707032819e-05, "loss": 1.0262, "step": 80000 }, { "epoch": 1.0678433829704976, "grad_norm": 10.9481782913208, "learning_rate": 5.979236884476386e-05, "loss": 1.1373, "step": 80100 }, { "epoch": 1.0691765207769528, "grad_norm": 14.924960136413574, "learning_rate": 5.979134812507634e-05, "loss": 1.1344, "step": 80200 }, { "epoch": 1.0705096585834077, "grad_norm": 23.271244049072266, "learning_rate": 5.9790324911351054e-05, "loss": 1.21, "step": 80300 }, { "epoch": 1.071842796389863, "grad_norm": 12.470232009887695, "learning_rate": 5.978929920367368e-05, "loss": 1.151, "step": 80400 }, { "epoch": 1.0731759341963178, "grad_norm": 5.916227340698242, "learning_rate": 5.978827100213007e-05, "loss": 1.0633, "step": 80500 }, { "epoch": 1.074509072002773, "grad_norm": 11.781140327453613, "learning_rate": 5.978724030680632e-05, "loss": 1.1243, "step": 80600 }, { "epoch": 1.075842209809228, "grad_norm": 6.117124557495117, "learning_rate": 5.978620711778871e-05, "loss": 1.0663, "step": 80700 }, { "epoch": 1.0771753476156831, "grad_norm": 4.603142738342285, "learning_rate": 5.978517143516374e-05, "loss": 1.217, "step": 80800 }, { "epoch": 1.078508485422138, "grad_norm": 18.69881248474121, "learning_rate": 5.9784133259018094e-05, "loss": 1.1374, "step": 80900 }, { "epoch": 1.0798416232285932, "grad_norm": 5.884377479553223, "learning_rate": 5.978309258943871e-05, "loss": 1.1373, "step": 81000 }, { "epoch": 1.0811747610350482, "grad_norm": 4.927627086639404, "learning_rate": 5.978204942651269e-05, "loss": 1.1145, "step": 81100 }, { "epoch": 1.0825078988415033, "grad_norm": 4.394540309906006, "learning_rate": 5.9781003770327375e-05, "loss": 1.1863, "step": 81200 }, { "epoch": 1.0838410366479583, "grad_norm": 7.741486072540283, "learning_rate": 5.9779955620970306e-05, "loss": 1.1936, "step": 81300 }, { "epoch": 1.0851741744544134, "grad_norm": 62.55707931518555, "learning_rate": 5.9778904978529224e-05, "loss": 1.1778, "step": 81400 }, { "epoch": 1.0865073122608684, "grad_norm": 120.63585662841797, "learning_rate": 5.977786238678651e-05, "loss": 1.1825, "step": 81500 }, { "epoch": 1.0878404500673235, "grad_norm": 6.895974159240723, "learning_rate": 5.9776806783370123e-05, "loss": 1.227, "step": 81600 }, { "epoch": 1.0891735878737785, "grad_norm": 39.79244613647461, "learning_rate": 5.977574868713335e-05, "loss": 1.1472, "step": 81700 }, { "epoch": 1.0905067256802337, "grad_norm": 14.568811416625977, "learning_rate": 5.977468809816475e-05, "loss": 1.1618, "step": 81800 }, { "epoch": 1.0918398634866886, "grad_norm": 14.992047309875488, "learning_rate": 5.977362501655312e-05, "loss": 1.1696, "step": 81900 }, { "epoch": 1.0931730012931438, "grad_norm": 86.61161804199219, "learning_rate": 5.9772559442387465e-05, "loss": 1.1109, "step": 82000 }, { "epoch": 1.0945061390995987, "grad_norm": 6.827703952789307, "learning_rate": 5.977149137575698e-05, "loss": 1.0806, "step": 82100 }, { "epoch": 1.0958392769060539, "grad_norm": 7.775423049926758, "learning_rate": 5.977042081675109e-05, "loss": 1.1765, "step": 82200 }, { "epoch": 1.0971724147125088, "grad_norm": 8.117273330688477, "learning_rate": 5.97693477654594e-05, "loss": 1.2164, "step": 82300 }, { "epoch": 1.098505552518964, "grad_norm": 15.901642799377441, "learning_rate": 5.9768272221971765e-05, "loss": 1.1548, "step": 82400 }, { "epoch": 1.099838690325419, "grad_norm": 5.394028186798096, "learning_rate": 5.976719418637822e-05, "loss": 1.0764, "step": 82500 }, { "epoch": 1.101171828131874, "grad_norm": 23.029502868652344, "learning_rate": 5.976611365876901e-05, "loss": 1.0354, "step": 82600 }, { "epoch": 1.102504965938329, "grad_norm": 12.254613876342773, "learning_rate": 5.976503063923459e-05, "loss": 1.1558, "step": 82700 }, { "epoch": 1.1038381037447842, "grad_norm": 7.172773838043213, "learning_rate": 5.9763945127865636e-05, "loss": 1.1655, "step": 82800 }, { "epoch": 1.1051712415512391, "grad_norm": 9.597220420837402, "learning_rate": 5.9762857124753024e-05, "loss": 1.045, "step": 82900 }, { "epoch": 1.1065043793576943, "grad_norm": 13.557044982910156, "learning_rate": 5.976176662998783e-05, "loss": 1.1193, "step": 83000 }, { "epoch": 1.1078375171641492, "grad_norm": 13.980875968933105, "learning_rate": 5.9760673643661344e-05, "loss": 1.2255, "step": 83100 }, { "epoch": 1.1091706549706044, "grad_norm": 64.94055938720703, "learning_rate": 5.9759578165865086e-05, "loss": 1.0811, "step": 83200 }, { "epoch": 1.1105037927770594, "grad_norm": 12.979415893554688, "learning_rate": 5.975848019669075e-05, "loss": 1.0881, "step": 83300 }, { "epoch": 1.1118369305835145, "grad_norm": 26.180320739746094, "learning_rate": 5.9757379736230264e-05, "loss": 1.1035, "step": 83400 }, { "epoch": 1.1131700683899695, "grad_norm": 8.298270225524902, "learning_rate": 5.9756287826423396e-05, "loss": 1.1983, "step": 83500 }, { "epoch": 1.1145032061964246, "grad_norm": 14.052399635314941, "learning_rate": 5.9755182408577744e-05, "loss": 1.1543, "step": 83600 }, { "epoch": 1.1158363440028796, "grad_norm": 17.602996826171875, "learning_rate": 5.975407449972202e-05, "loss": 1.1258, "step": 83700 }, { "epoch": 1.1171694818093347, "grad_norm": 5.507196426391602, "learning_rate": 5.975296409994896e-05, "loss": 1.018, "step": 83800 }, { "epoch": 1.1185026196157897, "grad_norm": 85.54991149902344, "learning_rate": 5.9751851209351544e-05, "loss": 1.151, "step": 83900 }, { "epoch": 1.1198357574222448, "grad_norm": 22.850526809692383, "learning_rate": 5.975073582802294e-05, "loss": 1.2172, "step": 84000 }, { "epoch": 1.1211688952286998, "grad_norm": 8.01451301574707, "learning_rate": 5.97496179560565e-05, "loss": 1.3195, "step": 84100 }, { "epoch": 1.122502033035155, "grad_norm": 7.175127029418945, "learning_rate": 5.9748497593545826e-05, "loss": 1.3277, "step": 84200 }, { "epoch": 1.1238351708416099, "grad_norm": 11.433351516723633, "learning_rate": 5.974737474058471e-05, "loss": 1.2127, "step": 84300 }, { "epoch": 1.1251683086480648, "grad_norm": 20.592527389526367, "learning_rate": 5.974624939726716e-05, "loss": 1.2005, "step": 84400 }, { "epoch": 1.12650144645452, "grad_norm": 11.988605499267578, "learning_rate": 5.9745121563687364e-05, "loss": 1.126, "step": 84500 }, { "epoch": 1.1278345842609752, "grad_norm": 21.23265838623047, "learning_rate": 5.9743991239939766e-05, "loss": 1.3371, "step": 84600 }, { "epoch": 1.12916772206743, "grad_norm": 7.660402297973633, "learning_rate": 5.9742858426118984e-05, "loss": 1.1938, "step": 84700 }, { "epoch": 1.130500859873885, "grad_norm": 13.586594581604004, "learning_rate": 5.9741723122319845e-05, "loss": 1.2218, "step": 84800 }, { "epoch": 1.1318339976803402, "grad_norm": 10.450994491577148, "learning_rate": 5.974058532863741e-05, "loss": 1.1283, "step": 84900 }, { "epoch": 1.1331671354867954, "grad_norm": 8.403241157531738, "learning_rate": 5.9739445045166916e-05, "loss": 1.1461, "step": 85000 }, { "epoch": 1.1345002732932503, "grad_norm": 14.940104484558105, "learning_rate": 5.973830227200383e-05, "loss": 1.1463, "step": 85100 }, { "epoch": 1.1358334110997053, "grad_norm": 12.222570419311523, "learning_rate": 5.9737157009243817e-05, "loss": 0.9977, "step": 85200 }, { "epoch": 1.1371665489061604, "grad_norm": 7.444983005523682, "learning_rate": 5.9736009256982766e-05, "loss": 1.1017, "step": 85300 }, { "epoch": 1.1384996867126156, "grad_norm": 10.13521957397461, "learning_rate": 5.973485901531675e-05, "loss": 1.1652, "step": 85400 }, { "epoch": 1.1398328245190705, "grad_norm": 5.480467319488525, "learning_rate": 5.973370628434207e-05, "loss": 1.2374, "step": 85500 }, { "epoch": 1.1411659623255255, "grad_norm": 19.68604850769043, "learning_rate": 5.9732551064155235e-05, "loss": 1.1153, "step": 85600 }, { "epoch": 1.1424991001319806, "grad_norm": 25.908260345458984, "learning_rate": 5.973139335485294e-05, "loss": 1.2232, "step": 85700 }, { "epoch": 1.1438322379384358, "grad_norm": 11.241397857666016, "learning_rate": 5.9730233156532125e-05, "loss": 1.0847, "step": 85800 }, { "epoch": 1.1451653757448907, "grad_norm": 17.148719787597656, "learning_rate": 5.97290704692899e-05, "loss": 1.2133, "step": 85900 }, { "epoch": 1.1464985135513457, "grad_norm": 49.80592346191406, "learning_rate": 5.972790529322361e-05, "loss": 1.1002, "step": 86000 }, { "epoch": 1.1478316513578009, "grad_norm": 79.86223602294922, "learning_rate": 5.9726737628430806e-05, "loss": 1.1112, "step": 86100 }, { "epoch": 1.149164789164256, "grad_norm": 5.998776435852051, "learning_rate": 5.9725567475009225e-05, "loss": 1.2046, "step": 86200 }, { "epoch": 1.150497926970711, "grad_norm": 19.046226501464844, "learning_rate": 5.9724394833056847e-05, "loss": 1.1943, "step": 86300 }, { "epoch": 1.151831064777166, "grad_norm": 5.2076215744018555, "learning_rate": 5.9723219702671824e-05, "loss": 1.1504, "step": 86400 }, { "epoch": 1.153164202583621, "grad_norm": 6.994994640350342, "learning_rate": 5.9722042083952556e-05, "loss": 1.1449, "step": 86500 }, { "epoch": 1.1544973403900762, "grad_norm": 8.151749610900879, "learning_rate": 5.972086197699761e-05, "loss": 1.1437, "step": 86600 }, { "epoch": 1.1558304781965312, "grad_norm": 10.373266220092773, "learning_rate": 5.971967938190578e-05, "loss": 1.1739, "step": 86700 }, { "epoch": 1.1571636160029861, "grad_norm": 24.712182998657227, "learning_rate": 5.971850616192285e-05, "loss": 1.1091, "step": 86800 }, { "epoch": 1.1584967538094413, "grad_norm": 64.45585632324219, "learning_rate": 5.971733050351025e-05, "loss": 1.1105, "step": 86900 }, { "epoch": 1.1598298916158964, "grad_norm": 11.484139442443848, "learning_rate": 5.971614049435846e-05, "loss": 1.2912, "step": 87000 }, { "epoch": 1.1611630294223514, "grad_norm": 12.35147476196289, "learning_rate": 5.971494799746506e-05, "loss": 1.23, "step": 87100 }, { "epoch": 1.1624961672288063, "grad_norm": 60.69480895996094, "learning_rate": 5.9713753012929886e-05, "loss": 1.1051, "step": 87200 }, { "epoch": 1.1638293050352615, "grad_norm": 30.60993766784668, "learning_rate": 5.9712555540852965e-05, "loss": 1.2039, "step": 87300 }, { "epoch": 1.1651624428417167, "grad_norm": 39.225765228271484, "learning_rate": 5.9711355581334564e-05, "loss": 1.1696, "step": 87400 }, { "epoch": 1.1664955806481716, "grad_norm": 24.108261108398438, "learning_rate": 5.9710153134475135e-05, "loss": 1.5256, "step": 87500 }, { "epoch": 1.1678287184546265, "grad_norm": 89.19821166992188, "learning_rate": 5.970894820037534e-05, "loss": 1.3537, "step": 87600 }, { "epoch": 1.1691618562610817, "grad_norm": 69.13079833984375, "learning_rate": 5.970774077913605e-05, "loss": 1.3992, "step": 87700 }, { "epoch": 1.1704949940675369, "grad_norm": 175.64361572265625, "learning_rate": 5.970653087085835e-05, "loss": 1.2896, "step": 87800 }, { "epoch": 1.1718281318739918, "grad_norm": 12.462745666503906, "learning_rate": 5.970531847564353e-05, "loss": 1.218, "step": 87900 }, { "epoch": 1.1731612696804468, "grad_norm": 7.118845462799072, "learning_rate": 5.970410359359307e-05, "loss": 1.1729, "step": 88000 }, { "epoch": 1.174494407486902, "grad_norm": 15.6696195602417, "learning_rate": 5.9702886224808706e-05, "loss": 1.391, "step": 88100 }, { "epoch": 1.175827545293357, "grad_norm": 8.417475700378418, "learning_rate": 5.970166636939234e-05, "loss": 1.0272, "step": 88200 }, { "epoch": 1.177160683099812, "grad_norm": 9.447205543518066, "learning_rate": 5.970044402744609e-05, "loss": 1.1155, "step": 88300 }, { "epoch": 1.178493820906267, "grad_norm": 8.31303596496582, "learning_rate": 5.96992191990723e-05, "loss": 1.2065, "step": 88400 }, { "epoch": 1.1798269587127221, "grad_norm": 8.019611358642578, "learning_rate": 5.9697991884373485e-05, "loss": 1.2684, "step": 88500 }, { "epoch": 1.1811600965191773, "grad_norm": 15.953227996826172, "learning_rate": 5.969676208345241e-05, "loss": 1.1435, "step": 88600 }, { "epoch": 1.1824932343256322, "grad_norm": 9.730003356933594, "learning_rate": 5.9695529796412026e-05, "loss": 1.0785, "step": 88700 }, { "epoch": 1.1838263721320872, "grad_norm": 14.623044967651367, "learning_rate": 5.96942950233555e-05, "loss": 1.1901, "step": 88800 }, { "epoch": 1.1851595099385424, "grad_norm": 13.701156616210938, "learning_rate": 5.9693057764386196e-05, "loss": 1.2522, "step": 88900 }, { "epoch": 1.1864926477449975, "grad_norm": 21.134876251220703, "learning_rate": 5.9691818019607694e-05, "loss": 1.1854, "step": 89000 }, { "epoch": 1.1878257855514525, "grad_norm": 8.649664878845215, "learning_rate": 5.9690588223732526e-05, "loss": 1.1092, "step": 89100 }, { "epoch": 1.1891589233579074, "grad_norm": 27.974180221557617, "learning_rate": 5.96893435325027e-05, "loss": 1.1193, "step": 89200 }, { "epoch": 1.1904920611643626, "grad_norm": 19.492082595825195, "learning_rate": 5.9688096355774625e-05, "loss": 1.0835, "step": 89300 }, { "epoch": 1.1918251989708177, "grad_norm": 6.332712650299072, "learning_rate": 5.9686846693652714e-05, "loss": 1.0741, "step": 89400 }, { "epoch": 1.1931583367772727, "grad_norm": 9.501975059509277, "learning_rate": 5.968559454624157e-05, "loss": 1.181, "step": 89500 }, { "epoch": 1.1944914745837276, "grad_norm": 24.967876434326172, "learning_rate": 5.968433991364604e-05, "loss": 1.0162, "step": 89600 }, { "epoch": 1.1958246123901828, "grad_norm": 9.475807189941406, "learning_rate": 5.968308279597113e-05, "loss": 1.2507, "step": 89700 }, { "epoch": 1.197157750196638, "grad_norm": 19.312448501586914, "learning_rate": 5.968182319332211e-05, "loss": 1.1709, "step": 89800 }, { "epoch": 1.198490888003093, "grad_norm": 5.007102012634277, "learning_rate": 5.9680561105804416e-05, "loss": 1.1828, "step": 89900 }, { "epoch": 1.1998240258095478, "grad_norm": 10.835165023803711, "learning_rate": 5.96792965335237e-05, "loss": 1.1555, "step": 90000 }, { "epoch": 1.201157163616003, "grad_norm": 87.05762481689453, "learning_rate": 5.967802947658584e-05, "loss": 1.1548, "step": 90100 }, { "epoch": 1.2024903014224582, "grad_norm": 10.361042022705078, "learning_rate": 5.96767599350969e-05, "loss": 1.2191, "step": 90200 }, { "epoch": 1.203823439228913, "grad_norm": 13.174924850463867, "learning_rate": 5.967548790916318e-05, "loss": 1.192, "step": 90300 }, { "epoch": 1.205156577035368, "grad_norm": 10.095010757446289, "learning_rate": 5.967421339889114e-05, "loss": 1.1073, "step": 90400 }, { "epoch": 1.2064897148418232, "grad_norm": 28.42295265197754, "learning_rate": 5.967293640438751e-05, "loss": 1.0767, "step": 90500 }, { "epoch": 1.2078228526482782, "grad_norm": 5.021069526672363, "learning_rate": 5.9671656925759184e-05, "loss": 1.175, "step": 90600 }, { "epoch": 1.2091559904547333, "grad_norm": 5.682706832885742, "learning_rate": 5.967037496311326e-05, "loss": 1.0716, "step": 90700 }, { "epoch": 1.2104891282611883, "grad_norm": 8.482327461242676, "learning_rate": 5.966909051655708e-05, "loss": 1.0824, "step": 90800 }, { "epoch": 1.2118222660676434, "grad_norm": 17.09185028076172, "learning_rate": 5.966780358619816e-05, "loss": 1.1872, "step": 90900 }, { "epoch": 1.2131554038740984, "grad_norm": 14.372496604919434, "learning_rate": 5.966651417214426e-05, "loss": 1.0806, "step": 91000 }, { "epoch": 1.2144885416805535, "grad_norm": 8.512094497680664, "learning_rate": 5.96652222745033e-05, "loss": 1.0649, "step": 91100 }, { "epoch": 1.2158216794870085, "grad_norm": 257.15118408203125, "learning_rate": 5.966392789338344e-05, "loss": 1.2166, "step": 91200 }, { "epoch": 1.2171548172934636, "grad_norm": 7.6355299949646, "learning_rate": 5.966263102889306e-05, "loss": 1.1661, "step": 91300 }, { "epoch": 1.2184879550999186, "grad_norm": 9.234601020812988, "learning_rate": 5.966133168114071e-05, "loss": 1.0463, "step": 91400 }, { "epoch": 1.2198210929063737, "grad_norm": 9.163500785827637, "learning_rate": 5.966002985023516e-05, "loss": 1.1113, "step": 91500 }, { "epoch": 1.2211542307128287, "grad_norm": 16.569820404052734, "learning_rate": 5.965872553628541e-05, "loss": 1.125, "step": 91600 }, { "epoch": 1.2224873685192839, "grad_norm": 12.858329772949219, "learning_rate": 5.965741873940066e-05, "loss": 1.1246, "step": 91700 }, { "epoch": 1.2238205063257388, "grad_norm": 11.406340599060059, "learning_rate": 5.96561094596903e-05, "loss": 1.2298, "step": 91800 }, { "epoch": 1.225153644132194, "grad_norm": 11.301819801330566, "learning_rate": 5.965479769726394e-05, "loss": 1.0913, "step": 91900 }, { "epoch": 1.226486781938649, "grad_norm": 14.881637573242188, "learning_rate": 5.9653483452231386e-05, "loss": 1.1278, "step": 92000 }, { "epoch": 1.227819919745104, "grad_norm": 6.801451683044434, "learning_rate": 5.9652166724702685e-05, "loss": 1.1943, "step": 92100 }, { "epoch": 1.229153057551559, "grad_norm": 14.308677673339844, "learning_rate": 5.965086071917465e-05, "loss": 1.0761, "step": 92200 }, { "epoch": 1.2304861953580142, "grad_norm": 10.600700378417969, "learning_rate": 5.9649539051806736e-05, "loss": 1.1678, "step": 92300 }, { "epoch": 1.2318193331644691, "grad_norm": 8.916555404663086, "learning_rate": 5.9648214902272886e-05, "loss": 1.0823, "step": 92400 }, { "epoch": 1.2331524709709243, "grad_norm": 43.76609420776367, "learning_rate": 5.9646888270683934e-05, "loss": 1.1437, "step": 92500 }, { "epoch": 1.2344856087773792, "grad_norm": 55.19985580444336, "learning_rate": 5.9645559157150956e-05, "loss": 1.0651, "step": 92600 }, { "epoch": 1.2358187465838344, "grad_norm": 52.70887756347656, "learning_rate": 5.964422756178522e-05, "loss": 1.2368, "step": 92700 }, { "epoch": 1.2371518843902893, "grad_norm": 4.696473598480225, "learning_rate": 5.964289348469819e-05, "loss": 1.1394, "step": 92800 }, { "epoch": 1.2384850221967445, "grad_norm": 10.68298625946045, "learning_rate": 5.9641570303872134e-05, "loss": 0.9723, "step": 92900 }, { "epoch": 1.2398181600031994, "grad_norm": 6.748388767242432, "learning_rate": 5.964023128849222e-05, "loss": 1.0326, "step": 93000 }, { "epoch": 1.2411512978096546, "grad_norm": 38.602176666259766, "learning_rate": 5.9638889791725584e-05, "loss": 0.9343, "step": 93100 }, { "epoch": 1.2424844356161096, "grad_norm": 13.165366172790527, "learning_rate": 5.963754581368451e-05, "loss": 1.1277, "step": 93200 }, { "epoch": 1.2438175734225647, "grad_norm": 8.655121803283691, "learning_rate": 5.963619935448154e-05, "loss": 1.0844, "step": 93300 }, { "epoch": 1.2451507112290197, "grad_norm": 7.568240642547607, "learning_rate": 5.963485041422938e-05, "loss": 1.0711, "step": 93400 }, { "epoch": 1.2464838490354748, "grad_norm": 18.06538963317871, "learning_rate": 5.9633498993040955e-05, "loss": 1.1184, "step": 93500 }, { "epoch": 1.2478169868419298, "grad_norm": 17.788007736206055, "learning_rate": 5.9632145091029415e-05, "loss": 1.0776, "step": 93600 }, { "epoch": 1.249150124648385, "grad_norm": 2.112546682357788, "learning_rate": 5.963078870830809e-05, "loss": 1.0985, "step": 93700 }, { "epoch": 1.25048326245484, "grad_norm": 4.808931827545166, "learning_rate": 5.962942984499054e-05, "loss": 1.1194, "step": 93800 }, { "epoch": 1.251816400261295, "grad_norm": 10.860662460327148, "learning_rate": 5.9628068501190525e-05, "loss": 1.0984, "step": 93900 }, { "epoch": 1.25314953806775, "grad_norm": 10.913837432861328, "learning_rate": 5.9626704677022004e-05, "loss": 1.098, "step": 94000 }, { "epoch": 1.2544826758742051, "grad_norm": 8.450738906860352, "learning_rate": 5.962533837259917e-05, "loss": 1.2049, "step": 94100 }, { "epoch": 1.25581581368066, "grad_norm": 11.124272346496582, "learning_rate": 5.9623969588036387e-05, "loss": 1.0541, "step": 94200 }, { "epoch": 1.2571489514871153, "grad_norm": 36.3824462890625, "learning_rate": 5.9622598323448254e-05, "loss": 1.1485, "step": 94300 }, { "epoch": 1.2584820892935702, "grad_norm": 7.29681396484375, "learning_rate": 5.9621224578949564e-05, "loss": 1.0146, "step": 94400 }, { "epoch": 1.2598152271000254, "grad_norm": 41.95053482055664, "learning_rate": 5.961984835465534e-05, "loss": 1.1078, "step": 94500 }, { "epoch": 1.2611483649064803, "grad_norm": 6.588904857635498, "learning_rate": 5.9618469650680776e-05, "loss": 1.1371, "step": 94600 }, { "epoch": 1.2624815027129355, "grad_norm": 5.675909996032715, "learning_rate": 5.961708846714129e-05, "loss": 1.1163, "step": 94700 }, { "epoch": 1.2638146405193904, "grad_norm": 12.247416496276855, "learning_rate": 5.9615704804152524e-05, "loss": 1.1426, "step": 94800 }, { "epoch": 1.2651477783258456, "grad_norm": 20.34516716003418, "learning_rate": 5.961431866183031e-05, "loss": 1.0999, "step": 94900 }, { "epoch": 1.2664809161323005, "grad_norm": 15.410736083984375, "learning_rate": 5.961293004029069e-05, "loss": 1.15, "step": 95000 }, { "epoch": 1.2678140539387557, "grad_norm": 21.05791473388672, "learning_rate": 5.961153893964991e-05, "loss": 1.0859, "step": 95100 }, { "epoch": 1.2691471917452106, "grad_norm": 5.355621337890625, "learning_rate": 5.961014536002443e-05, "loss": 1.1336, "step": 95200 }, { "epoch": 1.2704803295516658, "grad_norm": 6.569826126098633, "learning_rate": 5.9608749301530934e-05, "loss": 1.0631, "step": 95300 }, { "epoch": 1.2718134673581207, "grad_norm": 25.660701751708984, "learning_rate": 5.9607350764286276e-05, "loss": 1.0433, "step": 95400 }, { "epoch": 1.273146605164576, "grad_norm": 6.251875400543213, "learning_rate": 5.960594974840754e-05, "loss": 0.9898, "step": 95500 }, { "epoch": 1.2744797429710308, "grad_norm": 12.132789611816406, "learning_rate": 5.9604546254012015e-05, "loss": 1.077, "step": 95600 }, { "epoch": 1.275812880777486, "grad_norm": 1.1855781078338623, "learning_rate": 5.96031402812172e-05, "loss": 1.0905, "step": 95700 }, { "epoch": 1.277146018583941, "grad_norm": 29.6900577545166, "learning_rate": 5.960173183014081e-05, "loss": 1.0314, "step": 95800 }, { "epoch": 1.278479156390396, "grad_norm": 5.994139194488525, "learning_rate": 5.9600320900900726e-05, "loss": 1.0915, "step": 95900 }, { "epoch": 1.279812294196851, "grad_norm": 4.595454216003418, "learning_rate": 5.9598907493615086e-05, "loss": 1.007, "step": 96000 }, { "epoch": 1.2811454320033062, "grad_norm": 4.081498146057129, "learning_rate": 5.9597491608402226e-05, "loss": 1.0308, "step": 96100 }, { "epoch": 1.2824785698097612, "grad_norm": 7.857297897338867, "learning_rate": 5.959607324538066e-05, "loss": 1.0676, "step": 96200 }, { "epoch": 1.2838117076162163, "grad_norm": 55.12851333618164, "learning_rate": 5.959465240466914e-05, "loss": 1.151, "step": 96300 }, { "epoch": 1.2851448454226713, "grad_norm": 16.752140045166016, "learning_rate": 5.959322908638661e-05, "loss": 1.0872, "step": 96400 }, { "epoch": 1.2864779832291264, "grad_norm": 5.453320026397705, "learning_rate": 5.959180329065223e-05, "loss": 1.059, "step": 96500 }, { "epoch": 1.2878111210355814, "grad_norm": 5.895717144012451, "learning_rate": 5.9590375017585356e-05, "loss": 0.9952, "step": 96600 }, { "epoch": 1.2891442588420365, "grad_norm": 9.95903205871582, "learning_rate": 5.958894426730556e-05, "loss": 1.0826, "step": 96700 }, { "epoch": 1.2904773966484915, "grad_norm": 18.055761337280273, "learning_rate": 5.9587511039932625e-05, "loss": 1.0814, "step": 96800 }, { "epoch": 1.2918105344549466, "grad_norm": 7.015773773193359, "learning_rate": 5.958607533558654e-05, "loss": 1.0695, "step": 96900 }, { "epoch": 1.2931436722614016, "grad_norm": 9.599479675292969, "learning_rate": 5.958463715438748e-05, "loss": 1.051, "step": 97000 }, { "epoch": 1.2944768100678568, "grad_norm": 7.115015983581543, "learning_rate": 5.958319649645587e-05, "loss": 1.0673, "step": 97100 }, { "epoch": 1.2958099478743117, "grad_norm": 12.266501426696777, "learning_rate": 5.9581753361912287e-05, "loss": 0.9936, "step": 97200 }, { "epoch": 1.2971430856807669, "grad_norm": 9.466108322143555, "learning_rate": 5.958030775087758e-05, "loss": 1.1882, "step": 97300 }, { "epoch": 1.2984762234872218, "grad_norm": 9.455041885375977, "learning_rate": 5.9578859663472736e-05, "loss": 1.1034, "step": 97400 }, { "epoch": 1.299809361293677, "grad_norm": 10.310709953308105, "learning_rate": 5.957740909981901e-05, "loss": 1.0699, "step": 97500 }, { "epoch": 1.301142499100132, "grad_norm": 35.62505340576172, "learning_rate": 5.957595606003783e-05, "loss": 1.1762, "step": 97600 }, { "epoch": 1.302475636906587, "grad_norm": 9.3591947555542, "learning_rate": 5.9574500544250846e-05, "loss": 1.1159, "step": 97700 }, { "epoch": 1.303808774713042, "grad_norm": 7.033812999725342, "learning_rate": 5.957304255257989e-05, "loss": 1.111, "step": 97800 }, { "epoch": 1.3051419125194972, "grad_norm": 36.7170295715332, "learning_rate": 5.9571582085147047e-05, "loss": 1.1416, "step": 97900 }, { "epoch": 1.3064750503259521, "grad_norm": 6.488825798034668, "learning_rate": 5.9570119142074564e-05, "loss": 1.0889, "step": 98000 }, { "epoch": 1.3078081881324073, "grad_norm": 5.459531307220459, "learning_rate": 5.9568653723484914e-05, "loss": 1.0607, "step": 98100 }, { "epoch": 1.3091413259388622, "grad_norm": 6.747074127197266, "learning_rate": 5.956718582950079e-05, "loss": 1.1602, "step": 98200 }, { "epoch": 1.3104744637453174, "grad_norm": 4.92739725112915, "learning_rate": 5.956571546024507e-05, "loss": 1.0752, "step": 98300 }, { "epoch": 1.3118076015517723, "grad_norm": 8.583707809448242, "learning_rate": 5.956424261584085e-05, "loss": 1.1244, "step": 98400 }, { "epoch": 1.3131407393582275, "grad_norm": 12.54875659942627, "learning_rate": 5.956276729641143e-05, "loss": 1.1041, "step": 98500 }, { "epoch": 1.3144738771646824, "grad_norm": 18.31597328186035, "learning_rate": 5.956128950208032e-05, "loss": 1.138, "step": 98600 }, { "epoch": 1.3158070149711376, "grad_norm": 16.37530517578125, "learning_rate": 5.9559824047912084e-05, "loss": 1.0047, "step": 98700 }, { "epoch": 1.3171401527775926, "grad_norm": 14.326208114624023, "learning_rate": 5.955834132889488e-05, "loss": 1.0532, "step": 98800 }, { "epoch": 1.3184732905840477, "grad_norm": 6.43242883682251, "learning_rate": 5.955685613534652e-05, "loss": 1.1611, "step": 98900 }, { "epoch": 1.3198064283905027, "grad_norm": 14.197293281555176, "learning_rate": 5.9555368467391345e-05, "loss": 1.0645, "step": 99000 }, { "epoch": 1.3211395661969578, "grad_norm": 4.43160343170166, "learning_rate": 5.955387832515388e-05, "loss": 1.1049, "step": 99100 }, { "epoch": 1.3224727040034128, "grad_norm": 11.253840446472168, "learning_rate": 5.955238570875888e-05, "loss": 1.1079, "step": 99200 }, { "epoch": 1.323805841809868, "grad_norm": 19.723247528076172, "learning_rate": 5.9550890618331306e-05, "loss": 1.2108, "step": 99300 }, { "epoch": 1.3251389796163229, "grad_norm": 15.948836326599121, "learning_rate": 5.954939305399633e-05, "loss": 1.1553, "step": 99400 }, { "epoch": 1.326472117422778, "grad_norm": 7.728938579559326, "learning_rate": 5.954789301587931e-05, "loss": 1.0125, "step": 99500 }, { "epoch": 1.327805255229233, "grad_norm": 34.5762825012207, "learning_rate": 5.9546390504105827e-05, "loss": 1.101, "step": 99600 }, { "epoch": 1.3291383930356881, "grad_norm": 10.199736595153809, "learning_rate": 5.954488551880167e-05, "loss": 1.0391, "step": 99700 }, { "epoch": 1.330471530842143, "grad_norm": 16.531017303466797, "learning_rate": 5.954337806009284e-05, "loss": 1.0418, "step": 99800 }, { "epoch": 1.3318046686485983, "grad_norm": 7.74006462097168, "learning_rate": 5.954186812810551e-05, "loss": 1.0412, "step": 99900 }, { "epoch": 1.3331378064550532, "grad_norm": 6.747781276702881, "learning_rate": 5.954035572296612e-05, "loss": 1.0914, "step": 100000 }, { "epoch": 1.3344709442615084, "grad_norm": 5.482439041137695, "learning_rate": 5.953884084480127e-05, "loss": 1.0737, "step": 100100 }, { "epoch": 1.3358040820679633, "grad_norm": 4.91019344329834, "learning_rate": 5.953732349373776e-05, "loss": 1.0709, "step": 100200 }, { "epoch": 1.3371372198744185, "grad_norm": 12.846415519714355, "learning_rate": 5.953580366990266e-05, "loss": 1.1289, "step": 100300 }, { "epoch": 1.3384703576808734, "grad_norm": 6.251044750213623, "learning_rate": 5.953428137342317e-05, "loss": 1.0432, "step": 100400 }, { "epoch": 1.3398034954873286, "grad_norm": 18.414371490478516, "learning_rate": 5.953275660442674e-05, "loss": 1.0549, "step": 100500 }, { "epoch": 1.3411366332937835, "grad_norm": 15.416671752929688, "learning_rate": 5.953122936304103e-05, "loss": 1.0576, "step": 100600 }, { "epoch": 1.3424697711002387, "grad_norm": 10.332579612731934, "learning_rate": 5.952969964939388e-05, "loss": 1.0431, "step": 100700 }, { "epoch": 1.3438029089066936, "grad_norm": 13.414596557617188, "learning_rate": 5.952816746361336e-05, "loss": 1.0754, "step": 100800 }, { "epoch": 1.3451360467131488, "grad_norm": 7.047237873077393, "learning_rate": 5.952663280582775e-05, "loss": 1.0553, "step": 100900 }, { "epoch": 1.3464691845196037, "grad_norm": 6.142723560333252, "learning_rate": 5.952509567616552e-05, "loss": 1.0158, "step": 101000 }, { "epoch": 1.347802322326059, "grad_norm": 14.631631851196289, "learning_rate": 5.952355607475534e-05, "loss": 1.0108, "step": 101100 }, { "epoch": 1.3491354601325138, "grad_norm": 20.227874755859375, "learning_rate": 5.952201400172612e-05, "loss": 1.0293, "step": 101200 }, { "epoch": 1.350468597938969, "grad_norm": 7.2135844230651855, "learning_rate": 5.952046945720695e-05, "loss": 1.0706, "step": 101300 }, { "epoch": 1.351801735745424, "grad_norm": 4.825830459594727, "learning_rate": 5.951892244132712e-05, "loss": 1.0697, "step": 101400 }, { "epoch": 1.3531348735518791, "grad_norm": 5.542803764343262, "learning_rate": 5.951737295421617e-05, "loss": 1.061, "step": 101500 }, { "epoch": 1.354468011358334, "grad_norm": 5.583561897277832, "learning_rate": 5.9515820996003795e-05, "loss": 1.0115, "step": 101600 }, { "epoch": 1.3558011491647892, "grad_norm": 12.75886058807373, "learning_rate": 5.9514266566819934e-05, "loss": 0.9828, "step": 101700 }, { "epoch": 1.3571342869712442, "grad_norm": 10.263659477233887, "learning_rate": 5.9512709666794705e-05, "loss": 0.9562, "step": 101800 }, { "epoch": 1.3584674247776993, "grad_norm": 3.6471455097198486, "learning_rate": 5.951115029605846e-05, "loss": 1.0454, "step": 101900 }, { "epoch": 1.3598005625841543, "grad_norm": 23.046833038330078, "learning_rate": 5.9509588454741744e-05, "loss": 1.0531, "step": 102000 }, { "epoch": 1.3611337003906094, "grad_norm": 12.010133743286133, "learning_rate": 5.950802414297529e-05, "loss": 0.9612, "step": 102100 }, { "epoch": 1.3624668381970644, "grad_norm": 13.14553165435791, "learning_rate": 5.950645736089008e-05, "loss": 1.0065, "step": 102200 }, { "epoch": 1.3637999760035195, "grad_norm": 27.47277069091797, "learning_rate": 5.950488810861728e-05, "loss": 1.0111, "step": 102300 }, { "epoch": 1.3651331138099745, "grad_norm": 36.932945251464844, "learning_rate": 5.950331638628825e-05, "loss": 0.9374, "step": 102400 }, { "epoch": 1.3664662516164297, "grad_norm": 16.070438385009766, "learning_rate": 5.9501742194034575e-05, "loss": 1.1139, "step": 102500 }, { "epoch": 1.3677993894228846, "grad_norm": 10.032145500183105, "learning_rate": 5.950016553198804e-05, "loss": 0.926, "step": 102600 }, { "epoch": 1.3691325272293398, "grad_norm": 6.505300045013428, "learning_rate": 5.9498586400280634e-05, "loss": 0.9361, "step": 102700 }, { "epoch": 1.3704656650357947, "grad_norm": 26.68906593322754, "learning_rate": 5.9497004799044574e-05, "loss": 0.9939, "step": 102800 }, { "epoch": 1.3717988028422499, "grad_norm": 10.683196067810059, "learning_rate": 5.949542072841224e-05, "loss": 1.1002, "step": 102900 }, { "epoch": 1.3731319406487048, "grad_norm": 9.935720443725586, "learning_rate": 5.949383418851627e-05, "loss": 1.0392, "step": 103000 }, { "epoch": 1.37446507845516, "grad_norm": 48.03205108642578, "learning_rate": 5.9492245179489476e-05, "loss": 1.0051, "step": 103100 }, { "epoch": 1.375798216261615, "grad_norm": 11.364152908325195, "learning_rate": 5.9490653701464875e-05, "loss": 1.0637, "step": 103200 }, { "epoch": 1.37713135406807, "grad_norm": 11.402976989746094, "learning_rate": 5.948905975457571e-05, "loss": 1.0095, "step": 103300 }, { "epoch": 1.378464491874525, "grad_norm": 32.750091552734375, "learning_rate": 5.948746333895543e-05, "loss": 1.0515, "step": 103400 }, { "epoch": 1.3797976296809802, "grad_norm": 13.945265769958496, "learning_rate": 5.9485864454737655e-05, "loss": 0.9521, "step": 103500 }, { "epoch": 1.3811307674874351, "grad_norm": 7.47422981262207, "learning_rate": 5.948426310205626e-05, "loss": 1.1099, "step": 103600 }, { "epoch": 1.3824639052938903, "grad_norm": 13.826478004455566, "learning_rate": 5.9482659281045315e-05, "loss": 1.0749, "step": 103700 }, { "epoch": 1.3837970431003452, "grad_norm": 3.900627613067627, "learning_rate": 5.948105299183906e-05, "loss": 0.9359, "step": 103800 }, { "epoch": 1.3851301809068004, "grad_norm": 10.384955406188965, "learning_rate": 5.9479444234571984e-05, "loss": 0.9183, "step": 103900 }, { "epoch": 1.3864633187132553, "grad_norm": 10.176738739013672, "learning_rate": 5.947783300937877e-05, "loss": 1.0107, "step": 104000 }, { "epoch": 1.3877964565197105, "grad_norm": 7.854725360870361, "learning_rate": 5.9476219316394294e-05, "loss": 0.9843, "step": 104100 }, { "epoch": 1.3891295943261655, "grad_norm": 4.834753513336182, "learning_rate": 5.947460315575364e-05, "loss": 1.0575, "step": 104200 }, { "epoch": 1.3904627321326206, "grad_norm": 8.290459632873535, "learning_rate": 5.947298452759214e-05, "loss": 0.9931, "step": 104300 }, { "epoch": 1.3917958699390756, "grad_norm": 7.256165981292725, "learning_rate": 5.947136343204527e-05, "loss": 1.0032, "step": 104400 }, { "epoch": 1.3931290077455307, "grad_norm": 9.241236686706543, "learning_rate": 5.946973986924877e-05, "loss": 1.1175, "step": 104500 }, { "epoch": 1.3944621455519857, "grad_norm": 7.445105075836182, "learning_rate": 5.946811383933854e-05, "loss": 1.0531, "step": 104600 }, { "epoch": 1.3957952833584408, "grad_norm": 17.064208984375, "learning_rate": 5.94664853424507e-05, "loss": 1.0677, "step": 104700 }, { "epoch": 1.3971284211648958, "grad_norm": 12.49902629852295, "learning_rate": 5.946487070056931e-05, "loss": 0.9475, "step": 104800 }, { "epoch": 1.398461558971351, "grad_norm": 11.316535949707031, "learning_rate": 5.9463237294801846e-05, "loss": 1.0323, "step": 104900 }, { "epoch": 1.3997946967778059, "grad_norm": 7.919010162353516, "learning_rate": 5.9461601422465044e-05, "loss": 1.0925, "step": 105000 }, { "epoch": 1.401127834584261, "grad_norm": 8.572054862976074, "learning_rate": 5.945997947929191e-05, "loss": 1.0464, "step": 105100 }, { "epoch": 1.402460972390716, "grad_norm": 19.22196388244629, "learning_rate": 5.945833869888974e-05, "loss": 1.1285, "step": 105200 }, { "epoch": 1.4037941101971712, "grad_norm": 6.494006156921387, "learning_rate": 5.945669545232831e-05, "loss": 1.0668, "step": 105300 }, { "epoch": 1.405127248003626, "grad_norm": 27.938323974609375, "learning_rate": 5.945504973974519e-05, "loss": 1.0761, "step": 105400 }, { "epoch": 1.4064603858100813, "grad_norm": 46.21061706542969, "learning_rate": 5.9453401561278155e-05, "loss": 1.0621, "step": 105500 }, { "epoch": 1.4077935236165362, "grad_norm": 46.19841003417969, "learning_rate": 5.9451750917065186e-05, "loss": 1.0265, "step": 105600 }, { "epoch": 1.4091266614229914, "grad_norm": 2.1678552627563477, "learning_rate": 5.945009780724447e-05, "loss": 1.1049, "step": 105700 }, { "epoch": 1.4104597992294463, "grad_norm": 12.385648727416992, "learning_rate": 5.94484422319544e-05, "loss": 1.1363, "step": 105800 }, { "epoch": 1.4117929370359015, "grad_norm": 11.583638191223145, "learning_rate": 5.944678419133359e-05, "loss": 1.1209, "step": 105900 }, { "epoch": 1.4131260748423564, "grad_norm": 13.737213134765625, "learning_rate": 5.9445140302781184e-05, "loss": 1.0362, "step": 106000 }, { "epoch": 1.4144592126488116, "grad_norm": 9.168383598327637, "learning_rate": 5.944347735656532e-05, "loss": 1.0752, "step": 106100 }, { "epoch": 1.4157923504552665, "grad_norm": 24.02765655517578, "learning_rate": 5.9441811945434344e-05, "loss": 1.1591, "step": 106200 }, { "epoch": 1.4171254882617217, "grad_norm": 8.052775382995605, "learning_rate": 5.944014406952768e-05, "loss": 1.0787, "step": 106300 }, { "epoch": 1.4184586260681766, "grad_norm": 8.447908401489258, "learning_rate": 5.943847372898496e-05, "loss": 1.1458, "step": 106400 }, { "epoch": 1.4197917638746318, "grad_norm": 6.011082649230957, "learning_rate": 5.9436800923946005e-05, "loss": 0.963, "step": 106500 }, { "epoch": 1.4211249016810867, "grad_norm": 6.499388694763184, "learning_rate": 5.943512565455089e-05, "loss": 1.1028, "step": 106600 }, { "epoch": 1.422458039487542, "grad_norm": 3.1753487586975098, "learning_rate": 5.9433447920939826e-05, "loss": 1.0096, "step": 106700 }, { "epoch": 1.4237911772939968, "grad_norm": 6.0038580894470215, "learning_rate": 5.94317677232533e-05, "loss": 0.9726, "step": 106800 }, { "epoch": 1.425124315100452, "grad_norm": 5.237049102783203, "learning_rate": 5.943008506163194e-05, "loss": 1.0095, "step": 106900 }, { "epoch": 1.426457452906907, "grad_norm": 9.17286205291748, "learning_rate": 5.942839993621663e-05, "loss": 1.0187, "step": 107000 }, { "epoch": 1.4277905907133621, "grad_norm": 2.5712249279022217, "learning_rate": 5.942671234714846e-05, "loss": 1.0707, "step": 107100 }, { "epoch": 1.429123728519817, "grad_norm": 12.270254135131836, "learning_rate": 5.942502229456869e-05, "loss": 0.9729, "step": 107200 }, { "epoch": 1.4304568663262722, "grad_norm": 31.013320922851562, "learning_rate": 5.94233297786188e-05, "loss": 1.1013, "step": 107300 }, { "epoch": 1.4317900041327272, "grad_norm": 11.448019981384277, "learning_rate": 5.94216347994405e-05, "loss": 1.1038, "step": 107400 }, { "epoch": 1.4331231419391823, "grad_norm": 9.818227767944336, "learning_rate": 5.941993735717567e-05, "loss": 1.1272, "step": 107500 }, { "epoch": 1.4344562797456373, "grad_norm": 14.170889854431152, "learning_rate": 5.941823745196643e-05, "loss": 1.0778, "step": 107600 }, { "epoch": 1.4357894175520924, "grad_norm": 32.21810531616211, "learning_rate": 5.9416535083955085e-05, "loss": 1.0801, "step": 107700 }, { "epoch": 1.4371225553585474, "grad_norm": 7.739402770996094, "learning_rate": 5.941483025328415e-05, "loss": 0.959, "step": 107800 }, { "epoch": 1.4384556931650025, "grad_norm": 8.357564926147461, "learning_rate": 5.941312296009635e-05, "loss": 1.0038, "step": 107900 }, { "epoch": 1.4397888309714575, "grad_norm": 6.557303428649902, "learning_rate": 5.941141320453461e-05, "loss": 0.9619, "step": 108000 }, { "epoch": 1.4411219687779127, "grad_norm": 11.983556747436523, "learning_rate": 5.940970098674208e-05, "loss": 0.9894, "step": 108100 }, { "epoch": 1.4424551065843676, "grad_norm": 29.802654266357422, "learning_rate": 5.9407986306862086e-05, "loss": 1.0703, "step": 108200 }, { "epoch": 1.4437882443908228, "grad_norm": 12.99390983581543, "learning_rate": 5.940626916503818e-05, "loss": 1.0136, "step": 108300 }, { "epoch": 1.4451213821972777, "grad_norm": 9.472550392150879, "learning_rate": 5.940454956141411e-05, "loss": 1.0192, "step": 108400 }, { "epoch": 1.4464545200037329, "grad_norm": 43.41335678100586, "learning_rate": 5.940282749613385e-05, "loss": 1.0397, "step": 108500 }, { "epoch": 1.4477876578101878, "grad_norm": 9.609247207641602, "learning_rate": 5.940110296934155e-05, "loss": 0.9629, "step": 108600 }, { "epoch": 1.449120795616643, "grad_norm": 16.279809951782227, "learning_rate": 5.93993759811816e-05, "loss": 1.0284, "step": 108700 }, { "epoch": 1.450453933423098, "grad_norm": 17.665515899658203, "learning_rate": 5.9397663838474975e-05, "loss": 1.0533, "step": 108800 }, { "epoch": 1.4517870712295529, "grad_norm": 11.412491798400879, "learning_rate": 5.939593195262369e-05, "loss": 1.0371, "step": 108900 }, { "epoch": 1.453120209036008, "grad_norm": 6.680787563323975, "learning_rate": 5.9394197605837666e-05, "loss": 0.9765, "step": 109000 }, { "epoch": 1.4544533468424632, "grad_norm": 4.241756439208984, "learning_rate": 5.939246079826207e-05, "loss": 1.0515, "step": 109100 }, { "epoch": 1.4557864846489181, "grad_norm": 3.358441114425659, "learning_rate": 5.939073893490422e-05, "loss": 1.1082, "step": 109200 }, { "epoch": 1.457119622455373, "grad_norm": 11.756570816040039, "learning_rate": 5.938899723079017e-05, "loss": 1.0674, "step": 109300 }, { "epoch": 1.4584527602618282, "grad_norm": 17.21695899963379, "learning_rate": 5.9387253066321905e-05, "loss": 1.1222, "step": 109400 }, { "epoch": 1.4597858980682834, "grad_norm": 6.199847221374512, "learning_rate": 5.938550644164546e-05, "loss": 1.1232, "step": 109500 }, { "epoch": 1.4611190358747383, "grad_norm": 8.668452262878418, "learning_rate": 5.938375735690704e-05, "loss": 1.2362, "step": 109600 }, { "epoch": 1.4624521736811933, "grad_norm": 37.31698989868164, "learning_rate": 5.938200581225308e-05, "loss": 1.051, "step": 109700 }, { "epoch": 1.4637853114876485, "grad_norm": 14.700597763061523, "learning_rate": 5.938025180783021e-05, "loss": 0.9401, "step": 109800 }, { "epoch": 1.4651184492941036, "grad_norm": 22.791461944580078, "learning_rate": 5.9378495343785276e-05, "loss": 1.1171, "step": 109900 }, { "epoch": 1.4664515871005586, "grad_norm": 5.426826000213623, "learning_rate": 5.9376736420265326e-05, "loss": 1.0133, "step": 110000 }, { "epoch": 1.4677847249070135, "grad_norm": 6.6110968589782715, "learning_rate": 5.9374975037417604e-05, "loss": 1.0546, "step": 110100 }, { "epoch": 1.4691178627134687, "grad_norm": 11.130767822265625, "learning_rate": 5.937321119538957e-05, "loss": 1.0875, "step": 110200 }, { "epoch": 1.4704510005199238, "grad_norm": 52.09043502807617, "learning_rate": 5.937144489432888e-05, "loss": 1.0107, "step": 110300 }, { "epoch": 1.4717841383263788, "grad_norm": 7.2424187660217285, "learning_rate": 5.936967613438341e-05, "loss": 1.0496, "step": 110400 }, { "epoch": 1.4731172761328337, "grad_norm": 8.891135215759277, "learning_rate": 5.936790491570125e-05, "loss": 1.0397, "step": 110500 }, { "epoch": 1.4744504139392889, "grad_norm": 266.9293212890625, "learning_rate": 5.936613123843065e-05, "loss": 1.1948, "step": 110600 }, { "epoch": 1.475783551745744, "grad_norm": 4.275050163269043, "learning_rate": 5.9364355102720127e-05, "loss": 1.1732, "step": 110700 }, { "epoch": 1.477116689552199, "grad_norm": 5.300197124481201, "learning_rate": 5.9362576508718346e-05, "loss": 1.0814, "step": 110800 }, { "epoch": 1.478449827358654, "grad_norm": 33.179222106933594, "learning_rate": 5.936079545657423e-05, "loss": 1.0741, "step": 110900 }, { "epoch": 1.479782965165109, "grad_norm": 20.252885818481445, "learning_rate": 5.935901194643687e-05, "loss": 1.0465, "step": 111000 }, { "epoch": 1.4811161029715643, "grad_norm": 21.4619197845459, "learning_rate": 5.935722597845557e-05, "loss": 1.0736, "step": 111100 }, { "epoch": 1.4824492407780192, "grad_norm": 8.79926872253418, "learning_rate": 5.935543755277985e-05, "loss": 1.1706, "step": 111200 }, { "epoch": 1.4837823785844741, "grad_norm": 4.494112014770508, "learning_rate": 5.9353646669559446e-05, "loss": 1.2276, "step": 111300 }, { "epoch": 1.4851155163909293, "grad_norm": 9.307448387145996, "learning_rate": 5.9351853328944264e-05, "loss": 1.1154, "step": 111400 }, { "epoch": 1.4864486541973845, "grad_norm": 3.7440741062164307, "learning_rate": 5.9350057531084456e-05, "loss": 1.0605, "step": 111500 }, { "epoch": 1.4877817920038394, "grad_norm": 58.2625732421875, "learning_rate": 5.9348259276130346e-05, "loss": 1.1034, "step": 111600 }, { "epoch": 1.4891149298102944, "grad_norm": 10.819257736206055, "learning_rate": 5.934645856423249e-05, "loss": 1.0323, "step": 111700 }, { "epoch": 1.4904480676167495, "grad_norm": 8.776371002197266, "learning_rate": 5.9344655395541616e-05, "loss": 1.069, "step": 111800 }, { "epoch": 1.4917812054232047, "grad_norm": 10.403406143188477, "learning_rate": 5.93428497702087e-05, "loss": 1.0767, "step": 111900 }, { "epoch": 1.4931143432296596, "grad_norm": 14.795417785644531, "learning_rate": 5.934104168838491e-05, "loss": 1.0136, "step": 112000 }, { "epoch": 1.4944474810361146, "grad_norm": 29.92347526550293, "learning_rate": 5.9339231150221585e-05, "loss": 1.0505, "step": 112100 }, { "epoch": 1.4957806188425697, "grad_norm": 15.607181549072266, "learning_rate": 5.933741815587032e-05, "loss": 1.1097, "step": 112200 }, { "epoch": 1.497113756649025, "grad_norm": 53.77953338623047, "learning_rate": 5.933560270548288e-05, "loss": 1.0388, "step": 112300 }, { "epoch": 1.4984468944554798, "grad_norm": 35.36404800415039, "learning_rate": 5.933378479921126e-05, "loss": 1.1266, "step": 112400 }, { "epoch": 1.4997800322619348, "grad_norm": 18.93492317199707, "learning_rate": 5.933196443720765e-05, "loss": 1.0772, "step": 112500 }, { "epoch": 1.50111317006839, "grad_norm": 14.855520248413086, "learning_rate": 5.933014161962444e-05, "loss": 1.0777, "step": 112600 }, { "epoch": 1.5024463078748451, "grad_norm": 5.935585021972656, "learning_rate": 5.932831634661422e-05, "loss": 1.1058, "step": 112700 }, { "epoch": 1.5037794456813, "grad_norm": 65.22245788574219, "learning_rate": 5.932648861832981e-05, "loss": 1.0534, "step": 112800 }, { "epoch": 1.505112583487755, "grad_norm": 14.856438636779785, "learning_rate": 5.932465843492422e-05, "loss": 1.1816, "step": 112900 }, { "epoch": 1.5064457212942102, "grad_norm": 22.556903839111328, "learning_rate": 5.9322825796550674e-05, "loss": 1.0447, "step": 113000 }, { "epoch": 1.5077788591006653, "grad_norm": 11.026520729064941, "learning_rate": 5.932099070336257e-05, "loss": 1.1296, "step": 113100 }, { "epoch": 1.5091119969071203, "grad_norm": 4.067707061767578, "learning_rate": 5.931915315551356e-05, "loss": 1.1286, "step": 113200 }, { "epoch": 1.5104451347135752, "grad_norm": 125.00704956054688, "learning_rate": 5.9317313153157466e-05, "loss": 1.1555, "step": 113300 }, { "epoch": 1.5117782725200304, "grad_norm": 45.90623474121094, "learning_rate": 5.931548913316397e-05, "loss": 1.3372, "step": 113400 }, { "epoch": 1.5131114103264856, "grad_norm": 9.612715721130371, "learning_rate": 5.931364424679726e-05, "loss": 1.3643, "step": 113500 }, { "epoch": 1.5144445481329405, "grad_norm": 15.230896949768066, "learning_rate": 5.9311796906384674e-05, "loss": 1.1887, "step": 113600 }, { "epoch": 1.5157776859393954, "grad_norm": 36.09800338745117, "learning_rate": 5.930994711208083e-05, "loss": 1.1534, "step": 113700 }, { "epoch": 1.5171108237458506, "grad_norm": 18.2763671875, "learning_rate": 5.930809486404062e-05, "loss": 1.1388, "step": 113800 }, { "epoch": 1.5184439615523058, "grad_norm": 19.269102096557617, "learning_rate": 5.930624016241909e-05, "loss": 1.0216, "step": 113900 }, { "epoch": 1.5197770993587607, "grad_norm": 7.571469783782959, "learning_rate": 5.930438300737152e-05, "loss": 1.0866, "step": 114000 }, { "epoch": 1.5211102371652157, "grad_norm": 14.208659172058105, "learning_rate": 5.9302523399053376e-05, "loss": 1.2066, "step": 114100 }, { "epoch": 1.5224433749716708, "grad_norm": 4.1547136306762695, "learning_rate": 5.930066133762034e-05, "loss": 1.1729, "step": 114200 }, { "epoch": 1.523776512778126, "grad_norm": 6.028252601623535, "learning_rate": 5.9298796823228304e-05, "loss": 1.0318, "step": 114300 }, { "epoch": 1.525109650584581, "grad_norm": 56.826255798339844, "learning_rate": 5.929692985603335e-05, "loss": 1.1589, "step": 114400 }, { "epoch": 1.5264427883910359, "grad_norm": 9.10191535949707, "learning_rate": 5.929506043619179e-05, "loss": 1.0081, "step": 114500 }, { "epoch": 1.527775926197491, "grad_norm": 7.186291217803955, "learning_rate": 5.9293188563860106e-05, "loss": 1.1235, "step": 114600 }, { "epoch": 1.5291090640039462, "grad_norm": 6.332291126251221, "learning_rate": 5.929131423919502e-05, "loss": 1.0972, "step": 114700 }, { "epoch": 1.5304422018104011, "grad_norm": 11.8353271484375, "learning_rate": 5.928943746235344e-05, "loss": 1.1692, "step": 114800 }, { "epoch": 1.531775339616856, "grad_norm": 72.98390197753906, "learning_rate": 5.928755823349249e-05, "loss": 1.0895, "step": 114900 }, { "epoch": 1.5331084774233112, "grad_norm": 20.125207901000977, "learning_rate": 5.928567655276948e-05, "loss": 1.0501, "step": 115000 }, { "epoch": 1.5344416152297664, "grad_norm": 66.22362518310547, "learning_rate": 5.928379242034195e-05, "loss": 1.1586, "step": 115100 }, { "epoch": 1.5357747530362214, "grad_norm": 16.348438262939453, "learning_rate": 5.928190583636762e-05, "loss": 1.1278, "step": 115200 }, { "epoch": 1.5371078908426763, "grad_norm": 49.16584396362305, "learning_rate": 5.928001680100445e-05, "loss": 1.2092, "step": 115300 }, { "epoch": 1.5384410286491315, "grad_norm": 24.80498695373535, "learning_rate": 5.9278125314410565e-05, "loss": 1.123, "step": 115400 }, { "epoch": 1.5397741664555866, "grad_norm": 53.775428771972656, "learning_rate": 5.927625032825328e-05, "loss": 1.1465, "step": 115500 }, { "epoch": 1.5411073042620416, "grad_norm": 11.201776504516602, "learning_rate": 5.92743729399538e-05, "loss": 1.1967, "step": 115600 }, { "epoch": 1.5424404420684965, "grad_norm": 99.4305191040039, "learning_rate": 5.9272474149632254e-05, "loss": 1.2119, "step": 115700 }, { "epoch": 1.5437735798749517, "grad_norm": 43.01657485961914, "learning_rate": 5.9270572908711445e-05, "loss": 1.1907, "step": 115800 }, { "epoch": 1.5451067176814068, "grad_norm": 29.924097061157227, "learning_rate": 5.9268669217350544e-05, "loss": 1.1605, "step": 115900 }, { "epoch": 1.5464398554878618, "grad_norm": 10.90344524383545, "learning_rate": 5.9266763075708915e-05, "loss": 1.2621, "step": 116000 }, { "epoch": 1.5477729932943167, "grad_norm": 26.9315128326416, "learning_rate": 5.926485448394614e-05, "loss": 1.1777, "step": 116100 }, { "epoch": 1.5491061311007719, "grad_norm": 8.644767761230469, "learning_rate": 5.9262943442222e-05, "loss": 1.2986, "step": 116200 }, { "epoch": 1.550439268907227, "grad_norm": 12.919650077819824, "learning_rate": 5.926102995069648e-05, "loss": 1.3031, "step": 116300 }, { "epoch": 1.551772406713682, "grad_norm": 12.833759307861328, "learning_rate": 5.925911400952976e-05, "loss": 1.1729, "step": 116400 }, { "epoch": 1.553105544520137, "grad_norm": 27.438026428222656, "learning_rate": 5.9257195618882263e-05, "loss": 1.2286, "step": 116500 }, { "epoch": 1.554438682326592, "grad_norm": 8.261297225952148, "learning_rate": 5.925527477891457e-05, "loss": 1.1881, "step": 116600 }, { "epoch": 1.5557718201330473, "grad_norm": 9.564327239990234, "learning_rate": 5.925335148978749e-05, "loss": 1.1674, "step": 116700 }, { "epoch": 1.5571049579395022, "grad_norm": 4.54689884185791, "learning_rate": 5.925142575166204e-05, "loss": 1.1897, "step": 116800 }, { "epoch": 1.5584380957459572, "grad_norm": 8.339677810668945, "learning_rate": 5.924949756469943e-05, "loss": 1.158, "step": 116900 }, { "epoch": 1.5597712335524123, "grad_norm": 33.63465118408203, "learning_rate": 5.9247566929061094e-05, "loss": 1.2217, "step": 117000 }, { "epoch": 1.5611043713588675, "grad_norm": 7.913755416870117, "learning_rate": 5.9245633844908654e-05, "loss": 1.0544, "step": 117100 }, { "epoch": 1.5624375091653224, "grad_norm": 9.277572631835938, "learning_rate": 5.924369831240393e-05, "loss": 1.0483, "step": 117200 }, { "epoch": 1.5637706469717774, "grad_norm": 6.047488689422607, "learning_rate": 5.924176033170897e-05, "loss": 1.1115, "step": 117300 }, { "epoch": 1.5651037847782325, "grad_norm": 76.03152465820312, "learning_rate": 5.923981990298601e-05, "loss": 1.1438, "step": 117400 }, { "epoch": 1.5664369225846877, "grad_norm": 9.127727508544922, "learning_rate": 5.9237877026397505e-05, "loss": 1.0184, "step": 117500 }, { "epoch": 1.5677700603911426, "grad_norm": 15.148660659790039, "learning_rate": 5.923593170210609e-05, "loss": 0.9949, "step": 117600 }, { "epoch": 1.5691031981975976, "grad_norm": 7.862441539764404, "learning_rate": 5.9233983930274645e-05, "loss": 1.1514, "step": 117700 }, { "epoch": 1.5704363360040527, "grad_norm": 37.018524169921875, "learning_rate": 5.923203371106621e-05, "loss": 1.0797, "step": 117800 }, { "epoch": 1.571769473810508, "grad_norm": 28.121315002441406, "learning_rate": 5.9230081044644066e-05, "loss": 1.1026, "step": 117900 }, { "epoch": 1.5731026116169629, "grad_norm": 32.340309143066406, "learning_rate": 5.922812593117168e-05, "loss": 1.0637, "step": 118000 }, { "epoch": 1.5744357494234178, "grad_norm": 10.422041893005371, "learning_rate": 5.9226168370812726e-05, "loss": 1.011, "step": 118100 }, { "epoch": 1.575768887229873, "grad_norm": 10.676474571228027, "learning_rate": 5.9224208363731075e-05, "loss": 1.1384, "step": 118200 }, { "epoch": 1.5771020250363281, "grad_norm": 17.888704299926758, "learning_rate": 5.922224591009083e-05, "loss": 1.1177, "step": 118300 }, { "epoch": 1.578435162842783, "grad_norm": 13.453577041625977, "learning_rate": 5.9220281010056274e-05, "loss": 1.1041, "step": 118400 }, { "epoch": 1.579768300649238, "grad_norm": 29.919376373291016, "learning_rate": 5.92183136637919e-05, "loss": 1.0949, "step": 118500 }, { "epoch": 1.5811014384556932, "grad_norm": 5.874375820159912, "learning_rate": 5.9216343871462416e-05, "loss": 1.0543, "step": 118600 }, { "epoch": 1.5824345762621483, "grad_norm": 60.75772476196289, "learning_rate": 5.921437163323272e-05, "loss": 1.1072, "step": 118700 }, { "epoch": 1.5837677140686033, "grad_norm": 9.214759826660156, "learning_rate": 5.921239694926792e-05, "loss": 1.5148, "step": 118800 }, { "epoch": 1.5851008518750582, "grad_norm": 41.61264419555664, "learning_rate": 5.921041981973332e-05, "loss": 1.0929, "step": 118900 }, { "epoch": 1.5864339896815134, "grad_norm": 23.013647079467773, "learning_rate": 5.920844024479447e-05, "loss": 1.1536, "step": 119000 }, { "epoch": 1.5877671274879686, "grad_norm": 35.60845184326172, "learning_rate": 5.9206458224617066e-05, "loss": 1.0325, "step": 119100 }, { "epoch": 1.5891002652944235, "grad_norm": 7.693451881408691, "learning_rate": 5.9204473759367055e-05, "loss": 1.0936, "step": 119200 }, { "epoch": 1.5904334031008784, "grad_norm": 7.621274471282959, "learning_rate": 5.9202486849210557e-05, "loss": 1.1024, "step": 119300 }, { "epoch": 1.5917665409073336, "grad_norm": 2.8836236000061035, "learning_rate": 5.920049749431391e-05, "loss": 1.1111, "step": 119400 }, { "epoch": 1.5930996787137888, "grad_norm": 7.268186569213867, "learning_rate": 5.9198505694843664e-05, "loss": 0.9686, "step": 119500 }, { "epoch": 1.5944328165202437, "grad_norm": 22.30896759033203, "learning_rate": 5.919651145096656e-05, "loss": 1.0585, "step": 119600 }, { "epoch": 1.5957659543266987, "grad_norm": 11.539246559143066, "learning_rate": 5.919451476284956e-05, "loss": 1.0667, "step": 119700 }, { "epoch": 1.5970990921331538, "grad_norm": 8.2214994430542, "learning_rate": 5.91925156306598e-05, "loss": 1.0422, "step": 119800 }, { "epoch": 1.598432229939609, "grad_norm": 4.4735212326049805, "learning_rate": 5.9190514054564656e-05, "loss": 1.1671, "step": 119900 }, { "epoch": 1.599765367746064, "grad_norm": 5.342032432556152, "learning_rate": 5.9188510034731697e-05, "loss": 1.0355, "step": 120000 }, { "epoch": 1.6010985055525189, "grad_norm": 14.06742000579834, "learning_rate": 5.91865035713287e-05, "loss": 1.1169, "step": 120100 }, { "epoch": 1.602431643358974, "grad_norm": 10.49070930480957, "learning_rate": 5.918449466452361e-05, "loss": 1.1121, "step": 120200 }, { "epoch": 1.6037647811654292, "grad_norm": 17.368099212646484, "learning_rate": 5.918248331448463e-05, "loss": 1.0517, "step": 120300 }, { "epoch": 1.6050979189718841, "grad_norm": 4.701587677001953, "learning_rate": 5.918046952138015e-05, "loss": 1.0014, "step": 120400 }, { "epoch": 1.606431056778339, "grad_norm": 7.3443498611450195, "learning_rate": 5.917845328537872e-05, "loss": 1.0272, "step": 120500 }, { "epoch": 1.6077641945847942, "grad_norm": 17.184436798095703, "learning_rate": 5.917643460664919e-05, "loss": 1.0836, "step": 120600 }, { "epoch": 1.6090973323912494, "grad_norm": 30.439342498779297, "learning_rate": 5.9174413485360513e-05, "loss": 1.0329, "step": 120700 }, { "epoch": 1.6104304701977044, "grad_norm": 7.808774471282959, "learning_rate": 5.917238992168189e-05, "loss": 1.0046, "step": 120800 }, { "epoch": 1.6117636080041593, "grad_norm": 25.387454986572266, "learning_rate": 5.917036391578276e-05, "loss": 1.0353, "step": 120900 }, { "epoch": 1.6130967458106145, "grad_norm": 7.006316184997559, "learning_rate": 5.916833546783271e-05, "loss": 1.1013, "step": 121000 }, { "epoch": 1.6144298836170696, "grad_norm": 7.441618919372559, "learning_rate": 5.916632489898663e-05, "loss": 1.1232, "step": 121100 }, { "epoch": 1.6157630214235246, "grad_norm": 13.719457626342773, "learning_rate": 5.916429159186067e-05, "loss": 1.1443, "step": 121200 }, { "epoch": 1.6170961592299795, "grad_norm": 24.965566635131836, "learning_rate": 5.916225584319215e-05, "loss": 1.2093, "step": 121300 }, { "epoch": 1.6184292970364347, "grad_norm": 8.482102394104004, "learning_rate": 5.91602176531515e-05, "loss": 1.1163, "step": 121400 }, { "epoch": 1.6197624348428898, "grad_norm": 14.103384017944336, "learning_rate": 5.915817702190936e-05, "loss": 1.0386, "step": 121500 }, { "epoch": 1.6210955726493448, "grad_norm": 6.007056713104248, "learning_rate": 5.9156133949636546e-05, "loss": 1.0251, "step": 121600 }, { "epoch": 1.6224287104557997, "grad_norm": 14.315899848937988, "learning_rate": 5.91540884365041e-05, "loss": 1.0877, "step": 121700 }, { "epoch": 1.623761848262255, "grad_norm": 5.818728923797607, "learning_rate": 5.915204048268327e-05, "loss": 1.1189, "step": 121800 }, { "epoch": 1.62509498606871, "grad_norm": 2.670956611633301, "learning_rate": 5.9149990088345517e-05, "loss": 0.9912, "step": 121900 }, { "epoch": 1.626428123875165, "grad_norm": 26.369855880737305, "learning_rate": 5.914793725366248e-05, "loss": 1.1083, "step": 122000 }, { "epoch": 1.62776126168162, "grad_norm": 276.3042907714844, "learning_rate": 5.914588197880602e-05, "loss": 1.0729, "step": 122100 }, { "epoch": 1.629094399488075, "grad_norm": 50.300228118896484, "learning_rate": 5.9143824263948186e-05, "loss": 0.9643, "step": 122200 }, { "epoch": 1.6304275372945303, "grad_norm": 9.573346138000488, "learning_rate": 5.914176410926126e-05, "loss": 1.0263, "step": 122300 }, { "epoch": 1.6317606751009852, "grad_norm": 6.695359706878662, "learning_rate": 5.9139701514917714e-05, "loss": 0.9896, "step": 122400 }, { "epoch": 1.6330938129074402, "grad_norm": 8.770630836486816, "learning_rate": 5.913765714350336e-05, "loss": 1.1162, "step": 122500 }, { "epoch": 1.6344269507138953, "grad_norm": 14.19379711151123, "learning_rate": 5.913558969475702e-05, "loss": 0.9772, "step": 122600 }, { "epoch": 1.6357600885203505, "grad_norm": 17.011837005615234, "learning_rate": 5.913351980687097e-05, "loss": 1.0045, "step": 122700 }, { "epoch": 1.6370932263268054, "grad_norm": 3.0861432552337646, "learning_rate": 5.9131447480018473e-05, "loss": 1.0781, "step": 122800 }, { "epoch": 1.6384263641332604, "grad_norm": 5.6266326904296875, "learning_rate": 5.912937271437302e-05, "loss": 0.942, "step": 122900 }, { "epoch": 1.6397595019397155, "grad_norm": 11.014402389526367, "learning_rate": 5.9127295510108305e-05, "loss": 1.1447, "step": 123000 }, { "epoch": 1.6410926397461707, "grad_norm": 5.802671432495117, "learning_rate": 5.912521586739824e-05, "loss": 0.9526, "step": 123100 }, { "epoch": 1.6424257775526256, "grad_norm": 11.949726104736328, "learning_rate": 5.9123133786416905e-05, "loss": 1.1913, "step": 123200 }, { "epoch": 1.6437589153590806, "grad_norm": 6.929421424865723, "learning_rate": 5.912104926733861e-05, "loss": 1.0109, "step": 123300 }, { "epoch": 1.6450920531655358, "grad_norm": 12.73901081085205, "learning_rate": 5.9118962310337866e-05, "loss": 1.0665, "step": 123400 }, { "epoch": 1.646425190971991, "grad_norm": 5.102193832397461, "learning_rate": 5.911687291558939e-05, "loss": 0.9965, "step": 123500 }, { "epoch": 1.6477583287784459, "grad_norm": 3.3305869102478027, "learning_rate": 5.911478108326811e-05, "loss": 1.0564, "step": 123600 }, { "epoch": 1.6490914665849008, "grad_norm": 4.0498528480529785, "learning_rate": 5.911268681354912e-05, "loss": 1.011, "step": 123700 }, { "epoch": 1.650424604391356, "grad_norm": 17.095172882080078, "learning_rate": 5.911059010660777e-05, "loss": 1.0568, "step": 123800 }, { "epoch": 1.6517577421978111, "grad_norm": 93.4796371459961, "learning_rate": 5.9108490962619576e-05, "loss": 0.926, "step": 123900 }, { "epoch": 1.653090880004266, "grad_norm": 14.991497039794922, "learning_rate": 5.910638938176028e-05, "loss": 1.102, "step": 124000 }, { "epoch": 1.654424017810721, "grad_norm": 17.772790908813477, "learning_rate": 5.910428536420582e-05, "loss": 0.9942, "step": 124100 }, { "epoch": 1.6557571556171762, "grad_norm": 32.1773567199707, "learning_rate": 5.9102199986733246e-05, "loss": 1.0088, "step": 124200 }, { "epoch": 1.6570902934236313, "grad_norm": 4.652344226837158, "learning_rate": 5.910009112067964e-05, "loss": 0.9804, "step": 124300 }, { "epoch": 1.6584234312300863, "grad_norm": 11.596138954162598, "learning_rate": 5.909797981845812e-05, "loss": 0.9737, "step": 124400 }, { "epoch": 1.6597565690365412, "grad_norm": 7.907742023468018, "learning_rate": 5.909586608024545e-05, "loss": 0.9591, "step": 124500 }, { "epoch": 1.6610897068429964, "grad_norm": 6.943394184112549, "learning_rate": 5.90937499062186e-05, "loss": 1.092, "step": 124600 }, { "epoch": 1.6624228446494516, "grad_norm": 7.274913787841797, "learning_rate": 5.909163129655472e-05, "loss": 0.9718, "step": 124700 }, { "epoch": 1.6637559824559065, "grad_norm": 4.180917263031006, "learning_rate": 5.908951025143118e-05, "loss": 1.0373, "step": 124800 }, { "epoch": 1.6650891202623614, "grad_norm": 13.343239784240723, "learning_rate": 5.9087386771025524e-05, "loss": 1.0645, "step": 124900 }, { "epoch": 1.6664222580688166, "grad_norm": 17.17283058166504, "learning_rate": 5.908526085551554e-05, "loss": 1.0237, "step": 125000 }, { "epoch": 1.6677553958752718, "grad_norm": 9.153670310974121, "learning_rate": 5.908313250507921e-05, "loss": 1.169, "step": 125100 }, { "epoch": 1.6690885336817267, "grad_norm": 11.675641059875488, "learning_rate": 5.90810017198947e-05, "loss": 1.0681, "step": 125200 }, { "epoch": 1.6704216714881817, "grad_norm": 21.1568546295166, "learning_rate": 5.90788685001404e-05, "loss": 1.0134, "step": 125300 }, { "epoch": 1.6717548092946368, "grad_norm": 73.02965545654297, "learning_rate": 5.907673284599489e-05, "loss": 0.9564, "step": 125400 }, { "epoch": 1.673087947101092, "grad_norm": 7.401341915130615, "learning_rate": 5.907459475763697e-05, "loss": 1.0477, "step": 125500 }, { "epoch": 1.674421084907547, "grad_norm": 3.81575083732605, "learning_rate": 5.907245423524563e-05, "loss": 0.8927, "step": 125600 }, { "epoch": 1.6757542227140019, "grad_norm": 4.277335166931152, "learning_rate": 5.9070311279000055e-05, "loss": 1.0818, "step": 125700 }, { "epoch": 1.677087360520457, "grad_norm": 8.855399131774902, "learning_rate": 5.906816588907966e-05, "loss": 1.0766, "step": 125800 }, { "epoch": 1.6784204983269122, "grad_norm": 3.5384554862976074, "learning_rate": 5.906601806566406e-05, "loss": 1.1184, "step": 125900 }, { "epoch": 1.6797536361333671, "grad_norm": 4.842370510101318, "learning_rate": 5.906386780893303e-05, "loss": 0.8946, "step": 126000 }, { "epoch": 1.681086773939822, "grad_norm": 20.950969696044922, "learning_rate": 5.906171511906662e-05, "loss": 0.8931, "step": 126100 }, { "epoch": 1.6824199117462773, "grad_norm": 21.34187889099121, "learning_rate": 5.905955999624502e-05, "loss": 0.9867, "step": 126200 }, { "epoch": 1.6837530495527324, "grad_norm": 20.939584732055664, "learning_rate": 5.905740244064867e-05, "loss": 1.0057, "step": 126300 }, { "epoch": 1.6850861873591874, "grad_norm": 5.040999412536621, "learning_rate": 5.9055242452458185e-05, "loss": 0.9921, "step": 126400 }, { "epoch": 1.6864193251656423, "grad_norm": 5.87464714050293, "learning_rate": 5.905308003185439e-05, "loss": 0.982, "step": 126500 }, { "epoch": 1.6877524629720975, "grad_norm": 3.7990241050720215, "learning_rate": 5.9050915179018315e-05, "loss": 1.0722, "step": 126600 }, { "epoch": 1.6890856007785526, "grad_norm": 11.578743934631348, "learning_rate": 5.9048747894131196e-05, "loss": 1.1247, "step": 126700 }, { "epoch": 1.6904187385850076, "grad_norm": 21.318256378173828, "learning_rate": 5.904657817737448e-05, "loss": 1.0864, "step": 126800 }, { "epoch": 1.6917518763914625, "grad_norm": 21.72938346862793, "learning_rate": 5.90444060289298e-05, "loss": 0.9637, "step": 126900 }, { "epoch": 1.6930850141979177, "grad_norm": 36.40229034423828, "learning_rate": 5.9042231448978996e-05, "loss": 0.9279, "step": 127000 }, { "epoch": 1.6944181520043728, "grad_norm": 5.348294258117676, "learning_rate": 5.904005443770413e-05, "loss": 1.0836, "step": 127100 }, { "epoch": 1.6957512898108278, "grad_norm": 20.156831741333008, "learning_rate": 5.9037874995287445e-05, "loss": 1.0515, "step": 127200 }, { "epoch": 1.6970844276172827, "grad_norm": 12.688535690307617, "learning_rate": 5.903569312191141e-05, "loss": 0.9854, "step": 127300 }, { "epoch": 1.698417565423738, "grad_norm": 4.008232116699219, "learning_rate": 5.903350881775866e-05, "loss": 1.0542, "step": 127400 }, { "epoch": 1.699750703230193, "grad_norm": 5.864956855773926, "learning_rate": 5.903132208301208e-05, "loss": 0.9765, "step": 127500 }, { "epoch": 1.701083841036648, "grad_norm": 11.455629348754883, "learning_rate": 5.902913291785473e-05, "loss": 1.0425, "step": 127600 }, { "epoch": 1.702416978843103, "grad_norm": 35.598480224609375, "learning_rate": 5.902694132246989e-05, "loss": 1.1037, "step": 127700 }, { "epoch": 1.703750116649558, "grad_norm": 2.8728187084198, "learning_rate": 5.902474729704101e-05, "loss": 0.9179, "step": 127800 }, { "epoch": 1.7050832544560133, "grad_norm": 23.463958740234375, "learning_rate": 5.902255084175179e-05, "loss": 1.0529, "step": 127900 }, { "epoch": 1.7064163922624682, "grad_norm": 14.904051780700684, "learning_rate": 5.902035195678609e-05, "loss": 1.0837, "step": 128000 }, { "epoch": 1.7077495300689232, "grad_norm": 5.7856011390686035, "learning_rate": 5.9018150642328015e-05, "loss": 1.0772, "step": 128100 }, { "epoch": 1.7090826678753783, "grad_norm": 12.680891990661621, "learning_rate": 5.901594689856183e-05, "loss": 0.9938, "step": 128200 }, { "epoch": 1.7104158056818335, "grad_norm": 18.12403678894043, "learning_rate": 5.901374072567204e-05, "loss": 1.0522, "step": 128300 }, { "epoch": 1.7117489434882884, "grad_norm": 9.414825439453125, "learning_rate": 5.901153212384335e-05, "loss": 0.9852, "step": 128400 }, { "epoch": 1.7130820812947434, "grad_norm": 8.137036323547363, "learning_rate": 5.9009321093260626e-05, "loss": 1.0443, "step": 128500 }, { "epoch": 1.7144152191011985, "grad_norm": 13.444110870361328, "learning_rate": 5.900710763410898e-05, "loss": 1.0475, "step": 128600 }, { "epoch": 1.7157483569076537, "grad_norm": 77.32196807861328, "learning_rate": 5.900489174657373e-05, "loss": 1.0419, "step": 128700 }, { "epoch": 1.7170814947141086, "grad_norm": 9.505863189697266, "learning_rate": 5.900267343084039e-05, "loss": 1.0515, "step": 128800 }, { "epoch": 1.7184146325205636, "grad_norm": 34.85601043701172, "learning_rate": 5.900045268709464e-05, "loss": 1.1064, "step": 128900 }, { "epoch": 1.7197477703270188, "grad_norm": 11.356162071228027, "learning_rate": 5.8998229515522405e-05, "loss": 1.0149, "step": 129000 }, { "epoch": 1.721080908133474, "grad_norm": 4.880529880523682, "learning_rate": 5.899600391630981e-05, "loss": 1.0256, "step": 129100 }, { "epoch": 1.7224140459399289, "grad_norm": 9.819894790649414, "learning_rate": 5.899377588964317e-05, "loss": 0.9857, "step": 129200 }, { "epoch": 1.7237471837463838, "grad_norm": 13.788296699523926, "learning_rate": 5.8991545435709014e-05, "loss": 0.9882, "step": 129300 }, { "epoch": 1.725080321552839, "grad_norm": 4.481779098510742, "learning_rate": 5.8989312554694054e-05, "loss": 0.9604, "step": 129400 }, { "epoch": 1.7264134593592941, "grad_norm": 8.345255851745605, "learning_rate": 5.898707724678524e-05, "loss": 1.0191, "step": 129500 }, { "epoch": 1.727746597165749, "grad_norm": 4.398279190063477, "learning_rate": 5.8984839512169684e-05, "loss": 1.0298, "step": 129600 }, { "epoch": 1.729079734972204, "grad_norm": 5.046751022338867, "learning_rate": 5.898259935103473e-05, "loss": 0.9963, "step": 129700 }, { "epoch": 1.7304128727786592, "grad_norm": 35.58378601074219, "learning_rate": 5.898035676356792e-05, "loss": 0.9821, "step": 129800 }, { "epoch": 1.7317460105851143, "grad_norm": 6.2874603271484375, "learning_rate": 5.8978111749957e-05, "loss": 1.066, "step": 129900 }, { "epoch": 1.7330791483915693, "grad_norm": 6.821561336517334, "learning_rate": 5.897586431038991e-05, "loss": 1.0039, "step": 130000 }, { "epoch": 1.7344122861980242, "grad_norm": 10.24402141571045, "learning_rate": 5.897361444505481e-05, "loss": 0.9524, "step": 130100 }, { "epoch": 1.7357454240044794, "grad_norm": 3.416046142578125, "learning_rate": 5.8971362154140025e-05, "loss": 1.0726, "step": 130200 }, { "epoch": 1.7370785618109346, "grad_norm": 5.5439066886901855, "learning_rate": 5.896910743783414e-05, "loss": 0.9874, "step": 130300 }, { "epoch": 1.7384116996173895, "grad_norm": 16.73900604248047, "learning_rate": 5.8966850296325894e-05, "loss": 1.0656, "step": 130400 }, { "epoch": 1.7397448374238444, "grad_norm": 29.423187255859375, "learning_rate": 5.8964590729804245e-05, "loss": 1.0244, "step": 130500 }, { "epoch": 1.7410779752302994, "grad_norm": 3.159780263900757, "learning_rate": 5.896232873845837e-05, "loss": 0.9608, "step": 130600 }, { "epoch": 1.7424111130367546, "grad_norm": 10.492265701293945, "learning_rate": 5.896006432247763e-05, "loss": 1.0179, "step": 130700 }, { "epoch": 1.7437442508432097, "grad_norm": 23.256296157836914, "learning_rate": 5.89577974820516e-05, "loss": 1.0601, "step": 130800 }, { "epoch": 1.7450773886496647, "grad_norm": 6.734389781951904, "learning_rate": 5.895552821737004e-05, "loss": 1.0001, "step": 130900 }, { "epoch": 1.7464105264561196, "grad_norm": 12.030269622802734, "learning_rate": 5.895325652862293e-05, "loss": 0.8747, "step": 131000 }, { "epoch": 1.7477436642625748, "grad_norm": 3.6501545906066895, "learning_rate": 5.895098241600047e-05, "loss": 0.9593, "step": 131100 }, { "epoch": 1.74907680206903, "grad_norm": 13.442198753356934, "learning_rate": 5.894870587969301e-05, "loss": 0.9355, "step": 131200 }, { "epoch": 1.7504099398754849, "grad_norm": 10.20988655090332, "learning_rate": 5.894642691989114e-05, "loss": 1.0106, "step": 131300 }, { "epoch": 1.7517430776819398, "grad_norm": 26.740283966064453, "learning_rate": 5.894414553678568e-05, "loss": 0.9569, "step": 131400 }, { "epoch": 1.753076215488395, "grad_norm": 6.381852626800537, "learning_rate": 5.8941861730567576e-05, "loss": 0.9778, "step": 131500 }, { "epoch": 1.7544093532948501, "grad_norm": 14.480917930603027, "learning_rate": 5.893957550142804e-05, "loss": 0.9791, "step": 131600 }, { "epoch": 1.755742491101305, "grad_norm": 135.4573974609375, "learning_rate": 5.893728684955847e-05, "loss": 1.1139, "step": 131700 }, { "epoch": 1.75707562890776, "grad_norm": 22.208465576171875, "learning_rate": 5.893499577515047e-05, "loss": 0.9689, "step": 131800 }, { "epoch": 1.7584087667142152, "grad_norm": 5.5821757316589355, "learning_rate": 5.8932702278395834e-05, "loss": 1.0092, "step": 131900 }, { "epoch": 1.7597419045206704, "grad_norm": 9.057134628295898, "learning_rate": 5.893040635948657e-05, "loss": 1.036, "step": 132000 }, { "epoch": 1.7610750423271253, "grad_norm": 3.6389522552490234, "learning_rate": 5.8928108018614875e-05, "loss": 0.994, "step": 132100 }, { "epoch": 1.7624081801335802, "grad_norm": 6.502557754516602, "learning_rate": 5.892580725597317e-05, "loss": 1.0275, "step": 132200 }, { "epoch": 1.7637413179400354, "grad_norm": 7.229946613311768, "learning_rate": 5.892350407175405e-05, "loss": 1.0021, "step": 132300 }, { "epoch": 1.7650744557464906, "grad_norm": 7.154219627380371, "learning_rate": 5.892119846615036e-05, "loss": 0.9734, "step": 132400 }, { "epoch": 1.7664075935529455, "grad_norm": 13.842524528503418, "learning_rate": 5.891889043935509e-05, "loss": 0.9328, "step": 132500 }, { "epoch": 1.7677407313594005, "grad_norm": 41.19913101196289, "learning_rate": 5.891657999156149e-05, "loss": 1.0999, "step": 132600 }, { "epoch": 1.7690738691658556, "grad_norm": 6.616876125335693, "learning_rate": 5.891426712296295e-05, "loss": 1.0214, "step": 132700 }, { "epoch": 1.7704070069723108, "grad_norm": 21.747560501098633, "learning_rate": 5.891195183375312e-05, "loss": 0.9885, "step": 132800 }, { "epoch": 1.7717401447787657, "grad_norm": 9.232473373413086, "learning_rate": 5.8909634124125816e-05, "loss": 0.955, "step": 132900 }, { "epoch": 1.7730732825852207, "grad_norm": 11.465886116027832, "learning_rate": 5.8907313994275076e-05, "loss": 1.0213, "step": 133000 }, { "epoch": 1.7744064203916758, "grad_norm": 18.831634521484375, "learning_rate": 5.890499144439513e-05, "loss": 0.9913, "step": 133100 }, { "epoch": 1.775739558198131, "grad_norm": 22.94124412536621, "learning_rate": 5.890266647468042e-05, "loss": 1.0393, "step": 133200 }, { "epoch": 1.777072696004586, "grad_norm": 6.356540203094482, "learning_rate": 5.890036237119571e-05, "loss": 0.9703, "step": 133300 }, { "epoch": 1.778405833811041, "grad_norm": 10.770771980285645, "learning_rate": 5.889803258658907e-05, "loss": 1.1505, "step": 133400 }, { "epoch": 1.779738971617496, "grad_norm": 9.6284761428833, "learning_rate": 5.8895700382730244e-05, "loss": 0.9959, "step": 133500 }, { "epoch": 1.7810721094239512, "grad_norm": 22.816808700561523, "learning_rate": 5.889336575981446e-05, "loss": 0.9726, "step": 133600 }, { "epoch": 1.7824052472304062, "grad_norm": 6.09254264831543, "learning_rate": 5.889102871803718e-05, "loss": 0.965, "step": 133700 }, { "epoch": 1.783738385036861, "grad_norm": 5.301441192626953, "learning_rate": 5.888868925759404e-05, "loss": 1.1259, "step": 133800 }, { "epoch": 1.7850715228433163, "grad_norm": 18.6987361907959, "learning_rate": 5.888634737868091e-05, "loss": 1.0127, "step": 133900 }, { "epoch": 1.7864046606497714, "grad_norm": 14.731962203979492, "learning_rate": 5.8884003081493835e-05, "loss": 1.0438, "step": 134000 }, { "epoch": 1.7877377984562264, "grad_norm": 8.026268005371094, "learning_rate": 5.888165636622907e-05, "loss": 1.0694, "step": 134100 }, { "epoch": 1.7890709362626813, "grad_norm": 7.268362045288086, "learning_rate": 5.8879307233083074e-05, "loss": 1.0246, "step": 134200 }, { "epoch": 1.7904040740691365, "grad_norm": 22.305540084838867, "learning_rate": 5.8876955682252515e-05, "loss": 0.9746, "step": 134300 }, { "epoch": 1.7917372118755917, "grad_norm": 2.7981815338134766, "learning_rate": 5.887460171393425e-05, "loss": 0.9983, "step": 134400 }, { "epoch": 1.7930703496820466, "grad_norm": 17.419448852539062, "learning_rate": 5.887224532832535e-05, "loss": 0.9787, "step": 134500 }, { "epoch": 1.7944034874885015, "grad_norm": 4.794266700744629, "learning_rate": 5.886988652562309e-05, "loss": 1.0212, "step": 134600 }, { "epoch": 1.7957366252949567, "grad_norm": 6.41156005859375, "learning_rate": 5.8867525306024933e-05, "loss": 0.9815, "step": 134700 }, { "epoch": 1.7970697631014119, "grad_norm": 22.048173904418945, "learning_rate": 5.8865161669728555e-05, "loss": 1.0552, "step": 134800 }, { "epoch": 1.7984029009078668, "grad_norm": 14.099875450134277, "learning_rate": 5.8862795616931834e-05, "loss": 0.9625, "step": 134900 }, { "epoch": 1.7997360387143218, "grad_norm": 6.892061233520508, "learning_rate": 5.886042714783285e-05, "loss": 0.9572, "step": 135000 }, { "epoch": 1.801069176520777, "grad_norm": 10.867799758911133, "learning_rate": 5.8858056262629875e-05, "loss": 1.0088, "step": 135100 }, { "epoch": 1.802402314327232, "grad_norm": 94.00987243652344, "learning_rate": 5.885568296152141e-05, "loss": 0.9786, "step": 135200 }, { "epoch": 1.803735452133687, "grad_norm": 7.132715225219727, "learning_rate": 5.885333101383136e-05, "loss": 0.9662, "step": 135300 }, { "epoch": 1.805068589940142, "grad_norm": 20.39360237121582, "learning_rate": 5.885095290566224e-05, "loss": 0.9035, "step": 135400 }, { "epoch": 1.8064017277465971, "grad_norm": 15.233184814453125, "learning_rate": 5.8848572382182277e-05, "loss": 0.9783, "step": 135500 }, { "epoch": 1.8077348655530523, "grad_norm": 4.0790534019470215, "learning_rate": 5.8846189443590784e-05, "loss": 0.9717, "step": 135600 }, { "epoch": 1.8090680033595072, "grad_norm": 7.090280055999756, "learning_rate": 5.884380409008724e-05, "loss": 1.0542, "step": 135700 }, { "epoch": 1.8104011411659622, "grad_norm": 2.6983280181884766, "learning_rate": 5.884141632187133e-05, "loss": 0.9793, "step": 135800 }, { "epoch": 1.8117342789724173, "grad_norm": 18.336793899536133, "learning_rate": 5.883902613914297e-05, "loss": 1.0054, "step": 135900 }, { "epoch": 1.8130674167788725, "grad_norm": 39.59661865234375, "learning_rate": 5.8836633542102254e-05, "loss": 0.9467, "step": 136000 }, { "epoch": 1.8144005545853275, "grad_norm": 6.208508014678955, "learning_rate": 5.883423853094947e-05, "loss": 1.1654, "step": 136100 }, { "epoch": 1.8157336923917824, "grad_norm": 6.210187911987305, "learning_rate": 5.883184110588513e-05, "loss": 1.0538, "step": 136200 }, { "epoch": 1.8170668301982376, "grad_norm": 45.018585205078125, "learning_rate": 5.882944126710995e-05, "loss": 1.06, "step": 136300 }, { "epoch": 1.8183999680046927, "grad_norm": 37.923545837402344, "learning_rate": 5.8827039014824816e-05, "loss": 1.0222, "step": 136400 }, { "epoch": 1.8197331058111477, "grad_norm": 11.500107765197754, "learning_rate": 5.882463434923085e-05, "loss": 1.1446, "step": 136500 }, { "epoch": 1.8210662436176026, "grad_norm": 12.704032897949219, "learning_rate": 5.8822227270529355e-05, "loss": 1.078, "step": 136600 }, { "epoch": 1.8223993814240578, "grad_norm": 13.45181941986084, "learning_rate": 5.881981777892185e-05, "loss": 1.1059, "step": 136700 }, { "epoch": 1.823732519230513, "grad_norm": 8.055557250976562, "learning_rate": 5.881740587461005e-05, "loss": 1.1547, "step": 136800 }, { "epoch": 1.8250656570369679, "grad_norm": 10.343531608581543, "learning_rate": 5.881499155779587e-05, "loss": 1.1727, "step": 136900 }, { "epoch": 1.8263987948434228, "grad_norm": 5.7513837814331055, "learning_rate": 5.8812574828681425e-05, "loss": 1.0078, "step": 137000 }, { "epoch": 1.827731932649878, "grad_norm": 6.620877742767334, "learning_rate": 5.881015568746905e-05, "loss": 0.9919, "step": 137100 }, { "epoch": 1.8290650704563332, "grad_norm": 5.6875529289245605, "learning_rate": 5.8807734134361254e-05, "loss": 1.0328, "step": 137200 }, { "epoch": 1.830398208262788, "grad_norm": 11.621871948242188, "learning_rate": 5.880531016956076e-05, "loss": 0.952, "step": 137300 }, { "epoch": 1.831731346069243, "grad_norm": 15.773863792419434, "learning_rate": 5.880288379327051e-05, "loss": 1.1705, "step": 137400 }, { "epoch": 1.8330644838756982, "grad_norm": 8.244169235229492, "learning_rate": 5.880045500569362e-05, "loss": 1.0719, "step": 137500 }, { "epoch": 1.8343976216821534, "grad_norm": 6.053536415100098, "learning_rate": 5.879802380703343e-05, "loss": 1.0283, "step": 137600 }, { "epoch": 1.8357307594886083, "grad_norm": 11.265652656555176, "learning_rate": 5.879559019749345e-05, "loss": 1.0572, "step": 137700 }, { "epoch": 1.8370638972950633, "grad_norm": 8.213422775268555, "learning_rate": 5.879315417727744e-05, "loss": 1.0005, "step": 137800 }, { "epoch": 1.8383970351015184, "grad_norm": 4.313119888305664, "learning_rate": 5.8790740142827375e-05, "loss": 1.0642, "step": 137900 }, { "epoch": 1.8397301729079736, "grad_norm": 7.15620756149292, "learning_rate": 5.8788299325972964e-05, "loss": 1.0514, "step": 138000 }, { "epoch": 1.8410633107144285, "grad_norm": 3.66398549079895, "learning_rate": 5.8785856099052875e-05, "loss": 1.0152, "step": 138100 }, { "epoch": 1.8423964485208835, "grad_norm": 12.125128746032715, "learning_rate": 5.878341046227166e-05, "loss": 0.9812, "step": 138200 }, { "epoch": 1.8437295863273386, "grad_norm": 3.7264883518218994, "learning_rate": 5.8780986908225555e-05, "loss": 1.017, "step": 138300 }, { "epoch": 1.8450627241337938, "grad_norm": 18.00465965270996, "learning_rate": 5.8778536476430003e-05, "loss": 1.103, "step": 138400 }, { "epoch": 1.8463958619402487, "grad_norm": 12.485156059265137, "learning_rate": 5.87760836353861e-05, "loss": 1.0304, "step": 138500 }, { "epoch": 1.8477289997467037, "grad_norm": 43.59517288208008, "learning_rate": 5.877362838529917e-05, "loss": 1.0754, "step": 138600 }, { "epoch": 1.8490621375531588, "grad_norm": 22.154258728027344, "learning_rate": 5.877117072637478e-05, "loss": 0.9489, "step": 138700 }, { "epoch": 1.850395275359614, "grad_norm": 1.9579824209213257, "learning_rate": 5.8768710658818675e-05, "loss": 0.9573, "step": 138800 }, { "epoch": 1.851728413166069, "grad_norm": 25.205930709838867, "learning_rate": 5.87662481828368e-05, "loss": 0.9939, "step": 138900 }, { "epoch": 1.853061550972524, "grad_norm": 393.660888671875, "learning_rate": 5.8763783298635304e-05, "loss": 0.9463, "step": 139000 }, { "epoch": 1.854394688778979, "grad_norm": 31.897628784179688, "learning_rate": 5.876131600642054e-05, "loss": 1.0527, "step": 139100 }, { "epoch": 1.8557278265854342, "grad_norm": 18.154996871948242, "learning_rate": 5.8758846306399074e-05, "loss": 1.1711, "step": 139200 }, { "epoch": 1.8570609643918892, "grad_norm": 6.8696608543396, "learning_rate": 5.8756374198777634e-05, "loss": 1.0885, "step": 139300 }, { "epoch": 1.8583941021983441, "grad_norm": 9.824455261230469, "learning_rate": 5.875389968376321e-05, "loss": 1.0282, "step": 139400 }, { "epoch": 1.8597272400047993, "grad_norm": 21.529508590698242, "learning_rate": 5.8751422761562927e-05, "loss": 1.0442, "step": 139500 }, { "epoch": 1.8610603778112544, "grad_norm": 13.387959480285645, "learning_rate": 5.8748943432384184e-05, "loss": 1.03, "step": 139600 }, { "epoch": 1.8623935156177094, "grad_norm": 14.130036354064941, "learning_rate": 5.8746461696434505e-05, "loss": 0.9754, "step": 139700 }, { "epoch": 1.8637266534241643, "grad_norm": 8.723565101623535, "learning_rate": 5.874397755392168e-05, "loss": 1.0525, "step": 139800 }, { "epoch": 1.8650597912306195, "grad_norm": 35.292938232421875, "learning_rate": 5.874149100505365e-05, "loss": 1.0217, "step": 139900 }, { "epoch": 1.8663929290370747, "grad_norm": 68.18289184570312, "learning_rate": 5.873900205003861e-05, "loss": 1.0414, "step": 140000 }, { "epoch": 1.8677260668435296, "grad_norm": 31.224510192871094, "learning_rate": 5.8736510689084894e-05, "loss": 0.9988, "step": 140100 }, { "epoch": 1.8690592046499845, "grad_norm": 21.99742317199707, "learning_rate": 5.87340169224011e-05, "loss": 1.0408, "step": 140200 }, { "epoch": 1.8703923424564397, "grad_norm": 18.842426300048828, "learning_rate": 5.873152075019598e-05, "loss": 0.9865, "step": 140300 }, { "epoch": 1.8717254802628949, "grad_norm": 6.555134296417236, "learning_rate": 5.872902217267852e-05, "loss": 0.9898, "step": 140400 }, { "epoch": 1.8730586180693498, "grad_norm": 8.956182479858398, "learning_rate": 5.872652119005788e-05, "loss": 1.0632, "step": 140500 }, { "epoch": 1.8743917558758048, "grad_norm": 15.811453819274902, "learning_rate": 5.872401780254344e-05, "loss": 0.9571, "step": 140600 }, { "epoch": 1.87572489368226, "grad_norm": 1.890555739402771, "learning_rate": 5.8721512010344776e-05, "loss": 1.0286, "step": 140700 }, { "epoch": 1.877058031488715, "grad_norm": 8.027750968933105, "learning_rate": 5.871900381367167e-05, "loss": 0.8977, "step": 140800 }, { "epoch": 1.87839116929517, "grad_norm": 10.94406509399414, "learning_rate": 5.8716493212734093e-05, "loss": 1.1955, "step": 140900 }, { "epoch": 1.879724307101625, "grad_norm": 6.057521343231201, "learning_rate": 5.8713980207742216e-05, "loss": 1.0236, "step": 141000 }, { "epoch": 1.8810574449080801, "grad_norm": 9.475712776184082, "learning_rate": 5.871146479890645e-05, "loss": 1.0693, "step": 141100 }, { "epoch": 1.8823905827145353, "grad_norm": 25.8647518157959, "learning_rate": 5.870894698643735e-05, "loss": 0.9613, "step": 141200 }, { "epoch": 1.8837237205209902, "grad_norm": 3.5990374088287354, "learning_rate": 5.8706426770545684e-05, "loss": 1.0397, "step": 141300 }, { "epoch": 1.8850568583274452, "grad_norm": 6.472697734832764, "learning_rate": 5.870390415144249e-05, "loss": 0.9781, "step": 141400 }, { "epoch": 1.8863899961339003, "grad_norm": 3.841646909713745, "learning_rate": 5.87013791293389e-05, "loss": 0.829, "step": 141500 }, { "epoch": 1.8877231339403555, "grad_norm": 5.581139087677002, "learning_rate": 5.869885170444634e-05, "loss": 0.9877, "step": 141600 }, { "epoch": 1.8890562717468105, "grad_norm": 11.641477584838867, "learning_rate": 5.8696321876976375e-05, "loss": 1.0458, "step": 141700 }, { "epoch": 1.8903894095532654, "grad_norm": 5.951430320739746, "learning_rate": 5.86937896471408e-05, "loss": 0.9421, "step": 141800 }, { "epoch": 1.8917225473597206, "grad_norm": 11.246418952941895, "learning_rate": 5.869128037336147e-05, "loss": 0.9858, "step": 141900 }, { "epoch": 1.8930556851661757, "grad_norm": 9.125296592712402, "learning_rate": 5.868874336344921e-05, "loss": 1.0355, "step": 142000 }, { "epoch": 1.8943888229726307, "grad_norm": 18.807098388671875, "learning_rate": 5.8686203951805795e-05, "loss": 1.0267, "step": 142100 }, { "epoch": 1.8957219607790856, "grad_norm": 8.180209159851074, "learning_rate": 5.8683662138643814e-05, "loss": 0.9647, "step": 142200 }, { "epoch": 1.8970550985855408, "grad_norm": 34.0649528503418, "learning_rate": 5.8681117924176056e-05, "loss": 1.0365, "step": 142300 }, { "epoch": 1.898388236391996, "grad_norm": 3.701293706893921, "learning_rate": 5.867857130861552e-05, "loss": 1.0138, "step": 142400 }, { "epoch": 1.8997213741984509, "grad_norm": 2.6684377193450928, "learning_rate": 5.867602229217539e-05, "loss": 0.9922, "step": 142500 }, { "epoch": 1.9010545120049058, "grad_norm": 5.5484137535095215, "learning_rate": 5.8673470875069065e-05, "loss": 0.9995, "step": 142600 }, { "epoch": 1.902387649811361, "grad_norm": 19.67209243774414, "learning_rate": 5.867091705751015e-05, "loss": 0.9617, "step": 142700 }, { "epoch": 1.9037207876178162, "grad_norm": 4.637143611907959, "learning_rate": 5.866836083971245e-05, "loss": 0.9123, "step": 142800 }, { "epoch": 1.905053925424271, "grad_norm": 5.371326923370361, "learning_rate": 5.8665802221889925e-05, "loss": 0.9427, "step": 142900 }, { "epoch": 1.906387063230726, "grad_norm": 6.538115501403809, "learning_rate": 5.8663241204256815e-05, "loss": 0.9667, "step": 143000 }, { "epoch": 1.9077202010371812, "grad_norm": 6.2176008224487305, "learning_rate": 5.86606777870275e-05, "loss": 1.013, "step": 143100 }, { "epoch": 1.9090533388436364, "grad_norm": 6.03598165512085, "learning_rate": 5.86581119704166e-05, "loss": 0.9499, "step": 143200 }, { "epoch": 1.9103864766500913, "grad_norm": 11.664010047912598, "learning_rate": 5.86555437546389e-05, "loss": 1.0367, "step": 143300 }, { "epoch": 1.9117196144565463, "grad_norm": 8.835166931152344, "learning_rate": 5.86529731399094e-05, "loss": 0.8986, "step": 143400 }, { "epoch": 1.9130527522630014, "grad_norm": 4.395211219787598, "learning_rate": 5.865040012644331e-05, "loss": 0.987, "step": 143500 }, { "epoch": 1.9143858900694566, "grad_norm": 19.41562271118164, "learning_rate": 5.8647824714456045e-05, "loss": 1.0069, "step": 143600 }, { "epoch": 1.9157190278759115, "grad_norm": 3.9383506774902344, "learning_rate": 5.864524690416319e-05, "loss": 0.8747, "step": 143700 }, { "epoch": 1.9170521656823665, "grad_norm": 8.231568336486816, "learning_rate": 5.8642666695780565e-05, "loss": 0.9977, "step": 143800 }, { "epoch": 1.9183853034888216, "grad_norm": 5.669800281524658, "learning_rate": 5.864008408952418e-05, "loss": 1.0242, "step": 143900 }, { "epoch": 1.9197184412952768, "grad_norm": 83.90917205810547, "learning_rate": 5.863749908561022e-05, "loss": 1.0195, "step": 144000 }, { "epoch": 1.9210515791017317, "grad_norm": 8.24555778503418, "learning_rate": 5.8634937570135306e-05, "loss": 0.9395, "step": 144100 }, { "epoch": 1.9223847169081867, "grad_norm": 9.741211891174316, "learning_rate": 5.863234779552684e-05, "loss": 0.9871, "step": 144200 }, { "epoch": 1.9237178547146419, "grad_norm": 7.822269916534424, "learning_rate": 5.862975562390847e-05, "loss": 0.973, "step": 144300 }, { "epoch": 1.925050992521097, "grad_norm": 9.709981918334961, "learning_rate": 5.8627161055497206e-05, "loss": 0.9364, "step": 144400 }, { "epoch": 1.926384130327552, "grad_norm": 6.3973541259765625, "learning_rate": 5.862456409051027e-05, "loss": 0.9814, "step": 144500 }, { "epoch": 1.927717268134007, "grad_norm": 5.304189682006836, "learning_rate": 5.862196472916505e-05, "loss": 0.9715, "step": 144600 }, { "epoch": 1.929050405940462, "grad_norm": 12.960403442382812, "learning_rate": 5.8619362971679164e-05, "loss": 0.9479, "step": 144700 }, { "epoch": 1.9303835437469172, "grad_norm": 4.619377613067627, "learning_rate": 5.861675881827044e-05, "loss": 0.9449, "step": 144800 }, { "epoch": 1.9317166815533722, "grad_norm": 13.780502319335938, "learning_rate": 5.861415226915686e-05, "loss": 0.9225, "step": 144900 }, { "epoch": 1.9330498193598271, "grad_norm": 12.108665466308594, "learning_rate": 5.861154332455667e-05, "loss": 0.9536, "step": 145000 }, { "epoch": 1.9343829571662823, "grad_norm": 6.380523681640625, "learning_rate": 5.860893198468825e-05, "loss": 0.9024, "step": 145100 }, { "epoch": 1.9357160949727374, "grad_norm": 7.643500804901123, "learning_rate": 5.8606318249770234e-05, "loss": 1.0303, "step": 145200 }, { "epoch": 1.9370492327791924, "grad_norm": 371.33935546875, "learning_rate": 5.860370212002142e-05, "loss": 0.9841, "step": 145300 }, { "epoch": 1.9383823705856473, "grad_norm": 16.201501846313477, "learning_rate": 5.860108359566083e-05, "loss": 0.9931, "step": 145400 }, { "epoch": 1.9397155083921025, "grad_norm": 5.242247104644775, "learning_rate": 5.85984626769077e-05, "loss": 0.9036, "step": 145500 }, { "epoch": 1.9410486461985577, "grad_norm": 42.34733581542969, "learning_rate": 5.859583936398141e-05, "loss": 1.0001, "step": 145600 }, { "epoch": 1.9423817840050126, "grad_norm": 11.450817108154297, "learning_rate": 5.859321365710159e-05, "loss": 1.058, "step": 145700 }, { "epoch": 1.9437149218114675, "grad_norm": 26.405559539794922, "learning_rate": 5.859058555648806e-05, "loss": 1.0482, "step": 145800 }, { "epoch": 1.9450480596179227, "grad_norm": 32.55730056762695, "learning_rate": 5.8587955062360835e-05, "loss": 1.0528, "step": 145900 }, { "epoch": 1.9463811974243779, "grad_norm": 16.224214553833008, "learning_rate": 5.858532217494012e-05, "loss": 1.1323, "step": 146000 }, { "epoch": 1.9477143352308328, "grad_norm": 7.66840934753418, "learning_rate": 5.858268689444635e-05, "loss": 1.0102, "step": 146100 }, { "epoch": 1.9490474730372878, "grad_norm": 13.100380897521973, "learning_rate": 5.858004922110013e-05, "loss": 1.0571, "step": 146200 }, { "epoch": 1.950380610843743, "grad_norm": 10.383312225341797, "learning_rate": 5.8577409155122284e-05, "loss": 0.9877, "step": 146300 }, { "epoch": 1.951713748650198, "grad_norm": 9.005138397216797, "learning_rate": 5.8574766696733816e-05, "loss": 0.9193, "step": 146400 }, { "epoch": 1.953046886456653, "grad_norm": 5.8133625984191895, "learning_rate": 5.857212184615597e-05, "loss": 0.9711, "step": 146500 }, { "epoch": 1.954380024263108, "grad_norm": 16.26742172241211, "learning_rate": 5.856947460361014e-05, "loss": 0.9043, "step": 146600 }, { "epoch": 1.9557131620695631, "grad_norm": 6.22686243057251, "learning_rate": 5.8566824969317964e-05, "loss": 0.9575, "step": 146700 }, { "epoch": 1.9570462998760183, "grad_norm": 4.507073879241943, "learning_rate": 5.856417294350125e-05, "loss": 0.9752, "step": 146800 }, { "epoch": 1.9583794376824732, "grad_norm": 5.562924385070801, "learning_rate": 5.856151852638201e-05, "loss": 0.9911, "step": 146900 }, { "epoch": 1.9597125754889282, "grad_norm": 2.747011184692383, "learning_rate": 5.855886171818249e-05, "loss": 0.8962, "step": 147000 }, { "epoch": 1.9610457132953834, "grad_norm": 9.145270347595215, "learning_rate": 5.855620251912508e-05, "loss": 0.9219, "step": 147100 }, { "epoch": 1.9623788511018385, "grad_norm": 10.42357349395752, "learning_rate": 5.855354092943241e-05, "loss": 1.0134, "step": 147200 }, { "epoch": 1.9637119889082935, "grad_norm": 14.733039855957031, "learning_rate": 5.8550876949327303e-05, "loss": 0.9634, "step": 147300 }, { "epoch": 1.9650451267147484, "grad_norm": 3.7961580753326416, "learning_rate": 5.8548210579032784e-05, "loss": 1.0213, "step": 147400 }, { "epoch": 1.9663782645212036, "grad_norm": 17.556394577026367, "learning_rate": 5.854554181877206e-05, "loss": 1.0212, "step": 147500 }, { "epoch": 1.9677114023276587, "grad_norm": 13.032634735107422, "learning_rate": 5.8542870668768564e-05, "loss": 1.0341, "step": 147600 }, { "epoch": 1.9690445401341137, "grad_norm": 9.907201766967773, "learning_rate": 5.854019712924591e-05, "loss": 0.9626, "step": 147700 }, { "epoch": 1.9703776779405686, "grad_norm": 4.71930456161499, "learning_rate": 5.853752120042792e-05, "loss": 0.9644, "step": 147800 }, { "epoch": 1.9717108157470238, "grad_norm": 9.702959060668945, "learning_rate": 5.853486967754267e-05, "loss": 0.9716, "step": 147900 }, { "epoch": 1.973043953553479, "grad_norm": 7.916568756103516, "learning_rate": 5.853218899469362e-05, "loss": 0.9755, "step": 148000 }, { "epoch": 1.974377091359934, "grad_norm": 8.667984962463379, "learning_rate": 5.8529505923219654e-05, "loss": 0.88, "step": 148100 }, { "epoch": 1.9757102291663888, "grad_norm": 15.728622436523438, "learning_rate": 5.852682046334539e-05, "loss": 0.9629, "step": 148200 }, { "epoch": 1.977043366972844, "grad_norm": 8.509275436401367, "learning_rate": 5.852413261529563e-05, "loss": 1.0659, "step": 148300 }, { "epoch": 1.9783765047792992, "grad_norm": 4.050390720367432, "learning_rate": 5.8521442379295404e-05, "loss": 0.9689, "step": 148400 }, { "epoch": 1.979709642585754, "grad_norm": 17.30630111694336, "learning_rate": 5.851874975556994e-05, "loss": 1.0443, "step": 148500 }, { "epoch": 1.981042780392209, "grad_norm": 3.5679574012756348, "learning_rate": 5.851605474434464e-05, "loss": 1.0057, "step": 148600 }, { "epoch": 1.9823759181986642, "grad_norm": 5.707697868347168, "learning_rate": 5.851335734584512e-05, "loss": 1.0122, "step": 148700 }, { "epoch": 1.9837090560051194, "grad_norm": 4.256319046020508, "learning_rate": 5.851065756029721e-05, "loss": 1.0095, "step": 148800 }, { "epoch": 1.9850421938115743, "grad_norm": 15.709830284118652, "learning_rate": 5.850795538792692e-05, "loss": 0.9562, "step": 148900 }, { "epoch": 1.9863753316180293, "grad_norm": 5.663149833679199, "learning_rate": 5.8505250828960476e-05, "loss": 1.038, "step": 149000 }, { "epoch": 1.9877084694244844, "grad_norm": 39.26630401611328, "learning_rate": 5.8502543883624285e-05, "loss": 1.026, "step": 149100 }, { "epoch": 1.9890416072309396, "grad_norm": 5.309559345245361, "learning_rate": 5.849983455214497e-05, "loss": 0.9478, "step": 149200 }, { "epoch": 1.9903747450373945, "grad_norm": 16.611251831054688, "learning_rate": 5.849712283474934e-05, "loss": 0.9298, "step": 149300 }, { "epoch": 1.9917078828438495, "grad_norm": 20.287250518798828, "learning_rate": 5.8494408731664426e-05, "loss": 0.9718, "step": 149400 }, { "epoch": 1.9930410206503046, "grad_norm": 6.833389759063721, "learning_rate": 5.849169224311743e-05, "loss": 1.0557, "step": 149500 }, { "epoch": 1.9943741584567598, "grad_norm": 10.28609848022461, "learning_rate": 5.848897336933577e-05, "loss": 0.9099, "step": 149600 }, { "epoch": 1.9957072962632147, "grad_norm": 10.099634170532227, "learning_rate": 5.8486252110547074e-05, "loss": 1.0303, "step": 149700 }, { "epoch": 1.9970404340696697, "grad_norm": 6.196499824523926, "learning_rate": 5.848352846697914e-05, "loss": 0.9765, "step": 149800 }, { "epoch": 1.9983735718761249, "grad_norm": 6.007256507873535, "learning_rate": 5.848080243885999e-05, "loss": 1.0017, "step": 149900 }, { "epoch": 1.99970670968258, "grad_norm": 4.948955059051514, "learning_rate": 5.847807402641785e-05, "loss": 1.0069, "step": 150000 }, { "epoch": 2.001039847489035, "grad_norm": 31.502723693847656, "learning_rate": 5.8475343229881106e-05, "loss": 0.9395, "step": 150100 }, { "epoch": 2.00237298529549, "grad_norm": 15.855053901672363, "learning_rate": 5.8472610049478393e-05, "loss": 1.0374, "step": 150200 }, { "epoch": 2.003706123101945, "grad_norm": 16.83685874938965, "learning_rate": 5.846987448543853e-05, "loss": 0.9336, "step": 150300 }, { "epoch": 2.0050392609084002, "grad_norm": 8.547715187072754, "learning_rate": 5.846713653799051e-05, "loss": 0.9334, "step": 150400 }, { "epoch": 2.006372398714855, "grad_norm": 3.5255022048950195, "learning_rate": 5.846439620736356e-05, "loss": 0.8933, "step": 150500 }, { "epoch": 2.00770553652131, "grad_norm": 12.878783226013184, "learning_rate": 5.846165349378709e-05, "loss": 0.9356, "step": 150600 }, { "epoch": 2.009038674327765, "grad_norm": 13.963788032531738, "learning_rate": 5.84589083974907e-05, "loss": 0.9692, "step": 150700 }, { "epoch": 2.0103718121342204, "grad_norm": 7.930510997772217, "learning_rate": 5.8456160918704215e-05, "loss": 0.9402, "step": 150800 }, { "epoch": 2.0117049499406754, "grad_norm": 4.557428359985352, "learning_rate": 5.8453411057657636e-05, "loss": 0.9805, "step": 150900 }, { "epoch": 2.0130380877471303, "grad_norm": 619.6597900390625, "learning_rate": 5.845065881458118e-05, "loss": 0.9064, "step": 151000 }, { "epoch": 2.0143712255535853, "grad_norm": 8.626474380493164, "learning_rate": 5.844790418970525e-05, "loss": 0.9988, "step": 151100 }, { "epoch": 2.0157043633600407, "grad_norm": 18.213825225830078, "learning_rate": 5.844514718326045e-05, "loss": 0.841, "step": 151200 }, { "epoch": 2.0170375011664956, "grad_norm": 13.489676475524902, "learning_rate": 5.844238779547761e-05, "loss": 0.9271, "step": 151300 }, { "epoch": 2.0183706389729505, "grad_norm": 8.343445777893066, "learning_rate": 5.843962602658771e-05, "loss": 0.9592, "step": 151400 }, { "epoch": 2.0197037767794055, "grad_norm": 25.767547607421875, "learning_rate": 5.843686187682198e-05, "loss": 0.921, "step": 151500 }, { "epoch": 2.021036914585861, "grad_norm": 8.223828315734863, "learning_rate": 5.84340953464118e-05, "loss": 0.959, "step": 151600 }, { "epoch": 2.022370052392316, "grad_norm": 18.332353591918945, "learning_rate": 5.8431326435588796e-05, "loss": 0.9908, "step": 151700 }, { "epoch": 2.0237031901987708, "grad_norm": 40.888145446777344, "learning_rate": 5.842855514458477e-05, "loss": 0.8779, "step": 151800 }, { "epoch": 2.0250363280052257, "grad_norm": 24.901826858520508, "learning_rate": 5.842578147363172e-05, "loss": 0.935, "step": 151900 }, { "epoch": 2.026369465811681, "grad_norm": 47.81048583984375, "learning_rate": 5.842300542296185e-05, "loss": 0.9323, "step": 152000 }, { "epoch": 2.027702603618136, "grad_norm": 10.23277473449707, "learning_rate": 5.842022699280756e-05, "loss": 0.9475, "step": 152100 }, { "epoch": 2.029035741424591, "grad_norm": 5.061823844909668, "learning_rate": 5.841744618340146e-05, "loss": 0.9068, "step": 152200 }, { "epoch": 2.030368879231046, "grad_norm": 23.19776725769043, "learning_rate": 5.8414662994976336e-05, "loss": 0.9679, "step": 152300 }, { "epoch": 2.0317020170375013, "grad_norm": 9.401873588562012, "learning_rate": 5.8411877427765204e-05, "loss": 0.894, "step": 152400 }, { "epoch": 2.0330351548439562, "grad_norm": 11.779426574707031, "learning_rate": 5.840908948200125e-05, "loss": 0.9054, "step": 152500 }, { "epoch": 2.034368292650411, "grad_norm": 24.75333023071289, "learning_rate": 5.840629915791788e-05, "loss": 0.875, "step": 152600 }, { "epoch": 2.035701430456866, "grad_norm": 11.71400260925293, "learning_rate": 5.8403506455748693e-05, "loss": 1.0443, "step": 152700 }, { "epoch": 2.0370345682633215, "grad_norm": 13.882397651672363, "learning_rate": 5.840071137572747e-05, "loss": 0.8752, "step": 152800 }, { "epoch": 2.0383677060697765, "grad_norm": 11.639689445495605, "learning_rate": 5.839791391808823e-05, "loss": 0.9435, "step": 152900 }, { "epoch": 2.0397008438762314, "grad_norm": 93.84835815429688, "learning_rate": 5.839511408306515e-05, "loss": 0.9096, "step": 153000 }, { "epoch": 2.0410339816826863, "grad_norm": 6.0128703117370605, "learning_rate": 5.839233990478048e-05, "loss": 1.0055, "step": 153100 }, { "epoch": 2.0423671194891417, "grad_norm": 21.754669189453125, "learning_rate": 5.83895353394611e-05, "loss": 0.9359, "step": 153200 }, { "epoch": 2.0437002572955967, "grad_norm": 26.053226470947266, "learning_rate": 5.8386728397459305e-05, "loss": 0.9332, "step": 153300 }, { "epoch": 2.0450333951020516, "grad_norm": 16.880157470703125, "learning_rate": 5.8383919079010095e-05, "loss": 0.8898, "step": 153400 }, { "epoch": 2.0463665329085066, "grad_norm": 7.161216735839844, "learning_rate": 5.8381107384348655e-05, "loss": 0.9521, "step": 153500 }, { "epoch": 2.047699670714962, "grad_norm": 3.5983471870422363, "learning_rate": 5.8378293313710365e-05, "loss": 1.0217, "step": 153600 }, { "epoch": 2.049032808521417, "grad_norm": 35.464569091796875, "learning_rate": 5.837547686733082e-05, "loss": 1.0432, "step": 153700 }, { "epoch": 2.050365946327872, "grad_norm": 7.835151195526123, "learning_rate": 5.83726580454458e-05, "loss": 0.9371, "step": 153800 }, { "epoch": 2.051699084134327, "grad_norm": 2.2857766151428223, "learning_rate": 5.836983684829128e-05, "loss": 0.991, "step": 153900 }, { "epoch": 2.053032221940782, "grad_norm": 6.437869071960449, "learning_rate": 5.8367013276103455e-05, "loss": 0.9096, "step": 154000 }, { "epoch": 2.054365359747237, "grad_norm": 13.157176971435547, "learning_rate": 5.83641873291187e-05, "loss": 0.9069, "step": 154100 }, { "epoch": 2.055698497553692, "grad_norm": 12.31402587890625, "learning_rate": 5.8361359007573594e-05, "loss": 0.8913, "step": 154200 }, { "epoch": 2.057031635360147, "grad_norm": 6.013667583465576, "learning_rate": 5.835852831170491e-05, "loss": 0.9679, "step": 154300 }, { "epoch": 2.0583647731666024, "grad_norm": 17.957651138305664, "learning_rate": 5.835569524174963e-05, "loss": 0.8614, "step": 154400 }, { "epoch": 2.0596979109730573, "grad_norm": 14.897080421447754, "learning_rate": 5.835288816413275e-05, "loss": 0.9379, "step": 154500 }, { "epoch": 2.0610310487795123, "grad_norm": 20.277978897094727, "learning_rate": 5.8350078760136375e-05, "loss": 0.9969, "step": 154600 }, { "epoch": 2.062364186585967, "grad_norm": 15.210801124572754, "learning_rate": 5.8347238616810314e-05, "loss": 0.931, "step": 154700 }, { "epoch": 2.0636973243924226, "grad_norm": 191.13768005371094, "learning_rate": 5.8344396100342795e-05, "loss": 0.9659, "step": 154800 }, { "epoch": 2.0650304621988775, "grad_norm": 4.81959342956543, "learning_rate": 5.834155121097179e-05, "loss": 0.9448, "step": 154900 }, { "epoch": 2.0663636000053325, "grad_norm": 8.297738075256348, "learning_rate": 5.833870394893545e-05, "loss": 1.0795, "step": 155000 }, { "epoch": 2.0676967378117874, "grad_norm": 5.562886714935303, "learning_rate": 5.833585431447214e-05, "loss": 0.9585, "step": 155100 }, { "epoch": 2.069029875618243, "grad_norm": 8.59869384765625, "learning_rate": 5.8333002307820444e-05, "loss": 0.9662, "step": 155200 }, { "epoch": 2.0703630134246978, "grad_norm": 5.954481601715088, "learning_rate": 5.83301479292191e-05, "loss": 0.9919, "step": 155300 }, { "epoch": 2.0716961512311527, "grad_norm": 21.132503509521484, "learning_rate": 5.8327291178907075e-05, "loss": 0.9982, "step": 155400 }, { "epoch": 2.0730292890376076, "grad_norm": 8.812986373901367, "learning_rate": 5.832443205712353e-05, "loss": 0.9709, "step": 155500 }, { "epoch": 2.074362426844063, "grad_norm": 4.981845855712891, "learning_rate": 5.832157056410782e-05, "loss": 0.9049, "step": 155600 }, { "epoch": 2.075695564650518, "grad_norm": 11.187894821166992, "learning_rate": 5.831870670009951e-05, "loss": 0.9046, "step": 155700 }, { "epoch": 2.077028702456973, "grad_norm": 4.718724250793457, "learning_rate": 5.831584046533832e-05, "loss": 0.9233, "step": 155800 }, { "epoch": 2.078361840263428, "grad_norm": 39.19499206542969, "learning_rate": 5.831297186006424e-05, "loss": 0.9131, "step": 155900 }, { "epoch": 2.0796949780698832, "grad_norm": 18.054244995117188, "learning_rate": 5.831010088451741e-05, "loss": 1.0874, "step": 156000 }, { "epoch": 2.081028115876338, "grad_norm": 38.15821838378906, "learning_rate": 5.830722753893817e-05, "loss": 1.0456, "step": 156100 }, { "epoch": 2.082361253682793, "grad_norm": 7.459341526031494, "learning_rate": 5.830435182356707e-05, "loss": 1.0958, "step": 156200 }, { "epoch": 2.083694391489248, "grad_norm": 25.264265060424805, "learning_rate": 5.830147373864485e-05, "loss": 1.0466, "step": 156300 }, { "epoch": 2.0850275292957035, "grad_norm": 59.922279357910156, "learning_rate": 5.829859328441248e-05, "loss": 0.994, "step": 156400 }, { "epoch": 2.0863606671021584, "grad_norm": 25.895240783691406, "learning_rate": 5.829571046111108e-05, "loss": 0.9241, "step": 156500 }, { "epoch": 2.0876938049086133, "grad_norm": 29.399627685546875, "learning_rate": 5.8292825268981994e-05, "loss": 1.1284, "step": 156600 }, { "epoch": 2.0890269427150683, "grad_norm": 9.937167167663574, "learning_rate": 5.8289937708266765e-05, "loss": 1.0312, "step": 156700 }, { "epoch": 2.0903600805215237, "grad_norm": 12.424627304077148, "learning_rate": 5.8287047779207124e-05, "loss": 1.0325, "step": 156800 }, { "epoch": 2.0916932183279786, "grad_norm": 6.016711235046387, "learning_rate": 5.828415548204502e-05, "loss": 1.0463, "step": 156900 }, { "epoch": 2.0930263561344336, "grad_norm": 67.27659606933594, "learning_rate": 5.828126081702256e-05, "loss": 1.0672, "step": 157000 }, { "epoch": 2.0943594939408885, "grad_norm": 5.859604835510254, "learning_rate": 5.8278363784382114e-05, "loss": 0.9661, "step": 157100 }, { "epoch": 2.095692631747344, "grad_norm": 4.747981548309326, "learning_rate": 5.827546438436619e-05, "loss": 1.1017, "step": 157200 }, { "epoch": 2.097025769553799, "grad_norm": 39.89076232910156, "learning_rate": 5.827256261721751e-05, "loss": 1.0664, "step": 157300 }, { "epoch": 2.0983589073602538, "grad_norm": 15.131988525390625, "learning_rate": 5.8269658483179015e-05, "loss": 0.9984, "step": 157400 }, { "epoch": 2.0996920451667087, "grad_norm": 18.32444190979004, "learning_rate": 5.8266751982493826e-05, "loss": 0.972, "step": 157500 }, { "epoch": 2.101025182973164, "grad_norm": 11.565061569213867, "learning_rate": 5.8263843115405265e-05, "loss": 1.0871, "step": 157600 }, { "epoch": 2.102358320779619, "grad_norm": 9.534890174865723, "learning_rate": 5.826093188215686e-05, "loss": 0.8943, "step": 157700 }, { "epoch": 2.103691458586074, "grad_norm": 16.10320472717285, "learning_rate": 5.825801828299231e-05, "loss": 1.0478, "step": 157800 }, { "epoch": 2.105024596392529, "grad_norm": 9.144424438476562, "learning_rate": 5.825510231815556e-05, "loss": 1.0102, "step": 157900 }, { "epoch": 2.1063577341989843, "grad_norm": 34.22707748413086, "learning_rate": 5.825218398789071e-05, "loss": 1.1873, "step": 158000 }, { "epoch": 2.1076908720054393, "grad_norm": 5.239530563354492, "learning_rate": 5.824926329244208e-05, "loss": 1.0888, "step": 158100 }, { "epoch": 2.109024009811894, "grad_norm": 38.587955474853516, "learning_rate": 5.8246340232054164e-05, "loss": 1.1538, "step": 158200 }, { "epoch": 2.110357147618349, "grad_norm": 30.26551628112793, "learning_rate": 5.824341480697169e-05, "loss": 1.11, "step": 158300 }, { "epoch": 2.1116902854248045, "grad_norm": 18.250150680541992, "learning_rate": 5.824048701743955e-05, "loss": 1.0783, "step": 158400 }, { "epoch": 2.1130234232312595, "grad_norm": 10.878432273864746, "learning_rate": 5.823755686370287e-05, "loss": 1.1268, "step": 158500 }, { "epoch": 2.1143565610377144, "grad_norm": 11.0975923538208, "learning_rate": 5.823462434600694e-05, "loss": 1.142, "step": 158600 }, { "epoch": 2.1156896988441694, "grad_norm": 8.003459930419922, "learning_rate": 5.8231689464597257e-05, "loss": 0.9545, "step": 158700 }, { "epoch": 2.1170228366506247, "grad_norm": 12.843964576721191, "learning_rate": 5.822875221971953e-05, "loss": 0.9674, "step": 158800 }, { "epoch": 2.1183559744570797, "grad_norm": 33.54901885986328, "learning_rate": 5.822581261161965e-05, "loss": 1.0362, "step": 158900 }, { "epoch": 2.1196891122635346, "grad_norm": 14.931583404541016, "learning_rate": 5.8222870640543715e-05, "loss": 0.9855, "step": 159000 }, { "epoch": 2.1210222500699896, "grad_norm": 10.681347846984863, "learning_rate": 5.8219955761770785e-05, "loss": 1.0631, "step": 159100 }, { "epoch": 2.122355387876445, "grad_norm": 21.93189239501953, "learning_rate": 5.821700908910542e-05, "loss": 1.0557, "step": 159200 }, { "epoch": 2.1236885256829, "grad_norm": 13.425495147705078, "learning_rate": 5.821406005420101e-05, "loss": 0.9338, "step": 159300 }, { "epoch": 2.125021663489355, "grad_norm": 12.14931583404541, "learning_rate": 5.821110865730443e-05, "loss": 1.0227, "step": 159400 }, { "epoch": 2.12635480129581, "grad_norm": 15.503982543945312, "learning_rate": 5.820815489866275e-05, "loss": 0.9368, "step": 159500 }, { "epoch": 2.127687939102265, "grad_norm": 3.8319194316864014, "learning_rate": 5.820519877852327e-05, "loss": 1.0598, "step": 159600 }, { "epoch": 2.12902107690872, "grad_norm": 17.046236038208008, "learning_rate": 5.820224029713347e-05, "loss": 1.0342, "step": 159700 }, { "epoch": 2.130354214715175, "grad_norm": 16.40085220336914, "learning_rate": 5.8199279454741e-05, "loss": 0.9395, "step": 159800 }, { "epoch": 2.13168735252163, "grad_norm": 5.1409478187561035, "learning_rate": 5.819631625159375e-05, "loss": 0.9393, "step": 159900 }, { "epoch": 2.1330204903280854, "grad_norm": 22.719852447509766, "learning_rate": 5.819335068793979e-05, "loss": 1.108, "step": 160000 }, { "epoch": 2.1343536281345403, "grad_norm": 10.421948432922363, "learning_rate": 5.819038276402737e-05, "loss": 0.8802, "step": 160100 }, { "epoch": 2.1356867659409953, "grad_norm": 12.601269721984863, "learning_rate": 5.818741248010498e-05, "loss": 1.0493, "step": 160200 }, { "epoch": 2.13701990374745, "grad_norm": 102.3021469116211, "learning_rate": 5.8184439836421276e-05, "loss": 0.9726, "step": 160300 }, { "epoch": 2.1383530415539056, "grad_norm": 4.574350833892822, "learning_rate": 5.8181464833225104e-05, "loss": 0.969, "step": 160400 }, { "epoch": 2.1396861793603605, "grad_norm": 4.8571248054504395, "learning_rate": 5.8178487470765526e-05, "loss": 0.9698, "step": 160500 }, { "epoch": 2.1410193171668155, "grad_norm": 8.088032722473145, "learning_rate": 5.817550774929182e-05, "loss": 0.9963, "step": 160600 }, { "epoch": 2.1423524549732704, "grad_norm": 9.211963653564453, "learning_rate": 5.817252566905341e-05, "loss": 0.9159, "step": 160700 }, { "epoch": 2.143685592779726, "grad_norm": 7.379790306091309, "learning_rate": 5.816954123029997e-05, "loss": 0.9026, "step": 160800 }, { "epoch": 2.1450187305861808, "grad_norm": 7.304496765136719, "learning_rate": 5.816655443328133e-05, "loss": 1.0329, "step": 160900 }, { "epoch": 2.1463518683926357, "grad_norm": 12.241649627685547, "learning_rate": 5.8163565278247536e-05, "loss": 0.9617, "step": 161000 }, { "epoch": 2.1476850061990906, "grad_norm": 7.121824741363525, "learning_rate": 5.8160573765448834e-05, "loss": 0.9517, "step": 161100 }, { "epoch": 2.149018144005546, "grad_norm": 17.649913787841797, "learning_rate": 5.815757989513568e-05, "loss": 0.9036, "step": 161200 }, { "epoch": 2.150351281812001, "grad_norm": 12.227380752563477, "learning_rate": 5.815458366755869e-05, "loss": 1.0107, "step": 161300 }, { "epoch": 2.151684419618456, "grad_norm": 3.3361928462982178, "learning_rate": 5.8151585082968706e-05, "loss": 0.9772, "step": 161400 }, { "epoch": 2.153017557424911, "grad_norm": 14.507036209106445, "learning_rate": 5.8148584141616764e-05, "loss": 0.9124, "step": 161500 }, { "epoch": 2.1543506952313662, "grad_norm": 42.036014556884766, "learning_rate": 5.814558084375408e-05, "loss": 1.0197, "step": 161600 }, { "epoch": 2.155683833037821, "grad_norm": 15.273947715759277, "learning_rate": 5.81425751896321e-05, "loss": 1.0235, "step": 161700 }, { "epoch": 2.157016970844276, "grad_norm": 22.658308029174805, "learning_rate": 5.8139567179502434e-05, "loss": 0.9648, "step": 161800 }, { "epoch": 2.158350108650731, "grad_norm": 8.385843276977539, "learning_rate": 5.813655681361691e-05, "loss": 0.9794, "step": 161900 }, { "epoch": 2.1596832464571865, "grad_norm": 6.93516731262207, "learning_rate": 5.813354409222754e-05, "loss": 0.9399, "step": 162000 }, { "epoch": 2.1610163842636414, "grad_norm": 15.795055389404297, "learning_rate": 5.8130529015586546e-05, "loss": 0.9168, "step": 162100 }, { "epoch": 2.1623495220700963, "grad_norm": 15.3910551071167, "learning_rate": 5.812751158394634e-05, "loss": 0.8715, "step": 162200 }, { "epoch": 2.1636826598765513, "grad_norm": 5.072242259979248, "learning_rate": 5.8124491797559526e-05, "loss": 0.9398, "step": 162300 }, { "epoch": 2.1650157976830067, "grad_norm": 10.121647834777832, "learning_rate": 5.812146965667891e-05, "loss": 0.8529, "step": 162400 }, { "epoch": 2.1663489354894616, "grad_norm": 9.261104583740234, "learning_rate": 5.811844516155751e-05, "loss": 0.93, "step": 162500 }, { "epoch": 2.1676820732959166, "grad_norm": 4.156920909881592, "learning_rate": 5.811541831244851e-05, "loss": 1.0168, "step": 162600 }, { "epoch": 2.1690152111023715, "grad_norm": 9.814631462097168, "learning_rate": 5.811238910960532e-05, "loss": 0.9536, "step": 162700 }, { "epoch": 2.170348348908827, "grad_norm": 14.621956825256348, "learning_rate": 5.8109357553281534e-05, "loss": 0.8421, "step": 162800 }, { "epoch": 2.171681486715282, "grad_norm": 7.21920919418335, "learning_rate": 5.8106323643730945e-05, "loss": 0.9029, "step": 162900 }, { "epoch": 2.1730146245217368, "grad_norm": 8.920235633850098, "learning_rate": 5.810328738120753e-05, "loss": 0.899, "step": 163000 }, { "epoch": 2.1743477623281917, "grad_norm": 15.810770034790039, "learning_rate": 5.810024876596549e-05, "loss": 1.0203, "step": 163100 }, { "epoch": 2.175680900134647, "grad_norm": 8.334254264831543, "learning_rate": 5.809720779825921e-05, "loss": 0.9182, "step": 163200 }, { "epoch": 2.177014037941102, "grad_norm": 34.5583610534668, "learning_rate": 5.809416447834325e-05, "loss": 0.9271, "step": 163300 }, { "epoch": 2.178347175747557, "grad_norm": 12.255109786987305, "learning_rate": 5.809111880647241e-05, "loss": 0.9852, "step": 163400 }, { "epoch": 2.179680313554012, "grad_norm": 8.126191139221191, "learning_rate": 5.8088070782901645e-05, "loss": 0.9205, "step": 163500 }, { "epoch": 2.1810134513604673, "grad_norm": 15.622180938720703, "learning_rate": 5.808502040788614e-05, "loss": 0.9829, "step": 163600 }, { "epoch": 2.1823465891669223, "grad_norm": 8.54246997833252, "learning_rate": 5.808196768168126e-05, "loss": 0.9611, "step": 163700 }, { "epoch": 2.183679726973377, "grad_norm": 10.089151382446289, "learning_rate": 5.807891260454256e-05, "loss": 0.9666, "step": 163800 }, { "epoch": 2.185012864779832, "grad_norm": 8.295586585998535, "learning_rate": 5.807585517672582e-05, "loss": 1.0575, "step": 163900 }, { "epoch": 2.1863460025862875, "grad_norm": 6.542031764984131, "learning_rate": 5.807279539848698e-05, "loss": 0.9617, "step": 164000 }, { "epoch": 2.1876791403927425, "grad_norm": 4.1980719566345215, "learning_rate": 5.806973327008221e-05, "loss": 0.908, "step": 164100 }, { "epoch": 2.1890122781991974, "grad_norm": 14.442049026489258, "learning_rate": 5.8066668791767846e-05, "loss": 0.9478, "step": 164200 }, { "epoch": 2.1903454160056524, "grad_norm": 3.580416440963745, "learning_rate": 5.8063632643710066e-05, "loss": 0.8279, "step": 164300 }, { "epoch": 2.1916785538121077, "grad_norm": 3.3912909030914307, "learning_rate": 5.8060563489839055e-05, "loss": 0.9326, "step": 164400 }, { "epoch": 2.1930116916185627, "grad_norm": 4.0787434577941895, "learning_rate": 5.805749198682614e-05, "loss": 0.9419, "step": 164500 }, { "epoch": 2.1943448294250176, "grad_norm": 11.130158424377441, "learning_rate": 5.805441813492844e-05, "loss": 1.0037, "step": 164600 }, { "epoch": 2.1956779672314726, "grad_norm": 32.47975158691406, "learning_rate": 5.805134193440328e-05, "loss": 0.9591, "step": 164700 }, { "epoch": 2.197011105037928, "grad_norm": 9.497849464416504, "learning_rate": 5.8048263385508205e-05, "loss": 0.9021, "step": 164800 }, { "epoch": 2.198344242844383, "grad_norm": 15.156950950622559, "learning_rate": 5.804518248850093e-05, "loss": 0.9677, "step": 164900 }, { "epoch": 2.199677380650838, "grad_norm": 22.35879135131836, "learning_rate": 5.804209924363939e-05, "loss": 0.9568, "step": 165000 }, { "epoch": 2.201010518457293, "grad_norm": 7.96636438369751, "learning_rate": 5.80390136511817e-05, "loss": 0.9378, "step": 165100 }, { "epoch": 2.202343656263748, "grad_norm": 12.374824523925781, "learning_rate": 5.8035925711386156e-05, "loss": 0.9778, "step": 165200 }, { "epoch": 2.203676794070203, "grad_norm": 35.804019927978516, "learning_rate": 5.803283542451131e-05, "loss": 0.9148, "step": 165300 }, { "epoch": 2.205009931876658, "grad_norm": 9.50378704071045, "learning_rate": 5.802974279081583e-05, "loss": 0.9625, "step": 165400 }, { "epoch": 2.206343069683113, "grad_norm": 11.307624816894531, "learning_rate": 5.802664781055866e-05, "loss": 0.986, "step": 165500 }, { "epoch": 2.2076762074895684, "grad_norm": 5.020549774169922, "learning_rate": 5.802355048399887e-05, "loss": 1.0779, "step": 165600 }, { "epoch": 2.2090093452960233, "grad_norm": 7.414650917053223, "learning_rate": 5.802045081139577e-05, "loss": 0.9563, "step": 165700 }, { "epoch": 2.2103424831024783, "grad_norm": 7.298057556152344, "learning_rate": 5.801734879300886e-05, "loss": 0.9962, "step": 165800 }, { "epoch": 2.211675620908933, "grad_norm": 5.790229320526123, "learning_rate": 5.8014244429097836e-05, "loss": 0.9239, "step": 165900 }, { "epoch": 2.2130087587153886, "grad_norm": 15.781787872314453, "learning_rate": 5.801113771992257e-05, "loss": 0.9092, "step": 166000 }, { "epoch": 2.2143418965218435, "grad_norm": 15.884072303771973, "learning_rate": 5.8008028665743144e-05, "loss": 0.917, "step": 166100 }, { "epoch": 2.2156750343282985, "grad_norm": 5.907567024230957, "learning_rate": 5.8004917266819855e-05, "loss": 0.8738, "step": 166200 }, { "epoch": 2.2170081721347534, "grad_norm": 12.873859405517578, "learning_rate": 5.8001803523413174e-05, "loss": 0.8613, "step": 166300 }, { "epoch": 2.218341309941209, "grad_norm": 19.79654312133789, "learning_rate": 5.7998687435783776e-05, "loss": 0.9786, "step": 166400 }, { "epoch": 2.2196744477476638, "grad_norm": 6.022929668426514, "learning_rate": 5.7995569004192514e-05, "loss": 0.9145, "step": 166500 }, { "epoch": 2.2210075855541187, "grad_norm": 19.678409576416016, "learning_rate": 5.7992448228900464e-05, "loss": 0.9556, "step": 166600 }, { "epoch": 2.2223407233605736, "grad_norm": 14.083244323730469, "learning_rate": 5.79893251101689e-05, "loss": 0.8623, "step": 166700 }, { "epoch": 2.223673861167029, "grad_norm": 4.506038665771484, "learning_rate": 5.798619964825925e-05, "loss": 0.9595, "step": 166800 }, { "epoch": 2.225006998973484, "grad_norm": 2.7216246128082275, "learning_rate": 5.79830718434332e-05, "loss": 0.9196, "step": 166900 }, { "epoch": 2.226340136779939, "grad_norm": 18.87946891784668, "learning_rate": 5.797994169595258e-05, "loss": 0.8717, "step": 167000 }, { "epoch": 2.227673274586394, "grad_norm": 7.042524814605713, "learning_rate": 5.797680920607945e-05, "loss": 0.8836, "step": 167100 }, { "epoch": 2.2290064123928492, "grad_norm": 15.471376419067383, "learning_rate": 5.7973674374076034e-05, "loss": 0.9542, "step": 167200 }, { "epoch": 2.230339550199304, "grad_norm": 11.673738479614258, "learning_rate": 5.7970537200204787e-05, "loss": 0.9572, "step": 167300 }, { "epoch": 2.231672688005759, "grad_norm": 8.486309051513672, "learning_rate": 5.796742909147317e-05, "loss": 0.9942, "step": 167400 }, { "epoch": 2.233005825812214, "grad_norm": 9.688340187072754, "learning_rate": 5.7964287258066464e-05, "loss": 0.9607, "step": 167500 }, { "epoch": 2.2343389636186695, "grad_norm": 5.805440425872803, "learning_rate": 5.7961143083577785e-05, "loss": 0.9672, "step": 167600 }, { "epoch": 2.2356721014251244, "grad_norm": 7.542809009552002, "learning_rate": 5.795799656827034e-05, "loss": 0.9639, "step": 167700 }, { "epoch": 2.2370052392315793, "grad_norm": 11.9628324508667, "learning_rate": 5.795484771240756e-05, "loss": 0.8885, "step": 167800 }, { "epoch": 2.2383383770380343, "grad_norm": 11.74487018585205, "learning_rate": 5.795169651625305e-05, "loss": 0.9788, "step": 167900 }, { "epoch": 2.2396715148444897, "grad_norm": 10.399934768676758, "learning_rate": 5.7948542980070616e-05, "loss": 0.8488, "step": 168000 }, { "epoch": 2.2410046526509446, "grad_norm": 16.293439865112305, "learning_rate": 5.794538710412426e-05, "loss": 0.9053, "step": 168100 }, { "epoch": 2.2423377904573996, "grad_norm": 17.100461959838867, "learning_rate": 5.79422288886782e-05, "loss": 0.8595, "step": 168200 }, { "epoch": 2.2436709282638545, "grad_norm": 17.357276916503906, "learning_rate": 5.79390683339968e-05, "loss": 0.915, "step": 168300 }, { "epoch": 2.24500406607031, "grad_norm": 4.6226301193237305, "learning_rate": 5.793590544034469e-05, "loss": 0.8813, "step": 168400 }, { "epoch": 2.246337203876765, "grad_norm": 11.38253402709961, "learning_rate": 5.793274020798663e-05, "loss": 0.8984, "step": 168500 }, { "epoch": 2.2476703416832198, "grad_norm": 18.7725830078125, "learning_rate": 5.7929572637187614e-05, "loss": 0.8917, "step": 168600 }, { "epoch": 2.2490034794896747, "grad_norm": 7.442286968231201, "learning_rate": 5.7926402728212806e-05, "loss": 0.9818, "step": 168700 }, { "epoch": 2.2503366172961297, "grad_norm": 6.745561599731445, "learning_rate": 5.792323048132761e-05, "loss": 0.8574, "step": 168800 }, { "epoch": 2.251669755102585, "grad_norm": 10.45546817779541, "learning_rate": 5.7920055896797574e-05, "loss": 0.9108, "step": 168900 }, { "epoch": 2.25300289290904, "grad_norm": 3.807060718536377, "learning_rate": 5.7916878974888465e-05, "loss": 0.976, "step": 169000 }, { "epoch": 2.254336030715495, "grad_norm": 6.936534404754639, "learning_rate": 5.791369971586626e-05, "loss": 0.8317, "step": 169100 }, { "epoch": 2.2556691685219503, "grad_norm": 8.776269912719727, "learning_rate": 5.79105181199971e-05, "loss": 0.8976, "step": 169200 }, { "epoch": 2.2570023063284053, "grad_norm": 5.5864129066467285, "learning_rate": 5.790733418754734e-05, "loss": 0.9475, "step": 169300 }, { "epoch": 2.25833544413486, "grad_norm": 7.267307281494141, "learning_rate": 5.790414791878354e-05, "loss": 0.9433, "step": 169400 }, { "epoch": 2.259668581941315, "grad_norm": 7.154885292053223, "learning_rate": 5.790095931397244e-05, "loss": 0.9654, "step": 169500 }, { "epoch": 2.26100171974777, "grad_norm": 10.87378978729248, "learning_rate": 5.7897768373380965e-05, "loss": 0.9617, "step": 169600 }, { "epoch": 2.2623348575542255, "grad_norm": 13.753266334533691, "learning_rate": 5.789457509727628e-05, "loss": 0.8666, "step": 169700 }, { "epoch": 2.2636679953606804, "grad_norm": 8.156198501586914, "learning_rate": 5.789137948592569e-05, "loss": 0.8835, "step": 169800 }, { "epoch": 2.2650011331671354, "grad_norm": 5.669678211212158, "learning_rate": 5.788818153959672e-05, "loss": 0.9171, "step": 169900 }, { "epoch": 2.2663342709735907, "grad_norm": 7.135474681854248, "learning_rate": 5.7885013272923456e-05, "loss": 0.9944, "step": 170000 }, { "epoch": 2.2676674087800457, "grad_norm": 19.212486267089844, "learning_rate": 5.7881810680784225e-05, "loss": 0.8906, "step": 170100 }, { "epoch": 2.2690005465865006, "grad_norm": 6.248098373413086, "learning_rate": 5.787860575446769e-05, "loss": 0.9016, "step": 170200 }, { "epoch": 2.2703336843929556, "grad_norm": 16.999919891357422, "learning_rate": 5.787539849424216e-05, "loss": 0.9, "step": 170300 }, { "epoch": 2.2716668221994105, "grad_norm": 6.524509906768799, "learning_rate": 5.787218890037614e-05, "loss": 0.9638, "step": 170400 }, { "epoch": 2.272999960005866, "grad_norm": 74.42312622070312, "learning_rate": 5.7868976973138323e-05, "loss": 0.9309, "step": 170500 }, { "epoch": 2.274333097812321, "grad_norm": 7.396635055541992, "learning_rate": 5.786576271279762e-05, "loss": 1.0199, "step": 170600 }, { "epoch": 2.275666235618776, "grad_norm": 9.899057388305664, "learning_rate": 5.786254611962308e-05, "loss": 0.9296, "step": 170700 }, { "epoch": 2.276999373425231, "grad_norm": 26.35314178466797, "learning_rate": 5.785932719388403e-05, "loss": 0.8841, "step": 170800 }, { "epoch": 2.278332511231686, "grad_norm": 272.8354187011719, "learning_rate": 5.785610593584992e-05, "loss": 0.8914, "step": 170900 }, { "epoch": 2.279665649038141, "grad_norm": 25.87502098083496, "learning_rate": 5.785288234579042e-05, "loss": 0.8407, "step": 171000 }, { "epoch": 2.280998786844596, "grad_norm": 21.65571403503418, "learning_rate": 5.7849688694734875e-05, "loss": 1.0472, "step": 171100 }, { "epoch": 2.282331924651051, "grad_norm": 9.135978698730469, "learning_rate": 5.7846460464747943e-05, "loss": 0.9487, "step": 171200 }, { "epoch": 2.2836650624575063, "grad_norm": 37.57064437866211, "learning_rate": 5.784322990354313e-05, "loss": 0.9194, "step": 171300 }, { "epoch": 2.2849982002639613, "grad_norm": 5.914510726928711, "learning_rate": 5.7839997011390866e-05, "loss": 0.9441, "step": 171400 }, { "epoch": 2.286331338070416, "grad_norm": 10.315113067626953, "learning_rate": 5.783676178856182e-05, "loss": 0.959, "step": 171500 }, { "epoch": 2.2876644758768716, "grad_norm": 43.05485916137695, "learning_rate": 5.783352423532682e-05, "loss": 0.9771, "step": 171600 }, { "epoch": 2.2889976136833265, "grad_norm": 7.964520454406738, "learning_rate": 5.783028435195691e-05, "loss": 0.9603, "step": 171700 }, { "epoch": 2.2903307514897815, "grad_norm": 3.4938788414001465, "learning_rate": 5.782704213872333e-05, "loss": 1.0516, "step": 171800 }, { "epoch": 2.2916638892962364, "grad_norm": 5.09851598739624, "learning_rate": 5.7823797595897496e-05, "loss": 1.0136, "step": 171900 }, { "epoch": 2.2929970271026914, "grad_norm": 10.670677185058594, "learning_rate": 5.7820550723751046e-05, "loss": 0.952, "step": 172000 }, { "epoch": 2.2943301649091468, "grad_norm": 3.839142084121704, "learning_rate": 5.781730152255577e-05, "loss": 0.9163, "step": 172100 }, { "epoch": 2.2956633027156017, "grad_norm": 6.482143878936768, "learning_rate": 5.78140499925837e-05, "loss": 0.9325, "step": 172200 }, { "epoch": 2.2969964405220566, "grad_norm": 9.160943984985352, "learning_rate": 5.7810796134107055e-05, "loss": 0.7884, "step": 172300 }, { "epoch": 2.298329578328512, "grad_norm": 5.8831305503845215, "learning_rate": 5.780753994739822e-05, "loss": 0.8737, "step": 172400 }, { "epoch": 2.299662716134967, "grad_norm": 9.377514839172363, "learning_rate": 5.780431402939898e-05, "loss": 0.9191, "step": 172500 }, { "epoch": 2.300995853941422, "grad_norm": 28.921268463134766, "learning_rate": 5.780105321031928e-05, "loss": 0.9423, "step": 172600 }, { "epoch": 2.302328991747877, "grad_norm": 22.5611515045166, "learning_rate": 5.7797790063823035e-05, "loss": 0.8742, "step": 172700 }, { "epoch": 2.303662129554332, "grad_norm": 11.45319652557373, "learning_rate": 5.7794524590183436e-05, "loss": 0.9103, "step": 172800 }, { "epoch": 2.304995267360787, "grad_norm": 8.823529243469238, "learning_rate": 5.779125678967386e-05, "loss": 0.8472, "step": 172900 }, { "epoch": 2.306328405167242, "grad_norm": 6.632887840270996, "learning_rate": 5.778798666256786e-05, "loss": 0.8902, "step": 173000 }, { "epoch": 2.307661542973697, "grad_norm": 16.690492630004883, "learning_rate": 5.7784714209139225e-05, "loss": 0.8937, "step": 173100 }, { "epoch": 2.3089946807801525, "grad_norm": 19.995990753173828, "learning_rate": 5.778143942966188e-05, "loss": 0.9642, "step": 173200 }, { "epoch": 2.3103278185866074, "grad_norm": 25.97408676147461, "learning_rate": 5.777816232441002e-05, "loss": 1.0111, "step": 173300 }, { "epoch": 2.3116609563930623, "grad_norm": 10.909085273742676, "learning_rate": 5.777488289365797e-05, "loss": 1.2097, "step": 173400 }, { "epoch": 2.3129940941995173, "grad_norm": 9.378416061401367, "learning_rate": 5.7771601137680276e-05, "loss": 0.9148, "step": 173500 }, { "epoch": 2.3143272320059722, "grad_norm": 23.037906646728516, "learning_rate": 5.776831705675168e-05, "loss": 0.9979, "step": 173600 }, { "epoch": 2.3156603698124276, "grad_norm": 13.380654335021973, "learning_rate": 5.7765030651147114e-05, "loss": 0.9783, "step": 173700 }, { "epoch": 2.3169935076188826, "grad_norm": 6.0310492515563965, "learning_rate": 5.7761741921141695e-05, "loss": 1.0107, "step": 173800 }, { "epoch": 2.3183266454253375, "grad_norm": 11.913926124572754, "learning_rate": 5.775845086701077e-05, "loss": 0.9901, "step": 173900 }, { "epoch": 2.319659783231793, "grad_norm": 8.882743835449219, "learning_rate": 5.775515748902983e-05, "loss": 0.9309, "step": 174000 }, { "epoch": 2.320992921038248, "grad_norm": 5.525155067443848, "learning_rate": 5.77518617874746e-05, "loss": 0.9285, "step": 174100 }, { "epoch": 2.322326058844703, "grad_norm": 7.640848636627197, "learning_rate": 5.774856376262098e-05, "loss": 1.0079, "step": 174200 }, { "epoch": 2.3236591966511577, "grad_norm": 11.67833423614502, "learning_rate": 5.7745263414745085e-05, "loss": 1.0051, "step": 174300 }, { "epoch": 2.3249923344576127, "grad_norm": 73.89511108398438, "learning_rate": 5.774196074412319e-05, "loss": 1.031, "step": 174400 }, { "epoch": 2.326325472264068, "grad_norm": 5.64599084854126, "learning_rate": 5.7738655751031785e-05, "loss": 0.9309, "step": 174500 }, { "epoch": 2.327658610070523, "grad_norm": 4.290924072265625, "learning_rate": 5.773534843574757e-05, "loss": 1.0014, "step": 174600 }, { "epoch": 2.328991747876978, "grad_norm": 4.89362096786499, "learning_rate": 5.7732038798547405e-05, "loss": 0.9243, "step": 174700 }, { "epoch": 2.3303248856834333, "grad_norm": 3.0671162605285645, "learning_rate": 5.772872683970838e-05, "loss": 0.852, "step": 174800 }, { "epoch": 2.3316580234898883, "grad_norm": 6.40576696395874, "learning_rate": 5.772541255950774e-05, "loss": 0.9266, "step": 174900 }, { "epoch": 2.332991161296343, "grad_norm": 15.741097450256348, "learning_rate": 5.772209595822297e-05, "loss": 0.9677, "step": 175000 }, { "epoch": 2.334324299102798, "grad_norm": 5.106222629547119, "learning_rate": 5.7718777036131706e-05, "loss": 0.8886, "step": 175100 }, { "epoch": 2.335657436909253, "grad_norm": 15.472319602966309, "learning_rate": 5.771545579351181e-05, "loss": 0.9488, "step": 175200 }, { "epoch": 2.3369905747157085, "grad_norm": 4.733688831329346, "learning_rate": 5.7712132230641325e-05, "loss": 0.8788, "step": 175300 }, { "epoch": 2.3383237125221634, "grad_norm": 5.404839515686035, "learning_rate": 5.770880634779848e-05, "loss": 0.8871, "step": 175400 }, { "epoch": 2.3396568503286184, "grad_norm": 13.988819122314453, "learning_rate": 5.770547814526171e-05, "loss": 0.8849, "step": 175500 }, { "epoch": 2.3409899881350738, "grad_norm": 0.7812204957008362, "learning_rate": 5.770214762330966e-05, "loss": 0.9036, "step": 175600 }, { "epoch": 2.3423231259415287, "grad_norm": 24.02560043334961, "learning_rate": 5.769881478222112e-05, "loss": 0.93, "step": 175700 }, { "epoch": 2.3436562637479836, "grad_norm": 6.613696575164795, "learning_rate": 5.769547962227513e-05, "loss": 0.8778, "step": 175800 }, { "epoch": 2.3449894015544386, "grad_norm": 5.254739284515381, "learning_rate": 5.769214214375088e-05, "loss": 0.9547, "step": 175900 }, { "epoch": 2.3463225393608935, "grad_norm": 18.34488868713379, "learning_rate": 5.7688802346927796e-05, "loss": 0.8984, "step": 176000 }, { "epoch": 2.347655677167349, "grad_norm": 6.824354648590088, "learning_rate": 5.7685460232085455e-05, "loss": 0.9182, "step": 176100 }, { "epoch": 2.348988814973804, "grad_norm": 4.032736778259277, "learning_rate": 5.768211579950365e-05, "loss": 0.9593, "step": 176200 }, { "epoch": 2.350321952780259, "grad_norm": 14.585145950317383, "learning_rate": 5.7678769049462376e-05, "loss": 0.9567, "step": 176300 }, { "epoch": 2.351655090586714, "grad_norm": 48.300437927246094, "learning_rate": 5.767541998224181e-05, "loss": 0.9304, "step": 176400 }, { "epoch": 2.352988228393169, "grad_norm": 9.99682903289795, "learning_rate": 5.767206859812232e-05, "loss": 0.8594, "step": 176500 }, { "epoch": 2.354321366199624, "grad_norm": 4.656745910644531, "learning_rate": 5.766871489738448e-05, "loss": 0.9066, "step": 176600 }, { "epoch": 2.355654504006079, "grad_norm": 7.003809928894043, "learning_rate": 5.7665358880309054e-05, "loss": 0.8933, "step": 176700 }, { "epoch": 2.356987641812534, "grad_norm": 10.976523399353027, "learning_rate": 5.766200054717698e-05, "loss": 1.0209, "step": 176800 }, { "epoch": 2.3583207796189893, "grad_norm": 50.61436462402344, "learning_rate": 5.7658639898269425e-05, "loss": 0.8674, "step": 176900 }, { "epoch": 2.3596539174254443, "grad_norm": 31.055360794067383, "learning_rate": 5.765527693386773e-05, "loss": 0.9213, "step": 177000 }, { "epoch": 2.3609870552318992, "grad_norm": 16.15890121459961, "learning_rate": 5.765191165425342e-05, "loss": 1.0084, "step": 177100 }, { "epoch": 2.3623201930383546, "grad_norm": 10.711358070373535, "learning_rate": 5.764854405970823e-05, "loss": 0.9011, "step": 177200 }, { "epoch": 2.3636533308448096, "grad_norm": 23.23415184020996, "learning_rate": 5.764517415051409e-05, "loss": 1.024, "step": 177300 }, { "epoch": 2.3649864686512645, "grad_norm": 35.27497482299805, "learning_rate": 5.764180192695312e-05, "loss": 0.8709, "step": 177400 }, { "epoch": 2.3663196064577194, "grad_norm": 7.164013862609863, "learning_rate": 5.763842738930762e-05, "loss": 1.0354, "step": 177500 }, { "epoch": 2.3676527442641744, "grad_norm": 6.799283027648926, "learning_rate": 5.7635050537860104e-05, "loss": 0.9018, "step": 177600 }, { "epoch": 2.3689858820706298, "grad_norm": 24.12723159790039, "learning_rate": 5.763167137289327e-05, "loss": 0.924, "step": 177700 }, { "epoch": 2.3703190198770847, "grad_norm": 10.092991828918457, "learning_rate": 5.762828989469002e-05, "loss": 1.0203, "step": 177800 }, { "epoch": 2.3716521576835397, "grad_norm": 8.144224166870117, "learning_rate": 5.762490610353342e-05, "loss": 0.9971, "step": 177900 }, { "epoch": 2.372985295489995, "grad_norm": 27.670625686645508, "learning_rate": 5.762151999970676e-05, "loss": 0.9592, "step": 178000 }, { "epoch": 2.37431843329645, "grad_norm": 11.842615127563477, "learning_rate": 5.7618131583493534e-05, "loss": 1.018, "step": 178100 }, { "epoch": 2.375651571102905, "grad_norm": 3.787975311279297, "learning_rate": 5.761474085517738e-05, "loss": 0.9381, "step": 178200 }, { "epoch": 2.37698470890936, "grad_norm": 8.721558570861816, "learning_rate": 5.761134781504217e-05, "loss": 0.9517, "step": 178300 }, { "epoch": 2.378317846715815, "grad_norm": 16.85736083984375, "learning_rate": 5.7607952463371966e-05, "loss": 0.9026, "step": 178400 }, { "epoch": 2.37965098452227, "grad_norm": 23.30556869506836, "learning_rate": 5.760458878851997e-05, "loss": 0.8638, "step": 178500 }, { "epoch": 2.380984122328725, "grad_norm": 6.516043663024902, "learning_rate": 5.760118883774095e-05, "loss": 0.9733, "step": 178600 }, { "epoch": 2.38231726013518, "grad_norm": 7.562189102172852, "learning_rate": 5.759778657627741e-05, "loss": 0.8576, "step": 178700 }, { "epoch": 2.3836503979416355, "grad_norm": 87.16138458251953, "learning_rate": 5.7594382004414176e-05, "loss": 0.9611, "step": 178800 }, { "epoch": 2.3849835357480904, "grad_norm": 5.620585918426514, "learning_rate": 5.759097512243626e-05, "loss": 1.0059, "step": 178900 }, { "epoch": 2.3863166735545454, "grad_norm": 4.342288494110107, "learning_rate": 5.758756593062889e-05, "loss": 1.027, "step": 179000 }, { "epoch": 2.3876498113610003, "grad_norm": 10.993585586547852, "learning_rate": 5.758415442927745e-05, "loss": 0.9861, "step": 179100 }, { "epoch": 2.3889829491674552, "grad_norm": 14.077459335327148, "learning_rate": 5.7580740618667564e-05, "loss": 1.0132, "step": 179200 }, { "epoch": 2.3903160869739106, "grad_norm": 7.662405490875244, "learning_rate": 5.7577324499085013e-05, "loss": 0.9168, "step": 179300 }, { "epoch": 2.3916492247803656, "grad_norm": 6.470981121063232, "learning_rate": 5.757390607081579e-05, "loss": 0.9499, "step": 179400 }, { "epoch": 2.3929823625868205, "grad_norm": 46.68390655517578, "learning_rate": 5.757048533414606e-05, "loss": 0.878, "step": 179500 }, { "epoch": 2.394315500393276, "grad_norm": 5.4957499504089355, "learning_rate": 5.756706228936221e-05, "loss": 0.8911, "step": 179600 }, { "epoch": 2.395648638199731, "grad_norm": 10.856915473937988, "learning_rate": 5.756363693675082e-05, "loss": 0.9063, "step": 179700 }, { "epoch": 2.396981776006186, "grad_norm": 6.300324440002441, "learning_rate": 5.756020927659861e-05, "loss": 0.9083, "step": 179800 }, { "epoch": 2.3983149138126407, "grad_norm": 29.53473663330078, "learning_rate": 5.755677930919257e-05, "loss": 0.8779, "step": 179900 }, { "epoch": 2.3996480516190957, "grad_norm": 6.499782085418701, "learning_rate": 5.7553347034819836e-05, "loss": 0.9136, "step": 180000 }, { "epoch": 2.400981189425551, "grad_norm": 4.125040531158447, "learning_rate": 5.754991245376774e-05, "loss": 1.05, "step": 180100 }, { "epoch": 2.402314327232006, "grad_norm": 8.500948905944824, "learning_rate": 5.7546475566323816e-05, "loss": 0.9508, "step": 180200 }, { "epoch": 2.403647465038461, "grad_norm": 14.253974914550781, "learning_rate": 5.75430363727758e-05, "loss": 0.9908, "step": 180300 }, { "epoch": 2.4049806028449163, "grad_norm": 14.704117774963379, "learning_rate": 5.75395948734116e-05, "loss": 0.9444, "step": 180400 }, { "epoch": 2.4063137406513713, "grad_norm": 11.858834266662598, "learning_rate": 5.753615106851931e-05, "loss": 0.9443, "step": 180500 }, { "epoch": 2.407646878457826, "grad_norm": 10.830971717834473, "learning_rate": 5.753273943089858e-05, "loss": 0.9315, "step": 180600 }, { "epoch": 2.408980016264281, "grad_norm": 8.815362930297852, "learning_rate": 5.7529291038863356e-05, "loss": 1.029, "step": 180700 }, { "epoch": 2.410313154070736, "grad_norm": 34.55770492553711, "learning_rate": 5.752584034216267e-05, "loss": 0.8992, "step": 180800 }, { "epoch": 2.4116462918771915, "grad_norm": 39.1789665222168, "learning_rate": 5.752238734108539e-05, "loss": 0.983, "step": 180900 }, { "epoch": 2.4129794296836464, "grad_norm": 3.147153854370117, "learning_rate": 5.7518932035920604e-05, "loss": 0.9819, "step": 181000 }, { "epoch": 2.4143125674901014, "grad_norm": 11.472395896911621, "learning_rate": 5.751547442695757e-05, "loss": 0.9511, "step": 181100 }, { "epoch": 2.4156457052965563, "grad_norm": 52.57477569580078, "learning_rate": 5.751201451448577e-05, "loss": 0.9783, "step": 181200 }, { "epoch": 2.4169788431030117, "grad_norm": 43.54500198364258, "learning_rate": 5.750855229879482e-05, "loss": 0.9624, "step": 181300 }, { "epoch": 2.4183119809094666, "grad_norm": 23.025371551513672, "learning_rate": 5.75050877801746e-05, "loss": 0.9337, "step": 181400 }, { "epoch": 2.4196451187159216, "grad_norm": 8.468856811523438, "learning_rate": 5.7501620958915124e-05, "loss": 1.0184, "step": 181500 }, { "epoch": 2.4209782565223765, "grad_norm": 12.198507308959961, "learning_rate": 5.749815183530664e-05, "loss": 1.0526, "step": 181600 }, { "epoch": 2.422311394328832, "grad_norm": 40.79458999633789, "learning_rate": 5.749468040963956e-05, "loss": 0.9575, "step": 181700 }, { "epoch": 2.423644532135287, "grad_norm": 3.9143688678741455, "learning_rate": 5.7491206682204516e-05, "loss": 0.9519, "step": 181800 }, { "epoch": 2.424977669941742, "grad_norm": 5.522427558898926, "learning_rate": 5.74877306532923e-05, "loss": 0.9019, "step": 181900 }, { "epoch": 2.4263108077481967, "grad_norm": 3.1926968097686768, "learning_rate": 5.748425232319393e-05, "loss": 0.9801, "step": 182000 }, { "epoch": 2.427643945554652, "grad_norm": 16.030832290649414, "learning_rate": 5.748077169220059e-05, "loss": 0.9141, "step": 182100 }, { "epoch": 2.428977083361107, "grad_norm": 4.7711262702941895, "learning_rate": 5.747728876060368e-05, "loss": 1.0075, "step": 182200 }, { "epoch": 2.430310221167562, "grad_norm": 25.396093368530273, "learning_rate": 5.747380352869475e-05, "loss": 0.9567, "step": 182300 }, { "epoch": 2.431643358974017, "grad_norm": 4.2878007888793945, "learning_rate": 5.747031599676561e-05, "loss": 0.9068, "step": 182400 }, { "epoch": 2.4329764967804723, "grad_norm": 10.356338500976562, "learning_rate": 5.74668261651082e-05, "loss": 0.8256, "step": 182500 }, { "epoch": 2.4343096345869273, "grad_norm": 7.003758430480957, "learning_rate": 5.746333403401469e-05, "loss": 0.9175, "step": 182600 }, { "epoch": 2.4356427723933822, "grad_norm": 9.877434730529785, "learning_rate": 5.745983960377743e-05, "loss": 0.8585, "step": 182700 }, { "epoch": 2.436975910199837, "grad_norm": 9.554931640625, "learning_rate": 5.745634287468895e-05, "loss": 0.9428, "step": 182800 }, { "epoch": 2.4383090480062926, "grad_norm": 9.802769660949707, "learning_rate": 5.7452843847042005e-05, "loss": 0.9615, "step": 182900 }, { "epoch": 2.4396421858127475, "grad_norm": 19.17452621459961, "learning_rate": 5.7449342521129506e-05, "loss": 0.9711, "step": 183000 }, { "epoch": 2.4409753236192024, "grad_norm": 11.708124160766602, "learning_rate": 5.744583889724458e-05, "loss": 0.986, "step": 183100 }, { "epoch": 2.4423084614256574, "grad_norm": 4.075823783874512, "learning_rate": 5.744233297568054e-05, "loss": 0.8978, "step": 183200 }, { "epoch": 2.4436415992321128, "grad_norm": 9.857738494873047, "learning_rate": 5.743882475673089e-05, "loss": 0.8933, "step": 183300 }, { "epoch": 2.4449747370385677, "grad_norm": 9.565342903137207, "learning_rate": 5.743531424068932e-05, "loss": 0.9241, "step": 183400 }, { "epoch": 2.4463078748450227, "grad_norm": 5.851468086242676, "learning_rate": 5.743180142784972e-05, "loss": 0.7822, "step": 183500 }, { "epoch": 2.4476410126514776, "grad_norm": 3.270627975463867, "learning_rate": 5.742828631850619e-05, "loss": 0.9119, "step": 183600 }, { "epoch": 2.448974150457933, "grad_norm": 3.8789594173431396, "learning_rate": 5.742476891295299e-05, "loss": 0.8863, "step": 183700 }, { "epoch": 2.450307288264388, "grad_norm": 8.83968448638916, "learning_rate": 5.742124921148458e-05, "loss": 0.9108, "step": 183800 }, { "epoch": 2.451640426070843, "grad_norm": 62.249610900878906, "learning_rate": 5.741776244572887e-05, "loss": 0.9214, "step": 183900 }, { "epoch": 2.452973563877298, "grad_norm": 9.843441009521484, "learning_rate": 5.7414238176266025e-05, "loss": 0.9272, "step": 184000 }, { "epoch": 2.454306701683753, "grad_norm": 47.50164031982422, "learning_rate": 5.741071161176957e-05, "loss": 0.8365, "step": 184100 }, { "epoch": 2.455639839490208, "grad_norm": 11.017227172851562, "learning_rate": 5.7407182752534745e-05, "loss": 0.8589, "step": 184200 }, { "epoch": 2.456972977296663, "grad_norm": 9.370798110961914, "learning_rate": 5.7403651598856976e-05, "loss": 1.0268, "step": 184300 }, { "epoch": 2.458306115103118, "grad_norm": 17.906314849853516, "learning_rate": 5.7400118151031886e-05, "loss": 0.8837, "step": 184400 }, { "epoch": 2.4596392529095734, "grad_norm": 9.301961898803711, "learning_rate": 5.739658240935527e-05, "loss": 0.8842, "step": 184500 }, { "epoch": 2.4609723907160284, "grad_norm": 13.017679214477539, "learning_rate": 5.739304437412314e-05, "loss": 0.8901, "step": 184600 }, { "epoch": 2.4623055285224833, "grad_norm": 14.942523002624512, "learning_rate": 5.738950404563169e-05, "loss": 0.8442, "step": 184700 }, { "epoch": 2.4636386663289382, "grad_norm": 5.444016456604004, "learning_rate": 5.7385961424177284e-05, "loss": 0.8511, "step": 184800 }, { "epoch": 2.4649718041353936, "grad_norm": 4.52464485168457, "learning_rate": 5.738241651005653e-05, "loss": 0.9072, "step": 184900 }, { "epoch": 2.4663049419418486, "grad_norm": 3.391129732131958, "learning_rate": 5.737886930356618e-05, "loss": 0.8549, "step": 185000 }, { "epoch": 2.4676380797483035, "grad_norm": 4.507004261016846, "learning_rate": 5.737531980500319e-05, "loss": 0.95, "step": 185100 }, { "epoch": 2.4689712175547585, "grad_norm": 55.78239822387695, "learning_rate": 5.7371768014664726e-05, "loss": 0.9072, "step": 185200 }, { "epoch": 2.470304355361214, "grad_norm": 3.4618232250213623, "learning_rate": 5.7368213932848125e-05, "loss": 0.9067, "step": 185300 }, { "epoch": 2.471637493167669, "grad_norm": 8.544364929199219, "learning_rate": 5.736465755985093e-05, "loss": 0.937, "step": 185400 }, { "epoch": 2.4729706309741237, "grad_norm": 19.629314422607422, "learning_rate": 5.736109889597086e-05, "loss": 0.8759, "step": 185500 }, { "epoch": 2.4743037687805787, "grad_norm": 11.136116981506348, "learning_rate": 5.735753794150585e-05, "loss": 0.851, "step": 185600 }, { "epoch": 2.475636906587034, "grad_norm": 5.427245140075684, "learning_rate": 5.7353974696754e-05, "loss": 0.9427, "step": 185700 }, { "epoch": 2.476970044393489, "grad_norm": 5.895517349243164, "learning_rate": 5.73504091620136e-05, "loss": 0.9171, "step": 185800 }, { "epoch": 2.478303182199944, "grad_norm": 9.664649963378906, "learning_rate": 5.7346841337583175e-05, "loss": 0.8742, "step": 185900 }, { "epoch": 2.479636320006399, "grad_norm": 16.956707000732422, "learning_rate": 5.734327122376139e-05, "loss": 0.904, "step": 186000 }, { "epoch": 2.4809694578128543, "grad_norm": 12.956076622009277, "learning_rate": 5.733969882084714e-05, "loss": 0.8615, "step": 186100 }, { "epoch": 2.482302595619309, "grad_norm": 1.7504470348358154, "learning_rate": 5.733612412913948e-05, "loss": 0.9675, "step": 186200 }, { "epoch": 2.483635733425764, "grad_norm": 22.863632202148438, "learning_rate": 5.733254714893768e-05, "loss": 0.928, "step": 186300 }, { "epoch": 2.484968871232219, "grad_norm": 15.970070838928223, "learning_rate": 5.7328967880541194e-05, "loss": 0.862, "step": 186400 }, { "epoch": 2.4863020090386745, "grad_norm": 4.626001358032227, "learning_rate": 5.732538632424967e-05, "loss": 0.9033, "step": 186500 }, { "epoch": 2.4876351468451294, "grad_norm": 10.520577430725098, "learning_rate": 5.732180248036293e-05, "loss": 0.9139, "step": 186600 }, { "epoch": 2.4889682846515844, "grad_norm": 7.02032995223999, "learning_rate": 5.731821634918101e-05, "loss": 0.925, "step": 186700 }, { "epoch": 2.4903014224580393, "grad_norm": 43.98715591430664, "learning_rate": 5.7314627931004144e-05, "loss": 0.8648, "step": 186800 }, { "epoch": 2.4916345602644947, "grad_norm": 11.59911060333252, "learning_rate": 5.731103722613272e-05, "loss": 0.9138, "step": 186900 }, { "epoch": 2.4929676980709496, "grad_norm": 6.968249797821045, "learning_rate": 5.7307444234867356e-05, "loss": 0.9557, "step": 187000 }, { "epoch": 2.4943008358774046, "grad_norm": 4.134568214416504, "learning_rate": 5.730384895750885e-05, "loss": 0.7593, "step": 187100 }, { "epoch": 2.4956339736838595, "grad_norm": 3.858826160430908, "learning_rate": 5.730025139435816e-05, "loss": 0.9569, "step": 187200 }, { "epoch": 2.496967111490315, "grad_norm": 1.2197777032852173, "learning_rate": 5.7296651545716496e-05, "loss": 0.9958, "step": 187300 }, { "epoch": 2.49830024929677, "grad_norm": 7.917428016662598, "learning_rate": 5.729304941188521e-05, "loss": 0.8227, "step": 187400 }, { "epoch": 2.499633387103225, "grad_norm": 13.755780220031738, "learning_rate": 5.728944499316586e-05, "loss": 0.8481, "step": 187500 }, { "epoch": 2.50096652490968, "grad_norm": 11.412001609802246, "learning_rate": 5.7285838289860194e-05, "loss": 0.9749, "step": 187600 }, { "epoch": 2.502299662716135, "grad_norm": 4.838199615478516, "learning_rate": 5.728222930227016e-05, "loss": 0.8518, "step": 187700 }, { "epoch": 2.50363280052259, "grad_norm": 2.9218389987945557, "learning_rate": 5.727861803069789e-05, "loss": 0.8211, "step": 187800 }, { "epoch": 2.504965938329045, "grad_norm": 3.5195398330688477, "learning_rate": 5.727500447544571e-05, "loss": 0.9911, "step": 187900 }, { "epoch": 2.5062990761355, "grad_norm": 10.726255416870117, "learning_rate": 5.727138863681614e-05, "loss": 0.9035, "step": 188000 }, { "epoch": 2.5076322139419553, "grad_norm": 10.27153205871582, "learning_rate": 5.726777051511187e-05, "loss": 0.912, "step": 188100 }, { "epoch": 2.5089653517484103, "grad_norm": 3.68229079246521, "learning_rate": 5.7264150110635814e-05, "loss": 0.846, "step": 188200 }, { "epoch": 2.5102984895548652, "grad_norm": 9.742674827575684, "learning_rate": 5.726052742369105e-05, "loss": 0.9022, "step": 188300 }, { "epoch": 2.51163162736132, "grad_norm": 4.1448140144348145, "learning_rate": 5.725690245458087e-05, "loss": 0.887, "step": 188400 }, { "epoch": 2.5129647651677756, "grad_norm": 17.49011993408203, "learning_rate": 5.725327520360873e-05, "loss": 0.8816, "step": 188500 }, { "epoch": 2.5142979029742305, "grad_norm": 12.284893989562988, "learning_rate": 5.72496456710783e-05, "loss": 0.9218, "step": 188600 }, { "epoch": 2.5156310407806854, "grad_norm": 16.2833309173584, "learning_rate": 5.724601385729344e-05, "loss": 0.9475, "step": 188700 }, { "epoch": 2.5169641785871404, "grad_norm": 2.3089916706085205, "learning_rate": 5.724237976255818e-05, "loss": 0.8957, "step": 188800 }, { "epoch": 2.5182973163935953, "grad_norm": 2.714859962463379, "learning_rate": 5.7238743387176764e-05, "loss": 0.8505, "step": 188900 }, { "epoch": 2.5196304542000507, "grad_norm": 6.900971412658691, "learning_rate": 5.7235104731453616e-05, "loss": 0.8999, "step": 189000 }, { "epoch": 2.5209635920065057, "grad_norm": 13.64380931854248, "learning_rate": 5.723146379569335e-05, "loss": 0.9839, "step": 189100 }, { "epoch": 2.5222967298129606, "grad_norm": 95.81458282470703, "learning_rate": 5.722782058020078e-05, "loss": 0.8731, "step": 189200 }, { "epoch": 2.523629867619416, "grad_norm": 12.948078155517578, "learning_rate": 5.7224211551512256e-05, "loss": 0.9509, "step": 189300 }, { "epoch": 2.524963005425871, "grad_norm": 9.999978065490723, "learning_rate": 5.722056380025997e-05, "loss": 0.9483, "step": 189400 }, { "epoch": 2.526296143232326, "grad_norm": 14.990596771240234, "learning_rate": 5.7216913770187875e-05, "loss": 0.9853, "step": 189500 }, { "epoch": 2.527629281038781, "grad_norm": 12.680957794189453, "learning_rate": 5.7213261461601565e-05, "loss": 0.8913, "step": 189600 }, { "epoch": 2.5289624188452358, "grad_norm": 42.6542854309082, "learning_rate": 5.720960687480679e-05, "loss": 0.9703, "step": 189700 }, { "epoch": 2.530295556651691, "grad_norm": 12.583633422851562, "learning_rate": 5.72059500101095e-05, "loss": 0.8375, "step": 189800 }, { "epoch": 2.531628694458146, "grad_norm": 14.86093521118164, "learning_rate": 5.720229086781582e-05, "loss": 0.9128, "step": 189900 }, { "epoch": 2.532961832264601, "grad_norm": 4.572847366333008, "learning_rate": 5.719862944823211e-05, "loss": 0.9172, "step": 190000 }, { "epoch": 2.5342949700710564, "grad_norm": 18.75384521484375, "learning_rate": 5.7194965751664886e-05, "loss": 0.86, "step": 190100 }, { "epoch": 2.5356281078775114, "grad_norm": 73.35372924804688, "learning_rate": 5.7191299778420846e-05, "loss": 0.9169, "step": 190200 }, { "epoch": 2.5369612456839663, "grad_norm": 136.47964477539062, "learning_rate": 5.718763152880691e-05, "loss": 0.9309, "step": 190300 }, { "epoch": 2.5382943834904212, "grad_norm": 21.60659408569336, "learning_rate": 5.718396100313016e-05, "loss": 0.9891, "step": 190400 }, { "epoch": 2.539627521296876, "grad_norm": 10.683351516723633, "learning_rate": 5.7180288201697883e-05, "loss": 0.8994, "step": 190500 }, { "epoch": 2.5409606591033316, "grad_norm": 26.94028091430664, "learning_rate": 5.717661312481756e-05, "loss": 0.9358, "step": 190600 }, { "epoch": 2.5422937969097865, "grad_norm": 10.380480766296387, "learning_rate": 5.7172972557578e-05, "loss": 0.9204, "step": 190700 }, { "epoch": 2.5436269347162415, "grad_norm": 15.505949020385742, "learning_rate": 5.716929295347157e-05, "loss": 0.9438, "step": 190800 }, { "epoch": 2.544960072522697, "grad_norm": 5.301143169403076, "learning_rate": 5.716561107483758e-05, "loss": 0.912, "step": 190900 }, { "epoch": 2.546293210329152, "grad_norm": 30.394474029541016, "learning_rate": 5.716192692198426e-05, "loss": 0.8853, "step": 191000 }, { "epoch": 2.5476263481356067, "grad_norm": 14.054719924926758, "learning_rate": 5.715824049522004e-05, "loss": 0.9327, "step": 191100 }, { "epoch": 2.5489594859420617, "grad_norm": 15.475951194763184, "learning_rate": 5.715455179485354e-05, "loss": 1.0074, "step": 191200 }, { "epoch": 2.5502926237485166, "grad_norm": 20.804224014282227, "learning_rate": 5.715089774218195e-05, "loss": 0.8892, "step": 191300 }, { "epoch": 2.551625761554972, "grad_norm": 5.932125091552734, "learning_rate": 5.714720451826581e-05, "loss": 0.9379, "step": 191400 }, { "epoch": 2.552958899361427, "grad_norm": 12.463311195373535, "learning_rate": 5.714350902167127e-05, "loss": 0.8531, "step": 191500 }, { "epoch": 2.554292037167882, "grad_norm": 12.431017875671387, "learning_rate": 5.713981125270773e-05, "loss": 0.9282, "step": 191600 }, { "epoch": 2.5556251749743373, "grad_norm": 23.940519332885742, "learning_rate": 5.713611121168474e-05, "loss": 0.9221, "step": 191700 }, { "epoch": 2.556958312780792, "grad_norm": 55.973426818847656, "learning_rate": 5.713240889891206e-05, "loss": 0.8831, "step": 191800 }, { "epoch": 2.558291450587247, "grad_norm": 18.58709144592285, "learning_rate": 5.712870431469963e-05, "loss": 0.9527, "step": 191900 }, { "epoch": 2.559624588393702, "grad_norm": 9.483757972717285, "learning_rate": 5.71249974593576e-05, "loss": 0.9572, "step": 192000 }, { "epoch": 2.560957726200157, "grad_norm": 5.675667762756348, "learning_rate": 5.7121288333196285e-05, "loss": 0.8806, "step": 192100 }, { "epoch": 2.5622908640066124, "grad_norm": 24.018375396728516, "learning_rate": 5.7117576936526204e-05, "loss": 0.9097, "step": 192200 }, { "epoch": 2.5636240018130674, "grad_norm": 5.3436808586120605, "learning_rate": 5.711386326965807e-05, "loss": 1.0493, "step": 192300 }, { "epoch": 2.5649571396195223, "grad_norm": 7.741073131561279, "learning_rate": 5.711014733290276e-05, "loss": 0.9481, "step": 192400 }, { "epoch": 2.5662902774259777, "grad_norm": 7.250056743621826, "learning_rate": 5.7106429126571387e-05, "loss": 0.8936, "step": 192500 }, { "epoch": 2.5676234152324326, "grad_norm": 7.175361156463623, "learning_rate": 5.710270865097521e-05, "loss": 0.8858, "step": 192600 }, { "epoch": 2.5689565530388876, "grad_norm": 3.039489984512329, "learning_rate": 5.70989859064257e-05, "loss": 0.8774, "step": 192700 }, { "epoch": 2.5702896908453425, "grad_norm": 8.51569652557373, "learning_rate": 5.709526089323451e-05, "loss": 0.8694, "step": 192800 }, { "epoch": 2.5716228286517975, "grad_norm": 14.36115550994873, "learning_rate": 5.709153361171349e-05, "loss": 0.9459, "step": 192900 }, { "epoch": 2.572955966458253, "grad_norm": 23.934125900268555, "learning_rate": 5.7087804062174675e-05, "loss": 0.8173, "step": 193000 }, { "epoch": 2.574289104264708, "grad_norm": 9.387582778930664, "learning_rate": 5.70840722449303e-05, "loss": 0.9889, "step": 193100 }, { "epoch": 2.5756222420711627, "grad_norm": 10.772202491760254, "learning_rate": 5.708033816029277e-05, "loss": 0.9329, "step": 193200 }, { "epoch": 2.576955379877618, "grad_norm": 6.6285080909729, "learning_rate": 5.707660180857469e-05, "loss": 0.7779, "step": 193300 }, { "epoch": 2.578288517684073, "grad_norm": 9.640190124511719, "learning_rate": 5.707286319008886e-05, "loss": 0.8568, "step": 193400 }, { "epoch": 2.579621655490528, "grad_norm": 10.561788558959961, "learning_rate": 5.706912230514827e-05, "loss": 0.9488, "step": 193500 }, { "epoch": 2.580954793296983, "grad_norm": 2.0055835247039795, "learning_rate": 5.706537915406609e-05, "loss": 0.8324, "step": 193600 }, { "epoch": 2.582287931103438, "grad_norm": 5.113940715789795, "learning_rate": 5.706163373715567e-05, "loss": 0.7744, "step": 193700 }, { "epoch": 2.5836210689098933, "grad_norm": 4.869402885437012, "learning_rate": 5.705788605473059e-05, "loss": 0.8104, "step": 193800 }, { "epoch": 2.5849542067163482, "grad_norm": 8.2017240524292, "learning_rate": 5.7054136107104584e-05, "loss": 0.8372, "step": 193900 }, { "epoch": 2.586287344522803, "grad_norm": 4.345639228820801, "learning_rate": 5.705038389459159e-05, "loss": 0.8873, "step": 194000 }, { "epoch": 2.5876204823292586, "grad_norm": 13.927671432495117, "learning_rate": 5.704662941750572e-05, "loss": 0.8659, "step": 194100 }, { "epoch": 2.5889536201357135, "grad_norm": 9.010117530822754, "learning_rate": 5.704287267616129e-05, "loss": 0.8667, "step": 194200 }, { "epoch": 2.5902867579421684, "grad_norm": 4.893955230712891, "learning_rate": 5.703911367087281e-05, "loss": 0.947, "step": 194300 }, { "epoch": 2.5916198957486234, "grad_norm": 12.344923973083496, "learning_rate": 5.703535240195497e-05, "loss": 0.9053, "step": 194400 }, { "epoch": 2.5929530335550783, "grad_norm": 6.643254280090332, "learning_rate": 5.703158886972265e-05, "loss": 0.8005, "step": 194500 }, { "epoch": 2.5942861713615337, "grad_norm": 2.308816432952881, "learning_rate": 5.702782307449091e-05, "loss": 0.9251, "step": 194600 }, { "epoch": 2.5956193091679887, "grad_norm": 4.2440924644470215, "learning_rate": 5.702405501657504e-05, "loss": 0.828, "step": 194700 }, { "epoch": 2.5969524469744436, "grad_norm": 6.746159553527832, "learning_rate": 5.702028469629046e-05, "loss": 0.9254, "step": 194800 }, { "epoch": 2.598285584780899, "grad_norm": 17.30436897277832, "learning_rate": 5.701651211395282e-05, "loss": 0.8753, "step": 194900 }, { "epoch": 2.599618722587354, "grad_norm": 15.869844436645508, "learning_rate": 5.701273726987794e-05, "loss": 0.8917, "step": 195000 }, { "epoch": 2.600951860393809, "grad_norm": 3.2467222213745117, "learning_rate": 5.700896016438187e-05, "loss": 0.8325, "step": 195100 }, { "epoch": 2.602284998200264, "grad_norm": 21.191665649414062, "learning_rate": 5.700518079778078e-05, "loss": 0.867, "step": 195200 }, { "epoch": 2.6036181360067188, "grad_norm": 11.58130168914795, "learning_rate": 5.700139917039109e-05, "loss": 0.8718, "step": 195300 }, { "epoch": 2.604951273813174, "grad_norm": 16.288503646850586, "learning_rate": 5.699761528252936e-05, "loss": 0.8549, "step": 195400 }, { "epoch": 2.606284411619629, "grad_norm": 9.243151664733887, "learning_rate": 5.69938291345124e-05, "loss": 0.9342, "step": 195500 }, { "epoch": 2.607617549426084, "grad_norm": 6.474632263183594, "learning_rate": 5.6990040726657156e-05, "loss": 0.8034, "step": 195600 }, { "epoch": 2.6089506872325394, "grad_norm": 4.7096686363220215, "learning_rate": 5.698625005928077e-05, "loss": 0.93, "step": 195700 }, { "epoch": 2.6102838250389944, "grad_norm": 11.98466682434082, "learning_rate": 5.69824571327006e-05, "loss": 0.7904, "step": 195800 }, { "epoch": 2.6116169628454493, "grad_norm": 6.689655303955078, "learning_rate": 5.6978661947234184e-05, "loss": 0.8285, "step": 195900 }, { "epoch": 2.6129501006519043, "grad_norm": 30.686601638793945, "learning_rate": 5.697486450319923e-05, "loss": 0.8542, "step": 196000 }, { "epoch": 2.614283238458359, "grad_norm": 11.150355339050293, "learning_rate": 5.6971064800913645e-05, "loss": 0.9826, "step": 196100 }, { "epoch": 2.6156163762648146, "grad_norm": 5.400226593017578, "learning_rate": 5.696726284069554e-05, "loss": 0.878, "step": 196200 }, { "epoch": 2.6169495140712695, "grad_norm": 4.412319660186768, "learning_rate": 5.69634586228632e-05, "loss": 0.8218, "step": 196300 }, { "epoch": 2.6182826518777245, "grad_norm": 9.198378562927246, "learning_rate": 5.695965214773511e-05, "loss": 0.9185, "step": 196400 }, { "epoch": 2.61961578968418, "grad_norm": 6.266569137573242, "learning_rate": 5.6955843415629904e-05, "loss": 0.9139, "step": 196500 }, { "epoch": 2.620948927490635, "grad_norm": 6.853569984436035, "learning_rate": 5.6952032426866476e-05, "loss": 0.8373, "step": 196600 }, { "epoch": 2.6222820652970897, "grad_norm": 9.772175788879395, "learning_rate": 5.694821918176384e-05, "loss": 0.8222, "step": 196700 }, { "epoch": 2.6236152031035447, "grad_norm": 7.46972131729126, "learning_rate": 5.694440368064125e-05, "loss": 0.7838, "step": 196800 }, { "epoch": 2.6249483409099996, "grad_norm": 17.404817581176758, "learning_rate": 5.6940585923818124e-05, "loss": 0.7999, "step": 196900 }, { "epoch": 2.626281478716455, "grad_norm": 5.3818278312683105, "learning_rate": 5.6936765911614064e-05, "loss": 0.8222, "step": 197000 }, { "epoch": 2.62761461652291, "grad_norm": 10.787400245666504, "learning_rate": 5.6932943644348875e-05, "loss": 0.9409, "step": 197100 }, { "epoch": 2.628947754329365, "grad_norm": 11.109642028808594, "learning_rate": 5.692911912234254e-05, "loss": 0.7591, "step": 197200 }, { "epoch": 2.6302808921358203, "grad_norm": 6.968142032623291, "learning_rate": 5.692529234591525e-05, "loss": 0.8459, "step": 197300 }, { "epoch": 2.6316140299422752, "grad_norm": 7.900240898132324, "learning_rate": 5.692146331538735e-05, "loss": 0.8622, "step": 197400 }, { "epoch": 2.63294716774873, "grad_norm": 11.274792671203613, "learning_rate": 5.691763203107941e-05, "loss": 0.861, "step": 197500 }, { "epoch": 2.634280305555185, "grad_norm": 5.059036731719971, "learning_rate": 5.691379849331216e-05, "loss": 0.87, "step": 197600 }, { "epoch": 2.63561344336164, "grad_norm": 14.788020133972168, "learning_rate": 5.6909962702406556e-05, "loss": 0.8551, "step": 197700 }, { "epoch": 2.6369465811680954, "grad_norm": 3.590946674346924, "learning_rate": 5.690612465868369e-05, "loss": 0.774, "step": 197800 }, { "epoch": 2.6382797189745504, "grad_norm": 7.267292022705078, "learning_rate": 5.690228436246489e-05, "loss": 0.9274, "step": 197900 }, { "epoch": 2.6396128567810053, "grad_norm": 4.4792256355285645, "learning_rate": 5.6898441814071645e-05, "loss": 0.7653, "step": 198000 }, { "epoch": 2.6409459945874607, "grad_norm": 15.18313217163086, "learning_rate": 5.689459701382563e-05, "loss": 0.8824, "step": 198100 }, { "epoch": 2.6422791323939157, "grad_norm": 5.117770195007324, "learning_rate": 5.689074996204875e-05, "loss": 0.8541, "step": 198200 }, { "epoch": 2.6436122702003706, "grad_norm": 12.651885032653809, "learning_rate": 5.6886900659063034e-05, "loss": 0.9304, "step": 198300 }, { "epoch": 2.6449454080068255, "grad_norm": 5.904500961303711, "learning_rate": 5.6883049105190756e-05, "loss": 0.8124, "step": 198400 }, { "epoch": 2.6462785458132805, "grad_norm": 6.825811862945557, "learning_rate": 5.687919530075436e-05, "loss": 0.9496, "step": 198500 }, { "epoch": 2.647611683619736, "grad_norm": 17.319704055786133, "learning_rate": 5.687533924607645e-05, "loss": 0.8462, "step": 198600 }, { "epoch": 2.648944821426191, "grad_norm": 19.313379287719727, "learning_rate": 5.687148094147986e-05, "loss": 0.8226, "step": 198700 }, { "epoch": 2.6502779592326458, "grad_norm": 11.919974327087402, "learning_rate": 5.68676203872876e-05, "loss": 0.8624, "step": 198800 }, { "epoch": 2.651611097039101, "grad_norm": 4.638187885284424, "learning_rate": 5.686375758382285e-05, "loss": 0.8235, "step": 198900 }, { "epoch": 2.652944234845556, "grad_norm": 4.482738971710205, "learning_rate": 5.6859892531408995e-05, "loss": 0.8366, "step": 199000 }, { "epoch": 2.654277372652011, "grad_norm": 9.947405815124512, "learning_rate": 5.685602523036961e-05, "loss": 0.8762, "step": 199100 }, { "epoch": 2.655610510458466, "grad_norm": 7.966471195220947, "learning_rate": 5.685215568102844e-05, "loss": 0.9336, "step": 199200 }, { "epoch": 2.656943648264921, "grad_norm": 13.481854438781738, "learning_rate": 5.684832261280907e-05, "loss": 0.8654, "step": 199300 }, { "epoch": 2.6582767860713763, "grad_norm": 12.411259651184082, "learning_rate": 5.684444859031132e-05, "loss": 0.8266, "step": 199400 }, { "epoch": 2.6596099238778312, "grad_norm": 13.128730773925781, "learning_rate": 5.6840572320480956e-05, "loss": 0.8525, "step": 199500 }, { "epoch": 2.660943061684286, "grad_norm": 3.1292834281921387, "learning_rate": 5.683673259993249e-05, "loss": 0.8528, "step": 199600 }, { "epoch": 2.6622761994907416, "grad_norm": 12.152811050415039, "learning_rate": 5.683285185887584e-05, "loss": 0.8668, "step": 199700 }, { "epoch": 2.6636093372971965, "grad_norm": 17.898109436035156, "learning_rate": 5.6828968871457407e-05, "loss": 0.8352, "step": 199800 }, { "epoch": 2.6649424751036515, "grad_norm": 7.841987133026123, "learning_rate": 5.682508363800228e-05, "loss": 0.8425, "step": 199900 }, { "epoch": 2.6662756129101064, "grad_norm": 17.661006927490234, "learning_rate": 5.6821196158835706e-05, "loss": 0.893, "step": 200000 }, { "epoch": 2.6676087507165613, "grad_norm": 5.116596698760986, "learning_rate": 5.6817306434283135e-05, "loss": 0.7908, "step": 200100 }, { "epoch": 2.6689418885230167, "grad_norm": 11.872044563293457, "learning_rate": 5.681341446467021e-05, "loss": 0.8688, "step": 200200 }, { "epoch": 2.6702750263294717, "grad_norm": 10.606745719909668, "learning_rate": 5.6809520250322746e-05, "loss": 0.9533, "step": 200300 }, { "epoch": 2.6716081641359266, "grad_norm": 5.922530651092529, "learning_rate": 5.680562379156676e-05, "loss": 0.8847, "step": 200400 }, { "epoch": 2.672941301942382, "grad_norm": 14.729166984558105, "learning_rate": 5.6801764086863964e-05, "loss": 0.9134, "step": 200500 }, { "epoch": 2.674274439748837, "grad_norm": 15.767383575439453, "learning_rate": 5.679786316270566e-05, "loss": 0.8056, "step": 200600 }, { "epoch": 2.675607577555292, "grad_norm": 5.26537561416626, "learning_rate": 5.679395999511472e-05, "loss": 0.9095, "step": 200700 }, { "epoch": 2.676940715361747, "grad_norm": 8.43250846862793, "learning_rate": 5.679005458441792e-05, "loss": 0.837, "step": 200800 }, { "epoch": 2.6782738531682018, "grad_norm": 1.8891582489013672, "learning_rate": 5.6786146930942193e-05, "loss": 0.9368, "step": 200900 }, { "epoch": 2.679606990974657, "grad_norm": 11.223736763000488, "learning_rate": 5.678223703501468e-05, "loss": 0.9461, "step": 201000 }, { "epoch": 2.680940128781112, "grad_norm": 5.555866718292236, "learning_rate": 5.677832489696272e-05, "loss": 0.8341, "step": 201100 }, { "epoch": 2.682273266587567, "grad_norm": 10.825348854064941, "learning_rate": 5.67744105171138e-05, "loss": 0.8927, "step": 201200 }, { "epoch": 2.6836064043940224, "grad_norm": 7.722110748291016, "learning_rate": 5.677049389579563e-05, "loss": 0.8457, "step": 201300 }, { "epoch": 2.6849395422004774, "grad_norm": 10.730108261108398, "learning_rate": 5.6766575033336104e-05, "loss": 0.8366, "step": 201400 }, { "epoch": 2.6862726800069323, "grad_norm": 47.1463508605957, "learning_rate": 5.67626539300633e-05, "loss": 0.7677, "step": 201500 }, { "epoch": 2.6876058178133873, "grad_norm": 12.913433074951172, "learning_rate": 5.675873058630546e-05, "loss": 0.8291, "step": 201600 }, { "epoch": 2.688938955619842, "grad_norm": 6.0199761390686035, "learning_rate": 5.675480500239105e-05, "loss": 0.7938, "step": 201700 }, { "epoch": 2.6902720934262976, "grad_norm": 8.101310729980469, "learning_rate": 5.6750877178648704e-05, "loss": 0.8769, "step": 201800 }, { "epoch": 2.6916052312327525, "grad_norm": 3.9120776653289795, "learning_rate": 5.674694711540725e-05, "loss": 0.8469, "step": 201900 }, { "epoch": 2.6929383690392075, "grad_norm": 5.562835693359375, "learning_rate": 5.674301481299569e-05, "loss": 0.7891, "step": 202000 }, { "epoch": 2.694271506845663, "grad_norm": 9.4999418258667, "learning_rate": 5.673908027174323e-05, "loss": 0.8269, "step": 202100 }, { "epoch": 2.695604644652118, "grad_norm": 11.657959938049316, "learning_rate": 5.673514349197927e-05, "loss": 1.0097, "step": 202200 }, { "epoch": 2.6969377824585727, "grad_norm": 2.436680316925049, "learning_rate": 5.673120447403336e-05, "loss": 0.8001, "step": 202300 }, { "epoch": 2.6982709202650277, "grad_norm": 8.531296730041504, "learning_rate": 5.672726321823528e-05, "loss": 0.8037, "step": 202400 }, { "epoch": 2.6996040580714826, "grad_norm": 12.82028579711914, "learning_rate": 5.672331972491498e-05, "loss": 0.8329, "step": 202500 }, { "epoch": 2.700937195877938, "grad_norm": 19.372262954711914, "learning_rate": 5.671937399440259e-05, "loss": 0.911, "step": 202600 }, { "epoch": 2.702270333684393, "grad_norm": 6.529332637786865, "learning_rate": 5.671542602702843e-05, "loss": 0.9704, "step": 202700 }, { "epoch": 2.703603471490848, "grad_norm": 2.808598279953003, "learning_rate": 5.671147582312302e-05, "loss": 0.8007, "step": 202800 }, { "epoch": 2.7049366092973033, "grad_norm": 8.536545753479004, "learning_rate": 5.670752338301706e-05, "loss": 0.882, "step": 202900 }, { "epoch": 2.7062697471037582, "grad_norm": 5.819326877593994, "learning_rate": 5.670356870704143e-05, "loss": 0.8461, "step": 203000 }, { "epoch": 2.707602884910213, "grad_norm": 2.1790387630462646, "learning_rate": 5.6699611795527196e-05, "loss": 0.9159, "step": 203100 }, { "epoch": 2.708936022716668, "grad_norm": 6.035168170928955, "learning_rate": 5.669565264880564e-05, "loss": 0.8914, "step": 203200 }, { "epoch": 2.710269160523123, "grad_norm": 6.537728786468506, "learning_rate": 5.669169126720819e-05, "loss": 0.8324, "step": 203300 }, { "epoch": 2.7116022983295784, "grad_norm": 5.232849597930908, "learning_rate": 5.6687767298287815e-05, "loss": 0.9049, "step": 203400 }, { "epoch": 2.7129354361360334, "grad_norm": 3.6120872497558594, "learning_rate": 5.668380147027416e-05, "loss": 0.8746, "step": 203500 }, { "epoch": 2.7142685739424883, "grad_norm": 5.817102909088135, "learning_rate": 5.6679833408376765e-05, "loss": 0.8485, "step": 203600 }, { "epoch": 2.7156017117489437, "grad_norm": 5.197171688079834, "learning_rate": 5.667586311292781e-05, "loss": 0.7769, "step": 203700 }, { "epoch": 2.7169348495553987, "grad_norm": 8.872209548950195, "learning_rate": 5.6671890584259686e-05, "loss": 0.8695, "step": 203800 }, { "epoch": 2.7182679873618536, "grad_norm": 6.881358623504639, "learning_rate": 5.666791582270497e-05, "loss": 0.8464, "step": 203900 }, { "epoch": 2.7196011251683085, "grad_norm": 7.541750907897949, "learning_rate": 5.66639388285964e-05, "loss": 0.8612, "step": 204000 }, { "epoch": 2.7209342629747635, "grad_norm": 9.716181755065918, "learning_rate": 5.6659959602266925e-05, "loss": 0.953, "step": 204100 }, { "epoch": 2.722267400781219, "grad_norm": 4.789022445678711, "learning_rate": 5.665597814404967e-05, "loss": 0.8138, "step": 204200 }, { "epoch": 2.723600538587674, "grad_norm": 5.970549583435059, "learning_rate": 5.6651994454277955e-05, "loss": 0.8646, "step": 204300 }, { "epoch": 2.7249336763941288, "grad_norm": 4.362027168273926, "learning_rate": 5.6648008533285275e-05, "loss": 0.8266, "step": 204400 }, { "epoch": 2.726266814200584, "grad_norm": 6.233395099639893, "learning_rate": 5.664402038140532e-05, "loss": 0.8749, "step": 204500 }, { "epoch": 2.727599952007039, "grad_norm": 3.579021692276001, "learning_rate": 5.6640029998971975e-05, "loss": 0.854, "step": 204600 }, { "epoch": 2.728933089813494, "grad_norm": 12.982717514038086, "learning_rate": 5.6636037386319285e-05, "loss": 0.8403, "step": 204700 }, { "epoch": 2.730266227619949, "grad_norm": 2.9769411087036133, "learning_rate": 5.6632042543781516e-05, "loss": 0.8479, "step": 204800 }, { "epoch": 2.731599365426404, "grad_norm": 7.175867557525635, "learning_rate": 5.6628045471693096e-05, "loss": 0.7711, "step": 204900 }, { "epoch": 2.7329325032328593, "grad_norm": 9.013981819152832, "learning_rate": 5.6624046170388644e-05, "loss": 0.9328, "step": 205000 }, { "epoch": 2.7342656410393142, "grad_norm": 3.419032096862793, "learning_rate": 5.662004464020298e-05, "loss": 0.9045, "step": 205100 }, { "epoch": 2.735598778845769, "grad_norm": 30.89655113220215, "learning_rate": 5.661604088147109e-05, "loss": 0.8976, "step": 205200 }, { "epoch": 2.7369319166522246, "grad_norm": 5.410957336425781, "learning_rate": 5.661203489452816e-05, "loss": 0.8416, "step": 205300 }, { "epoch": 2.7382650544586795, "grad_norm": 5.939827919006348, "learning_rate": 5.660802667970955e-05, "loss": 0.8627, "step": 205400 }, { "epoch": 2.7395981922651345, "grad_norm": 15.127639770507812, "learning_rate": 5.660401623735083e-05, "loss": 0.8081, "step": 205500 }, { "epoch": 2.7409313300715894, "grad_norm": 37.72124481201172, "learning_rate": 5.660000356778772e-05, "loss": 0.7758, "step": 205600 }, { "epoch": 2.7422644678780443, "grad_norm": 3.7609970569610596, "learning_rate": 5.659598867135618e-05, "loss": 0.9399, "step": 205700 }, { "epoch": 2.7435976056844997, "grad_norm": 6.670751571655273, "learning_rate": 5.6591971548392295e-05, "loss": 0.8297, "step": 205800 }, { "epoch": 2.7449307434909547, "grad_norm": 10.642436981201172, "learning_rate": 5.658795219923238e-05, "loss": 0.7558, "step": 205900 }, { "epoch": 2.7462638812974096, "grad_norm": 7.5895466804504395, "learning_rate": 5.6583930624212915e-05, "loss": 0.8633, "step": 206000 }, { "epoch": 2.747597019103865, "grad_norm": 5.442039966583252, "learning_rate": 5.6579906823670586e-05, "loss": 0.8343, "step": 206100 }, { "epoch": 2.74893015691032, "grad_norm": 73.27313232421875, "learning_rate": 5.657588079794224e-05, "loss": 0.8157, "step": 206200 }, { "epoch": 2.750263294716775, "grad_norm": 5.1653733253479, "learning_rate": 5.657185254736493e-05, "loss": 0.8064, "step": 206300 }, { "epoch": 2.75159643252323, "grad_norm": 16.899606704711914, "learning_rate": 5.656782207227588e-05, "loss": 0.8588, "step": 206400 }, { "epoch": 2.7529295703296848, "grad_norm": 8.020820617675781, "learning_rate": 5.6563789373012526e-05, "loss": 0.875, "step": 206500 }, { "epoch": 2.75426270813614, "grad_norm": 2.319972276687622, "learning_rate": 5.655975444991246e-05, "loss": 0.7998, "step": 206600 }, { "epoch": 2.755595845942595, "grad_norm": 1.9590176343917847, "learning_rate": 5.655571730331347e-05, "loss": 0.7703, "step": 206700 }, { "epoch": 2.75692898374905, "grad_norm": 4.487187385559082, "learning_rate": 5.6551677933553534e-05, "loss": 0.8907, "step": 206800 }, { "epoch": 2.7582621215555054, "grad_norm": 25.292081832885742, "learning_rate": 5.6547636340970835e-05, "loss": 0.8419, "step": 206900 }, { "epoch": 2.7595952593619604, "grad_norm": 5.304779529571533, "learning_rate": 5.654359252590369e-05, "loss": 0.8453, "step": 207000 }, { "epoch": 2.7609283971684153, "grad_norm": 7.315831184387207, "learning_rate": 5.653954648869066e-05, "loss": 0.8549, "step": 207100 }, { "epoch": 2.7622615349748703, "grad_norm": 5.739187717437744, "learning_rate": 5.653549822967046e-05, "loss": 0.8274, "step": 207200 }, { "epoch": 2.763594672781325, "grad_norm": 6.272340774536133, "learning_rate": 5.653144774918199e-05, "loss": 0.7985, "step": 207300 }, { "epoch": 2.7649278105877806, "grad_norm": 4.031361103057861, "learning_rate": 5.652739504756435e-05, "loss": 0.8302, "step": 207400 }, { "epoch": 2.7662609483942355, "grad_norm": 8.835556030273438, "learning_rate": 5.652334012515682e-05, "loss": 0.8778, "step": 207500 }, { "epoch": 2.7675940862006905, "grad_norm": 5.298642635345459, "learning_rate": 5.651928298229887e-05, "loss": 0.8776, "step": 207600 }, { "epoch": 2.768927224007146, "grad_norm": 21.358318328857422, "learning_rate": 5.6515264223948246e-05, "loss": 0.8023, "step": 207700 }, { "epoch": 2.770260361813601, "grad_norm": 11.732666969299316, "learning_rate": 5.65112026634046e-05, "loss": 0.7795, "step": 207800 }, { "epoch": 2.7715934996200557, "grad_norm": 5.519641876220703, "learning_rate": 5.650713888342664e-05, "loss": 0.9639, "step": 207900 }, { "epoch": 2.7729266374265107, "grad_norm": 7.528768062591553, "learning_rate": 5.650307288435457e-05, "loss": 0.9154, "step": 208000 }, { "epoch": 2.7742597752329656, "grad_norm": 7.8538079261779785, "learning_rate": 5.649900466652878e-05, "loss": 0.8694, "step": 208100 }, { "epoch": 2.775592913039421, "grad_norm": 3.8365633487701416, "learning_rate": 5.649493423028986e-05, "loss": 0.7915, "step": 208200 }, { "epoch": 2.776926050845876, "grad_norm": 4.975947856903076, "learning_rate": 5.649086157597855e-05, "loss": 0.8799, "step": 208300 }, { "epoch": 2.778259188652331, "grad_norm": 2.4239354133605957, "learning_rate": 5.648678670393583e-05, "loss": 0.7993, "step": 208400 }, { "epoch": 2.7795923264587863, "grad_norm": 7.591439723968506, "learning_rate": 5.648270961450281e-05, "loss": 0.8049, "step": 208500 }, { "epoch": 2.7809254642652412, "grad_norm": 8.706523895263672, "learning_rate": 5.6478630308020835e-05, "loss": 0.8012, "step": 208600 }, { "epoch": 2.782258602071696, "grad_norm": 10.386486053466797, "learning_rate": 5.6474548784831395e-05, "loss": 0.7934, "step": 208700 }, { "epoch": 2.783591739878151, "grad_norm": 5.874204635620117, "learning_rate": 5.647046504527617e-05, "loss": 0.8347, "step": 208800 }, { "epoch": 2.784924877684606, "grad_norm": 4.446353912353516, "learning_rate": 5.6466379089697063e-05, "loss": 0.7812, "step": 208900 }, { "epoch": 2.7862580154910614, "grad_norm": 9.673798561096191, "learning_rate": 5.646229091843613e-05, "loss": 0.8671, "step": 209000 }, { "epoch": 2.7875911532975164, "grad_norm": 5.010857582092285, "learning_rate": 5.6458200531835624e-05, "loss": 0.8098, "step": 209100 }, { "epoch": 2.7889242911039713, "grad_norm": 9.003173828125, "learning_rate": 5.645410793023797e-05, "loss": 0.8919, "step": 209200 }, { "epoch": 2.7902574289104267, "grad_norm": 13.050959587097168, "learning_rate": 5.6450013113985785e-05, "loss": 0.8251, "step": 209300 }, { "epoch": 2.7915905667168817, "grad_norm": 9.24220085144043, "learning_rate": 5.644591608342188e-05, "loss": 0.8151, "step": 209400 }, { "epoch": 2.7929237045233366, "grad_norm": 4.638727188110352, "learning_rate": 5.644181683888925e-05, "loss": 0.789, "step": 209500 }, { "epoch": 2.7942568423297915, "grad_norm": 131.6635284423828, "learning_rate": 5.6437715380731055e-05, "loss": 0.8137, "step": 209600 }, { "epoch": 2.7955899801362465, "grad_norm": 6.661667823791504, "learning_rate": 5.643361170929068e-05, "loss": 0.814, "step": 209700 }, { "epoch": 2.796923117942702, "grad_norm": 7.877896308898926, "learning_rate": 5.642950582491165e-05, "loss": 0.8253, "step": 209800 }, { "epoch": 2.798256255749157, "grad_norm": 8.62459945678711, "learning_rate": 5.6425397727937705e-05, "loss": 0.8897, "step": 209900 }, { "epoch": 2.7995893935556118, "grad_norm": 2.4673547744750977, "learning_rate": 5.6421287418712766e-05, "loss": 0.9169, "step": 210000 }, { "epoch": 2.800922531362067, "grad_norm": 3.505629062652588, "learning_rate": 5.6417174897580934e-05, "loss": 0.8311, "step": 210100 }, { "epoch": 2.802255669168522, "grad_norm": 4.549429416656494, "learning_rate": 5.641306016488649e-05, "loss": 0.8798, "step": 210200 }, { "epoch": 2.803588806974977, "grad_norm": 2.3036646842956543, "learning_rate": 5.640894322097391e-05, "loss": 0.8029, "step": 210300 }, { "epoch": 2.804921944781432, "grad_norm": 4.878140926361084, "learning_rate": 5.640482406618786e-05, "loss": 0.7689, "step": 210400 }, { "epoch": 2.806255082587887, "grad_norm": 11.128988265991211, "learning_rate": 5.6400702700873166e-05, "loss": 0.9014, "step": 210500 }, { "epoch": 2.8075882203943423, "grad_norm": 5.013498783111572, "learning_rate": 5.6396579125374866e-05, "loss": 0.7666, "step": 210600 }, { "epoch": 2.8089213582007972, "grad_norm": 7.091034889221191, "learning_rate": 5.639245334003817e-05, "loss": 0.7962, "step": 210700 }, { "epoch": 2.810254496007252, "grad_norm": 3.688847541809082, "learning_rate": 5.638832534520847e-05, "loss": 0.808, "step": 210800 }, { "epoch": 2.8115876338137076, "grad_norm": 4.047696590423584, "learning_rate": 5.638419514123137e-05, "loss": 0.7739, "step": 210900 }, { "epoch": 2.8129207716201625, "grad_norm": 9.582799911499023, "learning_rate": 5.6380062728452606e-05, "loss": 0.8636, "step": 211000 }, { "epoch": 2.8142539094266175, "grad_norm": 13.70408821105957, "learning_rate": 5.637592810721817e-05, "loss": 0.85, "step": 211100 }, { "epoch": 2.8155870472330724, "grad_norm": 11.264286994934082, "learning_rate": 5.637179127787415e-05, "loss": 0.8586, "step": 211200 }, { "epoch": 2.8169201850395273, "grad_norm": 3.2988944053649902, "learning_rate": 5.6367652240766914e-05, "loss": 0.8189, "step": 211300 }, { "epoch": 2.8182533228459827, "grad_norm": 4.912891864776611, "learning_rate": 5.636351099624294e-05, "loss": 0.8934, "step": 211400 }, { "epoch": 2.8195864606524377, "grad_norm": 3.042203903198242, "learning_rate": 5.635936754464894e-05, "loss": 0.8315, "step": 211500 }, { "epoch": 2.8209195984588926, "grad_norm": 16.52178192138672, "learning_rate": 5.6355221886331776e-05, "loss": 0.8595, "step": 211600 }, { "epoch": 2.822252736265348, "grad_norm": 7.692200660705566, "learning_rate": 5.635111551120587e-05, "loss": 0.8339, "step": 211700 }, { "epoch": 2.823585874071803, "grad_norm": 10.226642608642578, "learning_rate": 5.634696546254233e-05, "loss": 0.8504, "step": 211800 }, { "epoch": 2.824919011878258, "grad_norm": 11.993839263916016, "learning_rate": 5.634281320819389e-05, "loss": 0.8441, "step": 211900 }, { "epoch": 2.826252149684713, "grad_norm": 5.153610706329346, "learning_rate": 5.633865874850817e-05, "loss": 0.7928, "step": 212000 }, { "epoch": 2.8275852874911678, "grad_norm": 8.481472969055176, "learning_rate": 5.633450208383297e-05, "loss": 0.8351, "step": 212100 }, { "epoch": 2.828918425297623, "grad_norm": 37.72245407104492, "learning_rate": 5.633034321451625e-05, "loss": 0.7925, "step": 212200 }, { "epoch": 2.830251563104078, "grad_norm": 8.207730293273926, "learning_rate": 5.6326182140906215e-05, "loss": 0.8181, "step": 212300 }, { "epoch": 2.831584700910533, "grad_norm": 22.759353637695312, "learning_rate": 5.6322018863351196e-05, "loss": 0.8334, "step": 212400 }, { "epoch": 2.8329178387169884, "grad_norm": 20.425418853759766, "learning_rate": 5.6317853382199734e-05, "loss": 0.8464, "step": 212500 }, { "epoch": 2.8342509765234434, "grad_norm": 2.258699893951416, "learning_rate": 5.631368569780054e-05, "loss": 0.7897, "step": 212600 }, { "epoch": 2.8355841143298983, "grad_norm": 7.097095489501953, "learning_rate": 5.630951581050253e-05, "loss": 0.8314, "step": 212700 }, { "epoch": 2.8369172521363533, "grad_norm": 9.145004272460938, "learning_rate": 5.630534372065479e-05, "loss": 0.9003, "step": 212800 }, { "epoch": 2.838250389942808, "grad_norm": 10.19577693939209, "learning_rate": 5.63011694286066e-05, "loss": 0.8125, "step": 212900 }, { "epoch": 2.8395835277492636, "grad_norm": 10.369868278503418, "learning_rate": 5.6296992934707415e-05, "loss": 0.9056, "step": 213000 }, { "epoch": 2.8409166655557185, "grad_norm": 3.281827688217163, "learning_rate": 5.629281423930687e-05, "loss": 0.8289, "step": 213100 }, { "epoch": 2.8422498033621735, "grad_norm": 7.4367804527282715, "learning_rate": 5.628863334275481e-05, "loss": 0.8415, "step": 213200 }, { "epoch": 2.843582941168629, "grad_norm": 34.345699310302734, "learning_rate": 5.628445024540123e-05, "loss": 0.7889, "step": 213300 }, { "epoch": 2.844916078975084, "grad_norm": 5.056133270263672, "learning_rate": 5.628026494759632e-05, "loss": 0.8132, "step": 213400 }, { "epoch": 2.8462492167815387, "grad_norm": 7.028249740600586, "learning_rate": 5.627607744969048e-05, "loss": 0.7688, "step": 213500 }, { "epoch": 2.8475823545879937, "grad_norm": 15.262004852294922, "learning_rate": 5.627188775203426e-05, "loss": 0.7928, "step": 213600 }, { "epoch": 2.8489154923944486, "grad_norm": 14.873286247253418, "learning_rate": 5.6267695854978424e-05, "loss": 0.9041, "step": 213700 }, { "epoch": 2.850248630200904, "grad_norm": 24.987022399902344, "learning_rate": 5.626354371071907e-05, "loss": 0.7609, "step": 213800 }, { "epoch": 2.851581768007359, "grad_norm": 10.216285705566406, "learning_rate": 5.6259347437902204e-05, "loss": 0.7979, "step": 213900 }, { "epoch": 2.852914905813814, "grad_norm": 15.946219444274902, "learning_rate": 5.625514896673553e-05, "loss": 0.7463, "step": 214000 }, { "epoch": 2.8542480436202693, "grad_norm": 20.84738540649414, "learning_rate": 5.6250948297570554e-05, "loss": 0.8473, "step": 214100 }, { "epoch": 2.8555811814267242, "grad_norm": 77.87663269042969, "learning_rate": 5.624674543075894e-05, "loss": 0.7819, "step": 214200 }, { "epoch": 2.856914319233179, "grad_norm": 135.85105895996094, "learning_rate": 5.6242582428169056e-05, "loss": 0.8205, "step": 214300 }, { "epoch": 2.858247457039634, "grad_norm": 3.376671075820923, "learning_rate": 5.6238375189087575e-05, "loss": 0.7811, "step": 214400 }, { "epoch": 2.859580594846089, "grad_norm": 14.134055137634277, "learning_rate": 5.623416575341204e-05, "loss": 0.8778, "step": 214500 }, { "epoch": 2.8609137326525445, "grad_norm": 2.9110944271087646, "learning_rate": 5.622995412149484e-05, "loss": 0.8645, "step": 214600 }, { "epoch": 2.8622468704589994, "grad_norm": 9.2081880569458, "learning_rate": 5.622574029368856e-05, "loss": 0.9387, "step": 214700 }, { "epoch": 2.8635800082654543, "grad_norm": 6.572634696960449, "learning_rate": 5.622152427034599e-05, "loss": 0.8173, "step": 214800 }, { "epoch": 2.8649131460719097, "grad_norm": 20.833797454833984, "learning_rate": 5.621730605182005e-05, "loss": 0.8007, "step": 214900 }, { "epoch": 2.8662462838783647, "grad_norm": 22.308578491210938, "learning_rate": 5.6213085638463896e-05, "loss": 0.8701, "step": 215000 }, { "epoch": 2.8675794216848196, "grad_norm": 74.98894500732422, "learning_rate": 5.620886303063085e-05, "loss": 0.7296, "step": 215100 }, { "epoch": 2.8689125594912745, "grad_norm": 18.914562225341797, "learning_rate": 5.62046382286744e-05, "loss": 0.8965, "step": 215200 }, { "epoch": 2.8702456972977295, "grad_norm": 4.38890266418457, "learning_rate": 5.6200411232948245e-05, "loss": 0.8781, "step": 215300 }, { "epoch": 2.871578835104185, "grad_norm": 40.41130828857422, "learning_rate": 5.6196182043806254e-05, "loss": 0.9078, "step": 215400 }, { "epoch": 2.87291197291064, "grad_norm": 51.561431884765625, "learning_rate": 5.6191950661602476e-05, "loss": 0.8666, "step": 215500 }, { "epoch": 2.8742451107170948, "grad_norm": 5.614627838134766, "learning_rate": 5.618771708669115e-05, "loss": 0.7747, "step": 215600 }, { "epoch": 2.87557824852355, "grad_norm": 21.152576446533203, "learning_rate": 5.6183481319426705e-05, "loss": 0.7891, "step": 215700 }, { "epoch": 2.876911386330005, "grad_norm": 9.204086303710938, "learning_rate": 5.6179243360163746e-05, "loss": 0.8083, "step": 215800 }, { "epoch": 2.87824452413646, "grad_norm": 3.6766812801361084, "learning_rate": 5.617500320925705e-05, "loss": 0.849, "step": 215900 }, { "epoch": 2.879577661942915, "grad_norm": 8.327140808105469, "learning_rate": 5.6170760867061595e-05, "loss": 0.8839, "step": 216000 }, { "epoch": 2.88091079974937, "grad_norm": 27.24889373779297, "learning_rate": 5.616655879010778e-05, "loss": 0.8365, "step": 216100 }, { "epoch": 2.8822439375558253, "grad_norm": 5.1772308349609375, "learning_rate": 5.616231208830449e-05, "loss": 0.9171, "step": 216200 }, { "epoch": 2.8835770753622803, "grad_norm": 6.081535816192627, "learning_rate": 5.615806319627491e-05, "loss": 0.8355, "step": 216300 }, { "epoch": 2.884910213168735, "grad_norm": 7.305371284484863, "learning_rate": 5.6153812114374714e-05, "loss": 0.8605, "step": 216400 }, { "epoch": 2.8862433509751906, "grad_norm": 8.67818546295166, "learning_rate": 5.6149558842959824e-05, "loss": 0.8572, "step": 216500 }, { "epoch": 2.8875764887816455, "grad_norm": 3.0363004207611084, "learning_rate": 5.6145303382386296e-05, "loss": 0.8981, "step": 216600 }, { "epoch": 2.8889096265881005, "grad_norm": 4.218711853027344, "learning_rate": 5.6141045733010386e-05, "loss": 0.8154, "step": 216700 }, { "epoch": 2.8902427643945554, "grad_norm": 2.5066232681274414, "learning_rate": 5.6136785895188525e-05, "loss": 0.8159, "step": 216800 }, { "epoch": 2.8915759022010104, "grad_norm": 41.40628433227539, "learning_rate": 5.6132523869277355e-05, "loss": 0.9359, "step": 216900 }, { "epoch": 2.8929090400074657, "grad_norm": 8.91246509552002, "learning_rate": 5.6128259655633646e-05, "loss": 0.8087, "step": 217000 }, { "epoch": 2.8942421778139207, "grad_norm": 3.7414047718048096, "learning_rate": 5.612399325461441e-05, "loss": 0.8059, "step": 217100 }, { "epoch": 2.8955753156203756, "grad_norm": 7.626338958740234, "learning_rate": 5.611972466657681e-05, "loss": 0.8893, "step": 217200 }, { "epoch": 2.8969084534268306, "grad_norm": 7.278754234313965, "learning_rate": 5.61154538918782e-05, "loss": 0.8442, "step": 217300 }, { "epoch": 2.898241591233286, "grad_norm": 1.9884192943572998, "learning_rate": 5.611118093087611e-05, "loss": 0.9326, "step": 217400 }, { "epoch": 2.899574729039741, "grad_norm": 7.304215431213379, "learning_rate": 5.610690578392825e-05, "loss": 0.7979, "step": 217500 }, { "epoch": 2.900907866846196, "grad_norm": 5.1941022872924805, "learning_rate": 5.6102628451392555e-05, "loss": 0.8301, "step": 217600 }, { "epoch": 2.902241004652651, "grad_norm": 8.924396514892578, "learning_rate": 5.609834893362708e-05, "loss": 0.8429, "step": 217700 }, { "epoch": 2.9035741424591057, "grad_norm": 6.995909214019775, "learning_rate": 5.60940672309901e-05, "loss": 0.8409, "step": 217800 }, { "epoch": 2.904907280265561, "grad_norm": 8.111733436584473, "learning_rate": 5.608978334384006e-05, "loss": 0.8476, "step": 217900 }, { "epoch": 2.906240418072016, "grad_norm": 4.761727333068848, "learning_rate": 5.608549727253562e-05, "loss": 0.8143, "step": 218000 }, { "epoch": 2.907573555878471, "grad_norm": 17.007247924804688, "learning_rate": 5.6081209017435556e-05, "loss": 0.8649, "step": 218100 }, { "epoch": 2.9089066936849264, "grad_norm": 19.397550582885742, "learning_rate": 5.6076918578898895e-05, "loss": 0.9394, "step": 218200 }, { "epoch": 2.9102398314913813, "grad_norm": 3.3887579441070557, "learning_rate": 5.607262595728481e-05, "loss": 0.8969, "step": 218300 }, { "epoch": 2.9115729692978363, "grad_norm": 10.191397666931152, "learning_rate": 5.6068331152952674e-05, "loss": 0.812, "step": 218400 }, { "epoch": 2.912906107104291, "grad_norm": 6.137408256530762, "learning_rate": 5.606403416626203e-05, "loss": 0.8574, "step": 218500 }, { "epoch": 2.914239244910746, "grad_norm": 12.47164535522461, "learning_rate": 5.60597349975726e-05, "loss": 0.8504, "step": 218600 }, { "epoch": 2.9155723827172015, "grad_norm": 8.207802772521973, "learning_rate": 5.6055433647244305e-05, "loss": 0.9144, "step": 218700 }, { "epoch": 2.9169055205236565, "grad_norm": 4.235064506530762, "learning_rate": 5.605113011563724e-05, "loss": 0.8158, "step": 218800 }, { "epoch": 2.9182386583301114, "grad_norm": 13.90906810760498, "learning_rate": 5.604682440311168e-05, "loss": 0.8479, "step": 218900 }, { "epoch": 2.919571796136567, "grad_norm": 5.800599575042725, "learning_rate": 5.604251651002809e-05, "loss": 0.8529, "step": 219000 }, { "epoch": 2.9209049339430218, "grad_norm": 3.8218276500701904, "learning_rate": 5.6038206436747106e-05, "loss": 0.8637, "step": 219100 }, { "epoch": 2.9222380717494767, "grad_norm": 18.078670501708984, "learning_rate": 5.603389418362957e-05, "loss": 0.83, "step": 219200 }, { "epoch": 2.9235712095559316, "grad_norm": 7.12617826461792, "learning_rate": 5.602957975103647e-05, "loss": 0.8238, "step": 219300 }, { "epoch": 2.9249043473623866, "grad_norm": 20.359704971313477, "learning_rate": 5.6025263139329004e-05, "loss": 0.791, "step": 219400 }, { "epoch": 2.926237485168842, "grad_norm": 23.309982299804688, "learning_rate": 5.6020987547556794e-05, "loss": 0.7419, "step": 219500 }, { "epoch": 2.927570622975297, "grad_norm": 4.065730571746826, "learning_rate": 5.6016666600487036e-05, "loss": 0.8668, "step": 219600 }, { "epoch": 2.928903760781752, "grad_norm": 7.4707159996032715, "learning_rate": 5.601234347538395e-05, "loss": 0.7111, "step": 219700 }, { "epoch": 2.9302368985882072, "grad_norm": 4.670587062835693, "learning_rate": 5.600801817260947e-05, "loss": 0.7704, "step": 219800 }, { "epoch": 2.931570036394662, "grad_norm": 5.748922348022461, "learning_rate": 5.600369069252569e-05, "loss": 0.8058, "step": 219900 }, { "epoch": 2.932903174201117, "grad_norm": 4.400135040283203, "learning_rate": 5.59993610354949e-05, "loss": 0.8136, "step": 220000 }, { "epoch": 2.934236312007572, "grad_norm": 5.03159236907959, "learning_rate": 5.5995029201879564e-05, "loss": 0.8472, "step": 220100 }, { "epoch": 2.935569449814027, "grad_norm": 10.54122257232666, "learning_rate": 5.5990695192042324e-05, "loss": 0.7963, "step": 220200 }, { "epoch": 2.9369025876204824, "grad_norm": 6.6853179931640625, "learning_rate": 5.598635900634601e-05, "loss": 0.8024, "step": 220300 }, { "epoch": 2.9382357254269373, "grad_norm": 3.194432258605957, "learning_rate": 5.598202064515363e-05, "loss": 0.8627, "step": 220400 }, { "epoch": 2.9395688632333923, "grad_norm": 9.981322288513184, "learning_rate": 5.59776801088284e-05, "loss": 0.8028, "step": 220500 }, { "epoch": 2.9409020010398477, "grad_norm": 3.190920352935791, "learning_rate": 5.597333739773367e-05, "loss": 0.9057, "step": 220600 }, { "epoch": 2.9422351388463026, "grad_norm": 14.333292007446289, "learning_rate": 5.596899251223301e-05, "loss": 0.8707, "step": 220700 }, { "epoch": 2.9435682766527576, "grad_norm": 13.899702072143555, "learning_rate": 5.596464545269016e-05, "loss": 0.8327, "step": 220800 }, { "epoch": 2.9449014144592125, "grad_norm": 3.0526421070098877, "learning_rate": 5.5960296219469024e-05, "loss": 0.8394, "step": 220900 }, { "epoch": 2.9462345522656674, "grad_norm": 1.733453631401062, "learning_rate": 5.595594481293374e-05, "loss": 0.832, "step": 221000 }, { "epoch": 2.947567690072123, "grad_norm": 10.761844635009766, "learning_rate": 5.595159123344857e-05, "loss": 0.7249, "step": 221100 }, { "epoch": 2.9489008278785778, "grad_norm": 31.60205078125, "learning_rate": 5.5947235481378e-05, "loss": 0.8146, "step": 221200 }, { "epoch": 2.9502339656850327, "grad_norm": 10.29630184173584, "learning_rate": 5.594287755708666e-05, "loss": 0.9406, "step": 221300 }, { "epoch": 2.951567103491488, "grad_norm": 10.467941284179688, "learning_rate": 5.5938517460939385e-05, "loss": 0.7591, "step": 221400 }, { "epoch": 2.952900241297943, "grad_norm": 9.920910835266113, "learning_rate": 5.593415519330119e-05, "loss": 0.8714, "step": 221500 }, { "epoch": 2.954233379104398, "grad_norm": 12.636817932128906, "learning_rate": 5.592979075453729e-05, "loss": 0.832, "step": 221600 }, { "epoch": 2.955566516910853, "grad_norm": 6.3582587242126465, "learning_rate": 5.5925424145013034e-05, "loss": 0.7861, "step": 221700 }, { "epoch": 2.956899654717308, "grad_norm": 3.60569429397583, "learning_rate": 5.5921055365093995e-05, "loss": 0.9304, "step": 221800 }, { "epoch": 2.9582327925237633, "grad_norm": 80.61800384521484, "learning_rate": 5.5916684415145913e-05, "loss": 0.8025, "step": 221900 }, { "epoch": 2.959565930330218, "grad_norm": 96.71614837646484, "learning_rate": 5.591231129553471e-05, "loss": 0.8099, "step": 222000 }, { "epoch": 2.960899068136673, "grad_norm": 5.711443901062012, "learning_rate": 5.590793600662649e-05, "loss": 0.7667, "step": 222100 }, { "epoch": 2.9622322059431285, "grad_norm": 5.621124267578125, "learning_rate": 5.590355854878752e-05, "loss": 0.8993, "step": 222200 }, { "epoch": 2.9635653437495835, "grad_norm": 2.1069750785827637, "learning_rate": 5.58991789223843e-05, "loss": 0.8356, "step": 222300 }, { "epoch": 2.9648984815560384, "grad_norm": 4.5022735595703125, "learning_rate": 5.589479712778346e-05, "loss": 0.7779, "step": 222400 }, { "epoch": 2.9662316193624934, "grad_norm": 67.44171905517578, "learning_rate": 5.589041316535183e-05, "loss": 0.7831, "step": 222500 }, { "epoch": 2.9675647571689483, "grad_norm": 3.7683422565460205, "learning_rate": 5.588602703545641e-05, "loss": 0.8135, "step": 222600 }, { "epoch": 2.9688978949754037, "grad_norm": 4.67572546005249, "learning_rate": 5.588163873846442e-05, "loss": 0.7348, "step": 222700 }, { "epoch": 2.9702310327818586, "grad_norm": 4.295060634613037, "learning_rate": 5.5877248274743226e-05, "loss": 0.8708, "step": 222800 }, { "epoch": 2.9715641705883136, "grad_norm": 4.412056922912598, "learning_rate": 5.5872855644660366e-05, "loss": 0.8404, "step": 222900 }, { "epoch": 2.972897308394769, "grad_norm": 5.323659896850586, "learning_rate": 5.58684608485836e-05, "loss": 0.8306, "step": 223000 }, { "epoch": 2.974230446201224, "grad_norm": 4.499324798583984, "learning_rate": 5.586406388688083e-05, "loss": 0.8439, "step": 223100 }, { "epoch": 2.975563584007679, "grad_norm": 4.819942951202393, "learning_rate": 5.585966475992016e-05, "loss": 0.8217, "step": 223200 }, { "epoch": 2.976896721814134, "grad_norm": 11.893436431884766, "learning_rate": 5.585526346806988e-05, "loss": 0.9574, "step": 223300 }, { "epoch": 2.9782298596205887, "grad_norm": 12.098329544067383, "learning_rate": 5.585086001169844e-05, "loss": 0.8632, "step": 223400 }, { "epoch": 2.979562997427044, "grad_norm": 6.688928127288818, "learning_rate": 5.5846498458091074e-05, "loss": 0.7813, "step": 223500 }, { "epoch": 2.980896135233499, "grad_norm": 10.868239402770996, "learning_rate": 5.584209069541945e-05, "loss": 0.8697, "step": 223600 }, { "epoch": 2.982229273039954, "grad_norm": 26.208269119262695, "learning_rate": 5.583768076932945e-05, "loss": 0.8725, "step": 223700 }, { "epoch": 2.9835624108464094, "grad_norm": 6.472638130187988, "learning_rate": 5.583326868019027e-05, "loss": 0.8442, "step": 223800 }, { "epoch": 2.9848955486528643, "grad_norm": 4.646242618560791, "learning_rate": 5.5828854428371264e-05, "loss": 0.7972, "step": 223900 }, { "epoch": 2.9862286864593193, "grad_norm": 3.887420177459717, "learning_rate": 5.582443801424199e-05, "loss": 0.7937, "step": 224000 }, { "epoch": 2.987561824265774, "grad_norm": 6.971560478210449, "learning_rate": 5.5820019438172166e-05, "loss": 0.8827, "step": 224100 }, { "epoch": 2.988894962072229, "grad_norm": 17.446651458740234, "learning_rate": 5.581559870053171e-05, "loss": 0.8691, "step": 224200 }, { "epoch": 2.9902280998786845, "grad_norm": 13.25003433227539, "learning_rate": 5.58111758016907e-05, "loss": 0.7708, "step": 224300 }, { "epoch": 2.9915612376851395, "grad_norm": 4.728490829467773, "learning_rate": 5.580675074201943e-05, "loss": 0.818, "step": 224400 }, { "epoch": 2.9928943754915944, "grad_norm": 13.448555946350098, "learning_rate": 5.580232352188833e-05, "loss": 0.8187, "step": 224500 }, { "epoch": 2.99422751329805, "grad_norm": 3.704737663269043, "learning_rate": 5.5797894141668036e-05, "loss": 0.8287, "step": 224600 }, { "epoch": 2.9955606511045048, "grad_norm": 3.1429433822631836, "learning_rate": 5.5793462601729376e-05, "loss": 0.7975, "step": 224700 }, { "epoch": 2.9968937889109597, "grad_norm": 5.144189834594727, "learning_rate": 5.578902890244332e-05, "loss": 0.7893, "step": 224800 }, { "epoch": 2.9982269267174146, "grad_norm": 2.5764000415802, "learning_rate": 5.578459304418108e-05, "loss": 0.8374, "step": 224900 }, { "epoch": 2.9995600645238696, "grad_norm": 10.082521438598633, "learning_rate": 5.5780155027313975e-05, "loss": 0.9006, "step": 225000 }, { "epoch": 3.000893202330325, "grad_norm": 4.37031888961792, "learning_rate": 5.5775714852213564e-05, "loss": 0.7887, "step": 225100 }, { "epoch": 3.00222634013678, "grad_norm": 7.491313934326172, "learning_rate": 5.5771272519251555e-05, "loss": 0.7282, "step": 225200 }, { "epoch": 3.003559477943235, "grad_norm": 7.796011924743652, "learning_rate": 5.5766828028799845e-05, "loss": 0.9005, "step": 225300 }, { "epoch": 3.0048926157496902, "grad_norm": 13.606353759765625, "learning_rate": 5.576238138123052e-05, "loss": 0.8053, "step": 225400 }, { "epoch": 3.006225753556145, "grad_norm": 10.114619255065918, "learning_rate": 5.575793257691583e-05, "loss": 0.7873, "step": 225500 }, { "epoch": 3.0075588913626, "grad_norm": 14.773933410644531, "learning_rate": 5.5753481616228235e-05, "loss": 0.7605, "step": 225600 }, { "epoch": 3.008892029169055, "grad_norm": 2.3873701095581055, "learning_rate": 5.574902849954033e-05, "loss": 0.778, "step": 225700 }, { "epoch": 3.0102251669755105, "grad_norm": 8.498668670654297, "learning_rate": 5.574457322722493e-05, "loss": 0.7552, "step": 225800 }, { "epoch": 3.0115583047819654, "grad_norm": 20.640945434570312, "learning_rate": 5.574011579965501e-05, "loss": 0.9344, "step": 225900 }, { "epoch": 3.0128914425884203, "grad_norm": 17.28620719909668, "learning_rate": 5.573565621720374e-05, "loss": 0.8324, "step": 226000 }, { "epoch": 3.0142245803948753, "grad_norm": 6.108672142028809, "learning_rate": 5.5731194480244455e-05, "loss": 0.7788, "step": 226100 }, { "epoch": 3.0155577182013307, "grad_norm": 18.707908630371094, "learning_rate": 5.5726730589150675e-05, "loss": 0.8016, "step": 226200 }, { "epoch": 3.0168908560077856, "grad_norm": 13.157783508300781, "learning_rate": 5.5722264544296116e-05, "loss": 0.8411, "step": 226300 }, { "epoch": 3.0182239938142406, "grad_norm": 4.390705585479736, "learning_rate": 5.571779634605465e-05, "loss": 0.7804, "step": 226400 }, { "epoch": 3.0195571316206955, "grad_norm": 34.1609992980957, "learning_rate": 5.571332599480035e-05, "loss": 0.7603, "step": 226500 }, { "epoch": 3.020890269427151, "grad_norm": 17.655839920043945, "learning_rate": 5.570885349090744e-05, "loss": 0.8656, "step": 226600 }, { "epoch": 3.022223407233606, "grad_norm": 22.187824249267578, "learning_rate": 5.570437883475037e-05, "loss": 0.8882, "step": 226700 }, { "epoch": 3.0235565450400608, "grad_norm": 17.408376693725586, "learning_rate": 5.569990202670372e-05, "loss": 0.8197, "step": 226800 }, { "epoch": 3.0248896828465157, "grad_norm": 6.8516411781311035, "learning_rate": 5.569542306714229e-05, "loss": 0.8535, "step": 226900 }, { "epoch": 3.026222820652971, "grad_norm": 3.690539836883545, "learning_rate": 5.569094195644104e-05, "loss": 0.9012, "step": 227000 }, { "epoch": 3.027555958459426, "grad_norm": 8.30081558227539, "learning_rate": 5.568645869497511e-05, "loss": 0.8868, "step": 227100 }, { "epoch": 3.028889096265881, "grad_norm": 3.868410110473633, "learning_rate": 5.5681973283119834e-05, "loss": 0.7997, "step": 227200 }, { "epoch": 3.030222234072336, "grad_norm": 82.17601013183594, "learning_rate": 5.567748572125071e-05, "loss": 0.7278, "step": 227300 }, { "epoch": 3.0315553718787913, "grad_norm": 22.749156951904297, "learning_rate": 5.567299600974343e-05, "loss": 0.8327, "step": 227400 }, { "epoch": 3.0328885096852463, "grad_norm": 13.832113265991211, "learning_rate": 5.566850414897384e-05, "loss": 0.8466, "step": 227500 }, { "epoch": 3.034221647491701, "grad_norm": 79.18486022949219, "learning_rate": 5.5664010139318e-05, "loss": 0.8341, "step": 227600 }, { "epoch": 3.035554785298156, "grad_norm": 13.165462493896484, "learning_rate": 5.565951398115214e-05, "loss": 0.9523, "step": 227700 }, { "epoch": 3.0368879231046115, "grad_norm": 41.36241912841797, "learning_rate": 5.565501567485265e-05, "loss": 0.8958, "step": 227800 }, { "epoch": 3.0382210609110665, "grad_norm": 34.81140899658203, "learning_rate": 5.565051522079611e-05, "loss": 0.8641, "step": 227900 }, { "epoch": 3.0395541987175214, "grad_norm": 126.24305725097656, "learning_rate": 5.564601261935931e-05, "loss": 0.8765, "step": 228000 }, { "epoch": 3.0408873365239764, "grad_norm": 5.937127590179443, "learning_rate": 5.5641507870919166e-05, "loss": 0.8123, "step": 228100 }, { "epoch": 3.0422204743304317, "grad_norm": 4.016146183013916, "learning_rate": 5.563704605542804e-05, "loss": 0.7843, "step": 228200 }, { "epoch": 3.0435536121368867, "grad_norm": 23.288272857666016, "learning_rate": 5.56325370355734e-05, "loss": 0.9145, "step": 228300 }, { "epoch": 3.0448867499433416, "grad_norm": 5.869715690612793, "learning_rate": 5.562802586984356e-05, "loss": 0.7082, "step": 228400 }, { "epoch": 3.0462198877497966, "grad_norm": 12.441618919372559, "learning_rate": 5.562351255861619e-05, "loss": 0.8723, "step": 228500 }, { "epoch": 3.047553025556252, "grad_norm": 10.590901374816895, "learning_rate": 5.561899710226911e-05, "loss": 0.8348, "step": 228600 }, { "epoch": 3.048886163362707, "grad_norm": 6.632433891296387, "learning_rate": 5.561447950118035e-05, "loss": 0.8347, "step": 228700 }, { "epoch": 3.050219301169162, "grad_norm": 5.7465691566467285, "learning_rate": 5.5610004963796e-05, "loss": 0.7523, "step": 228800 }, { "epoch": 3.051552438975617, "grad_norm": 12.122199058532715, "learning_rate": 5.5605483095796636e-05, "loss": 0.7219, "step": 228900 }, { "epoch": 3.052885576782072, "grad_norm": 4.023875713348389, "learning_rate": 5.5600959084186957e-05, "loss": 0.8133, "step": 229000 }, { "epoch": 3.054218714588527, "grad_norm": 6.405729293823242, "learning_rate": 5.5596432929345674e-05, "loss": 0.8673, "step": 229100 }, { "epoch": 3.055551852394982, "grad_norm": 65.47309112548828, "learning_rate": 5.559190463165171e-05, "loss": 0.8848, "step": 229200 }, { "epoch": 3.056884990201437, "grad_norm": 5.402429103851318, "learning_rate": 5.558737419148417e-05, "loss": 0.7592, "step": 229300 }, { "epoch": 3.0582181280078924, "grad_norm": 14.853043556213379, "learning_rate": 5.558284160922231e-05, "loss": 0.8981, "step": 229400 }, { "epoch": 3.0595512658143473, "grad_norm": 7.109285831451416, "learning_rate": 5.557830688524561e-05, "loss": 0.8112, "step": 229500 }, { "epoch": 3.0608844036208023, "grad_norm": 8.33376693725586, "learning_rate": 5.557377001993368e-05, "loss": 0.8537, "step": 229600 }, { "epoch": 3.062217541427257, "grad_norm": 13.793822288513184, "learning_rate": 5.5569231013666336e-05, "loss": 0.8844, "step": 229700 }, { "epoch": 3.0635506792337126, "grad_norm": 11.067919731140137, "learning_rate": 5.556468986682358e-05, "loss": 0.834, "step": 229800 }, { "epoch": 3.0648838170401675, "grad_norm": 52.28904342651367, "learning_rate": 5.556014657978557e-05, "loss": 0.7449, "step": 229900 }, { "epoch": 3.0662169548466225, "grad_norm": 12.198089599609375, "learning_rate": 5.555560115293266e-05, "loss": 0.7668, "step": 230000 }, { "epoch": 3.0675500926530774, "grad_norm": 3.6590399742126465, "learning_rate": 5.555105358664538e-05, "loss": 0.8341, "step": 230100 }, { "epoch": 3.068883230459533, "grad_norm": 26.31719207763672, "learning_rate": 5.554650388130444e-05, "loss": 0.8327, "step": 230200 }, { "epoch": 3.0702163682659878, "grad_norm": 12.079144477844238, "learning_rate": 5.554195203729073e-05, "loss": 0.8374, "step": 230300 }, { "epoch": 3.0715495060724427, "grad_norm": 20.63292694091797, "learning_rate": 5.55373980549853e-05, "loss": 0.9042, "step": 230400 }, { "epoch": 3.0728826438788976, "grad_norm": 6.641359329223633, "learning_rate": 5.553284193476941e-05, "loss": 0.8094, "step": 230500 }, { "epoch": 3.0742157816853526, "grad_norm": 20.473758697509766, "learning_rate": 5.5528283677024485e-05, "loss": 0.7842, "step": 230600 }, { "epoch": 3.075548919491808, "grad_norm": 5.467502117156982, "learning_rate": 5.5523723282132114e-05, "loss": 0.8977, "step": 230700 }, { "epoch": 3.076882057298263, "grad_norm": 17.30173110961914, "learning_rate": 5.5519160750474084e-05, "loss": 0.8654, "step": 230800 }, { "epoch": 3.078215195104718, "grad_norm": 6.1838812828063965, "learning_rate": 5.5514596082432373e-05, "loss": 0.9512, "step": 230900 }, { "epoch": 3.0795483329111732, "grad_norm": 15.81331729888916, "learning_rate": 5.55100292783891e-05, "loss": 0.8437, "step": 231000 }, { "epoch": 3.080881470717628, "grad_norm": 3.625096321105957, "learning_rate": 5.550546033872659e-05, "loss": 0.779, "step": 231100 }, { "epoch": 3.082214608524083, "grad_norm": 9.61219310760498, "learning_rate": 5.550088926382734e-05, "loss": 0.8066, "step": 231200 }, { "epoch": 3.083547746330538, "grad_norm": 15.824125289916992, "learning_rate": 5.549631605407403e-05, "loss": 0.7529, "step": 231300 }, { "epoch": 3.084880884136993, "grad_norm": 8.66683578491211, "learning_rate": 5.549174070984951e-05, "loss": 0.9559, "step": 231400 }, { "epoch": 3.0862140219434484, "grad_norm": 2.9783878326416016, "learning_rate": 5.548720901688242e-05, "loss": 0.8695, "step": 231500 }, { "epoch": 3.0875471597499033, "grad_norm": 4.867984771728516, "learning_rate": 5.5482675232667335e-05, "loss": 0.8496, "step": 231600 }, { "epoch": 3.0888802975563583, "grad_norm": 12.628862380981445, "learning_rate": 5.547809352999078e-05, "loss": 0.8812, "step": 231700 }, { "epoch": 3.0902134353628137, "grad_norm": 28.640701293945312, "learning_rate": 5.547350969436855e-05, "loss": 0.8525, "step": 231800 }, { "epoch": 3.0915465731692686, "grad_norm": 62.77066421508789, "learning_rate": 5.5468923726184375e-05, "loss": 0.9591, "step": 231900 }, { "epoch": 3.0928797109757236, "grad_norm": 9.490784645080566, "learning_rate": 5.5464335625822195e-05, "loss": 0.8173, "step": 232000 }, { "epoch": 3.0942128487821785, "grad_norm": 20.5423583984375, "learning_rate": 5.5459745393666114e-05, "loss": 0.8821, "step": 232100 }, { "epoch": 3.0955459865886334, "grad_norm": 22.418630599975586, "learning_rate": 5.54551530301004e-05, "loss": 0.9007, "step": 232200 }, { "epoch": 3.096879124395089, "grad_norm": 30.271865844726562, "learning_rate": 5.545055853550952e-05, "loss": 0.8465, "step": 232300 }, { "epoch": 3.0982122622015438, "grad_norm": 66.4676513671875, "learning_rate": 5.54459619102781e-05, "loss": 0.9058, "step": 232400 }, { "epoch": 3.0995454000079987, "grad_norm": 25.767255783081055, "learning_rate": 5.544136315479096e-05, "loss": 0.9421, "step": 232500 }, { "epoch": 3.100878537814454, "grad_norm": 33.35421371459961, "learning_rate": 5.5436762269433103e-05, "loss": 0.9248, "step": 232600 }, { "epoch": 3.102211675620909, "grad_norm": 26.284832000732422, "learning_rate": 5.543215925458969e-05, "loss": 0.8737, "step": 232700 }, { "epoch": 3.103544813427364, "grad_norm": 33.75792694091797, "learning_rate": 5.542755411064607e-05, "loss": 0.9676, "step": 232800 }, { "epoch": 3.104877951233819, "grad_norm": 75.93854522705078, "learning_rate": 5.542294683798778e-05, "loss": 0.9166, "step": 232900 }, { "epoch": 3.106211089040274, "grad_norm": 40.90751647949219, "learning_rate": 5.541833743700051e-05, "loss": 0.9163, "step": 233000 }, { "epoch": 3.1075442268467293, "grad_norm": 9.331055641174316, "learning_rate": 5.5413725908070165e-05, "loss": 0.9511, "step": 233100 }, { "epoch": 3.108877364653184, "grad_norm": 283.3279724121094, "learning_rate": 5.5409112251582796e-05, "loss": 0.9401, "step": 233200 }, { "epoch": 3.110210502459639, "grad_norm": 36.20569610595703, "learning_rate": 5.540454263628946e-05, "loss": 0.9137, "step": 233300 }, { "epoch": 3.1115436402660945, "grad_norm": 12.99903392791748, "learning_rate": 5.539992474711287e-05, "loss": 0.9713, "step": 233400 }, { "epoch": 3.1128767780725495, "grad_norm": 24.5657901763916, "learning_rate": 5.539530473153466e-05, "loss": 0.9153, "step": 233500 }, { "epoch": 3.1142099158790044, "grad_norm": 11.655362129211426, "learning_rate": 5.5390682589941584e-05, "loss": 0.973, "step": 233600 }, { "epoch": 3.1155430536854594, "grad_norm": 5.602494716644287, "learning_rate": 5.53860583227206e-05, "loss": 0.8993, "step": 233700 }, { "epoch": 3.1168761914919143, "grad_norm": 13.579225540161133, "learning_rate": 5.5381431930258836e-05, "loss": 0.9307, "step": 233800 }, { "epoch": 3.1182093292983697, "grad_norm": 13.75529670715332, "learning_rate": 5.53768034129436e-05, "loss": 0.9331, "step": 233900 }, { "epoch": 3.1195424671048246, "grad_norm": 6.855217933654785, "learning_rate": 5.5372172771162385e-05, "loss": 0.8903, "step": 234000 }, { "epoch": 3.1208756049112796, "grad_norm": 7.299376010894775, "learning_rate": 5.536754000530284e-05, "loss": 0.9182, "step": 234100 }, { "epoch": 3.1222087427177345, "grad_norm": 26.170961380004883, "learning_rate": 5.536290511575281e-05, "loss": 0.8912, "step": 234200 }, { "epoch": 3.12354188052419, "grad_norm": 5.06658411026001, "learning_rate": 5.5358268102900314e-05, "loss": 0.9097, "step": 234300 }, { "epoch": 3.124875018330645, "grad_norm": 5.416306495666504, "learning_rate": 5.535362896713355e-05, "loss": 0.9044, "step": 234400 }, { "epoch": 3.1262081561371, "grad_norm": 5.923246383666992, "learning_rate": 5.53489877088409e-05, "loss": 0.8372, "step": 234500 }, { "epoch": 3.1275412939435547, "grad_norm": 14.37340259552002, "learning_rate": 5.5344344328410894e-05, "loss": 0.965, "step": 234600 }, { "epoch": 3.12887443175001, "grad_norm": 5.1112165451049805, "learning_rate": 5.533969882623227e-05, "loss": 0.8781, "step": 234700 }, { "epoch": 3.130207569556465, "grad_norm": 2.4819326400756836, "learning_rate": 5.533505120269394e-05, "loss": 0.8179, "step": 234800 }, { "epoch": 3.13154070736292, "grad_norm": 13.629648208618164, "learning_rate": 5.5330401458185e-05, "loss": 0.9989, "step": 234900 }, { "epoch": 3.1328738451693754, "grad_norm": 9.706526756286621, "learning_rate": 5.532574959309468e-05, "loss": 0.8634, "step": 235000 }, { "epoch": 3.1342069829758303, "grad_norm": 4.314345359802246, "learning_rate": 5.5321095607812446e-05, "loss": 0.8212, "step": 235100 }, { "epoch": 3.1355401207822853, "grad_norm": 5.8476972579956055, "learning_rate": 5.531643950272791e-05, "loss": 0.8827, "step": 235200 }, { "epoch": 3.13687325858874, "grad_norm": 5.0695905685424805, "learning_rate": 5.5311781278230864e-05, "loss": 0.8684, "step": 235300 }, { "epoch": 3.138206396395195, "grad_norm": 230.1542205810547, "learning_rate": 5.530712093471127e-05, "loss": 0.9033, "step": 235400 }, { "epoch": 3.1395395342016506, "grad_norm": 39.42318344116211, "learning_rate": 5.530245847255929e-05, "loss": 0.8884, "step": 235500 }, { "epoch": 3.1408726720081055, "grad_norm": 35.7872428894043, "learning_rate": 5.5297793892165246e-05, "loss": 1.0244, "step": 235600 }, { "epoch": 3.1422058098145604, "grad_norm": 47.4526252746582, "learning_rate": 5.529312719391964e-05, "loss": 1.0679, "step": 235700 }, { "epoch": 3.1435389476210154, "grad_norm": 18.738819122314453, "learning_rate": 5.528845837821317e-05, "loss": 1.0009, "step": 235800 }, { "epoch": 3.1448720854274708, "grad_norm": 7.017409324645996, "learning_rate": 5.528378744543666e-05, "loss": 0.9816, "step": 235900 }, { "epoch": 3.1462052232339257, "grad_norm": 49.93830108642578, "learning_rate": 5.527911439598118e-05, "loss": 1.1106, "step": 236000 }, { "epoch": 3.1475383610403807, "grad_norm": 40.64863204956055, "learning_rate": 5.527443923023793e-05, "loss": 0.9907, "step": 236100 }, { "epoch": 3.1488714988468356, "grad_norm": 39.05976104736328, "learning_rate": 5.52697619485983e-05, "loss": 1.1077, "step": 236200 }, { "epoch": 3.150204636653291, "grad_norm": 8.266751289367676, "learning_rate": 5.526508255145386e-05, "loss": 0.9847, "step": 236300 }, { "epoch": 3.151537774459746, "grad_norm": 123.42279815673828, "learning_rate": 5.5260401039196334e-05, "loss": 0.9679, "step": 236400 }, { "epoch": 3.152870912266201, "grad_norm": 150.79029846191406, "learning_rate": 5.525571741221767e-05, "loss": 0.9129, "step": 236500 }, { "epoch": 3.154204050072656, "grad_norm": 3.38964581489563, "learning_rate": 5.525103167090996e-05, "loss": 1.0218, "step": 236600 }, { "epoch": 3.155537187879111, "grad_norm": 43.96664810180664, "learning_rate": 5.524639070468063e-05, "loss": 0.9497, "step": 236700 }, { "epoch": 3.156870325685566, "grad_norm": 60.006038665771484, "learning_rate": 5.524170075702531e-05, "loss": 1.1212, "step": 236800 }, { "epoch": 3.158203463492021, "grad_norm": 133.60125732421875, "learning_rate": 5.523700869621439e-05, "loss": 1.1296, "step": 236900 }, { "epoch": 3.159536601298476, "grad_norm": 22.318567276000977, "learning_rate": 5.5232314522640655e-05, "loss": 0.9842, "step": 237000 }, { "epoch": 3.1608697391049314, "grad_norm": 12.324549674987793, "learning_rate": 5.522766521001146e-05, "loss": 1.1034, "step": 237100 }, { "epoch": 3.1622028769113864, "grad_norm": 119.33197021484375, "learning_rate": 5.5222966833209034e-05, "loss": 1.1606, "step": 237200 }, { "epoch": 3.1635360147178413, "grad_norm": 10.046595573425293, "learning_rate": 5.521826634481933e-05, "loss": 1.1233, "step": 237300 }, { "epoch": 3.1648691525242962, "grad_norm": 172.47900390625, "learning_rate": 5.521356374523587e-05, "loss": 1.0458, "step": 237400 }, { "epoch": 3.1662022903307516, "grad_norm": 13.852143287658691, "learning_rate": 5.520885903485233e-05, "loss": 1.1805, "step": 237500 }, { "epoch": 3.1675354281372066, "grad_norm": 9.542901992797852, "learning_rate": 5.520415221406258e-05, "loss": 1.0681, "step": 237600 }, { "epoch": 3.1688685659436615, "grad_norm": 75.89056396484375, "learning_rate": 5.519944328326065e-05, "loss": 1.0353, "step": 237700 }, { "epoch": 3.1702017037501165, "grad_norm": 119.34503173828125, "learning_rate": 5.519473224284077e-05, "loss": 1.0067, "step": 237800 }, { "epoch": 3.171534841556572, "grad_norm": 26.798770904541016, "learning_rate": 5.519001909319733e-05, "loss": 1.2144, "step": 237900 }, { "epoch": 3.172867979363027, "grad_norm": 67.91123962402344, "learning_rate": 5.518530383472489e-05, "loss": 1.0627, "step": 238000 }, { "epoch": 3.1742011171694817, "grad_norm": 2.3148996829986572, "learning_rate": 5.51805864678182e-05, "loss": 1.0501, "step": 238100 }, { "epoch": 3.1755342549759367, "grad_norm": 151.2959747314453, "learning_rate": 5.5175866992872195e-05, "loss": 0.9776, "step": 238200 }, { "epoch": 3.176867392782392, "grad_norm": 109.68241119384766, "learning_rate": 5.517114541028197e-05, "loss": 0.9971, "step": 238300 }, { "epoch": 3.178200530588847, "grad_norm": 75.9361572265625, "learning_rate": 5.516642172044279e-05, "loss": 1.0541, "step": 238400 }, { "epoch": 3.179533668395302, "grad_norm": 54.312198638916016, "learning_rate": 5.5161695923750114e-05, "loss": 1.1369, "step": 238500 }, { "epoch": 3.180866806201757, "grad_norm": 27.77895164489746, "learning_rate": 5.5156968020599584e-05, "loss": 1.1279, "step": 238600 }, { "epoch": 3.1821999440082123, "grad_norm": 35.791568756103516, "learning_rate": 5.5152238011386977e-05, "loss": 1.168, "step": 238700 }, { "epoch": 3.183533081814667, "grad_norm": 38.96428298950195, "learning_rate": 5.51475058965083e-05, "loss": 1.063, "step": 238800 }, { "epoch": 3.184866219621122, "grad_norm": 7.896603107452393, "learning_rate": 5.5142771676359695e-05, "loss": 1.0928, "step": 238900 }, { "epoch": 3.186199357427577, "grad_norm": 170.27442932128906, "learning_rate": 5.5138035351337505e-05, "loss": 1.1071, "step": 239000 }, { "epoch": 3.1875324952340325, "grad_norm": 56.01560592651367, "learning_rate": 5.513329692183824e-05, "loss": 1.2954, "step": 239100 }, { "epoch": 3.1888656330404874, "grad_norm": 69.6558609008789, "learning_rate": 5.5128556388258584e-05, "loss": 1.472, "step": 239200 }, { "epoch": 3.1901987708469424, "grad_norm": 65.16993713378906, "learning_rate": 5.51238137509954e-05, "loss": 1.2951, "step": 239300 }, { "epoch": 3.1915319086533973, "grad_norm": 192.11036682128906, "learning_rate": 5.511906901044572e-05, "loss": 1.302, "step": 239400 }, { "epoch": 3.1928650464598527, "grad_norm": 115.76988983154297, "learning_rate": 5.511432216700677e-05, "loss": 1.2011, "step": 239500 }, { "epoch": 3.1941981842663076, "grad_norm": 96.13262939453125, "learning_rate": 5.510957322107593e-05, "loss": 1.5657, "step": 239600 }, { "epoch": 3.1955313220727626, "grad_norm": 39.632728576660156, "learning_rate": 5.510482217305078e-05, "loss": 1.3096, "step": 239700 }, { "epoch": 3.1968644598792175, "grad_norm": 521.146240234375, "learning_rate": 5.510006902332905e-05, "loss": 1.3195, "step": 239800 }, { "epoch": 3.198197597685673, "grad_norm": 14.838930130004883, "learning_rate": 5.509531377230866e-05, "loss": 1.2065, "step": 239900 }, { "epoch": 3.199530735492128, "grad_norm": 29.53390884399414, "learning_rate": 5.509055642038771e-05, "loss": 1.1918, "step": 240000 }, { "epoch": 3.200863873298583, "grad_norm": 94.82846069335938, "learning_rate": 5.508579696796447e-05, "loss": 1.1532, "step": 240100 }, { "epoch": 3.2021970111050377, "grad_norm": 17.583011627197266, "learning_rate": 5.5081035415437384e-05, "loss": 1.0211, "step": 240200 }, { "epoch": 3.203530148911493, "grad_norm": 70.4919204711914, "learning_rate": 5.5076271763205085e-05, "loss": 1.0512, "step": 240300 }, { "epoch": 3.204863286717948, "grad_norm": 43.86053466796875, "learning_rate": 5.507150601166635e-05, "loss": 1.1705, "step": 240400 }, { "epoch": 3.206196424524403, "grad_norm": 10.945452690124512, "learning_rate": 5.506673816122016e-05, "loss": 1.0616, "step": 240500 }, { "epoch": 3.207529562330858, "grad_norm": 49.13788604736328, "learning_rate": 5.506196821226567e-05, "loss": 1.05, "step": 240600 }, { "epoch": 3.2088627001373133, "grad_norm": 19.313650131225586, "learning_rate": 5.5057196165202224e-05, "loss": 0.9924, "step": 240700 }, { "epoch": 3.2101958379437683, "grad_norm": 10.972599029541016, "learning_rate": 5.5052422020429285e-05, "loss": 1.1208, "step": 240800 }, { "epoch": 3.2115289757502232, "grad_norm": 34.248779296875, "learning_rate": 5.5047645778346545e-05, "loss": 1.108, "step": 240900 }, { "epoch": 3.212862113556678, "grad_norm": 16.767244338989258, "learning_rate": 5.504286743935386e-05, "loss": 1.0142, "step": 241000 }, { "epoch": 3.2141952513631336, "grad_norm": 12.180190086364746, "learning_rate": 5.503808700385126e-05, "loss": 1.1414, "step": 241100 }, { "epoch": 3.2155283891695885, "grad_norm": 51.59714126586914, "learning_rate": 5.503330447223894e-05, "loss": 1.0494, "step": 241200 }, { "epoch": 3.2168615269760434, "grad_norm": 17.990615844726562, "learning_rate": 5.502851984491728e-05, "loss": 0.9621, "step": 241300 }, { "epoch": 3.2181946647824984, "grad_norm": 39.63703155517578, "learning_rate": 5.502373312228683e-05, "loss": 1.0291, "step": 241400 }, { "epoch": 3.2195278025889538, "grad_norm": 12.651480674743652, "learning_rate": 5.5018944304748336e-05, "loss": 1.0982, "step": 241500 }, { "epoch": 3.2208609403954087, "grad_norm": 64.87610626220703, "learning_rate": 5.501415339270268e-05, "loss": 1.0099, "step": 241600 }, { "epoch": 3.2221940782018637, "grad_norm": 6.626151084899902, "learning_rate": 5.500936038655096e-05, "loss": 1.1212, "step": 241700 }, { "epoch": 3.2235272160083186, "grad_norm": 48.65848922729492, "learning_rate": 5.500456528669442e-05, "loss": 1.081, "step": 241800 }, { "epoch": 3.224860353814774, "grad_norm": 12.22689437866211, "learning_rate": 5.4999768093534495e-05, "loss": 1.0503, "step": 241900 }, { "epoch": 3.226193491621229, "grad_norm": 4.808488845825195, "learning_rate": 5.49949688074728e-05, "loss": 0.9505, "step": 242000 }, { "epoch": 3.227526629427684, "grad_norm": 36.68666076660156, "learning_rate": 5.49901674289111e-05, "loss": 0.9988, "step": 242100 }, { "epoch": 3.228859767234139, "grad_norm": 14.03719425201416, "learning_rate": 5.498541200331252e-05, "loss": 0.9469, "step": 242200 }, { "epoch": 3.230192905040594, "grad_norm": 248.42347717285156, "learning_rate": 5.498060646187184e-05, "loss": 1.0777, "step": 242300 }, { "epoch": 3.231526042847049, "grad_norm": 18.922178268432617, "learning_rate": 5.497584691581152e-05, "loss": 1.0226, "step": 242400 }, { "epoch": 3.232859180653504, "grad_norm": 29.75818634033203, "learning_rate": 5.497103721308502e-05, "loss": 1.0563, "step": 242500 }, { "epoch": 3.234192318459959, "grad_norm": 21.305004119873047, "learning_rate": 5.4966225419862005e-05, "loss": 1.0526, "step": 242600 }, { "epoch": 3.2355254562664144, "grad_norm": 24.889467239379883, "learning_rate": 5.49614115365453e-05, "loss": 1.114, "step": 242700 }, { "epoch": 3.2368585940728694, "grad_norm": 53.82320022583008, "learning_rate": 5.49565955635379e-05, "loss": 1.0406, "step": 242800 }, { "epoch": 3.2381917318793243, "grad_norm": 7.413285255432129, "learning_rate": 5.495177750124299e-05, "loss": 1.0032, "step": 242900 }, { "epoch": 3.2395248696857792, "grad_norm": 24.526399612426758, "learning_rate": 5.494695735006393e-05, "loss": 1.0276, "step": 243000 }, { "epoch": 3.2408580074922346, "grad_norm": 11.451824188232422, "learning_rate": 5.4942135110404225e-05, "loss": 1.0091, "step": 243100 }, { "epoch": 3.2421911452986896, "grad_norm": 102.41023254394531, "learning_rate": 5.4937310782667605e-05, "loss": 0.9803, "step": 243200 }, { "epoch": 3.2435242831051445, "grad_norm": 14.895102500915527, "learning_rate": 5.4932484367257934e-05, "loss": 1.0672, "step": 243300 }, { "epoch": 3.2448574209115995, "grad_norm": 16.261531829833984, "learning_rate": 5.4927655864579265e-05, "loss": 1.0868, "step": 243400 }, { "epoch": 3.246190558718055, "grad_norm": 23.757875442504883, "learning_rate": 5.492282527503583e-05, "loss": 1.0328, "step": 243500 }, { "epoch": 3.24752369652451, "grad_norm": 31.398395538330078, "learning_rate": 5.491799259903202e-05, "loss": 1.1044, "step": 243600 }, { "epoch": 3.2488568343309647, "grad_norm": 13.196992874145508, "learning_rate": 5.491315783697242e-05, "loss": 1.0292, "step": 243700 }, { "epoch": 3.2501899721374197, "grad_norm": 32.49329376220703, "learning_rate": 5.490832098926177e-05, "loss": 1.0597, "step": 243800 }, { "epoch": 3.251523109943875, "grad_norm": 5.983699798583984, "learning_rate": 5.4903482056305015e-05, "loss": 1.0668, "step": 243900 }, { "epoch": 3.25285624775033, "grad_norm": 4.267114162445068, "learning_rate": 5.4898641038507245e-05, "loss": 0.9876, "step": 244000 }, { "epoch": 3.254189385556785, "grad_norm": 21.648405075073242, "learning_rate": 5.4893797936273734e-05, "loss": 0.9359, "step": 244100 }, { "epoch": 3.25552252336324, "grad_norm": 70.63455200195312, "learning_rate": 5.488895275000993e-05, "loss": 0.9535, "step": 244200 }, { "epoch": 3.2568556611696953, "grad_norm": 11.70775318145752, "learning_rate": 5.4884105480121455e-05, "loss": 1.0525, "step": 244300 }, { "epoch": 3.25818879897615, "grad_norm": 17.84592628479004, "learning_rate": 5.4879256127014116e-05, "loss": 1.0184, "step": 244400 }, { "epoch": 3.259521936782605, "grad_norm": 43.5318489074707, "learning_rate": 5.4874404691093885e-05, "loss": 0.9518, "step": 244500 }, { "epoch": 3.26085507458906, "grad_norm": 11.367446899414062, "learning_rate": 5.486955117276689e-05, "loss": 1.0032, "step": 244600 }, { "epoch": 3.2621882123955155, "grad_norm": 2.8854339122772217, "learning_rate": 5.486469557243948e-05, "loss": 0.958, "step": 244700 }, { "epoch": 3.2635213502019704, "grad_norm": 655.1524658203125, "learning_rate": 5.485983789051813e-05, "loss": 1.1007, "step": 244800 }, { "epoch": 3.2648544880084254, "grad_norm": 253.1002960205078, "learning_rate": 5.485497812740952e-05, "loss": 1.0176, "step": 244900 }, { "epoch": 3.2661876258148803, "grad_norm": 10.740411758422852, "learning_rate": 5.485011628352049e-05, "loss": 0.8969, "step": 245000 }, { "epoch": 3.2675207636213357, "grad_norm": 29.399667739868164, "learning_rate": 5.484525235925805e-05, "loss": 1.0199, "step": 245100 }, { "epoch": 3.2688539014277906, "grad_norm": 50.91224670410156, "learning_rate": 5.484038635502942e-05, "loss": 0.9526, "step": 245200 }, { "epoch": 3.2701870392342456, "grad_norm": 66.98694610595703, "learning_rate": 5.4835518271241935e-05, "loss": 0.9475, "step": 245300 }, { "epoch": 3.2715201770407005, "grad_norm": 59.357608795166016, "learning_rate": 5.483064810830315e-05, "loss": 0.9751, "step": 245400 }, { "epoch": 3.272853314847156, "grad_norm": 32.2253532409668, "learning_rate": 5.4825775866620785e-05, "loss": 0.9313, "step": 245500 }, { "epoch": 3.274186452653611, "grad_norm": 34.780765533447266, "learning_rate": 5.482090154660271e-05, "loss": 0.8955, "step": 245600 }, { "epoch": 3.275519590460066, "grad_norm": 18.89608383178711, "learning_rate": 5.4816025148657006e-05, "loss": 0.9992, "step": 245700 }, { "epoch": 3.2768527282665207, "grad_norm": 5.084780216217041, "learning_rate": 5.4811146673191903e-05, "loss": 0.936, "step": 245800 }, { "epoch": 3.278185866072976, "grad_norm": 26.750804901123047, "learning_rate": 5.480626612061581e-05, "loss": 0.877, "step": 245900 }, { "epoch": 3.279519003879431, "grad_norm": 7.099696636199951, "learning_rate": 5.480138349133732e-05, "loss": 0.9293, "step": 246000 }, { "epoch": 3.280852141685886, "grad_norm": 66.49244689941406, "learning_rate": 5.479649878576519e-05, "loss": 1.0034, "step": 246100 }, { "epoch": 3.282185279492341, "grad_norm": 9.258814811706543, "learning_rate": 5.4791612004308327e-05, "loss": 0.9734, "step": 246200 }, { "epoch": 3.2835184172987963, "grad_norm": 9.527475357055664, "learning_rate": 5.478672314737587e-05, "loss": 1.0496, "step": 246300 }, { "epoch": 3.2848515551052513, "grad_norm": 130.56979370117188, "learning_rate": 5.478183221537708e-05, "loss": 1.0188, "step": 246400 }, { "epoch": 3.2861846929117062, "grad_norm": 19.675945281982422, "learning_rate": 5.477693920872143e-05, "loss": 0.8773, "step": 246500 }, { "epoch": 3.287517830718161, "grad_norm": 24.572919845581055, "learning_rate": 5.4772044127818516e-05, "loss": 0.9428, "step": 246600 }, { "epoch": 3.2888509685246166, "grad_norm": 4.036107063293457, "learning_rate": 5.476714697307816e-05, "loss": 0.8737, "step": 246700 }, { "epoch": 3.2901841063310715, "grad_norm": 7.457381725311279, "learning_rate": 5.4762247744910336e-05, "loss": 0.9802, "step": 246800 }, { "epoch": 3.2915172441375264, "grad_norm": 69.87682342529297, "learning_rate": 5.4757346443725186e-05, "loss": 0.9187, "step": 246900 }, { "epoch": 3.2928503819439814, "grad_norm": 1.0662363767623901, "learning_rate": 5.475244306993304e-05, "loss": 0.9328, "step": 247000 }, { "epoch": 3.2941835197504368, "grad_norm": 12.51457405090332, "learning_rate": 5.4747537623944376e-05, "loss": 0.8836, "step": 247100 }, { "epoch": 3.2955166575568917, "grad_norm": 135.50741577148438, "learning_rate": 5.4742630106169876e-05, "loss": 0.9912, "step": 247200 }, { "epoch": 3.2968497953633467, "grad_norm": 33.81473922729492, "learning_rate": 5.4737720517020384e-05, "loss": 0.9123, "step": 247300 }, { "epoch": 3.2981829331698016, "grad_norm": 80.58493041992188, "learning_rate": 5.4732808856906906e-05, "loss": 0.9011, "step": 247400 }, { "epoch": 3.299516070976257, "grad_norm": 6.856149196624756, "learning_rate": 5.472789512624064e-05, "loss": 0.853, "step": 247500 }, { "epoch": 3.300849208782712, "grad_norm": 43.259246826171875, "learning_rate": 5.472297932543294e-05, "loss": 0.9226, "step": 247600 }, { "epoch": 3.302182346589167, "grad_norm": 4.457376956939697, "learning_rate": 5.4718061454895345e-05, "loss": 1.0211, "step": 247700 }, { "epoch": 3.303515484395622, "grad_norm": 11.680906295776367, "learning_rate": 5.471314151503956e-05, "loss": 0.8483, "step": 247800 }, { "epoch": 3.304848622202077, "grad_norm": 4.974902629852295, "learning_rate": 5.470821950627747e-05, "loss": 0.8884, "step": 247900 }, { "epoch": 3.306181760008532, "grad_norm": 16.622331619262695, "learning_rate": 5.470329542902113e-05, "loss": 1.0112, "step": 248000 }, { "epoch": 3.307514897814987, "grad_norm": 2.5010437965393066, "learning_rate": 5.469836928368277e-05, "loss": 0.9061, "step": 248100 }, { "epoch": 3.308848035621442, "grad_norm": 8.403154373168945, "learning_rate": 5.469344107067479e-05, "loss": 0.8757, "step": 248200 }, { "epoch": 3.3101811734278974, "grad_norm": 6.506626129150391, "learning_rate": 5.468851079040977e-05, "loss": 0.831, "step": 248300 }, { "epoch": 3.3115143112343524, "grad_norm": 12.917576789855957, "learning_rate": 5.4683627777001056e-05, "loss": 0.9938, "step": 248400 }, { "epoch": 3.3128474490408073, "grad_norm": 42.356319427490234, "learning_rate": 5.467869338412263e-05, "loss": 0.9363, "step": 248500 }, { "epoch": 3.3141805868472622, "grad_norm": 23.85137367248535, "learning_rate": 5.467375692522178e-05, "loss": 0.9026, "step": 248600 }, { "epoch": 3.3155137246537176, "grad_norm": 2.830627918243408, "learning_rate": 5.4668818400711775e-05, "loss": 0.9078, "step": 248700 }, { "epoch": 3.3168468624601726, "grad_norm": 122.04475402832031, "learning_rate": 5.466387781100606e-05, "loss": 0.9554, "step": 248800 }, { "epoch": 3.3181800002666275, "grad_norm": 6.426831245422363, "learning_rate": 5.465893515651824e-05, "loss": 0.9683, "step": 248900 }, { "epoch": 3.3195131380730825, "grad_norm": 15.185344696044922, "learning_rate": 5.46539904376621e-05, "loss": 0.8277, "step": 249000 }, { "epoch": 3.320846275879538, "grad_norm": 21.78784942626953, "learning_rate": 5.4649043654851596e-05, "loss": 0.9765, "step": 249100 }, { "epoch": 3.322179413685993, "grad_norm": 74.19667053222656, "learning_rate": 5.464409480850087e-05, "loss": 0.9345, "step": 249200 }, { "epoch": 3.3235125514924477, "grad_norm": 45.545162200927734, "learning_rate": 5.46391438990242e-05, "loss": 0.9277, "step": 249300 }, { "epoch": 3.3248456892989027, "grad_norm": 6.072159290313721, "learning_rate": 5.4634190926836074e-05, "loss": 0.8615, "step": 249400 }, { "epoch": 3.326178827105358, "grad_norm": 10.740707397460938, "learning_rate": 5.462923589235114e-05, "loss": 0.9097, "step": 249500 }, { "epoch": 3.327511964911813, "grad_norm": 11.519430160522461, "learning_rate": 5.4624278795984214e-05, "loss": 0.9906, "step": 249600 }, { "epoch": 3.328845102718268, "grad_norm": 11.48894214630127, "learning_rate": 5.4619319638150285e-05, "loss": 0.9176, "step": 249700 }, { "epoch": 3.330178240524723, "grad_norm": 41.96733474731445, "learning_rate": 5.461435841926453e-05, "loss": 0.9334, "step": 249800 }, { "epoch": 3.3315113783311783, "grad_norm": 13.443733215332031, "learning_rate": 5.460939513974227e-05, "loss": 0.9627, "step": 249900 }, { "epoch": 3.332844516137633, "grad_norm": 2.677353858947754, "learning_rate": 5.4604429799999025e-05, "loss": 0.9602, "step": 250000 }, { "epoch": 3.334177653944088, "grad_norm": 6.796391010284424, "learning_rate": 5.459946240045049e-05, "loss": 0.9138, "step": 250100 }, { "epoch": 3.335510791750543, "grad_norm": 68.9503173828125, "learning_rate": 5.45944929415125e-05, "loss": 0.8587, "step": 250200 }, { "epoch": 3.3368439295569985, "grad_norm": 6.923464775085449, "learning_rate": 5.458952142360109e-05, "loss": 0.8903, "step": 250300 }, { "epoch": 3.3381770673634534, "grad_norm": 27.270587921142578, "learning_rate": 5.458454784713245e-05, "loss": 0.8746, "step": 250400 }, { "epoch": 3.3395102051699084, "grad_norm": 15.671549797058105, "learning_rate": 5.457957221252297e-05, "loss": 0.863, "step": 250500 }, { "epoch": 3.3408433429763633, "grad_norm": 8.065800666809082, "learning_rate": 5.4574594520189185e-05, "loss": 0.7833, "step": 250600 }, { "epoch": 3.3421764807828187, "grad_norm": 11.157132148742676, "learning_rate": 5.4569614770547815e-05, "loss": 0.9644, "step": 250700 }, { "epoch": 3.3435096185892736, "grad_norm": 16.52591323852539, "learning_rate": 5.456463296401574e-05, "loss": 0.9075, "step": 250800 }, { "epoch": 3.3448427563957286, "grad_norm": 4.606525897979736, "learning_rate": 5.4559649101010033e-05, "loss": 0.9153, "step": 250900 }, { "epoch": 3.3461758942021835, "grad_norm": 11.548784255981445, "learning_rate": 5.455466318194792e-05, "loss": 0.7852, "step": 251000 }, { "epoch": 3.347509032008639, "grad_norm": 7.6068620681762695, "learning_rate": 5.454967520724682e-05, "loss": 0.8542, "step": 251100 }, { "epoch": 3.348842169815094, "grad_norm": 5.166468143463135, "learning_rate": 5.454468517732428e-05, "loss": 0.8627, "step": 251200 }, { "epoch": 3.350175307621549, "grad_norm": 6.721941947937012, "learning_rate": 5.453969309259808e-05, "loss": 0.8354, "step": 251300 }, { "epoch": 3.3515084454280037, "grad_norm": 9.65803337097168, "learning_rate": 5.4534698953486116e-05, "loss": 0.8552, "step": 251400 }, { "epoch": 3.352841583234459, "grad_norm": 7.698729515075684, "learning_rate": 5.4529702760406515e-05, "loss": 0.9151, "step": 251500 }, { "epoch": 3.354174721040914, "grad_norm": 4.053490161895752, "learning_rate": 5.4524704513777505e-05, "loss": 0.9717, "step": 251600 }, { "epoch": 3.355507858847369, "grad_norm": 9.021767616271973, "learning_rate": 5.4519704214017555e-05, "loss": 0.8923, "step": 251700 }, { "epoch": 3.356840996653824, "grad_norm": 14.682426452636719, "learning_rate": 5.451470186154526e-05, "loss": 0.8451, "step": 251800 }, { "epoch": 3.3581741344602793, "grad_norm": 47.33797836303711, "learning_rate": 5.45096974567794e-05, "loss": 0.9549, "step": 251900 }, { "epoch": 3.3595072722667343, "grad_norm": 4.373838901519775, "learning_rate": 5.450469100013893e-05, "loss": 0.8893, "step": 252000 }, { "epoch": 3.3608404100731892, "grad_norm": 7.437839984893799, "learning_rate": 5.449968249204298e-05, "loss": 0.7936, "step": 252100 }, { "epoch": 3.362173547879644, "grad_norm": 6.800704479217529, "learning_rate": 5.4494671932910825e-05, "loss": 0.8187, "step": 252200 }, { "epoch": 3.3635066856860996, "grad_norm": 8.95647144317627, "learning_rate": 5.4489659323161964e-05, "loss": 0.8682, "step": 252300 }, { "epoch": 3.3648398234925545, "grad_norm": 8.441305160522461, "learning_rate": 5.448464466321602e-05, "loss": 0.9223, "step": 252400 }, { "epoch": 3.3661729612990094, "grad_norm": 13.335646629333496, "learning_rate": 5.4479627953492805e-05, "loss": 0.8638, "step": 252500 }, { "epoch": 3.3675060991054644, "grad_norm": 7.647675514221191, "learning_rate": 5.44746091944123e-05, "loss": 0.8427, "step": 252600 }, { "epoch": 3.3688392369119198, "grad_norm": 6.906610488891602, "learning_rate": 5.446958838639468e-05, "loss": 0.7826, "step": 252700 }, { "epoch": 3.3701723747183747, "grad_norm": 7.990323543548584, "learning_rate": 5.446456552986023e-05, "loss": 0.8384, "step": 252800 }, { "epoch": 3.3715055125248297, "grad_norm": 10.27931022644043, "learning_rate": 5.4459540625229495e-05, "loss": 1.0066, "step": 252900 }, { "epoch": 3.3728386503312846, "grad_norm": 7.156965732574463, "learning_rate": 5.4454513672923117e-05, "loss": 0.8664, "step": 253000 }, { "epoch": 3.37417178813774, "grad_norm": 3.784658193588257, "learning_rate": 5.444948467336193e-05, "loss": 0.7516, "step": 253100 }, { "epoch": 3.375504925944195, "grad_norm": 14.667325019836426, "learning_rate": 5.4444503947561364e-05, "loss": 0.844, "step": 253200 }, { "epoch": 3.37683806375065, "grad_norm": 5.366697788238525, "learning_rate": 5.443947087521584e-05, "loss": 0.8765, "step": 253300 }, { "epoch": 3.378171201557105, "grad_norm": 70.15642547607422, "learning_rate": 5.443443575687485e-05, "loss": 0.8568, "step": 253400 }, { "epoch": 3.37950433936356, "grad_norm": 6.525642395019531, "learning_rate": 5.4429398592959925e-05, "loss": 0.9277, "step": 253500 }, { "epoch": 3.380837477170015, "grad_norm": 11.436717987060547, "learning_rate": 5.442435938389276e-05, "loss": 0.8357, "step": 253600 }, { "epoch": 3.38217061497647, "grad_norm": 7.280603885650635, "learning_rate": 5.441931813009522e-05, "loss": 0.9831, "step": 253700 }, { "epoch": 3.383503752782925, "grad_norm": 23.63903045654297, "learning_rate": 5.441427483198934e-05, "loss": 0.9613, "step": 253800 }, { "epoch": 3.38483689058938, "grad_norm": 13.056439399719238, "learning_rate": 5.440922948999733e-05, "loss": 0.9606, "step": 253900 }, { "epoch": 3.3861700283958354, "grad_norm": 54.230709075927734, "learning_rate": 5.440418210454157e-05, "loss": 0.7842, "step": 254000 }, { "epoch": 3.3875031662022903, "grad_norm": 9.880938529968262, "learning_rate": 5.4399132676044615e-05, "loss": 0.9233, "step": 254100 }, { "epoch": 3.3888363040087452, "grad_norm": 19.628522872924805, "learning_rate": 5.439408120492919e-05, "loss": 0.8719, "step": 254200 }, { "epoch": 3.3901694418152006, "grad_norm": 11.972478866577148, "learning_rate": 5.438902769161818e-05, "loss": 0.8475, "step": 254300 }, { "epoch": 3.3915025796216556, "grad_norm": 11.598776817321777, "learning_rate": 5.438397213653465e-05, "loss": 0.9197, "step": 254400 }, { "epoch": 3.3928357174281105, "grad_norm": 3.63519024848938, "learning_rate": 5.4378914540101845e-05, "loss": 1.0572, "step": 254500 }, { "epoch": 3.3941688552345655, "grad_norm": 43.80859375, "learning_rate": 5.437385490274316e-05, "loss": 1.0347, "step": 254600 }, { "epoch": 3.3955019930410204, "grad_norm": 23.66025161743164, "learning_rate": 5.4368793224882166e-05, "loss": 0.9898, "step": 254700 }, { "epoch": 3.396835130847476, "grad_norm": 15.164233207702637, "learning_rate": 5.436372950694263e-05, "loss": 1.0326, "step": 254800 }, { "epoch": 3.3981682686539307, "grad_norm": 5.437252044677734, "learning_rate": 5.4358663749348456e-05, "loss": 0.9393, "step": 254900 }, { "epoch": 3.3995014064603857, "grad_norm": 8.536689758300781, "learning_rate": 5.435364664058479e-05, "loss": 0.9291, "step": 255000 }, { "epoch": 3.400834544266841, "grad_norm": 19.523950576782227, "learning_rate": 5.434857682533976e-05, "loss": 0.8802, "step": 255100 }, { "epoch": 3.402167682073296, "grad_norm": 19.031726837158203, "learning_rate": 5.434350497170861e-05, "loss": 1.0157, "step": 255200 }, { "epoch": 3.403500819879751, "grad_norm": 75.77684783935547, "learning_rate": 5.433843108011598e-05, "loss": 0.904, "step": 255300 }, { "epoch": 3.404833957686206, "grad_norm": 22.80597686767578, "learning_rate": 5.433335515098662e-05, "loss": 0.985, "step": 255400 }, { "epoch": 3.406167095492661, "grad_norm": 44.696475982666016, "learning_rate": 5.4328277184745465e-05, "loss": 1.0473, "step": 255500 }, { "epoch": 3.407500233299116, "grad_norm": 20.402050018310547, "learning_rate": 5.432319718181764e-05, "loss": 0.9645, "step": 255600 }, { "epoch": 3.408833371105571, "grad_norm": 13.349115371704102, "learning_rate": 5.431811514262842e-05, "loss": 0.8896, "step": 255700 }, { "epoch": 3.410166508912026, "grad_norm": 7.541125297546387, "learning_rate": 5.4313031067603256e-05, "loss": 0.9179, "step": 255800 }, { "epoch": 3.4114996467184815, "grad_norm": 21.483562469482422, "learning_rate": 5.430794495716777e-05, "loss": 0.9732, "step": 255900 }, { "epoch": 3.4128327845249364, "grad_norm": 14.844100952148438, "learning_rate": 5.4302856811747755e-05, "loss": 0.9356, "step": 256000 }, { "epoch": 3.4141659223313914, "grad_norm": 10.56740665435791, "learning_rate": 5.429776663176918e-05, "loss": 1.0026, "step": 256100 }, { "epoch": 3.4154990601378463, "grad_norm": 55.519657135009766, "learning_rate": 5.4292674417658175e-05, "loss": 0.9276, "step": 256200 }, { "epoch": 3.4168321979443013, "grad_norm": 9.382831573486328, "learning_rate": 5.428758016984105e-05, "loss": 0.9172, "step": 256300 }, { "epoch": 3.4181653357507567, "grad_norm": 49.277835845947266, "learning_rate": 5.428248388874428e-05, "loss": 0.8919, "step": 256400 }, { "epoch": 3.4194984735572116, "grad_norm": 12.05991268157959, "learning_rate": 5.427738557479449e-05, "loss": 0.9603, "step": 256500 }, { "epoch": 3.4208316113636665, "grad_norm": 7.496159076690674, "learning_rate": 5.427228522841852e-05, "loss": 0.9706, "step": 256600 }, { "epoch": 3.422164749170122, "grad_norm": 16.903871536254883, "learning_rate": 5.4267182850043335e-05, "loss": 0.938, "step": 256700 }, { "epoch": 3.423497886976577, "grad_norm": 3.649028778076172, "learning_rate": 5.426207844009609e-05, "loss": 0.8194, "step": 256800 }, { "epoch": 3.424831024783032, "grad_norm": 2.469714879989624, "learning_rate": 5.4256971999004136e-05, "loss": 0.9283, "step": 256900 }, { "epoch": 3.4261641625894868, "grad_norm": 17.577228546142578, "learning_rate": 5.4251863527194946e-05, "loss": 0.8703, "step": 257000 }, { "epoch": 3.4274973003959417, "grad_norm": 24.35905647277832, "learning_rate": 5.424675302509618e-05, "loss": 0.9307, "step": 257100 }, { "epoch": 3.428830438202397, "grad_norm": 45.38491439819336, "learning_rate": 5.424164049313569e-05, "loss": 1.0009, "step": 257200 }, { "epoch": 3.430163576008852, "grad_norm": 22.426410675048828, "learning_rate": 5.423652593174148e-05, "loss": 0.9694, "step": 257300 }, { "epoch": 3.431496713815307, "grad_norm": 11.417949676513672, "learning_rate": 5.423140934134172e-05, "loss": 1.027, "step": 257400 }, { "epoch": 3.4328298516217624, "grad_norm": 27.858484268188477, "learning_rate": 5.4226290722364746e-05, "loss": 0.9491, "step": 257500 }, { "epoch": 3.4341629894282173, "grad_norm": 7.785845756530762, "learning_rate": 5.4221170075239084e-05, "loss": 0.9526, "step": 257600 }, { "epoch": 3.4354961272346722, "grad_norm": 23.288524627685547, "learning_rate": 5.421604740039341e-05, "loss": 0.9558, "step": 257700 }, { "epoch": 3.436829265041127, "grad_norm": 131.7282257080078, "learning_rate": 5.4210922698256584e-05, "loss": 1.0512, "step": 257800 }, { "epoch": 3.438162402847582, "grad_norm": 14.517962455749512, "learning_rate": 5.4205795969257635e-05, "loss": 1.0574, "step": 257900 }, { "epoch": 3.4394955406540375, "grad_norm": 27.0507755279541, "learning_rate": 5.420066721382575e-05, "loss": 0.9649, "step": 258000 }, { "epoch": 3.4408286784604925, "grad_norm": 118.66785430908203, "learning_rate": 5.419553643239029e-05, "loss": 1.017, "step": 258100 }, { "epoch": 3.4421618162669474, "grad_norm": 26.98771858215332, "learning_rate": 5.4190403625380787e-05, "loss": 0.9864, "step": 258200 }, { "epoch": 3.443494954073403, "grad_norm": 19.944974899291992, "learning_rate": 5.418526879322695e-05, "loss": 1.0076, "step": 258300 }, { "epoch": 3.4448280918798577, "grad_norm": 52.79111862182617, "learning_rate": 5.4180131936358655e-05, "loss": 0.993, "step": 258400 }, { "epoch": 3.4461612296863127, "grad_norm": 22.716922760009766, "learning_rate": 5.417499305520594e-05, "loss": 1.0312, "step": 258500 }, { "epoch": 3.4474943674927676, "grad_norm": 10.036187171936035, "learning_rate": 5.416985215019899e-05, "loss": 0.8932, "step": 258600 }, { "epoch": 3.4488275052992226, "grad_norm": 109.33732604980469, "learning_rate": 5.4164709221768226e-05, "loss": 1.0169, "step": 258700 }, { "epoch": 3.450160643105678, "grad_norm": 21.50236701965332, "learning_rate": 5.4159615729870825e-05, "loss": 0.8673, "step": 258800 }, { "epoch": 3.451493780912133, "grad_norm": 16.71588706970215, "learning_rate": 5.41544687761077e-05, "loss": 1.0385, "step": 258900 }, { "epoch": 3.452826918718588, "grad_norm": 124.0373306274414, "learning_rate": 5.414931980020859e-05, "loss": 0.9662, "step": 259000 }, { "epoch": 3.454160056525043, "grad_norm": 6.239383220672607, "learning_rate": 5.4144168802604566e-05, "loss": 0.894, "step": 259100 }, { "epoch": 3.455493194331498, "grad_norm": 26.90391731262207, "learning_rate": 5.413901578372683e-05, "loss": 1.0005, "step": 259200 }, { "epoch": 3.456826332137953, "grad_norm": 99.47506713867188, "learning_rate": 5.4133860744006776e-05, "loss": 0.969, "step": 259300 }, { "epoch": 3.458159469944408, "grad_norm": 4.635208606719971, "learning_rate": 5.412870368387599e-05, "loss": 1.0353, "step": 259400 }, { "epoch": 3.459492607750863, "grad_norm": 21.77749252319336, "learning_rate": 5.4123544603766194e-05, "loss": 1.0454, "step": 259500 }, { "epoch": 3.4608257455573184, "grad_norm": 10.506856918334961, "learning_rate": 5.411838350410928e-05, "loss": 0.9484, "step": 259600 }, { "epoch": 3.4621588833637733, "grad_norm": 12.139592170715332, "learning_rate": 5.411322038533733e-05, "loss": 1.0013, "step": 259700 }, { "epoch": 3.4634920211702283, "grad_norm": 11.908894538879395, "learning_rate": 5.4108055247882574e-05, "loss": 0.9419, "step": 259800 }, { "epoch": 3.4648251589766836, "grad_norm": 4.852936744689941, "learning_rate": 5.410288809217744e-05, "loss": 1.0772, "step": 259900 }, { "epoch": 3.4661582967831386, "grad_norm": 8.407777786254883, "learning_rate": 5.409771891865448e-05, "loss": 0.9622, "step": 260000 }, { "epoch": 3.4674914345895935, "grad_norm": 24.422853469848633, "learning_rate": 5.409254772774646e-05, "loss": 0.9458, "step": 260100 }, { "epoch": 3.4688245723960485, "grad_norm": 6.659424781799316, "learning_rate": 5.4087374519886294e-05, "loss": 0.9022, "step": 260200 }, { "epoch": 3.4701577102025034, "grad_norm": 32.53437805175781, "learning_rate": 5.408219929550706e-05, "loss": 0.9424, "step": 260300 }, { "epoch": 3.471490848008959, "grad_norm": 123.85906219482422, "learning_rate": 5.4077022055042015e-05, "loss": 1.0058, "step": 260400 }, { "epoch": 3.4728239858154137, "grad_norm": 10.781240463256836, "learning_rate": 5.407184279892458e-05, "loss": 0.9828, "step": 260500 }, { "epoch": 3.4741571236218687, "grad_norm": 10.931221961975098, "learning_rate": 5.406666152758835e-05, "loss": 0.894, "step": 260600 }, { "epoch": 3.475490261428324, "grad_norm": 6.166298866271973, "learning_rate": 5.4061478241467074e-05, "loss": 0.9133, "step": 260700 }, { "epoch": 3.476823399234779, "grad_norm": 7.538830280303955, "learning_rate": 5.4056292940994696e-05, "loss": 0.9485, "step": 260800 }, { "epoch": 3.478156537041234, "grad_norm": 18.192277908325195, "learning_rate": 5.40511056266053e-05, "loss": 0.9854, "step": 260900 }, { "epoch": 3.479489674847689, "grad_norm": 44.85555648803711, "learning_rate": 5.4045916298733166e-05, "loss": 0.9367, "step": 261000 }, { "epoch": 3.480822812654144, "grad_norm": 6.016756534576416, "learning_rate": 5.404072495781272e-05, "loss": 1.0048, "step": 261100 }, { "epoch": 3.4821559504605992, "grad_norm": 11.332769393920898, "learning_rate": 5.403553160427855e-05, "loss": 0.8939, "step": 261200 }, { "epoch": 3.483489088267054, "grad_norm": 16.165515899658203, "learning_rate": 5.403033623856544e-05, "loss": 0.8498, "step": 261300 }, { "epoch": 3.484822226073509, "grad_norm": 34.05329132080078, "learning_rate": 5.402513886110835e-05, "loss": 0.9907, "step": 261400 }, { "epoch": 3.4861553638799645, "grad_norm": 37.343414306640625, "learning_rate": 5.401993947234235e-05, "loss": 0.9477, "step": 261500 }, { "epoch": 3.4874885016864194, "grad_norm": 36.91941452026367, "learning_rate": 5.4014738072702736e-05, "loss": 0.9422, "step": 261600 }, { "epoch": 3.4888216394928744, "grad_norm": 48.01537322998047, "learning_rate": 5.400953466262495e-05, "loss": 0.9381, "step": 261700 }, { "epoch": 3.4901547772993293, "grad_norm": 24.5910587310791, "learning_rate": 5.400432924254462e-05, "loss": 0.893, "step": 261800 }, { "epoch": 3.4914879151057843, "grad_norm": 22.526283264160156, "learning_rate": 5.3999121812897506e-05, "loss": 0.9172, "step": 261900 }, { "epoch": 3.4928210529122397, "grad_norm": 10.374889373779297, "learning_rate": 5.399391237411956e-05, "loss": 0.999, "step": 262000 }, { "epoch": 3.4941541907186946, "grad_norm": 11.464898109436035, "learning_rate": 5.398870092664691e-05, "loss": 0.9725, "step": 262100 }, { "epoch": 3.4954873285251495, "grad_norm": 4.243405342102051, "learning_rate": 5.3983487470915834e-05, "loss": 0.9664, "step": 262200 }, { "epoch": 3.496820466331605, "grad_norm": 30.193559646606445, "learning_rate": 5.39782720073628e-05, "loss": 0.9411, "step": 262300 }, { "epoch": 3.49815360413806, "grad_norm": 9.491408348083496, "learning_rate": 5.397305453642441e-05, "loss": 0.8748, "step": 262400 }, { "epoch": 3.499486741944515, "grad_norm": 9.584452629089355, "learning_rate": 5.396783505853747e-05, "loss": 0.8796, "step": 262500 }, { "epoch": 3.5008198797509698, "grad_norm": 40.953880310058594, "learning_rate": 5.396261357413892e-05, "loss": 1.0101, "step": 262600 }, { "epoch": 3.5021530175574247, "grad_norm": 13.967059135437012, "learning_rate": 5.395744232849927e-05, "loss": 0.9296, "step": 262700 }, { "epoch": 3.50348615536388, "grad_norm": 20.0158634185791, "learning_rate": 5.395221685244329e-05, "loss": 0.8586, "step": 262800 }, { "epoch": 3.504819293170335, "grad_norm": 8.650063514709473, "learning_rate": 5.394698937118322e-05, "loss": 0.9506, "step": 262900 }, { "epoch": 3.50615243097679, "grad_norm": 7.936328411102295, "learning_rate": 5.394175988515668e-05, "loss": 0.9601, "step": 263000 }, { "epoch": 3.5074855687832454, "grad_norm": 42.02079391479492, "learning_rate": 5.393652839480148e-05, "loss": 0.9509, "step": 263100 }, { "epoch": 3.5088187065897003, "grad_norm": 11.96915054321289, "learning_rate": 5.393129490055557e-05, "loss": 0.9853, "step": 263200 }, { "epoch": 3.5101518443961552, "grad_norm": 15.781047821044922, "learning_rate": 5.392605940285709e-05, "loss": 1.0618, "step": 263300 }, { "epoch": 3.51148498220261, "grad_norm": 12.77486801147461, "learning_rate": 5.3920821902144344e-05, "loss": 1.0407, "step": 263400 }, { "epoch": 3.512818120009065, "grad_norm": 13.527844429016113, "learning_rate": 5.391558239885578e-05, "loss": 0.9295, "step": 263500 }, { "epoch": 3.5141512578155205, "grad_norm": 37.226768493652344, "learning_rate": 5.3910340893430065e-05, "loss": 0.9741, "step": 263600 }, { "epoch": 3.5154843956219755, "grad_norm": 45.33626937866211, "learning_rate": 5.390509738630597e-05, "loss": 1.0431, "step": 263700 }, { "epoch": 3.5168175334284304, "grad_norm": 12.825387001037598, "learning_rate": 5.3899851877922485e-05, "loss": 0.9409, "step": 263800 }, { "epoch": 3.518150671234886, "grad_norm": 11.85519027709961, "learning_rate": 5.389460436871873e-05, "loss": 1.0085, "step": 263900 }, { "epoch": 3.5194838090413407, "grad_norm": 3.5273096561431885, "learning_rate": 5.3889354859134026e-05, "loss": 1.0205, "step": 264000 }, { "epoch": 3.5208169468477957, "grad_norm": 171.09910583496094, "learning_rate": 5.388410334960783e-05, "loss": 0.9973, "step": 264100 }, { "epoch": 3.5221500846542506, "grad_norm": 4.740816593170166, "learning_rate": 5.38788498405798e-05, "loss": 0.9645, "step": 264200 }, { "epoch": 3.5234832224607056, "grad_norm": 9.429879188537598, "learning_rate": 5.3873594332489735e-05, "loss": 1.0165, "step": 264300 }, { "epoch": 3.524816360267161, "grad_norm": 9.676434516906738, "learning_rate": 5.38683368257776e-05, "loss": 0.9235, "step": 264400 }, { "epoch": 3.526149498073616, "grad_norm": 6.840790748596191, "learning_rate": 5.386307732088355e-05, "loss": 0.8531, "step": 264500 }, { "epoch": 3.527482635880071, "grad_norm": 7.200995922088623, "learning_rate": 5.3857815818247894e-05, "loss": 0.9429, "step": 264600 }, { "epoch": 3.528815773686526, "grad_norm": 4.304939270019531, "learning_rate": 5.38525523183111e-05, "loss": 0.8928, "step": 264700 }, { "epoch": 3.530148911492981, "grad_norm": 8.813733100891113, "learning_rate": 5.384728682151381e-05, "loss": 0.948, "step": 264800 }, { "epoch": 3.531482049299436, "grad_norm": 7.812871932983398, "learning_rate": 5.3842019328296854e-05, "loss": 0.8919, "step": 264900 }, { "epoch": 3.532815187105891, "grad_norm": 14.078702926635742, "learning_rate": 5.383674983910118e-05, "loss": 0.882, "step": 265000 }, { "epoch": 3.534148324912346, "grad_norm": 18.02888298034668, "learning_rate": 5.3831478354367964e-05, "loss": 0.8737, "step": 265100 }, { "epoch": 3.5354814627188014, "grad_norm": 6.239128112792969, "learning_rate": 5.382620487453849e-05, "loss": 0.9037, "step": 265200 }, { "epoch": 3.5368146005252563, "grad_norm": 3.793897867202759, "learning_rate": 5.382092940005427e-05, "loss": 0.8477, "step": 265300 }, { "epoch": 3.5381477383317113, "grad_norm": 49.383453369140625, "learning_rate": 5.381565193135691e-05, "loss": 0.9953, "step": 265400 }, { "epoch": 3.5394808761381666, "grad_norm": 5.896852016448975, "learning_rate": 5.381037246888825e-05, "loss": 0.8351, "step": 265500 }, { "epoch": 3.5408140139446216, "grad_norm": 13.297842025756836, "learning_rate": 5.380509101309028e-05, "loss": 0.9255, "step": 265600 }, { "epoch": 3.5421471517510765, "grad_norm": 7.380673885345459, "learning_rate": 5.3799807564405116e-05, "loss": 0.9182, "step": 265700 }, { "epoch": 3.5434802895575315, "grad_norm": 56.63745880126953, "learning_rate": 5.379452212327508e-05, "loss": 0.9339, "step": 265800 }, { "epoch": 3.5448134273639864, "grad_norm": 48.467796325683594, "learning_rate": 5.378923469014267e-05, "loss": 0.8595, "step": 265900 }, { "epoch": 3.546146565170442, "grad_norm": 12.50361156463623, "learning_rate": 5.378394526545051e-05, "loss": 0.9177, "step": 266000 }, { "epoch": 3.5474797029768967, "grad_norm": 7.251820087432861, "learning_rate": 5.3778653849641435e-05, "loss": 0.8863, "step": 266100 }, { "epoch": 3.5488128407833517, "grad_norm": 80.1898193359375, "learning_rate": 5.3773360443158415e-05, "loss": 0.9044, "step": 266200 }, { "epoch": 3.550145978589807, "grad_norm": 6.285276889801025, "learning_rate": 5.37680650464446e-05, "loss": 0.8477, "step": 266300 }, { "epoch": 3.551479116396262, "grad_norm": 57.7635612487793, "learning_rate": 5.37627676599433e-05, "loss": 0.9425, "step": 266400 }, { "epoch": 3.552812254202717, "grad_norm": 4.165036201477051, "learning_rate": 5.3757468284098e-05, "loss": 0.937, "step": 266500 }, { "epoch": 3.554145392009172, "grad_norm": 7.213791847229004, "learning_rate": 5.375216691935235e-05, "loss": 0.917, "step": 266600 }, { "epoch": 3.555478529815627, "grad_norm": 63.28120040893555, "learning_rate": 5.3746863566150145e-05, "loss": 1.0048, "step": 266700 }, { "epoch": 3.5568116676220822, "grad_norm": 53.403804779052734, "learning_rate": 5.3741558224935385e-05, "loss": 0.8639, "step": 266800 }, { "epoch": 3.558144805428537, "grad_norm": 6.023101806640625, "learning_rate": 5.3736250896152224e-05, "loss": 0.9666, "step": 266900 }, { "epoch": 3.559477943234992, "grad_norm": 7.466597557067871, "learning_rate": 5.373094158024495e-05, "loss": 0.8603, "step": 267000 }, { "epoch": 3.5608110810414475, "grad_norm": 6.904023170471191, "learning_rate": 5.372563027765805e-05, "loss": 0.9205, "step": 267100 }, { "epoch": 3.5621442188479024, "grad_norm": 6.509407997131348, "learning_rate": 5.372031698883618e-05, "loss": 0.8533, "step": 267200 }, { "epoch": 3.5634773566543574, "grad_norm": 5.300851345062256, "learning_rate": 5.371500171422414e-05, "loss": 0.9626, "step": 267300 }, { "epoch": 3.5648104944608123, "grad_norm": 3.5167226791381836, "learning_rate": 5.3709684454266915e-05, "loss": 0.846, "step": 267400 }, { "epoch": 3.5661436322672673, "grad_norm": 13.102985382080078, "learning_rate": 5.3704365209409636e-05, "loss": 0.8534, "step": 267500 }, { "epoch": 3.5674767700737227, "grad_norm": 9.090873718261719, "learning_rate": 5.3699043980097635e-05, "loss": 0.913, "step": 267600 }, { "epoch": 3.5688099078801776, "grad_norm": 5.952266693115234, "learning_rate": 5.369372076677638e-05, "loss": 0.7917, "step": 267700 }, { "epoch": 3.5701430456866325, "grad_norm": 25.144681930541992, "learning_rate": 5.36883955698915e-05, "loss": 0.851, "step": 267800 }, { "epoch": 3.571476183493088, "grad_norm": 31.452245712280273, "learning_rate": 5.368306838988883e-05, "loss": 0.8853, "step": 267900 }, { "epoch": 3.572809321299543, "grad_norm": 42.524139404296875, "learning_rate": 5.367773922721432e-05, "loss": 0.8314, "step": 268000 }, { "epoch": 3.574142459105998, "grad_norm": 7.544164657592773, "learning_rate": 5.367240808231412e-05, "loss": 0.8366, "step": 268100 }, { "epoch": 3.5754755969124528, "grad_norm": 4.872640132904053, "learning_rate": 5.366707495563454e-05, "loss": 0.8773, "step": 268200 }, { "epoch": 3.5768087347189077, "grad_norm": 25.723674774169922, "learning_rate": 5.366173984762204e-05, "loss": 0.8353, "step": 268300 }, { "epoch": 3.578141872525363, "grad_norm": 3.4925990104675293, "learning_rate": 5.365640275872329e-05, "loss": 0.8344, "step": 268400 }, { "epoch": 3.579475010331818, "grad_norm": 79.42778015136719, "learning_rate": 5.365106368938506e-05, "loss": 0.8603, "step": 268500 }, { "epoch": 3.580808148138273, "grad_norm": 3.400064468383789, "learning_rate": 5.364572264005433e-05, "loss": 0.8311, "step": 268600 }, { "epoch": 3.5821412859447284, "grad_norm": 1.1430084705352783, "learning_rate": 5.364037961117825e-05, "loss": 0.7592, "step": 268700 }, { "epoch": 3.5834744237511833, "grad_norm": 18.252349853515625, "learning_rate": 5.3635034603204095e-05, "loss": 0.9305, "step": 268800 }, { "epoch": 3.5848075615576382, "grad_norm": 5.410533428192139, "learning_rate": 5.362968761657936e-05, "loss": 0.9134, "step": 268900 }, { "epoch": 3.586140699364093, "grad_norm": 14.34504222869873, "learning_rate": 5.3624338651751654e-05, "loss": 0.8525, "step": 269000 }, { "epoch": 3.587473837170548, "grad_norm": 6.9254655838012695, "learning_rate": 5.3618987709168795e-05, "loss": 0.8914, "step": 269100 }, { "epoch": 3.5888069749770035, "grad_norm": 12.609107971191406, "learning_rate": 5.3613634789278735e-05, "loss": 0.8873, "step": 269200 }, { "epoch": 3.5901401127834585, "grad_norm": 24.00758934020996, "learning_rate": 5.36082798925296e-05, "loss": 0.8464, "step": 269300 }, { "epoch": 3.5914732505899134, "grad_norm": 2.2649383544921875, "learning_rate": 5.3602923019369696e-05, "loss": 0.8175, "step": 269400 }, { "epoch": 3.592806388396369, "grad_norm": 15.832942962646484, "learning_rate": 5.359756417024749e-05, "loss": 0.8899, "step": 269500 }, { "epoch": 3.5941395262028237, "grad_norm": 7.271198749542236, "learning_rate": 5.359220334561159e-05, "loss": 0.8351, "step": 269600 }, { "epoch": 3.5954726640092787, "grad_norm": 6.851893901824951, "learning_rate": 5.3586840545910786e-05, "loss": 0.7647, "step": 269700 }, { "epoch": 3.5968058018157336, "grad_norm": 8.173537254333496, "learning_rate": 5.358147577159406e-05, "loss": 0.8847, "step": 269800 }, { "epoch": 3.5981389396221886, "grad_norm": 6.3226165771484375, "learning_rate": 5.3576162700365994e-05, "loss": 0.8206, "step": 269900 }, { "epoch": 3.599472077428644, "grad_norm": 8.675185203552246, "learning_rate": 5.357079399789986e-05, "loss": 0.8664, "step": 270000 }, { "epoch": 3.600805215235099, "grad_norm": 5.946749687194824, "learning_rate": 5.3565423322161165e-05, "loss": 0.8072, "step": 270100 }, { "epoch": 3.602138353041554, "grad_norm": 5.3907623291015625, "learning_rate": 5.356005067359951e-05, "loss": 0.8698, "step": 270200 }, { "epoch": 3.603471490848009, "grad_norm": 9.228534698486328, "learning_rate": 5.355467605266468e-05, "loss": 0.8177, "step": 270300 }, { "epoch": 3.604804628654464, "grad_norm": 4.474039077758789, "learning_rate": 5.354929945980662e-05, "loss": 0.9079, "step": 270400 }, { "epoch": 3.606137766460919, "grad_norm": 15.865964889526367, "learning_rate": 5.3543920895475435e-05, "loss": 0.7607, "step": 270500 }, { "epoch": 3.607470904267374, "grad_norm": 3.5810694694519043, "learning_rate": 5.353854036012142e-05, "loss": 0.8618, "step": 270600 }, { "epoch": 3.608804042073829, "grad_norm": 1.6078554391860962, "learning_rate": 5.353315785419501e-05, "loss": 0.8129, "step": 270700 }, { "epoch": 3.6101371798802844, "grad_norm": 6.922598361968994, "learning_rate": 5.3527773378146806e-05, "loss": 0.8242, "step": 270800 }, { "epoch": 3.6114703176867393, "grad_norm": 4.366153240203857, "learning_rate": 5.352238693242757e-05, "loss": 0.8737, "step": 270900 }, { "epoch": 3.6128034554931943, "grad_norm": 2.2540481090545654, "learning_rate": 5.351699851748825e-05, "loss": 0.8129, "step": 271000 }, { "epoch": 3.6141365932996496, "grad_norm": 13.215925216674805, "learning_rate": 5.3511608133779956e-05, "loss": 0.8467, "step": 271100 }, { "epoch": 3.6154697311061046, "grad_norm": 35.36745071411133, "learning_rate": 5.350621578175394e-05, "loss": 0.8438, "step": 271200 }, { "epoch": 3.6168028689125595, "grad_norm": 78.4917984008789, "learning_rate": 5.350082146186164e-05, "loss": 0.8044, "step": 271300 }, { "epoch": 3.6181360067190145, "grad_norm": 5.4864182472229, "learning_rate": 5.349542517455464e-05, "loss": 0.8138, "step": 271400 }, { "epoch": 3.6194691445254694, "grad_norm": 9.05914306640625, "learning_rate": 5.3490026920284704e-05, "loss": 0.8733, "step": 271500 }, { "epoch": 3.620802282331925, "grad_norm": 8.046319961547852, "learning_rate": 5.348462669950378e-05, "loss": 0.8234, "step": 271600 }, { "epoch": 3.6221354201383797, "grad_norm": 11.066679000854492, "learning_rate": 5.347922451266392e-05, "loss": 0.8603, "step": 271700 }, { "epoch": 3.6234685579448347, "grad_norm": 5.949485778808594, "learning_rate": 5.3473820360217414e-05, "loss": 0.8355, "step": 271800 }, { "epoch": 3.62480169575129, "grad_norm": 17.80038833618164, "learning_rate": 5.346841424261665e-05, "loss": 0.8718, "step": 271900 }, { "epoch": 3.626134833557745, "grad_norm": 14.517935752868652, "learning_rate": 5.346306025086104e-05, "loss": 0.9544, "step": 272000 }, { "epoch": 3.6274679713642, "grad_norm": 7.600750923156738, "learning_rate": 5.345765022394995e-05, "loss": 0.7126, "step": 272100 }, { "epoch": 3.628801109170655, "grad_norm": 8.006924629211426, "learning_rate": 5.3452238233238335e-05, "loss": 0.8302, "step": 272200 }, { "epoch": 3.63013424697711, "grad_norm": 21.958927154541016, "learning_rate": 5.344682427917926e-05, "loss": 0.7565, "step": 272300 }, { "epoch": 3.6314673847835652, "grad_norm": 41.64734649658203, "learning_rate": 5.3441408362225964e-05, "loss": 0.8044, "step": 272400 }, { "epoch": 3.63280052259002, "grad_norm": 4.9467692375183105, "learning_rate": 5.343599048283186e-05, "loss": 0.8419, "step": 272500 }, { "epoch": 3.634133660396475, "grad_norm": 14.73542594909668, "learning_rate": 5.3430570641450496e-05, "loss": 1.0322, "step": 272600 }, { "epoch": 3.6354667982029305, "grad_norm": 8.712089538574219, "learning_rate": 5.342514883853563e-05, "loss": 0.7959, "step": 272700 }, { "epoch": 3.6367999360093854, "grad_norm": 19.577531814575195, "learning_rate": 5.3419725074541144e-05, "loss": 0.8167, "step": 272800 }, { "epoch": 3.6381330738158404, "grad_norm": 16.046350479125977, "learning_rate": 5.341429934992109e-05, "loss": 0.9769, "step": 272900 }, { "epoch": 3.6394662116222953, "grad_norm": 7.698108673095703, "learning_rate": 5.340887166512971e-05, "loss": 0.8225, "step": 273000 }, { "epoch": 3.6407993494287503, "grad_norm": 29.368261337280273, "learning_rate": 5.340344202062139e-05, "loss": 0.8436, "step": 273100 }, { "epoch": 3.6421324872352057, "grad_norm": 14.88512897491455, "learning_rate": 5.339801041685067e-05, "loss": 0.8674, "step": 273200 }, { "epoch": 3.6434656250416606, "grad_norm": 6.699275493621826, "learning_rate": 5.339257685427228e-05, "loss": 0.8343, "step": 273300 }, { "epoch": 3.6447987628481155, "grad_norm": 13.923571586608887, "learning_rate": 5.3387141333341103e-05, "loss": 0.8287, "step": 273400 }, { "epoch": 3.646131900654571, "grad_norm": 12.403640747070312, "learning_rate": 5.338170385451216e-05, "loss": 0.8221, "step": 273500 }, { "epoch": 3.647465038461026, "grad_norm": 6.299748420715332, "learning_rate": 5.3376264418240686e-05, "loss": 0.7634, "step": 273600 }, { "epoch": 3.648798176267481, "grad_norm": 9.42530632019043, "learning_rate": 5.337082302498204e-05, "loss": 0.7134, "step": 273700 }, { "epoch": 3.6501313140739358, "grad_norm": 10.998556137084961, "learning_rate": 5.336537967519176e-05, "loss": 0.8496, "step": 273800 }, { "epoch": 3.6514644518803907, "grad_norm": 18.33515167236328, "learning_rate": 5.3359934369325545e-05, "loss": 0.9177, "step": 273900 }, { "epoch": 3.652797589686846, "grad_norm": 10.869627952575684, "learning_rate": 5.335448710783927e-05, "loss": 0.7901, "step": 274000 }, { "epoch": 3.654130727493301, "grad_norm": 17.100685119628906, "learning_rate": 5.3349037891188946e-05, "loss": 0.863, "step": 274100 }, { "epoch": 3.655463865299756, "grad_norm": 9.410653114318848, "learning_rate": 5.334364124121867e-05, "loss": 0.9056, "step": 274200 }, { "epoch": 3.6567970031062114, "grad_norm": 93.21263122558594, "learning_rate": 5.333818813514926e-05, "loss": 0.9122, "step": 274300 }, { "epoch": 3.6581301409126663, "grad_norm": 8.252012252807617, "learning_rate": 5.333278763554881e-05, "loss": 0.9631, "step": 274400 }, { "epoch": 3.6594632787191212, "grad_norm": 5.7947163581848145, "learning_rate": 5.332733064186816e-05, "loss": 0.8699, "step": 274500 }, { "epoch": 3.660796416525576, "grad_norm": 2.113999843597412, "learning_rate": 5.332187169529693e-05, "loss": 0.8411, "step": 274600 }, { "epoch": 3.662129554332031, "grad_norm": 28.436302185058594, "learning_rate": 5.331641079629211e-05, "loss": 0.7609, "step": 274700 }, { "epoch": 3.6634626921384865, "grad_norm": 18.767906188964844, "learning_rate": 5.331100258348148e-05, "loss": 0.7809, "step": 274800 }, { "epoch": 3.6647958299449415, "grad_norm": 50.512393951416016, "learning_rate": 5.33055378004941e-05, "loss": 0.9372, "step": 274900 }, { "epoch": 3.6661289677513964, "grad_norm": 4.8961639404296875, "learning_rate": 5.330007106644056e-05, "loss": 0.8972, "step": 275000 }, { "epoch": 3.667462105557852, "grad_norm": 4.503507137298584, "learning_rate": 5.329460238177852e-05, "loss": 0.8141, "step": 275100 }, { "epoch": 3.6687952433643067, "grad_norm": 8.144170761108398, "learning_rate": 5.32891317469658e-05, "loss": 0.8828, "step": 275200 }, { "epoch": 3.6701283811707617, "grad_norm": 7.00837516784668, "learning_rate": 5.328365916246038e-05, "loss": 0.7997, "step": 275300 }, { "epoch": 3.6714615189772166, "grad_norm": 3.9859025478363037, "learning_rate": 5.3278184628720414e-05, "loss": 0.8547, "step": 275400 }, { "epoch": 3.6727946567836716, "grad_norm": 7.264124870300293, "learning_rate": 5.3272708146204207e-05, "loss": 0.7701, "step": 275500 }, { "epoch": 3.6741277945901265, "grad_norm": 0.7586125135421753, "learning_rate": 5.3267229715370244e-05, "loss": 0.8164, "step": 275600 }, { "epoch": 3.675460932396582, "grad_norm": 4.806887626647949, "learning_rate": 5.3261749336677144e-05, "loss": 0.9256, "step": 275700 }, { "epoch": 3.676794070203037, "grad_norm": 4.754878997802734, "learning_rate": 5.325626701058372e-05, "loss": 0.8086, "step": 275800 }, { "epoch": 3.6781272080094918, "grad_norm": 31.495500564575195, "learning_rate": 5.325078273754894e-05, "loss": 0.8313, "step": 275900 }, { "epoch": 3.679460345815947, "grad_norm": 26.8424072265625, "learning_rate": 5.324529651803191e-05, "loss": 0.8157, "step": 276000 }, { "epoch": 3.680793483622402, "grad_norm": 6.811697959899902, "learning_rate": 5.323980835249194e-05, "loss": 0.8545, "step": 276100 }, { "epoch": 3.682126621428857, "grad_norm": 9.043444633483887, "learning_rate": 5.323431824138847e-05, "loss": 0.8498, "step": 276200 }, { "epoch": 3.683459759235312, "grad_norm": 11.799369812011719, "learning_rate": 5.322882618518112e-05, "loss": 0.7717, "step": 276300 }, { "epoch": 3.684792897041767, "grad_norm": 11.056285858154297, "learning_rate": 5.322333218432967e-05, "loss": 0.8388, "step": 276400 }, { "epoch": 3.6861260348482223, "grad_norm": 11.16865348815918, "learning_rate": 5.321783623929404e-05, "loss": 0.8801, "step": 276500 }, { "epoch": 3.6874591726546773, "grad_norm": 2.2857816219329834, "learning_rate": 5.3212338350534356e-05, "loss": 0.9176, "step": 276600 }, { "epoch": 3.688792310461132, "grad_norm": 5.933136463165283, "learning_rate": 5.320683851851088e-05, "loss": 0.8447, "step": 276700 }, { "epoch": 3.6901254482675876, "grad_norm": 26.20657730102539, "learning_rate": 5.320133674368403e-05, "loss": 0.9673, "step": 276800 }, { "epoch": 3.6914585860740425, "grad_norm": 5.43034029006958, "learning_rate": 5.319583302651442e-05, "loss": 0.94, "step": 276900 }, { "epoch": 3.6927917238804975, "grad_norm": 5.933419704437256, "learning_rate": 5.319032736746278e-05, "loss": 0.8027, "step": 277000 }, { "epoch": 3.6941248616869524, "grad_norm": 13.596923828125, "learning_rate": 5.318481976699003e-05, "loss": 0.922, "step": 277100 }, { "epoch": 3.6954579994934074, "grad_norm": 21.47899055480957, "learning_rate": 5.317931022555726e-05, "loss": 0.8553, "step": 277200 }, { "epoch": 3.6967911372998628, "grad_norm": 6.818812370300293, "learning_rate": 5.31737987436257e-05, "loss": 0.7899, "step": 277300 }, { "epoch": 3.6981242751063177, "grad_norm": 8.177650451660156, "learning_rate": 5.316828532165676e-05, "loss": 0.9074, "step": 277400 }, { "epoch": 3.6994574129127726, "grad_norm": 5.326920986175537, "learning_rate": 5.316276996011201e-05, "loss": 0.755, "step": 277500 }, { "epoch": 3.700790550719228, "grad_norm": 4.886800765991211, "learning_rate": 5.315725265945317e-05, "loss": 0.829, "step": 277600 }, { "epoch": 3.702123688525683, "grad_norm": 24.214658737182617, "learning_rate": 5.315173342014214e-05, "loss": 0.8428, "step": 277700 }, { "epoch": 3.703456826332138, "grad_norm": 53.51362991333008, "learning_rate": 5.314621224264097e-05, "loss": 0.9015, "step": 277800 }, { "epoch": 3.704789964138593, "grad_norm": 9.88304328918457, "learning_rate": 5.314068912741187e-05, "loss": 0.9026, "step": 277900 }, { "epoch": 3.706123101945048, "grad_norm": 5.261514663696289, "learning_rate": 5.3135164074917226e-05, "loss": 0.8621, "step": 278000 }, { "epoch": 3.707456239751503, "grad_norm": 8.676607131958008, "learning_rate": 5.312963708561957e-05, "loss": 0.8135, "step": 278100 }, { "epoch": 3.708789377557958, "grad_norm": 10.475987434387207, "learning_rate": 5.312410815998161e-05, "loss": 0.8062, "step": 278200 }, { "epoch": 3.710122515364413, "grad_norm": 11.826848030090332, "learning_rate": 5.31185772984662e-05, "loss": 0.8374, "step": 278300 }, { "epoch": 3.7114556531708685, "grad_norm": 6.788539409637451, "learning_rate": 5.3113044501536385e-05, "loss": 0.8137, "step": 278400 }, { "epoch": 3.7127887909773234, "grad_norm": 29.590911865234375, "learning_rate": 5.310750976965534e-05, "loss": 0.8534, "step": 278500 }, { "epoch": 3.7141219287837783, "grad_norm": 6.204885005950928, "learning_rate": 5.3101973103286425e-05, "loss": 0.8064, "step": 278600 }, { "epoch": 3.7154550665902333, "grad_norm": 16.902334213256836, "learning_rate": 5.309643450289313e-05, "loss": 0.8563, "step": 278700 }, { "epoch": 3.7167882043966882, "grad_norm": 28.61983871459961, "learning_rate": 5.309089396893916e-05, "loss": 0.7744, "step": 278800 }, { "epoch": 3.7181213422031436, "grad_norm": 12.851700782775879, "learning_rate": 5.3085351501888323e-05, "loss": 0.8661, "step": 278900 }, { "epoch": 3.7194544800095986, "grad_norm": 7.802393913269043, "learning_rate": 5.307980710220464e-05, "loss": 0.8221, "step": 279000 }, { "epoch": 3.7207876178160535, "grad_norm": 11.658449172973633, "learning_rate": 5.307426077035225e-05, "loss": 0.8925, "step": 279100 }, { "epoch": 3.722120755622509, "grad_norm": 15.97349739074707, "learning_rate": 5.3068712506795494e-05, "loss": 0.8723, "step": 279200 }, { "epoch": 3.723453893428964, "grad_norm": 1.1406205892562866, "learning_rate": 5.306316231199884e-05, "loss": 0.7493, "step": 279300 }, { "epoch": 3.7247870312354188, "grad_norm": 12.490666389465332, "learning_rate": 5.305761018642694e-05, "loss": 0.8288, "step": 279400 }, { "epoch": 3.7261201690418737, "grad_norm": 17.81568145751953, "learning_rate": 5.305205613054461e-05, "loss": 0.7852, "step": 279500 }, { "epoch": 3.7274533068483287, "grad_norm": 3.8805325031280518, "learning_rate": 5.30465001448168e-05, "loss": 0.8257, "step": 279600 }, { "epoch": 3.728786444654784, "grad_norm": 7.742098331451416, "learning_rate": 5.304094222970864e-05, "loss": 0.8179, "step": 279700 }, { "epoch": 3.730119582461239, "grad_norm": 5.83899450302124, "learning_rate": 5.303538238568542e-05, "loss": 0.8021, "step": 279800 }, { "epoch": 3.731452720267694, "grad_norm": 11.47380542755127, "learning_rate": 5.302982061321262e-05, "loss": 0.7993, "step": 279900 }, { "epoch": 3.7327858580741493, "grad_norm": 5.622348308563232, "learning_rate": 5.3024256912755826e-05, "loss": 0.8433, "step": 280000 }, { "epoch": 3.7341189958806043, "grad_norm": 37.46951675415039, "learning_rate": 5.301869128478082e-05, "loss": 0.8286, "step": 280100 }, { "epoch": 3.735452133687059, "grad_norm": 118.23983001708984, "learning_rate": 5.301312372975355e-05, "loss": 0.8242, "step": 280200 }, { "epoch": 3.736785271493514, "grad_norm": 54.71044158935547, "learning_rate": 5.3007554248140096e-05, "loss": 0.8449, "step": 280300 }, { "epoch": 3.738118409299969, "grad_norm": 7.6794586181640625, "learning_rate": 5.3001982840406726e-05, "loss": 0.7835, "step": 280400 }, { "epoch": 3.7394515471064245, "grad_norm": 25.195280075073242, "learning_rate": 5.2996409507019866e-05, "loss": 0.8447, "step": 280500 }, { "epoch": 3.7407846849128794, "grad_norm": 20.51214599609375, "learning_rate": 5.299083424844609e-05, "loss": 0.7609, "step": 280600 }, { "epoch": 3.7421178227193344, "grad_norm": 8.141054153442383, "learning_rate": 5.2985257065152146e-05, "loss": 0.8666, "step": 280700 }, { "epoch": 3.7434509605257897, "grad_norm": 5.483390808105469, "learning_rate": 5.2979677957604934e-05, "loss": 0.8173, "step": 280800 }, { "epoch": 3.7447840983322447, "grad_norm": 20.130680084228516, "learning_rate": 5.297409692627152e-05, "loss": 0.8154, "step": 280900 }, { "epoch": 3.7461172361386996, "grad_norm": 3.8348026275634766, "learning_rate": 5.2968513971619136e-05, "loss": 0.8627, "step": 281000 }, { "epoch": 3.7474503739451546, "grad_norm": 5.733528137207031, "learning_rate": 5.2962984952406784e-05, "loss": 0.841, "step": 281100 }, { "epoch": 3.7487835117516095, "grad_norm": 13.35064697265625, "learning_rate": 5.2957454049061256e-05, "loss": 0.909, "step": 281200 }, { "epoch": 3.750116649558065, "grad_norm": 17.736923217773438, "learning_rate": 5.2951865365690654e-05, "loss": 0.7563, "step": 281300 }, { "epoch": 3.75144978736452, "grad_norm": 19.567047119140625, "learning_rate": 5.294627476086224e-05, "loss": 0.8682, "step": 281400 }, { "epoch": 3.752782925170975, "grad_norm": 11.86171817779541, "learning_rate": 5.294068223504404e-05, "loss": 0.901, "step": 281500 }, { "epoch": 3.75411606297743, "grad_norm": 12.64025592803955, "learning_rate": 5.2935087788704224e-05, "loss": 0.9144, "step": 281600 }, { "epoch": 3.755449200783885, "grad_norm": 13.735821723937988, "learning_rate": 5.292949142231117e-05, "loss": 0.8176, "step": 281700 }, { "epoch": 3.75678233859034, "grad_norm": 7.684802532196045, "learning_rate": 5.292389313633337e-05, "loss": 0.8982, "step": 281800 }, { "epoch": 3.758115476396795, "grad_norm": 28.7979736328125, "learning_rate": 5.29182929312395e-05, "loss": 0.882, "step": 281900 }, { "epoch": 3.75944861420325, "grad_norm": 9.746452331542969, "learning_rate": 5.291269080749839e-05, "loss": 0.8418, "step": 282000 }, { "epoch": 3.7607817520097053, "grad_norm": 9.487780570983887, "learning_rate": 5.290708676557903e-05, "loss": 0.8653, "step": 282100 }, { "epoch": 3.7621148898161603, "grad_norm": 8.89246940612793, "learning_rate": 5.2901480805950576e-05, "loss": 0.806, "step": 282200 }, { "epoch": 3.763448027622615, "grad_norm": 35.15156936645508, "learning_rate": 5.2895872929082345e-05, "loss": 0.7064, "step": 282300 }, { "epoch": 3.7647811654290706, "grad_norm": 3.243042230606079, "learning_rate": 5.289026313544379e-05, "loss": 0.7906, "step": 282400 }, { "epoch": 3.7661143032355255, "grad_norm": 4.992863655090332, "learning_rate": 5.288465142550458e-05, "loss": 0.7557, "step": 282500 }, { "epoch": 3.7674474410419805, "grad_norm": 3.535097122192383, "learning_rate": 5.2879037799734486e-05, "loss": 0.7608, "step": 282600 }, { "epoch": 3.7687805788484354, "grad_norm": 53.83796691894531, "learning_rate": 5.287342225860347e-05, "loss": 0.8283, "step": 282700 }, { "epoch": 3.7701137166548904, "grad_norm": 1.5114786624908447, "learning_rate": 5.286780480258165e-05, "loss": 0.8241, "step": 282800 }, { "epoch": 3.7714468544613458, "grad_norm": 7.334554195404053, "learning_rate": 5.2862185432139285e-05, "loss": 0.9061, "step": 282900 }, { "epoch": 3.7727799922678007, "grad_norm": 3.67539644241333, "learning_rate": 5.2856564147746825e-05, "loss": 0.8007, "step": 283000 }, { "epoch": 3.7741131300742556, "grad_norm": 8.544854164123535, "learning_rate": 5.285094094987487e-05, "loss": 0.812, "step": 283100 }, { "epoch": 3.775446267880711, "grad_norm": 4.759920120239258, "learning_rate": 5.2845315838994166e-05, "loss": 0.8099, "step": 283200 }, { "epoch": 3.776779405687166, "grad_norm": 7.416076183319092, "learning_rate": 5.283968881557564e-05, "loss": 0.8622, "step": 283300 }, { "epoch": 3.778112543493621, "grad_norm": 16.528026580810547, "learning_rate": 5.283405988009037e-05, "loss": 0.8184, "step": 283400 }, { "epoch": 3.779445681300076, "grad_norm": 8.69449234008789, "learning_rate": 5.2828429033009575e-05, "loss": 0.8476, "step": 283500 }, { "epoch": 3.780778819106531, "grad_norm": 7.9810261726379395, "learning_rate": 5.282285261184524e-05, "loss": 0.7716, "step": 283600 }, { "epoch": 3.782111956912986, "grad_norm": 4.042098045349121, "learning_rate": 5.281721796209196e-05, "loss": 0.8715, "step": 283700 }, { "epoch": 3.783445094719441, "grad_norm": 15.573029518127441, "learning_rate": 5.281158140215311e-05, "loss": 0.9002, "step": 283800 }, { "epoch": 3.784778232525896, "grad_norm": 13.761211395263672, "learning_rate": 5.280594293250059e-05, "loss": 0.7954, "step": 283900 }, { "epoch": 3.7861113703323515, "grad_norm": 5.915521621704102, "learning_rate": 5.280030255360641e-05, "loss": 0.8883, "step": 284000 }, { "epoch": 3.7874445081388064, "grad_norm": 267.3552551269531, "learning_rate": 5.279466026594277e-05, "loss": 0.8499, "step": 284100 }, { "epoch": 3.7887776459452613, "grad_norm": 41.416419982910156, "learning_rate": 5.2789016069982034e-05, "loss": 0.7956, "step": 284200 }, { "epoch": 3.7901107837517163, "grad_norm": 20.531171798706055, "learning_rate": 5.278336996619671e-05, "loss": 0.79, "step": 284300 }, { "epoch": 3.7914439215581712, "grad_norm": 2.8506758213043213, "learning_rate": 5.277772195505947e-05, "loss": 0.7711, "step": 284400 }, { "epoch": 3.7927770593646266, "grad_norm": 21.111289978027344, "learning_rate": 5.277207203704315e-05, "loss": 0.8463, "step": 284500 }, { "epoch": 3.7941101971710816, "grad_norm": 13.238577842712402, "learning_rate": 5.276642021262074e-05, "loss": 0.7496, "step": 284600 }, { "epoch": 3.7954433349775365, "grad_norm": 13.944062232971191, "learning_rate": 5.2760766482265397e-05, "loss": 0.8381, "step": 284700 }, { "epoch": 3.796776472783992, "grad_norm": 4.2707905769348145, "learning_rate": 5.275511084645043e-05, "loss": 0.758, "step": 284800 }, { "epoch": 3.798109610590447, "grad_norm": 3.661444902420044, "learning_rate": 5.274945330564932e-05, "loss": 0.7937, "step": 284900 }, { "epoch": 3.7994427483969018, "grad_norm": 17.354490280151367, "learning_rate": 5.274379386033568e-05, "loss": 0.8597, "step": 285000 }, { "epoch": 3.8007758862033567, "grad_norm": 41.578678131103516, "learning_rate": 5.273813251098332e-05, "loss": 0.85, "step": 285100 }, { "epoch": 3.8021090240098117, "grad_norm": 14.480527877807617, "learning_rate": 5.273246925806619e-05, "loss": 0.9386, "step": 285200 }, { "epoch": 3.803442161816267, "grad_norm": 14.678339958190918, "learning_rate": 5.2726804102058386e-05, "loss": 0.7667, "step": 285300 }, { "epoch": 3.804775299622722, "grad_norm": 35.07117462158203, "learning_rate": 5.272113704343418e-05, "loss": 0.8304, "step": 285400 }, { "epoch": 3.806108437429177, "grad_norm": 12.211941719055176, "learning_rate": 5.271546808266801e-05, "loss": 0.7496, "step": 285500 }, { "epoch": 3.8074415752356323, "grad_norm": 12.30476188659668, "learning_rate": 5.2709797220234455e-05, "loss": 0.8901, "step": 285600 }, { "epoch": 3.8087747130420873, "grad_norm": 25.449092864990234, "learning_rate": 5.270412445660826e-05, "loss": 0.8472, "step": 285700 }, { "epoch": 3.810107850848542, "grad_norm": 2.0459024906158447, "learning_rate": 5.269844979226435e-05, "loss": 0.8649, "step": 285800 }, { "epoch": 3.811440988654997, "grad_norm": 10.025466918945312, "learning_rate": 5.269277322767776e-05, "loss": 0.8723, "step": 285900 }, { "epoch": 3.812774126461452, "grad_norm": 5.334728240966797, "learning_rate": 5.268709476332373e-05, "loss": 0.8844, "step": 286000 }, { "epoch": 3.8141072642679075, "grad_norm": 9.676863670349121, "learning_rate": 5.2681414399677645e-05, "loss": 0.7897, "step": 286100 }, { "epoch": 3.8154404020743624, "grad_norm": 4.681800365447998, "learning_rate": 5.267573213721504e-05, "loss": 0.9064, "step": 286200 }, { "epoch": 3.8167735398808174, "grad_norm": 5.876117706298828, "learning_rate": 5.267004797641163e-05, "loss": 0.801, "step": 286300 }, { "epoch": 3.8181066776872727, "grad_norm": 14.199224472045898, "learning_rate": 5.266436191774326e-05, "loss": 0.7983, "step": 286400 }, { "epoch": 3.8194398154937277, "grad_norm": 21.49662971496582, "learning_rate": 5.265867396168596e-05, "loss": 0.7255, "step": 286500 }, { "epoch": 3.8207729533001826, "grad_norm": 2.469348192214966, "learning_rate": 5.2652984108715895e-05, "loss": 0.8258, "step": 286600 }, { "epoch": 3.8221060911066376, "grad_norm": 17.62484359741211, "learning_rate": 5.2647292359309416e-05, "loss": 0.7175, "step": 286700 }, { "epoch": 3.8234392289130925, "grad_norm": 4.7123894691467285, "learning_rate": 5.2641598713943e-05, "loss": 0.7432, "step": 286800 }, { "epoch": 3.824772366719548, "grad_norm": 12.915946960449219, "learning_rate": 5.263590317309332e-05, "loss": 0.7754, "step": 286900 }, { "epoch": 3.826105504526003, "grad_norm": 9.770316123962402, "learning_rate": 5.2630205737237186e-05, "loss": 0.8804, "step": 287000 }, { "epoch": 3.827438642332458, "grad_norm": 13.722908973693848, "learning_rate": 5.262450640685156e-05, "loss": 0.8519, "step": 287100 }, { "epoch": 3.828771780138913, "grad_norm": 4.220076560974121, "learning_rate": 5.261880518241357e-05, "loss": 0.821, "step": 287200 }, { "epoch": 3.830104917945368, "grad_norm": 18.084932327270508, "learning_rate": 5.2613102064400516e-05, "loss": 0.8046, "step": 287300 }, { "epoch": 3.831438055751823, "grad_norm": 5.023919105529785, "learning_rate": 5.2607397053289844e-05, "loss": 0.7389, "step": 287400 }, { "epoch": 3.832771193558278, "grad_norm": 15.779045104980469, "learning_rate": 5.2601690149559145e-05, "loss": 0.9252, "step": 287500 }, { "epoch": 3.834104331364733, "grad_norm": 27.069873809814453, "learning_rate": 5.25959813536862e-05, "loss": 0.7731, "step": 287600 }, { "epoch": 3.8354374691711883, "grad_norm": 17.932266235351562, "learning_rate": 5.259027066614893e-05, "loss": 0.8082, "step": 287700 }, { "epoch": 3.8367706069776433, "grad_norm": 15.043538093566895, "learning_rate": 5.25845580874254e-05, "loss": 0.7753, "step": 287800 }, { "epoch": 3.838103744784098, "grad_norm": 18.083925247192383, "learning_rate": 5.257884361799386e-05, "loss": 0.8641, "step": 287900 }, { "epoch": 3.8394368825905536, "grad_norm": 8.595149040222168, "learning_rate": 5.257312725833271e-05, "loss": 0.8323, "step": 288000 }, { "epoch": 3.8407700203970085, "grad_norm": 14.349325180053711, "learning_rate": 5.256740900892051e-05, "loss": 0.7215, "step": 288100 }, { "epoch": 3.8421031582034635, "grad_norm": 3.941537380218506, "learning_rate": 5.256168887023595e-05, "loss": 0.8278, "step": 288200 }, { "epoch": 3.8434362960099184, "grad_norm": 7.361851215362549, "learning_rate": 5.255596684275793e-05, "loss": 0.8159, "step": 288300 }, { "epoch": 3.8447694338163734, "grad_norm": 9.082711219787598, "learning_rate": 5.2550242926965466e-05, "loss": 0.8455, "step": 288400 }, { "epoch": 3.8461025716228288, "grad_norm": 8.945943832397461, "learning_rate": 5.2544517123337746e-05, "loss": 0.7829, "step": 288500 }, { "epoch": 3.8474357094292837, "grad_norm": 7.789073467254639, "learning_rate": 5.2538789432354126e-05, "loss": 0.6715, "step": 288600 }, { "epoch": 3.8487688472357386, "grad_norm": 2.2784249782562256, "learning_rate": 5.2533059854494096e-05, "loss": 0.8522, "step": 288700 }, { "epoch": 3.850101985042194, "grad_norm": 3.5291483402252197, "learning_rate": 5.2527328390237326e-05, "loss": 0.7724, "step": 288800 }, { "epoch": 3.851435122848649, "grad_norm": 7.424714088439941, "learning_rate": 5.2521595040063644e-05, "loss": 0.7884, "step": 288900 }, { "epoch": 3.852768260655104, "grad_norm": 6.6215386390686035, "learning_rate": 5.251585980445301e-05, "loss": 0.7584, "step": 289000 }, { "epoch": 3.854101398461559, "grad_norm": 23.612903594970703, "learning_rate": 5.251012268388557e-05, "loss": 0.7736, "step": 289100 }, { "epoch": 3.855434536268014, "grad_norm": 14.32138729095459, "learning_rate": 5.250438367884162e-05, "loss": 0.8182, "step": 289200 }, { "epoch": 3.856767674074469, "grad_norm": 10.842520713806152, "learning_rate": 5.2498642789801615e-05, "loss": 0.7792, "step": 289300 }, { "epoch": 3.858100811880924, "grad_norm": 6.592386245727539, "learning_rate": 5.249290001724615e-05, "loss": 0.8028, "step": 289400 }, { "epoch": 3.859433949687379, "grad_norm": 83.94574737548828, "learning_rate": 5.2487155361655995e-05, "loss": 0.7744, "step": 289500 }, { "epoch": 3.8607670874938345, "grad_norm": 4.076933860778809, "learning_rate": 5.248140882351208e-05, "loss": 0.7518, "step": 289600 }, { "epoch": 3.8621002253002894, "grad_norm": 20.352863311767578, "learning_rate": 5.2475660403295494e-05, "loss": 0.8049, "step": 289700 }, { "epoch": 3.8634333631067443, "grad_norm": 13.13442325592041, "learning_rate": 5.246991010148746e-05, "loss": 0.8205, "step": 289800 }, { "epoch": 3.8647665009131993, "grad_norm": 5.467333793640137, "learning_rate": 5.2464157918569386e-05, "loss": 0.815, "step": 289900 }, { "epoch": 3.8660996387196542, "grad_norm": 44.68418502807617, "learning_rate": 5.245840385502283e-05, "loss": 0.7366, "step": 290000 }, { "epoch": 3.8674327765261096, "grad_norm": 17.51060676574707, "learning_rate": 5.2452647911329485e-05, "loss": 0.7811, "step": 290100 }, { "epoch": 3.8687659143325646, "grad_norm": 8.838266372680664, "learning_rate": 5.244689008797125e-05, "loss": 0.725, "step": 290200 }, { "epoch": 3.8700990521390195, "grad_norm": 12.286373138427734, "learning_rate": 5.244113038543013e-05, "loss": 0.8133, "step": 290300 }, { "epoch": 3.871432189945475, "grad_norm": 24.4243221282959, "learning_rate": 5.243536880418831e-05, "loss": 0.7429, "step": 290400 }, { "epoch": 3.87276532775193, "grad_norm": 3.5364255905151367, "learning_rate": 5.2429605344728136e-05, "loss": 0.7593, "step": 290500 }, { "epoch": 3.8740984655583848, "grad_norm": 13.124157905578613, "learning_rate": 5.2423897670197286e-05, "loss": 0.7085, "step": 290600 }, { "epoch": 3.8754316033648397, "grad_norm": 9.838518142700195, "learning_rate": 5.24181304745182e-05, "loss": 0.8369, "step": 290700 }, { "epoch": 3.8767647411712947, "grad_norm": 3.8194243907928467, "learning_rate": 5.24123614020639e-05, "loss": 0.7833, "step": 290800 }, { "epoch": 3.87809787897775, "grad_norm": 5.586716651916504, "learning_rate": 5.240659045331735e-05, "loss": 0.7827, "step": 290900 }, { "epoch": 3.879431016784205, "grad_norm": 27.988544464111328, "learning_rate": 5.240081762876168e-05, "loss": 0.8315, "step": 291000 }, { "epoch": 3.88076415459066, "grad_norm": 6.084184169769287, "learning_rate": 5.239504292888018e-05, "loss": 0.7091, "step": 291100 }, { "epoch": 3.8820972923971153, "grad_norm": 8.883874893188477, "learning_rate": 5.238926635415627e-05, "loss": 0.7719, "step": 291200 }, { "epoch": 3.8834304302035703, "grad_norm": 13.92149829864502, "learning_rate": 5.2383487905073564e-05, "loss": 0.7524, "step": 291300 }, { "epoch": 3.884763568010025, "grad_norm": 6.484952449798584, "learning_rate": 5.2377707582115815e-05, "loss": 0.8296, "step": 291400 }, { "epoch": 3.88609670581648, "grad_norm": 7.400921821594238, "learning_rate": 5.237192538576692e-05, "loss": 0.8046, "step": 291500 }, { "epoch": 3.887429843622935, "grad_norm": 7.734588623046875, "learning_rate": 5.2366141316510956e-05, "loss": 0.7797, "step": 291600 }, { "epoch": 3.8887629814293905, "grad_norm": 6.272462844848633, "learning_rate": 5.236035537483215e-05, "loss": 0.7049, "step": 291700 }, { "epoch": 3.8900961192358454, "grad_norm": 37.40755081176758, "learning_rate": 5.235456756121487e-05, "loss": 0.8084, "step": 291800 }, { "epoch": 3.8914292570423004, "grad_norm": 13.370132446289062, "learning_rate": 5.234877787614366e-05, "loss": 0.7544, "step": 291900 }, { "epoch": 3.8927623948487557, "grad_norm": 14.618858337402344, "learning_rate": 5.234298632010322e-05, "loss": 0.8398, "step": 292000 }, { "epoch": 3.8940955326552107, "grad_norm": 5.055782794952393, "learning_rate": 5.23371928935784e-05, "loss": 0.7394, "step": 292100 }, { "epoch": 3.8954286704616656, "grad_norm": 25.20567512512207, "learning_rate": 5.23313975970542e-05, "loss": 0.7796, "step": 292200 }, { "epoch": 3.8967618082681206, "grad_norm": 5.145323276519775, "learning_rate": 5.232560043101579e-05, "loss": 0.7375, "step": 292300 }, { "epoch": 3.8980949460745755, "grad_norm": 8.359809875488281, "learning_rate": 5.231980139594848e-05, "loss": 0.826, "step": 292400 }, { "epoch": 3.899428083881031, "grad_norm": 5.3065619468688965, "learning_rate": 5.2314000492337766e-05, "loss": 0.771, "step": 292500 }, { "epoch": 3.900761221687486, "grad_norm": 24.927194595336914, "learning_rate": 5.230819772066927e-05, "loss": 0.7598, "step": 292600 }, { "epoch": 3.902094359493941, "grad_norm": 4.467944145202637, "learning_rate": 5.230250919251265e-05, "loss": 0.7494, "step": 292700 }, { "epoch": 3.903427497300396, "grad_norm": 7.562213897705078, "learning_rate": 5.229670272352308e-05, "loss": 0.8275, "step": 292800 }, { "epoch": 3.904760635106851, "grad_norm": 13.641551971435547, "learning_rate": 5.2290894387923825e-05, "loss": 0.787, "step": 292900 }, { "epoch": 3.906093772913306, "grad_norm": 4.384075164794922, "learning_rate": 5.228508418620117e-05, "loss": 0.6581, "step": 293000 }, { "epoch": 3.907426910719761, "grad_norm": 4.826481819152832, "learning_rate": 5.227927211884151e-05, "loss": 0.883, "step": 293100 }, { "epoch": 3.908760048526216, "grad_norm": 86.46570587158203, "learning_rate": 5.227345818633143e-05, "loss": 0.8119, "step": 293200 }, { "epoch": 3.9100931863326713, "grad_norm": 48.2784538269043, "learning_rate": 5.2267642389157635e-05, "loss": 0.7732, "step": 293300 }, { "epoch": 3.9114263241391263, "grad_norm": 8.419891357421875, "learning_rate": 5.226182472780701e-05, "loss": 0.7313, "step": 293400 }, { "epoch": 3.912759461945581, "grad_norm": 5.602296352386475, "learning_rate": 5.225600520276659e-05, "loss": 0.8483, "step": 293500 }, { "epoch": 3.9140925997520366, "grad_norm": 3.0828816890716553, "learning_rate": 5.2250183814523575e-05, "loss": 0.7379, "step": 293600 }, { "epoch": 3.9154257375584915, "grad_norm": 2.9291601181030273, "learning_rate": 5.224436056356532e-05, "loss": 0.7731, "step": 293700 }, { "epoch": 3.9167588753649465, "grad_norm": 3.4690682888031006, "learning_rate": 5.223853545037931e-05, "loss": 0.7311, "step": 293800 }, { "epoch": 3.9180920131714014, "grad_norm": 4.076145648956299, "learning_rate": 5.2232708475453214e-05, "loss": 0.7405, "step": 293900 }, { "epoch": 3.9194251509778564, "grad_norm": 22.868579864501953, "learning_rate": 5.222687963927485e-05, "loss": 0.7897, "step": 294000 }, { "epoch": 3.9207582887843118, "grad_norm": 7.908239841461182, "learning_rate": 5.2221048942332195e-05, "loss": 0.7402, "step": 294100 }, { "epoch": 3.9220914265907667, "grad_norm": 2.3729848861694336, "learning_rate": 5.221521638511336e-05, "loss": 0.8291, "step": 294200 }, { "epoch": 3.9234245643972216, "grad_norm": 6.132571697235107, "learning_rate": 5.220938196810664e-05, "loss": 0.7325, "step": 294300 }, { "epoch": 3.924757702203677, "grad_norm": 6.539307117462158, "learning_rate": 5.220354569180047e-05, "loss": 0.7922, "step": 294400 }, { "epoch": 3.926090840010132, "grad_norm": 7.346495151519775, "learning_rate": 5.2197707556683445e-05, "loss": 0.7857, "step": 294500 }, { "epoch": 3.927423977816587, "grad_norm": 1.1885439157485962, "learning_rate": 5.219186756324432e-05, "loss": 0.7643, "step": 294600 }, { "epoch": 3.928757115623042, "grad_norm": 4.517455101013184, "learning_rate": 5.2186025711972e-05, "loss": 0.8314, "step": 294700 }, { "epoch": 3.930090253429497, "grad_norm": 5.979319095611572, "learning_rate": 5.218024044963395e-05, "loss": 0.8368, "step": 294800 }, { "epoch": 3.931423391235952, "grad_norm": 4.357511520385742, "learning_rate": 5.21743949027287e-05, "loss": 0.7904, "step": 294900 }, { "epoch": 3.932756529042407, "grad_norm": 10.917655944824219, "learning_rate": 5.216866446570778e-05, "loss": 0.8377, "step": 295000 }, { "epoch": 3.934089666848862, "grad_norm": 19.55188751220703, "learning_rate": 5.2162815243663985e-05, "loss": 0.787, "step": 295100 }, { "epoch": 3.9354228046553175, "grad_norm": 6.860501289367676, "learning_rate": 5.215696416621917e-05, "loss": 0.8743, "step": 295200 }, { "epoch": 3.9367559424617724, "grad_norm": 4.164614677429199, "learning_rate": 5.215111123386316e-05, "loss": 0.8445, "step": 295300 }, { "epoch": 3.9380890802682273, "grad_norm": 18.783334732055664, "learning_rate": 5.214525644708594e-05, "loss": 0.8494, "step": 295400 }, { "epoch": 3.9394222180746823, "grad_norm": 5.90316915512085, "learning_rate": 5.213939980637766e-05, "loss": 0.8182, "step": 295500 }, { "epoch": 3.9407553558811372, "grad_norm": 26.598766326904297, "learning_rate": 5.213354131222861e-05, "loss": 0.7404, "step": 295600 }, { "epoch": 3.9420884936875926, "grad_norm": 30.999210357666016, "learning_rate": 5.212768096512927e-05, "loss": 0.8408, "step": 295700 }, { "epoch": 3.9434216314940476, "grad_norm": 14.756216049194336, "learning_rate": 5.212181876557022e-05, "loss": 0.8578, "step": 295800 }, { "epoch": 3.9447547693005025, "grad_norm": 4.537065029144287, "learning_rate": 5.211595471404223e-05, "loss": 0.6908, "step": 295900 }, { "epoch": 3.946087907106958, "grad_norm": 8.211098670959473, "learning_rate": 5.211008881103624e-05, "loss": 0.8212, "step": 296000 }, { "epoch": 3.947421044913413, "grad_norm": 5.522947788238525, "learning_rate": 5.2104221057043304e-05, "loss": 0.8281, "step": 296100 }, { "epoch": 3.948754182719868, "grad_norm": 6.190502643585205, "learning_rate": 5.209835145255466e-05, "loss": 0.7852, "step": 296200 }, { "epoch": 3.9500873205263227, "grad_norm": 19.57147979736328, "learning_rate": 5.20924799980617e-05, "loss": 0.8274, "step": 296300 }, { "epoch": 3.9514204583327777, "grad_norm": 9.017529487609863, "learning_rate": 5.2086606694055944e-05, "loss": 0.7814, "step": 296400 }, { "epoch": 3.952753596139233, "grad_norm": 4.016287326812744, "learning_rate": 5.2080731541029104e-05, "loss": 0.7861, "step": 296500 }, { "epoch": 3.954086733945688, "grad_norm": 7.2443437576293945, "learning_rate": 5.2074854539473034e-05, "loss": 0.8118, "step": 296600 }, { "epoch": 3.955419871752143, "grad_norm": 12.181198120117188, "learning_rate": 5.206897568987971e-05, "loss": 0.7503, "step": 296700 }, { "epoch": 3.9567530095585983, "grad_norm": 211.9810333251953, "learning_rate": 5.206309499274131e-05, "loss": 0.8148, "step": 296800 }, { "epoch": 3.9580861473650533, "grad_norm": 9.641141891479492, "learning_rate": 5.205721244855016e-05, "loss": 0.8269, "step": 296900 }, { "epoch": 3.959419285171508, "grad_norm": 11.989686012268066, "learning_rate": 5.2051328057798714e-05, "loss": 0.8142, "step": 297000 }, { "epoch": 3.960752422977963, "grad_norm": 5.876277923583984, "learning_rate": 5.204544182097959e-05, "loss": 0.8231, "step": 297100 }, { "epoch": 3.962085560784418, "grad_norm": 4.481141567230225, "learning_rate": 5.2039553738585564e-05, "loss": 0.7647, "step": 297200 }, { "epoch": 3.9634186985908735, "grad_norm": 3.906426429748535, "learning_rate": 5.203366381110958e-05, "loss": 0.7325, "step": 297300 }, { "epoch": 3.9647518363973284, "grad_norm": 5.6467485427856445, "learning_rate": 5.2027772039044716e-05, "loss": 0.8436, "step": 297400 }, { "epoch": 3.9660849742037834, "grad_norm": 4.65558385848999, "learning_rate": 5.202187842288421e-05, "loss": 0.7662, "step": 297500 }, { "epoch": 3.9674181120102388, "grad_norm": 4.149421215057373, "learning_rate": 5.2015982963121464e-05, "loss": 0.8775, "step": 297600 }, { "epoch": 3.9687512498166937, "grad_norm": 3.810077428817749, "learning_rate": 5.201008566025003e-05, "loss": 0.8453, "step": 297700 }, { "epoch": 3.9700843876231486, "grad_norm": 8.864726066589355, "learning_rate": 5.2004186514763594e-05, "loss": 0.7554, "step": 297800 }, { "epoch": 3.9714175254296036, "grad_norm": 7.109074115753174, "learning_rate": 5.199828552715604e-05, "loss": 0.8229, "step": 297900 }, { "epoch": 3.9727506632360585, "grad_norm": 4.254584789276123, "learning_rate": 5.199238269792137e-05, "loss": 0.8318, "step": 298000 }, { "epoch": 3.974083801042514, "grad_norm": 4.903864860534668, "learning_rate": 5.198647802755373e-05, "loss": 0.7515, "step": 298100 }, { "epoch": 3.975416938848969, "grad_norm": 64.66783142089844, "learning_rate": 5.198057151654747e-05, "loss": 0.7681, "step": 298200 }, { "epoch": 3.976750076655424, "grad_norm": 3.7340807914733887, "learning_rate": 5.197466316539704e-05, "loss": 0.8243, "step": 298300 }, { "epoch": 3.978083214461879, "grad_norm": 2.908525228500366, "learning_rate": 5.196875297459709e-05, "loss": 0.8172, "step": 298400 }, { "epoch": 3.979416352268334, "grad_norm": 7.536036491394043, "learning_rate": 5.196284094464238e-05, "loss": 0.7051, "step": 298500 }, { "epoch": 3.980749490074789, "grad_norm": 10.173685073852539, "learning_rate": 5.195692707602787e-05, "loss": 0.7625, "step": 298600 }, { "epoch": 3.982082627881244, "grad_norm": 0.47871407866477966, "learning_rate": 5.195101136924864e-05, "loss": 0.7227, "step": 298700 }, { "epoch": 3.983415765687699, "grad_norm": 16.70237922668457, "learning_rate": 5.194509382479993e-05, "loss": 0.7145, "step": 298800 }, { "epoch": 3.9847489034941543, "grad_norm": 3.9684672355651855, "learning_rate": 5.193917444317714e-05, "loss": 0.7741, "step": 298900 }, { "epoch": 3.9860820413006093, "grad_norm": 5.663035869598389, "learning_rate": 5.1933253224875834e-05, "loss": 0.7174, "step": 299000 }, { "epoch": 3.9874151791070642, "grad_norm": 6.956714153289795, "learning_rate": 5.192733017039169e-05, "loss": 0.8537, "step": 299100 }, { "epoch": 3.9887483169135196, "grad_norm": 14.955069541931152, "learning_rate": 5.1921405280220604e-05, "loss": 0.6562, "step": 299200 }, { "epoch": 3.9900814547199746, "grad_norm": 11.491960525512695, "learning_rate": 5.1915478554858555e-05, "loss": 0.7918, "step": 299300 }, { "epoch": 3.9914145925264295, "grad_norm": 5.903707027435303, "learning_rate": 5.190954999480173e-05, "loss": 0.8095, "step": 299400 }, { "epoch": 3.9927477303328844, "grad_norm": 0.5887353420257568, "learning_rate": 5.190361960054644e-05, "loss": 0.8413, "step": 299500 }, { "epoch": 3.9940808681393394, "grad_norm": 5.428269386291504, "learning_rate": 5.1897687372589174e-05, "loss": 0.7628, "step": 299600 }, { "epoch": 3.9954140059457948, "grad_norm": 11.673418998718262, "learning_rate": 5.189175331142654e-05, "loss": 0.7827, "step": 299700 }, { "epoch": 3.9967471437522497, "grad_norm": 10.626482963562012, "learning_rate": 5.188581741755533e-05, "loss": 0.7659, "step": 299800 }, { "epoch": 3.9980802815587047, "grad_norm": 17.020248413085938, "learning_rate": 5.1879879691472474e-05, "loss": 0.7954, "step": 299900 }, { "epoch": 3.99941341936516, "grad_norm": 4.82595157623291, "learning_rate": 5.187394013367507e-05, "loss": 0.8127, "step": 300000 }, { "epoch": 3.99941341936516, "eval_accuracy": 0.9052826211746134, "eval_cer": 0.13378265350609428, "eval_loss": 0.7741264700889587, "eval_runtime": 11119.2873, "eval_samples_per_second": 5.013, "eval_steps_per_second": 0.313, "eval_wer": 0.2486297765284249, "step": 300000 }, { "epoch": 4.000746557171615, "grad_norm": 3.1853182315826416, "learning_rate": 5.186799874466035e-05, "loss": 0.7371, "step": 300100 }, { "epoch": 4.00207969497807, "grad_norm": 13.23748779296875, "learning_rate": 5.1862055524925694e-05, "loss": 0.7503, "step": 300200 }, { "epoch": 4.003412832784525, "grad_norm": 5.39965295791626, "learning_rate": 5.185611047496867e-05, "loss": 0.7329, "step": 300300 }, { "epoch": 4.00474597059098, "grad_norm": 4.7204084396362305, "learning_rate": 5.185016359528698e-05, "loss": 0.7145, "step": 300400 }, { "epoch": 4.006079108397435, "grad_norm": 10.915873527526855, "learning_rate": 5.1844214886378474e-05, "loss": 0.779, "step": 300500 }, { "epoch": 4.00741224620389, "grad_norm": 12.18453598022461, "learning_rate": 5.1838264348741147e-05, "loss": 0.7843, "step": 300600 }, { "epoch": 4.0087453840103455, "grad_norm": 3.9269914627075195, "learning_rate": 5.183231198287318e-05, "loss": 0.7304, "step": 300700 }, { "epoch": 4.0100785218168005, "grad_norm": 3.96427583694458, "learning_rate": 5.182635778927287e-05, "loss": 0.7664, "step": 300800 }, { "epoch": 4.011411659623255, "grad_norm": 7.178703308105469, "learning_rate": 5.182040176843868e-05, "loss": 0.6851, "step": 300900 }, { "epoch": 4.01274479742971, "grad_norm": 11.984002113342285, "learning_rate": 5.181450350838564e-05, "loss": 0.8202, "step": 301000 }, { "epoch": 4.014077935236165, "grad_norm": 6.808746814727783, "learning_rate": 5.1808543852839624e-05, "loss": 0.7773, "step": 301100 }, { "epoch": 4.01541107304262, "grad_norm": 9.305115699768066, "learning_rate": 5.180258237155106e-05, "loss": 0.7963, "step": 301200 }, { "epoch": 4.016744210849075, "grad_norm": 1.1853455305099487, "learning_rate": 5.179661906501902e-05, "loss": 0.7, "step": 301300 }, { "epoch": 4.01807734865553, "grad_norm": 11.174756050109863, "learning_rate": 5.179065393374276e-05, "loss": 0.6986, "step": 301400 }, { "epoch": 4.019410486461986, "grad_norm": 5.007973670959473, "learning_rate": 5.178468697822162e-05, "loss": 0.7714, "step": 301500 }, { "epoch": 4.020743624268441, "grad_norm": 9.20291805267334, "learning_rate": 5.177871819895517e-05, "loss": 0.7515, "step": 301600 }, { "epoch": 4.022076762074896, "grad_norm": 3.710325002670288, "learning_rate": 5.177274759644308e-05, "loss": 0.8298, "step": 301700 }, { "epoch": 4.023409899881351, "grad_norm": 7.8267059326171875, "learning_rate": 5.17667751711852e-05, "loss": 0.6897, "step": 301800 }, { "epoch": 4.024743037687806, "grad_norm": 2.045884132385254, "learning_rate": 5.1760800923681515e-05, "loss": 0.6661, "step": 301900 }, { "epoch": 4.026076175494261, "grad_norm": 12.458800315856934, "learning_rate": 5.175482485443217e-05, "loss": 0.6939, "step": 302000 }, { "epoch": 4.027409313300716, "grad_norm": 11.444567680358887, "learning_rate": 5.174890675185594e-05, "loss": 0.7768, "step": 302100 }, { "epoch": 4.0287424511071706, "grad_norm": 5.126943588256836, "learning_rate": 5.17429270588213e-05, "loss": 0.7265, "step": 302200 }, { "epoch": 4.030075588913626, "grad_norm": 100.20039367675781, "learning_rate": 5.173694554553734e-05, "loss": 0.6687, "step": 302300 }, { "epoch": 4.031408726720081, "grad_norm": 3.0989718437194824, "learning_rate": 5.1730962212504824e-05, "loss": 0.7413, "step": 302400 }, { "epoch": 4.032741864526536, "grad_norm": 11.042068481445312, "learning_rate": 5.172497706022466e-05, "loss": 0.7749, "step": 302500 }, { "epoch": 4.034075002332991, "grad_norm": 4.8945746421813965, "learning_rate": 5.17189900891979e-05, "loss": 0.7818, "step": 302600 }, { "epoch": 4.035408140139446, "grad_norm": 3.077268600463867, "learning_rate": 5.171300129992574e-05, "loss": 0.7703, "step": 302700 }, { "epoch": 4.036741277945901, "grad_norm": 4.265362739562988, "learning_rate": 5.1707010692909575e-05, "loss": 0.7283, "step": 302800 }, { "epoch": 4.038074415752356, "grad_norm": 8.4100341796875, "learning_rate": 5.1701018268650886e-05, "loss": 0.7806, "step": 302900 }, { "epoch": 4.039407553558811, "grad_norm": 6.903780937194824, "learning_rate": 5.169502402765137e-05, "loss": 0.7092, "step": 303000 }, { "epoch": 4.040740691365267, "grad_norm": 12.153608322143555, "learning_rate": 5.168902797041281e-05, "loss": 0.7703, "step": 303100 }, { "epoch": 4.042073829171722, "grad_norm": 7.742288589477539, "learning_rate": 5.168303009743722e-05, "loss": 0.7029, "step": 303200 }, { "epoch": 4.043406966978177, "grad_norm": 14.941488265991211, "learning_rate": 5.167703040922668e-05, "loss": 0.6649, "step": 303300 }, { "epoch": 4.044740104784632, "grad_norm": 19.862083435058594, "learning_rate": 5.1671028906283495e-05, "loss": 0.6924, "step": 303400 }, { "epoch": 4.046073242591087, "grad_norm": 17.366756439208984, "learning_rate": 5.166502558911009e-05, "loss": 0.8129, "step": 303500 }, { "epoch": 4.0474063803975415, "grad_norm": 6.698452949523926, "learning_rate": 5.165902045820903e-05, "loss": 0.7471, "step": 303600 }, { "epoch": 4.0487395182039965, "grad_norm": 14.744972229003906, "learning_rate": 5.1653013514083056e-05, "loss": 0.5733, "step": 303700 }, { "epoch": 4.050072656010451, "grad_norm": 13.559379577636719, "learning_rate": 5.164700475723504e-05, "loss": 0.7751, "step": 303800 }, { "epoch": 4.051405793816907, "grad_norm": 3.655327796936035, "learning_rate": 5.164099418816803e-05, "loss": 0.7622, "step": 303900 }, { "epoch": 4.052738931623362, "grad_norm": 7.54448127746582, "learning_rate": 5.1634981807385196e-05, "loss": 0.7094, "step": 304000 }, { "epoch": 4.054072069429817, "grad_norm": 28.948488235473633, "learning_rate": 5.16289676153899e-05, "loss": 0.7751, "step": 304100 }, { "epoch": 4.055405207236272, "grad_norm": 20.04469108581543, "learning_rate": 5.1622951612685605e-05, "loss": 0.6954, "step": 304200 }, { "epoch": 4.056738345042727, "grad_norm": 5.5337419509887695, "learning_rate": 5.161693379977596e-05, "loss": 0.8107, "step": 304300 }, { "epoch": 4.058071482849182, "grad_norm": 4.158088207244873, "learning_rate": 5.161091417716476e-05, "loss": 0.7431, "step": 304400 }, { "epoch": 4.059404620655637, "grad_norm": 22.267988204956055, "learning_rate": 5.1604892745355955e-05, "loss": 0.7296, "step": 304500 }, { "epoch": 4.060737758462092, "grad_norm": 18.27762222290039, "learning_rate": 5.159886950485363e-05, "loss": 0.7908, "step": 304600 }, { "epoch": 4.062070896268548, "grad_norm": 2.70481014251709, "learning_rate": 5.159284445616203e-05, "loss": 0.6821, "step": 304700 }, { "epoch": 4.063404034075003, "grad_norm": 17.327407836914062, "learning_rate": 5.1586817599785574e-05, "loss": 0.7799, "step": 304800 }, { "epoch": 4.0647371718814576, "grad_norm": 4.370835304260254, "learning_rate": 5.158078893622878e-05, "loss": 0.7381, "step": 304900 }, { "epoch": 4.0660703096879125, "grad_norm": 16.556045532226562, "learning_rate": 5.157475846599637e-05, "loss": 0.7231, "step": 305000 }, { "epoch": 4.067403447494367, "grad_norm": 17.508277893066406, "learning_rate": 5.15687261895932e-05, "loss": 0.7569, "step": 305100 }, { "epoch": 4.068736585300822, "grad_norm": 19.315139770507812, "learning_rate": 5.156269210752426e-05, "loss": 0.718, "step": 305200 }, { "epoch": 4.070069723107277, "grad_norm": 6.445992946624756, "learning_rate": 5.15566562202947e-05, "loss": 0.6793, "step": 305300 }, { "epoch": 4.071402860913732, "grad_norm": 10.222743034362793, "learning_rate": 5.155061852840984e-05, "loss": 0.7835, "step": 305400 }, { "epoch": 4.072735998720188, "grad_norm": 3.970233201980591, "learning_rate": 5.154457903237513e-05, "loss": 0.7463, "step": 305500 }, { "epoch": 4.074069136526643, "grad_norm": 8.39959716796875, "learning_rate": 5.1538537732696174e-05, "loss": 0.7478, "step": 305600 }, { "epoch": 4.075402274333098, "grad_norm": 22.44239044189453, "learning_rate": 5.1532494629878736e-05, "loss": 0.6706, "step": 305700 }, { "epoch": 4.076735412139553, "grad_norm": 3.955111503601074, "learning_rate": 5.1526449724428725e-05, "loss": 0.7492, "step": 305800 }, { "epoch": 4.078068549946008, "grad_norm": 6.304575443267822, "learning_rate": 5.15204030168522e-05, "loss": 0.7125, "step": 305900 }, { "epoch": 4.079401687752463, "grad_norm": 8.806821823120117, "learning_rate": 5.151435450765537e-05, "loss": 0.8176, "step": 306000 }, { "epoch": 4.080734825558918, "grad_norm": 9.604864120483398, "learning_rate": 5.15083041973446e-05, "loss": 0.7859, "step": 306100 }, { "epoch": 4.082067963365373, "grad_norm": 1.0865999460220337, "learning_rate": 5.15022520864264e-05, "loss": 0.7304, "step": 306200 }, { "epoch": 4.0834011011718285, "grad_norm": 4.862327575683594, "learning_rate": 5.149619817540745e-05, "loss": 0.6575, "step": 306300 }, { "epoch": 4.0847342389782835, "grad_norm": 32.657257080078125, "learning_rate": 5.149014246479454e-05, "loss": 0.6818, "step": 306400 }, { "epoch": 4.086067376784738, "grad_norm": 4.558753490447998, "learning_rate": 5.1484084955094646e-05, "loss": 0.8429, "step": 306500 }, { "epoch": 4.087400514591193, "grad_norm": 7.716710567474365, "learning_rate": 5.147802564681488e-05, "loss": 0.6631, "step": 306600 }, { "epoch": 4.088733652397648, "grad_norm": 4.396974086761475, "learning_rate": 5.1471964540462524e-05, "loss": 0.7398, "step": 306700 }, { "epoch": 4.090066790204103, "grad_norm": 8.268854141235352, "learning_rate": 5.1465901636544976e-05, "loss": 0.6996, "step": 306800 }, { "epoch": 4.091399928010558, "grad_norm": 7.765451431274414, "learning_rate": 5.14598369355698e-05, "loss": 0.7345, "step": 306900 }, { "epoch": 4.092733065817013, "grad_norm": 6.405287265777588, "learning_rate": 5.145377043804474e-05, "loss": 0.7268, "step": 307000 }, { "epoch": 4.094066203623469, "grad_norm": 3.169851541519165, "learning_rate": 5.144770214447763e-05, "loss": 0.7264, "step": 307100 }, { "epoch": 4.095399341429924, "grad_norm": 11.698719024658203, "learning_rate": 5.1441632055376513e-05, "loss": 0.6774, "step": 307200 }, { "epoch": 4.096732479236379, "grad_norm": 4.114218235015869, "learning_rate": 5.143562089897453e-05, "loss": 0.7671, "step": 307300 }, { "epoch": 4.098065617042834, "grad_norm": 5.536536693572998, "learning_rate": 5.14295472382727e-05, "loss": 0.7663, "step": 307400 }, { "epoch": 4.099398754849289, "grad_norm": 3.3218741416931152, "learning_rate": 5.142347178355672e-05, "loss": 0.7621, "step": 307500 }, { "epoch": 4.100731892655744, "grad_norm": 9.370499610900879, "learning_rate": 5.141739453533522e-05, "loss": 0.71, "step": 307600 }, { "epoch": 4.102065030462199, "grad_norm": 2.1014161109924316, "learning_rate": 5.1411315494116976e-05, "loss": 0.7678, "step": 307700 }, { "epoch": 4.103398168268654, "grad_norm": 9.47592830657959, "learning_rate": 5.1405234660410874e-05, "loss": 0.674, "step": 307800 }, { "epoch": 4.104731306075109, "grad_norm": 11.523845672607422, "learning_rate": 5.1399152034726e-05, "loss": 0.7373, "step": 307900 }, { "epoch": 4.106064443881564, "grad_norm": 11.70095157623291, "learning_rate": 5.139306761757159e-05, "loss": 0.6724, "step": 308000 }, { "epoch": 4.107397581688019, "grad_norm": 10.865763664245605, "learning_rate": 5.138698140945698e-05, "loss": 0.7914, "step": 308100 }, { "epoch": 4.108730719494474, "grad_norm": 13.051876068115234, "learning_rate": 5.138089341089173e-05, "loss": 0.6207, "step": 308200 }, { "epoch": 4.110063857300929, "grad_norm": 22.65190887451172, "learning_rate": 5.1374803622385454e-05, "loss": 0.7623, "step": 308300 }, { "epoch": 4.111396995107384, "grad_norm": 245.55252075195312, "learning_rate": 5.136871204444801e-05, "loss": 0.7264, "step": 308400 }, { "epoch": 4.112730132913839, "grad_norm": 12.184490203857422, "learning_rate": 5.136261867758935e-05, "loss": 0.6744, "step": 308500 }, { "epoch": 4.114063270720294, "grad_norm": 7.62047815322876, "learning_rate": 5.13565235223196e-05, "loss": 0.7452, "step": 308600 }, { "epoch": 4.11539640852675, "grad_norm": 12.032438278198242, "learning_rate": 5.135042657914901e-05, "loss": 0.7681, "step": 308700 }, { "epoch": 4.116729546333205, "grad_norm": 13.564154624938965, "learning_rate": 5.134432784858802e-05, "loss": 0.7347, "step": 308800 }, { "epoch": 4.11806268413966, "grad_norm": 33.14181137084961, "learning_rate": 5.133822733114718e-05, "loss": 0.7378, "step": 308900 }, { "epoch": 4.119395821946115, "grad_norm": 3.855325937271118, "learning_rate": 5.133212502733722e-05, "loss": 0.7271, "step": 309000 }, { "epoch": 4.12072895975257, "grad_norm": 5.311984539031982, "learning_rate": 5.132602093766899e-05, "loss": 0.7903, "step": 309100 }, { "epoch": 4.1220620975590245, "grad_norm": 2.8379220962524414, "learning_rate": 5.131991506265352e-05, "loss": 0.7348, "step": 309200 }, { "epoch": 4.1233952353654795, "grad_norm": 7.027371883392334, "learning_rate": 5.131380740280195e-05, "loss": 0.784, "step": 309300 }, { "epoch": 4.124728373171934, "grad_norm": 31.39542579650879, "learning_rate": 5.130769795862564e-05, "loss": 0.6686, "step": 309400 }, { "epoch": 4.12606151097839, "grad_norm": 5.445221900939941, "learning_rate": 5.1301586730636e-05, "loss": 0.7056, "step": 309500 }, { "epoch": 4.127394648784845, "grad_norm": 61.600528717041016, "learning_rate": 5.129547371934468e-05, "loss": 0.7832, "step": 309600 }, { "epoch": 4.1287277865913, "grad_norm": 9.623259544372559, "learning_rate": 5.128935892526343e-05, "loss": 0.7867, "step": 309700 }, { "epoch": 4.130060924397755, "grad_norm": 2.0937068462371826, "learning_rate": 5.128324234890414e-05, "loss": 0.8258, "step": 309800 }, { "epoch": 4.13139406220421, "grad_norm": 14.740052223205566, "learning_rate": 5.1277123990778904e-05, "loss": 0.8089, "step": 309900 }, { "epoch": 4.132727200010665, "grad_norm": 6.793670654296875, "learning_rate": 5.127100385139993e-05, "loss": 0.7399, "step": 310000 }, { "epoch": 4.13406033781712, "grad_norm": 13.367440223693848, "learning_rate": 5.1264881931279555e-05, "loss": 0.6656, "step": 310100 }, { "epoch": 4.135393475623575, "grad_norm": 2.287529230117798, "learning_rate": 5.12587582309303e-05, "loss": 0.7782, "step": 310200 }, { "epoch": 4.136726613430031, "grad_norm": 8.08200740814209, "learning_rate": 5.1252632750864816e-05, "loss": 0.8191, "step": 310300 }, { "epoch": 4.138059751236486, "grad_norm": 7.227906703948975, "learning_rate": 5.124650549159592e-05, "loss": 0.6807, "step": 310400 }, { "epoch": 4.139392889042941, "grad_norm": 6.48392915725708, "learning_rate": 5.124037645363655e-05, "loss": 0.801, "step": 310500 }, { "epoch": 4.1407260268493955, "grad_norm": 11.690773010253906, "learning_rate": 5.123424563749982e-05, "loss": 0.7398, "step": 310600 }, { "epoch": 4.1420591646558504, "grad_norm": 0.7780767679214478, "learning_rate": 5.1228113043698986e-05, "loss": 0.7747, "step": 310700 }, { "epoch": 4.143392302462305, "grad_norm": 6.599091529846191, "learning_rate": 5.122197867274744e-05, "loss": 0.7471, "step": 310800 }, { "epoch": 4.14472544026876, "grad_norm": 17.18409538269043, "learning_rate": 5.1215903895427294e-05, "loss": 0.7566, "step": 310900 }, { "epoch": 4.146058578075215, "grad_norm": 4.2640910148620605, "learning_rate": 5.1209765989473826e-05, "loss": 0.847, "step": 311000 }, { "epoch": 4.147391715881671, "grad_norm": 100.47086334228516, "learning_rate": 5.120362630790561e-05, "loss": 0.737, "step": 311100 }, { "epoch": 4.148724853688126, "grad_norm": 11.425575256347656, "learning_rate": 5.119748485123663e-05, "loss": 0.7768, "step": 311200 }, { "epoch": 4.150057991494581, "grad_norm": 16.84617805480957, "learning_rate": 5.1191341619981054e-05, "loss": 0.8262, "step": 311300 }, { "epoch": 4.151391129301036, "grad_norm": 6.643960475921631, "learning_rate": 5.118519661465315e-05, "loss": 0.8093, "step": 311400 }, { "epoch": 4.152724267107491, "grad_norm": 83.72212982177734, "learning_rate": 5.1179049835767375e-05, "loss": 0.9093, "step": 311500 }, { "epoch": 4.154057404913946, "grad_norm": 65.87942504882812, "learning_rate": 5.1172901283838296e-05, "loss": 0.7419, "step": 311600 }, { "epoch": 4.155390542720401, "grad_norm": 3.793771982192993, "learning_rate": 5.116675095938067e-05, "loss": 0.8579, "step": 311700 }, { "epoch": 4.156723680526856, "grad_norm": 58.632686614990234, "learning_rate": 5.116066039264387e-05, "loss": 0.7794, "step": 311800 }, { "epoch": 4.1580568183333115, "grad_norm": 12.192587852478027, "learning_rate": 5.115450654238638e-05, "loss": 0.789, "step": 311900 }, { "epoch": 4.1593899561397665, "grad_norm": 9.862421035766602, "learning_rate": 5.114835092114028e-05, "loss": 0.9125, "step": 312000 }, { "epoch": 4.160723093946221, "grad_norm": 9.629714965820312, "learning_rate": 5.114219352942092e-05, "loss": 0.8832, "step": 312100 }, { "epoch": 4.162056231752676, "grad_norm": 5.997868537902832, "learning_rate": 5.1136034367743765e-05, "loss": 0.7642, "step": 312200 }, { "epoch": 4.163389369559131, "grad_norm": 22.73073959350586, "learning_rate": 5.112987343662442e-05, "loss": 0.8224, "step": 312300 }, { "epoch": 4.164722507365586, "grad_norm": 8.390926361083984, "learning_rate": 5.11237107365787e-05, "loss": 0.8216, "step": 312400 }, { "epoch": 4.166055645172041, "grad_norm": 75.41954040527344, "learning_rate": 5.11175462681225e-05, "loss": 0.8091, "step": 312500 }, { "epoch": 4.167388782978496, "grad_norm": 23.349292755126953, "learning_rate": 5.1111380031771886e-05, "loss": 0.8039, "step": 312600 }, { "epoch": 4.168721920784952, "grad_norm": 32.34123611450195, "learning_rate": 5.1105212028043085e-05, "loss": 0.8785, "step": 312700 }, { "epoch": 4.170055058591407, "grad_norm": 39.40290069580078, "learning_rate": 5.1099042257452474e-05, "loss": 0.836, "step": 312800 }, { "epoch": 4.171388196397862, "grad_norm": 1.7298297882080078, "learning_rate": 5.109287072051654e-05, "loss": 0.79, "step": 312900 }, { "epoch": 4.172721334204317, "grad_norm": 39.7358512878418, "learning_rate": 5.108669741775197e-05, "loss": 0.8522, "step": 313000 }, { "epoch": 4.174054472010772, "grad_norm": 10.832755088806152, "learning_rate": 5.108052234967556e-05, "loss": 0.8371, "step": 313100 }, { "epoch": 4.175387609817227, "grad_norm": 5.759565353393555, "learning_rate": 5.1074345516804274e-05, "loss": 0.8143, "step": 313200 }, { "epoch": 4.176720747623682, "grad_norm": 32.52663803100586, "learning_rate": 5.10681669196552e-05, "loss": 0.8377, "step": 313300 }, { "epoch": 4.178053885430137, "grad_norm": 596.7396850585938, "learning_rate": 5.1061986558745624e-05, "loss": 0.8169, "step": 313400 }, { "epoch": 4.179387023236592, "grad_norm": 6.445364952087402, "learning_rate": 5.105580443459291e-05, "loss": 0.9105, "step": 313500 }, { "epoch": 4.180720161043047, "grad_norm": 73.86817169189453, "learning_rate": 5.104962054771463e-05, "loss": 0.8267, "step": 313600 }, { "epoch": 4.182053298849502, "grad_norm": 21.717527389526367, "learning_rate": 5.104343489862848e-05, "loss": 0.848, "step": 313700 }, { "epoch": 4.183386436655957, "grad_norm": 7.660300254821777, "learning_rate": 5.103724748785227e-05, "loss": 0.7719, "step": 313800 }, { "epoch": 4.184719574462412, "grad_norm": 32.44230270385742, "learning_rate": 5.103105831590403e-05, "loss": 0.8379, "step": 313900 }, { "epoch": 4.186052712268867, "grad_norm": 14.593600273132324, "learning_rate": 5.1024867383301893e-05, "loss": 0.7904, "step": 314000 }, { "epoch": 4.187385850075322, "grad_norm": 5.530404090881348, "learning_rate": 5.1018674690564124e-05, "loss": 0.8777, "step": 314100 }, { "epoch": 4.188718987881777, "grad_norm": 8.15846061706543, "learning_rate": 5.101248023820916e-05, "loss": 0.8822, "step": 314200 }, { "epoch": 4.190052125688233, "grad_norm": 3.550025224685669, "learning_rate": 5.1006284026755605e-05, "loss": 0.8002, "step": 314300 }, { "epoch": 4.191385263494688, "grad_norm": 5.107466220855713, "learning_rate": 5.100008605672216e-05, "loss": 0.7591, "step": 314400 }, { "epoch": 4.192718401301143, "grad_norm": 51.529266357421875, "learning_rate": 5.0993886328627704e-05, "loss": 0.7645, "step": 314500 }, { "epoch": 4.194051539107598, "grad_norm": 23.660722732543945, "learning_rate": 5.0987684842991275e-05, "loss": 0.8376, "step": 314600 }, { "epoch": 4.195384676914053, "grad_norm": 13.621746063232422, "learning_rate": 5.098148160033202e-05, "loss": 0.8262, "step": 314700 }, { "epoch": 4.1967178147205075, "grad_norm": 4.969440937042236, "learning_rate": 5.097527660116927e-05, "loss": 0.8718, "step": 314800 }, { "epoch": 4.1980509525269625, "grad_norm": 9.396942138671875, "learning_rate": 5.096906984602248e-05, "loss": 0.7432, "step": 314900 }, { "epoch": 4.199384090333417, "grad_norm": 31.84966468811035, "learning_rate": 5.096286133541126e-05, "loss": 0.772, "step": 315000 }, { "epoch": 4.200717228139872, "grad_norm": 38.00093078613281, "learning_rate": 5.0956651069855375e-05, "loss": 0.7903, "step": 315100 }, { "epoch": 4.202050365946328, "grad_norm": 20.057405471801758, "learning_rate": 5.095043904987472e-05, "loss": 0.7678, "step": 315200 }, { "epoch": 4.203383503752783, "grad_norm": 57.063053131103516, "learning_rate": 5.094422527598935e-05, "loss": 0.7389, "step": 315300 }, { "epoch": 4.204716641559238, "grad_norm": 28.07072639465332, "learning_rate": 5.0938009748719456e-05, "loss": 0.8284, "step": 315400 }, { "epoch": 4.206049779365693, "grad_norm": 9.968184471130371, "learning_rate": 5.093179246858539e-05, "loss": 0.7532, "step": 315500 }, { "epoch": 4.207382917172148, "grad_norm": 22.118255615234375, "learning_rate": 5.0925573436107644e-05, "loss": 0.7703, "step": 315600 }, { "epoch": 4.208716054978603, "grad_norm": 136.30975341796875, "learning_rate": 5.091935265180686e-05, "loss": 0.7871, "step": 315700 }, { "epoch": 4.210049192785058, "grad_norm": 34.40464401245117, "learning_rate": 5.0913192350227074e-05, "loss": 0.7521, "step": 315800 }, { "epoch": 4.211382330591514, "grad_norm": 109.88851165771484, "learning_rate": 5.090696808134793e-05, "loss": 0.7769, "step": 315900 }, { "epoch": 4.212715468397969, "grad_norm": 2.899031639099121, "learning_rate": 5.0900742062203315e-05, "loss": 0.7978, "step": 316000 }, { "epoch": 4.214048606204424, "grad_norm": 10.876358985900879, "learning_rate": 5.0894514293314476e-05, "loss": 0.8145, "step": 316100 }, { "epoch": 4.2153817440108785, "grad_norm": 6.115724563598633, "learning_rate": 5.088828477520275e-05, "loss": 0.8239, "step": 316200 }, { "epoch": 4.2167148818173334, "grad_norm": 30.27534294128418, "learning_rate": 5.088205350838967e-05, "loss": 0.8953, "step": 316300 }, { "epoch": 4.218048019623788, "grad_norm": 5.704623699188232, "learning_rate": 5.087582049339691e-05, "loss": 0.7979, "step": 316400 }, { "epoch": 4.219381157430243, "grad_norm": 2.753654718399048, "learning_rate": 5.086958573074625e-05, "loss": 0.8349, "step": 316500 }, { "epoch": 4.220714295236698, "grad_norm": 15.287988662719727, "learning_rate": 5.0863349220959656e-05, "loss": 0.7462, "step": 316600 }, { "epoch": 4.222047433043153, "grad_norm": 25.87177276611328, "learning_rate": 5.085711096455924e-05, "loss": 0.827, "step": 316700 }, { "epoch": 4.223380570849609, "grad_norm": 12.4507417678833, "learning_rate": 5.085087096206724e-05, "loss": 0.8079, "step": 316800 }, { "epoch": 4.224713708656064, "grad_norm": 5.493698596954346, "learning_rate": 5.0844629214006056e-05, "loss": 0.7489, "step": 316900 }, { "epoch": 4.226046846462519, "grad_norm": 1.969923734664917, "learning_rate": 5.083838572089822e-05, "loss": 0.7868, "step": 317000 }, { "epoch": 4.227379984268974, "grad_norm": 14.27915096282959, "learning_rate": 5.0832140483266415e-05, "loss": 0.7246, "step": 317100 }, { "epoch": 4.228713122075429, "grad_norm": 35.2597770690918, "learning_rate": 5.082589350163348e-05, "loss": 0.8465, "step": 317200 }, { "epoch": 4.230046259881884, "grad_norm": 13.406697273254395, "learning_rate": 5.081964477652238e-05, "loss": 0.7753, "step": 317300 }, { "epoch": 4.231379397688339, "grad_norm": 6.209971904754639, "learning_rate": 5.0813394308456264e-05, "loss": 0.7945, "step": 317400 }, { "epoch": 4.2327125354947945, "grad_norm": 8.174581527709961, "learning_rate": 5.080714209795838e-05, "loss": 0.8366, "step": 317500 }, { "epoch": 4.2340456733012495, "grad_norm": 40.511444091796875, "learning_rate": 5.0800888145552153e-05, "loss": 0.7064, "step": 317600 }, { "epoch": 4.235378811107704, "grad_norm": 13.357099533081055, "learning_rate": 5.079463245176113e-05, "loss": 0.7645, "step": 317700 }, { "epoch": 4.236711948914159, "grad_norm": 100.42739868164062, "learning_rate": 5.078837501710904e-05, "loss": 0.8286, "step": 317800 }, { "epoch": 4.238045086720614, "grad_norm": 72.52742767333984, "learning_rate": 5.078211584211971e-05, "loss": 0.8948, "step": 317900 }, { "epoch": 4.239378224527069, "grad_norm": 3.8949267864227295, "learning_rate": 5.0775854927317166e-05, "loss": 0.8033, "step": 318000 }, { "epoch": 4.240711362333524, "grad_norm": 22.973337173461914, "learning_rate": 5.0769592273225535e-05, "loss": 0.8576, "step": 318100 }, { "epoch": 4.242044500139979, "grad_norm": 16.366483688354492, "learning_rate": 5.076332788036911e-05, "loss": 0.7543, "step": 318200 }, { "epoch": 4.243377637946434, "grad_norm": 42.79903030395508, "learning_rate": 5.075706174927233e-05, "loss": 0.801, "step": 318300 }, { "epoch": 4.24471077575289, "grad_norm": 10.17547607421875, "learning_rate": 5.0750793880459764e-05, "loss": 0.8129, "step": 318400 }, { "epoch": 4.246043913559345, "grad_norm": 5.044783592224121, "learning_rate": 5.074452427445615e-05, "loss": 0.7688, "step": 318500 }, { "epoch": 4.2473770513658, "grad_norm": 4.983713150024414, "learning_rate": 5.073825293178636e-05, "loss": 0.8816, "step": 318600 }, { "epoch": 4.248710189172255, "grad_norm": 3.1441946029663086, "learning_rate": 5.07319798529754e-05, "loss": 0.8139, "step": 318700 }, { "epoch": 4.25004332697871, "grad_norm": 18.857887268066406, "learning_rate": 5.072570503854845e-05, "loss": 0.749, "step": 318800 }, { "epoch": 4.251376464785165, "grad_norm": 5.321173667907715, "learning_rate": 5.0719428489030804e-05, "loss": 0.7341, "step": 318900 }, { "epoch": 4.25270960259162, "grad_norm": 5.048271656036377, "learning_rate": 5.071315020494791e-05, "loss": 0.7708, "step": 319000 }, { "epoch": 4.254042740398075, "grad_norm": 6.5230302810668945, "learning_rate": 5.070687018682538e-05, "loss": 0.7393, "step": 319100 }, { "epoch": 4.25537587820453, "grad_norm": 3.326730251312256, "learning_rate": 5.070058843518895e-05, "loss": 0.7178, "step": 319200 }, { "epoch": 4.256709016010985, "grad_norm": 11.410584449768066, "learning_rate": 5.0694304950564515e-05, "loss": 0.8183, "step": 319300 }, { "epoch": 4.25804215381744, "grad_norm": 6.421915531158447, "learning_rate": 5.068801973347809e-05, "loss": 0.7646, "step": 319400 }, { "epoch": 4.259375291623895, "grad_norm": 11.6187744140625, "learning_rate": 5.068173278445588e-05, "loss": 0.7507, "step": 319500 }, { "epoch": 4.26070842943035, "grad_norm": 7.4825439453125, "learning_rate": 5.0675506999397264e-05, "loss": 0.7885, "step": 319600 }, { "epoch": 4.262041567236805, "grad_norm": 7.739953517913818, "learning_rate": 5.0669216605388796e-05, "loss": 0.7682, "step": 319700 }, { "epoch": 4.26337470504326, "grad_norm": 4.735890865325928, "learning_rate": 5.0662924481018665e-05, "loss": 0.7925, "step": 319800 }, { "epoch": 4.264707842849715, "grad_norm": 8.110722541809082, "learning_rate": 5.065663062681363e-05, "loss": 0.7645, "step": 319900 }, { "epoch": 4.266040980656171, "grad_norm": 16.740516662597656, "learning_rate": 5.06503350433006e-05, "loss": 0.7475, "step": 320000 }, { "epoch": 4.267374118462626, "grad_norm": 6.236398696899414, "learning_rate": 5.064403773100662e-05, "loss": 0.7229, "step": 320100 }, { "epoch": 4.268707256269081, "grad_norm": 3.6311213970184326, "learning_rate": 5.063773869045887e-05, "loss": 0.7576, "step": 320200 }, { "epoch": 4.270040394075536, "grad_norm": 13.265114784240723, "learning_rate": 5.063143792218469e-05, "loss": 0.7261, "step": 320300 }, { "epoch": 4.2713735318819905, "grad_norm": 2.797632932662964, "learning_rate": 5.062513542671156e-05, "loss": 0.7076, "step": 320400 }, { "epoch": 4.2727066696884455, "grad_norm": 8.243858337402344, "learning_rate": 5.061883120456712e-05, "loss": 0.8123, "step": 320500 }, { "epoch": 4.2740398074949, "grad_norm": 4.409436225891113, "learning_rate": 5.061252525627912e-05, "loss": 0.8632, "step": 320600 }, { "epoch": 4.275372945301356, "grad_norm": 3.7755789756774902, "learning_rate": 5.060621758237549e-05, "loss": 0.6869, "step": 320700 }, { "epoch": 4.276706083107811, "grad_norm": 8.61677074432373, "learning_rate": 5.059990818338427e-05, "loss": 0.6927, "step": 320800 }, { "epoch": 4.278039220914266, "grad_norm": 9.033851623535156, "learning_rate": 5.059359705983368e-05, "loss": 0.7362, "step": 320900 }, { "epoch": 4.279372358720721, "grad_norm": 18.770843505859375, "learning_rate": 5.0587284212252056e-05, "loss": 0.6806, "step": 321000 }, { "epoch": 4.280705496527176, "grad_norm": 8.606554985046387, "learning_rate": 5.0580969641167896e-05, "loss": 0.7496, "step": 321100 }, { "epoch": 4.282038634333631, "grad_norm": 13.571919441223145, "learning_rate": 5.057465334710983e-05, "loss": 0.7567, "step": 321200 }, { "epoch": 4.283371772140086, "grad_norm": 8.408830642700195, "learning_rate": 5.0568335330606654e-05, "loss": 0.797, "step": 321300 }, { "epoch": 4.284704909946541, "grad_norm": 3.6571106910705566, "learning_rate": 5.056201559218728e-05, "loss": 0.8216, "step": 321400 }, { "epoch": 4.286038047752996, "grad_norm": 46.68545150756836, "learning_rate": 5.055569413238077e-05, "loss": 0.79, "step": 321500 }, { "epoch": 4.287371185559452, "grad_norm": 14.368212699890137, "learning_rate": 5.0549370951716345e-05, "loss": 0.602, "step": 321600 }, { "epoch": 4.288704323365907, "grad_norm": 38.33243942260742, "learning_rate": 5.054304605072337e-05, "loss": 0.7688, "step": 321700 }, { "epoch": 4.2900374611723615, "grad_norm": 61.2823486328125, "learning_rate": 5.053678270465051e-05, "loss": 0.7595, "step": 321800 }, { "epoch": 4.2913705989788165, "grad_norm": 3.3864388465881348, "learning_rate": 5.0530454381779136e-05, "loss": 0.7714, "step": 321900 }, { "epoch": 4.292703736785271, "grad_norm": 79.47239685058594, "learning_rate": 5.0524124340162826e-05, "loss": 0.752, "step": 322000 }, { "epoch": 4.294036874591726, "grad_norm": 4.420698642730713, "learning_rate": 5.051779258033152e-05, "loss": 0.7418, "step": 322100 }, { "epoch": 4.295370012398181, "grad_norm": 4.888479709625244, "learning_rate": 5.051145910281532e-05, "loss": 0.6898, "step": 322200 }, { "epoch": 4.296703150204637, "grad_norm": 7.624805450439453, "learning_rate": 5.0505123908144404e-05, "loss": 0.696, "step": 322300 }, { "epoch": 4.298036288011092, "grad_norm": 4.0029215812683105, "learning_rate": 5.049878699684915e-05, "loss": 0.7086, "step": 322400 }, { "epoch": 4.299369425817547, "grad_norm": 23.31371307373047, "learning_rate": 5.0492448369460074e-05, "loss": 0.8463, "step": 322500 }, { "epoch": 4.300702563624002, "grad_norm": 7.456881999969482, "learning_rate": 5.048610802650782e-05, "loss": 0.7952, "step": 322600 }, { "epoch": 4.302035701430457, "grad_norm": 7.494369983673096, "learning_rate": 5.047976596852318e-05, "loss": 0.7117, "step": 322700 }, { "epoch": 4.303368839236912, "grad_norm": 6.217690467834473, "learning_rate": 5.0473422196037086e-05, "loss": 0.7983, "step": 322800 }, { "epoch": 4.304701977043367, "grad_norm": 3.102309226989746, "learning_rate": 5.0467076709580626e-05, "loss": 0.7405, "step": 322900 }, { "epoch": 4.306035114849822, "grad_norm": 4.394599914550781, "learning_rate": 5.046072950968502e-05, "loss": 0.807, "step": 323000 }, { "epoch": 4.307368252656277, "grad_norm": 10.45299243927002, "learning_rate": 5.0454380596881636e-05, "loss": 0.7318, "step": 323100 }, { "epoch": 4.3087013904627325, "grad_norm": 20.557111740112305, "learning_rate": 5.044802997170199e-05, "loss": 0.7705, "step": 323200 }, { "epoch": 4.310034528269187, "grad_norm": 18.220842361450195, "learning_rate": 5.0441677634677734e-05, "loss": 0.7263, "step": 323300 }, { "epoch": 4.311367666075642, "grad_norm": 21.62270164489746, "learning_rate": 5.043532358634066e-05, "loss": 0.7715, "step": 323400 }, { "epoch": 4.312700803882097, "grad_norm": 6.819455146789551, "learning_rate": 5.0428967827222714e-05, "loss": 0.7126, "step": 323500 }, { "epoch": 4.314033941688552, "grad_norm": 19.071916580200195, "learning_rate": 5.042261035785598e-05, "loss": 0.7061, "step": 323600 }, { "epoch": 4.315367079495007, "grad_norm": 4.361313819885254, "learning_rate": 5.041625117877268e-05, "loss": 0.795, "step": 323700 }, { "epoch": 4.316700217301462, "grad_norm": 9.676730155944824, "learning_rate": 5.040989029050519e-05, "loss": 0.6895, "step": 323800 }, { "epoch": 4.318033355107918, "grad_norm": 2.7916321754455566, "learning_rate": 5.0403527693586024e-05, "loss": 0.7785, "step": 323900 }, { "epoch": 4.319366492914373, "grad_norm": 6.561089515686035, "learning_rate": 5.0397163388547834e-05, "loss": 0.7809, "step": 324000 }, { "epoch": 4.320699630720828, "grad_norm": 4.625338554382324, "learning_rate": 5.039079737592343e-05, "loss": 0.6539, "step": 324100 }, { "epoch": 4.322032768527283, "grad_norm": 23.82413101196289, "learning_rate": 5.0384429656245746e-05, "loss": 0.7519, "step": 324200 }, { "epoch": 4.323365906333738, "grad_norm": 7.127883434295654, "learning_rate": 5.037806023004786e-05, "loss": 0.7646, "step": 324300 }, { "epoch": 4.324699044140193, "grad_norm": 6.126817226409912, "learning_rate": 5.037168909786302e-05, "loss": 0.7135, "step": 324400 }, { "epoch": 4.326032181946648, "grad_norm": 1.5323551893234253, "learning_rate": 5.036537999704121e-05, "loss": 0.8121, "step": 324500 }, { "epoch": 4.327365319753103, "grad_norm": 7.598670482635498, "learning_rate": 5.0359005471529246e-05, "loss": 0.8208, "step": 324600 }, { "epoch": 4.3286984575595575, "grad_norm": 7.301164627075195, "learning_rate": 5.035262924162552e-05, "loss": 0.8489, "step": 324700 }, { "epoch": 4.330031595366013, "grad_norm": 5.215762138366699, "learning_rate": 5.03463150956338e-05, "loss": 0.7472, "step": 324800 }, { "epoch": 4.331364733172468, "grad_norm": 2.8085808753967285, "learning_rate": 5.033993547557869e-05, "loss": 0.8872, "step": 324900 }, { "epoch": 4.332697870978923, "grad_norm": 634.6810913085938, "learning_rate": 5.03335541527283e-05, "loss": 0.733, "step": 325000 }, { "epoch": 4.334031008785378, "grad_norm": 13.474733352661133, "learning_rate": 5.032717112761685e-05, "loss": 0.8426, "step": 325100 }, { "epoch": 4.335364146591833, "grad_norm": 94.42285919189453, "learning_rate": 5.032078640077871e-05, "loss": 0.8242, "step": 325200 }, { "epoch": 4.336697284398288, "grad_norm": 31.964141845703125, "learning_rate": 5.0314399972748396e-05, "loss": 0.7699, "step": 325300 }, { "epoch": 4.338030422204743, "grad_norm": 5.443176746368408, "learning_rate": 5.030801184406055e-05, "loss": 0.7064, "step": 325400 }, { "epoch": 4.339363560011199, "grad_norm": 32.92766571044922, "learning_rate": 5.030162201524996e-05, "loss": 0.8125, "step": 325500 }, { "epoch": 4.340696697817654, "grad_norm": 2.530297040939331, "learning_rate": 5.029523048685158e-05, "loss": 0.7818, "step": 325600 }, { "epoch": 4.342029835624109, "grad_norm": 39.982879638671875, "learning_rate": 5.028883725940048e-05, "loss": 0.8176, "step": 325700 }, { "epoch": 4.343362973430564, "grad_norm": 9.88154411315918, "learning_rate": 5.028244233343188e-05, "loss": 0.6857, "step": 325800 }, { "epoch": 4.344696111237019, "grad_norm": 7.316488742828369, "learning_rate": 5.027604570948114e-05, "loss": 0.7858, "step": 325900 }, { "epoch": 4.3460292490434735, "grad_norm": 10.44239330291748, "learning_rate": 5.026964738808377e-05, "loss": 0.7806, "step": 326000 }, { "epoch": 4.3473623868499285, "grad_norm": 9.23131275177002, "learning_rate": 5.026324736977541e-05, "loss": 0.7092, "step": 326100 }, { "epoch": 4.348695524656383, "grad_norm": 13.91838550567627, "learning_rate": 5.0256845655091866e-05, "loss": 0.7138, "step": 326200 }, { "epoch": 4.350028662462838, "grad_norm": 11.349276542663574, "learning_rate": 5.025044224456905e-05, "loss": 0.7314, "step": 326300 }, { "epoch": 4.351361800269294, "grad_norm": 12.846551895141602, "learning_rate": 5.024403713874306e-05, "loss": 0.8891, "step": 326400 }, { "epoch": 4.352694938075749, "grad_norm": 17.00482177734375, "learning_rate": 5.023763033815008e-05, "loss": 0.6467, "step": 326500 }, { "epoch": 4.354028075882204, "grad_norm": 14.013162612915039, "learning_rate": 5.0231221843326485e-05, "loss": 0.7211, "step": 326600 }, { "epoch": 4.355361213688659, "grad_norm": 2.3490777015686035, "learning_rate": 5.022481165480878e-05, "loss": 0.6952, "step": 326700 }, { "epoch": 4.356694351495114, "grad_norm": 5.574021339416504, "learning_rate": 5.021839977313358e-05, "loss": 0.6928, "step": 326800 }, { "epoch": 4.358027489301569, "grad_norm": 17.135210037231445, "learning_rate": 5.02119861988377e-05, "loss": 0.8017, "step": 326900 }, { "epoch": 4.359360627108024, "grad_norm": 5.178762912750244, "learning_rate": 5.0205570932458053e-05, "loss": 0.7166, "step": 327000 }, { "epoch": 4.360693764914479, "grad_norm": 46.3132209777832, "learning_rate": 5.019915397453168e-05, "loss": 0.7331, "step": 327100 }, { "epoch": 4.362026902720935, "grad_norm": 67.16956329345703, "learning_rate": 5.0192735325595827e-05, "loss": 0.7594, "step": 327200 }, { "epoch": 4.36336004052739, "grad_norm": 5.204036712646484, "learning_rate": 5.0186314986187825e-05, "loss": 0.6879, "step": 327300 }, { "epoch": 4.3646931783338445, "grad_norm": 15.802461624145508, "learning_rate": 5.017989295684516e-05, "loss": 0.7676, "step": 327400 }, { "epoch": 4.3660263161402995, "grad_norm": 4.3023576736450195, "learning_rate": 5.0173469238105466e-05, "loss": 0.685, "step": 327500 }, { "epoch": 4.367359453946754, "grad_norm": 30.254499435424805, "learning_rate": 5.01671080929406e-05, "loss": 0.8517, "step": 327600 }, { "epoch": 4.368692591753209, "grad_norm": 16.722076416015625, "learning_rate": 5.016068101390088e-05, "loss": 0.934, "step": 327700 }, { "epoch": 4.370025729559664, "grad_norm": 28.868030548095703, "learning_rate": 5.015425224707249e-05, "loss": 0.7093, "step": 327800 }, { "epoch": 4.371358867366119, "grad_norm": 6.862483978271484, "learning_rate": 5.0147821792993626e-05, "loss": 0.7865, "step": 327900 }, { "epoch": 4.372692005172575, "grad_norm": 14.441380500793457, "learning_rate": 5.014138965220264e-05, "loss": 0.8295, "step": 328000 }, { "epoch": 4.37402514297903, "grad_norm": 1102.6270751953125, "learning_rate": 5.0134955825238e-05, "loss": 0.8694, "step": 328100 }, { "epoch": 4.375358280785485, "grad_norm": 36.56635665893555, "learning_rate": 5.012852031263833e-05, "loss": 0.8417, "step": 328200 }, { "epoch": 4.37669141859194, "grad_norm": 4.749833583831787, "learning_rate": 5.012208311494239e-05, "loss": 0.8483, "step": 328300 }, { "epoch": 4.378024556398395, "grad_norm": 32.07921600341797, "learning_rate": 5.011564423268908e-05, "loss": 0.8534, "step": 328400 }, { "epoch": 4.37935769420485, "grad_norm": 21.68750762939453, "learning_rate": 5.0109203666417455e-05, "loss": 0.7568, "step": 328500 }, { "epoch": 4.380690832011305, "grad_norm": 35.79483413696289, "learning_rate": 5.0102761416666685e-05, "loss": 0.7941, "step": 328600 }, { "epoch": 4.38202396981776, "grad_norm": 8.646381378173828, "learning_rate": 5.009631748397609e-05, "loss": 0.8092, "step": 328700 }, { "epoch": 4.3833571076242155, "grad_norm": 963.8303833007812, "learning_rate": 5.008987186888515e-05, "loss": 0.8344, "step": 328800 }, { "epoch": 4.38469024543067, "grad_norm": 10.412981986999512, "learning_rate": 5.008342457193347e-05, "loss": 0.9054, "step": 328900 }, { "epoch": 4.386023383237125, "grad_norm": 7.266582012176514, "learning_rate": 5.007697559366078e-05, "loss": 0.7221, "step": 329000 }, { "epoch": 4.38735652104358, "grad_norm": 6.241432189941406, "learning_rate": 5.0070524934607e-05, "loss": 0.7367, "step": 329100 }, { "epoch": 4.388689658850035, "grad_norm": 411.1413269042969, "learning_rate": 5.006407259531213e-05, "loss": 0.7974, "step": 329200 }, { "epoch": 4.39002279665649, "grad_norm": 20.299522399902344, "learning_rate": 5.005761857631634e-05, "loss": 0.8286, "step": 329300 }, { "epoch": 4.391355934462945, "grad_norm": 21.720605850219727, "learning_rate": 5.005116287815997e-05, "loss": 0.7613, "step": 329400 }, { "epoch": 4.3926890722694, "grad_norm": 65.15131378173828, "learning_rate": 5.004470550138344e-05, "loss": 0.894, "step": 329500 }, { "epoch": 4.394022210075856, "grad_norm": 7.522080898284912, "learning_rate": 5.0038246446527354e-05, "loss": 0.7808, "step": 329600 }, { "epoch": 4.395355347882311, "grad_norm": 11.91016960144043, "learning_rate": 5.003178571413244e-05, "loss": 0.758, "step": 329700 }, { "epoch": 4.396688485688766, "grad_norm": 6.080156326293945, "learning_rate": 5.002532330473957e-05, "loss": 0.8153, "step": 329800 }, { "epoch": 4.398021623495221, "grad_norm": 12.692529678344727, "learning_rate": 5.001885921888975e-05, "loss": 0.7141, "step": 329900 }, { "epoch": 4.399354761301676, "grad_norm": 36.004825592041016, "learning_rate": 5.001239345712415e-05, "loss": 0.865, "step": 330000 }, { "epoch": 4.400687899108131, "grad_norm": 63.02214813232422, "learning_rate": 5.0005926019984055e-05, "loss": 0.7935, "step": 330100 }, { "epoch": 4.402021036914586, "grad_norm": 7.851664066314697, "learning_rate": 4.9999456908010904e-05, "loss": 0.8008, "step": 330200 }, { "epoch": 4.4033541747210405, "grad_norm": 54.092254638671875, "learning_rate": 4.999298612174626e-05, "loss": 0.8076, "step": 330300 }, { "epoch": 4.404687312527496, "grad_norm": 9.448959350585938, "learning_rate": 4.9986513661731834e-05, "loss": 0.7587, "step": 330400 }, { "epoch": 4.406020450333951, "grad_norm": 20.73597526550293, "learning_rate": 4.998003952850949e-05, "loss": 0.7748, "step": 330500 }, { "epoch": 4.407353588140406, "grad_norm": 13.140054702758789, "learning_rate": 4.9973563722621227e-05, "loss": 0.811, "step": 330600 }, { "epoch": 4.408686725946861, "grad_norm": 9.287500381469727, "learning_rate": 4.996708624460916e-05, "loss": 0.7196, "step": 330700 }, { "epoch": 4.410019863753316, "grad_norm": 12.16799259185791, "learning_rate": 4.996060709501558e-05, "loss": 0.7881, "step": 330800 }, { "epoch": 4.411353001559771, "grad_norm": 14.543741226196289, "learning_rate": 4.99541262743829e-05, "loss": 0.7549, "step": 330900 }, { "epoch": 4.412686139366226, "grad_norm": 5.138039588928223, "learning_rate": 4.9947643783253664e-05, "loss": 0.7368, "step": 331000 }, { "epoch": 4.414019277172681, "grad_norm": 11.803509712219238, "learning_rate": 4.994115962217057e-05, "loss": 0.6949, "step": 331100 }, { "epoch": 4.415352414979137, "grad_norm": 16.723371505737305, "learning_rate": 4.993467379167646e-05, "loss": 0.7638, "step": 331200 }, { "epoch": 4.416685552785592, "grad_norm": 6.388904571533203, "learning_rate": 4.992818629231429e-05, "loss": 0.7794, "step": 331300 }, { "epoch": 4.418018690592047, "grad_norm": 8.794983863830566, "learning_rate": 4.992169712462718e-05, "loss": 0.8595, "step": 331400 }, { "epoch": 4.419351828398502, "grad_norm": 4.546991348266602, "learning_rate": 4.9915206289158396e-05, "loss": 0.7888, "step": 331500 }, { "epoch": 4.4206849662049565, "grad_norm": 7.602880477905273, "learning_rate": 4.9908713786451315e-05, "loss": 0.8247, "step": 331600 }, { "epoch": 4.4220181040114115, "grad_norm": 17.128080368041992, "learning_rate": 4.990221961704947e-05, "loss": 0.7817, "step": 331700 }, { "epoch": 4.423351241817866, "grad_norm": 9.418432235717773, "learning_rate": 4.9895723781496545e-05, "loss": 0.8111, "step": 331800 }, { "epoch": 4.424684379624321, "grad_norm": 95.39327239990234, "learning_rate": 4.9889226280336336e-05, "loss": 0.8018, "step": 331900 }, { "epoch": 4.426017517430777, "grad_norm": 37.94580078125, "learning_rate": 4.9882727114112804e-05, "loss": 0.8777, "step": 332000 }, { "epoch": 4.427350655237232, "grad_norm": 24.588768005371094, "learning_rate": 4.987622628337003e-05, "loss": 0.7154, "step": 332100 }, { "epoch": 4.428683793043687, "grad_norm": 4.135098934173584, "learning_rate": 4.986972378865224e-05, "loss": 0.7218, "step": 332200 }, { "epoch": 4.430016930850142, "grad_norm": 21.033048629760742, "learning_rate": 4.9863219630503814e-05, "loss": 0.7309, "step": 332300 }, { "epoch": 4.431350068656597, "grad_norm": 4.588006973266602, "learning_rate": 4.9856713809469254e-05, "loss": 0.7252, "step": 332400 }, { "epoch": 4.432683206463052, "grad_norm": 8.33082103729248, "learning_rate": 4.985020632609321e-05, "loss": 0.7295, "step": 332500 }, { "epoch": 4.434016344269507, "grad_norm": 13.894925117492676, "learning_rate": 4.984369718092047e-05, "loss": 0.7625, "step": 332600 }, { "epoch": 4.435349482075962, "grad_norm": 6.912850856781006, "learning_rate": 4.983718637449595e-05, "loss": 0.7293, "step": 332700 }, { "epoch": 4.436682619882418, "grad_norm": 17.478872299194336, "learning_rate": 4.9830673907364715e-05, "loss": 0.637, "step": 332800 }, { "epoch": 4.438015757688873, "grad_norm": 6.548609256744385, "learning_rate": 4.9824159780071975e-05, "loss": 0.724, "step": 332900 }, { "epoch": 4.4393488954953275, "grad_norm": 9.06545639038086, "learning_rate": 4.981764399316307e-05, "loss": 0.7646, "step": 333000 }, { "epoch": 4.4406820333017825, "grad_norm": 8.8247709274292, "learning_rate": 4.9811126547183487e-05, "loss": 0.6963, "step": 333100 }, { "epoch": 4.442015171108237, "grad_norm": 8.823662757873535, "learning_rate": 4.980460744267883e-05, "loss": 0.8418, "step": 333200 }, { "epoch": 4.443348308914692, "grad_norm": 19.242097854614258, "learning_rate": 4.979808668019487e-05, "loss": 0.892, "step": 333300 }, { "epoch": 4.444681446721147, "grad_norm": 3.5370373725891113, "learning_rate": 4.9791564260277505e-05, "loss": 0.731, "step": 333400 }, { "epoch": 4.446014584527602, "grad_norm": 14.064180374145508, "learning_rate": 4.9785040183472765e-05, "loss": 0.8451, "step": 333500 }, { "epoch": 4.447347722334058, "grad_norm": 31.884492874145508, "learning_rate": 4.977857971585538e-05, "loss": 0.8112, "step": 333600 }, { "epoch": 4.448680860140513, "grad_norm": 12.82873249053955, "learning_rate": 4.9772052343469806e-05, "loss": 0.6967, "step": 333700 }, { "epoch": 4.450013997946968, "grad_norm": 7.548708915710449, "learning_rate": 4.976552331583034e-05, "loss": 0.6793, "step": 333800 }, { "epoch": 4.451347135753423, "grad_norm": 403.9815368652344, "learning_rate": 4.975899263348357e-05, "loss": 0.8942, "step": 333900 }, { "epoch": 4.452680273559878, "grad_norm": 7.22392463684082, "learning_rate": 4.9752460296976215e-05, "loss": 0.8522, "step": 334000 }, { "epoch": 4.454013411366333, "grad_norm": 13.490275382995605, "learning_rate": 4.974592630685516e-05, "loss": 0.8083, "step": 334100 }, { "epoch": 4.455346549172788, "grad_norm": NaN, "learning_rate": 4.9739456028280147e-05, "loss": 0.7761, "step": 334200 }, { "epoch": 4.456679686979243, "grad_norm": 7.850437641143799, "learning_rate": 4.973291874909531e-05, "loss": 0.7407, "step": 334300 }, { "epoch": 4.4580128247856985, "grad_norm": 2.3869521617889404, "learning_rate": 4.9726379817932705e-05, "loss": 0.7337, "step": 334400 }, { "epoch": 4.459345962592153, "grad_norm": 2.3250226974487305, "learning_rate": 4.971983923533977e-05, "loss": 0.7831, "step": 334500 }, { "epoch": 4.460679100398608, "grad_norm": 372.6661682128906, "learning_rate": 4.971329700186406e-05, "loss": 0.7991, "step": 334600 }, { "epoch": 4.462012238205063, "grad_norm": 7.213209629058838, "learning_rate": 4.970675311805326e-05, "loss": 0.8389, "step": 334700 }, { "epoch": 4.463345376011518, "grad_norm": 11.874282836914062, "learning_rate": 4.9700207584455215e-05, "loss": 0.7496, "step": 334800 }, { "epoch": 4.464678513817973, "grad_norm": 5.418203353881836, "learning_rate": 4.969366040161789e-05, "loss": 0.722, "step": 334900 }, { "epoch": 4.466011651624428, "grad_norm": 45.621307373046875, "learning_rate": 4.96871115700894e-05, "loss": 0.7837, "step": 335000 }, { "epoch": 4.467344789430883, "grad_norm": 293.19573974609375, "learning_rate": 4.968056109041799e-05, "loss": 0.7673, "step": 335100 }, { "epoch": 4.468677927237339, "grad_norm": 5.822731018066406, "learning_rate": 4.967400896315203e-05, "loss": 0.7321, "step": 335200 }, { "epoch": 4.470011065043794, "grad_norm": 7.1360697746276855, "learning_rate": 4.9667455188840075e-05, "loss": 0.7652, "step": 335300 }, { "epoch": 4.471344202850249, "grad_norm": 51.360076904296875, "learning_rate": 4.9660899768030755e-05, "loss": 0.7484, "step": 335400 }, { "epoch": 4.472677340656704, "grad_norm": 622.8114013671875, "learning_rate": 4.965434270127289e-05, "loss": 0.8106, "step": 335500 }, { "epoch": 4.474010478463159, "grad_norm": 6.057236194610596, "learning_rate": 4.96477839891154e-05, "loss": 0.7498, "step": 335600 }, { "epoch": 4.475343616269614, "grad_norm": 19.263193130493164, "learning_rate": 4.964122363210738e-05, "loss": 0.6995, "step": 335700 }, { "epoch": 4.476676754076069, "grad_norm": 26.69365119934082, "learning_rate": 4.9634661630798034e-05, "loss": 0.7793, "step": 335800 }, { "epoch": 4.4780098918825235, "grad_norm": 19.539825439453125, "learning_rate": 4.962809798573671e-05, "loss": 0.7176, "step": 335900 }, { "epoch": 4.479343029688979, "grad_norm": 26.067829132080078, "learning_rate": 4.962153269747288e-05, "loss": 0.7973, "step": 336000 }, { "epoch": 4.480676167495434, "grad_norm": 6.062092304229736, "learning_rate": 4.961496576655621e-05, "loss": 0.8866, "step": 336100 }, { "epoch": 4.482009305301889, "grad_norm": 5.938701152801514, "learning_rate": 4.960846288739322e-05, "loss": 0.7721, "step": 336200 }, { "epoch": 4.483342443108344, "grad_norm": 1.744908332824707, "learning_rate": 4.960189268923304e-05, "loss": 0.8356, "step": 336300 }, { "epoch": 4.484675580914799, "grad_norm": 7.808166027069092, "learning_rate": 4.95953208500642e-05, "loss": 0.7914, "step": 336400 }, { "epoch": 4.486008718721254, "grad_norm": 10.76846694946289, "learning_rate": 4.9588747370436866e-05, "loss": 0.7249, "step": 336500 }, { "epoch": 4.487341856527709, "grad_norm": 21.447410583496094, "learning_rate": 4.958217225090135e-05, "loss": 0.7203, "step": 336600 }, { "epoch": 4.488674994334164, "grad_norm": 5.630235195159912, "learning_rate": 4.95755954920081e-05, "loss": 0.7554, "step": 336700 }, { "epoch": 4.49000813214062, "grad_norm": 9.94130802154541, "learning_rate": 4.95690170943077e-05, "loss": 0.7446, "step": 336800 }, { "epoch": 4.491341269947075, "grad_norm": 6.2487969398498535, "learning_rate": 4.956243705835088e-05, "loss": 0.7532, "step": 336900 }, { "epoch": 4.49267440775353, "grad_norm": 20.25420570373535, "learning_rate": 4.955585538468849e-05, "loss": 0.6909, "step": 337000 }, { "epoch": 4.494007545559985, "grad_norm": 34.16858673095703, "learning_rate": 4.954927207387155e-05, "loss": 0.6971, "step": 337100 }, { "epoch": 4.4953406833664395, "grad_norm": 31.902576446533203, "learning_rate": 4.954268712645116e-05, "loss": 0.7046, "step": 337200 }, { "epoch": 4.4966738211728945, "grad_norm": 5.8503313064575195, "learning_rate": 4.9536100542978614e-05, "loss": 0.7483, "step": 337300 }, { "epoch": 4.498006958979349, "grad_norm": 11.555658340454102, "learning_rate": 4.9529512324005315e-05, "loss": 0.7514, "step": 337400 }, { "epoch": 4.499340096785804, "grad_norm": 23.63335418701172, "learning_rate": 4.9522922470082794e-05, "loss": 0.7926, "step": 337500 }, { "epoch": 4.500673234592259, "grad_norm": 11.10025691986084, "learning_rate": 4.951633098176277e-05, "loss": 0.7557, "step": 337600 }, { "epoch": 4.502006372398715, "grad_norm": 8.227973937988281, "learning_rate": 4.950973785959701e-05, "loss": 0.7125, "step": 337700 }, { "epoch": 4.50333951020517, "grad_norm": 21.90609359741211, "learning_rate": 4.950314310413751e-05, "loss": 0.7683, "step": 337800 }, { "epoch": 4.504672648011625, "grad_norm": 3.717312812805176, "learning_rate": 4.9496546715936345e-05, "loss": 0.7887, "step": 337900 }, { "epoch": 4.50600578581808, "grad_norm": 9.439708709716797, "learning_rate": 4.948994869554575e-05, "loss": 0.7587, "step": 338000 }, { "epoch": 4.507338923624535, "grad_norm": 98.49059295654297, "learning_rate": 4.9483415048113154e-05, "loss": 0.7792, "step": 338100 }, { "epoch": 4.50867206143099, "grad_norm": 2.773221969604492, "learning_rate": 4.9476813781309034e-05, "loss": 0.7347, "step": 338200 }, { "epoch": 4.510005199237445, "grad_norm": 4.74717378616333, "learning_rate": 4.947021088396747e-05, "loss": 0.7718, "step": 338300 }, { "epoch": 4.511338337043901, "grad_norm": 9.115506172180176, "learning_rate": 4.946360635664121e-05, "loss": 0.8728, "step": 338400 }, { "epoch": 4.512671474850356, "grad_norm": 16.088531494140625, "learning_rate": 4.945700019988319e-05, "loss": 0.7249, "step": 338500 }, { "epoch": 4.5140046126568105, "grad_norm": 34.11576461791992, "learning_rate": 4.9450392414246445e-05, "loss": 0.7145, "step": 338600 }, { "epoch": 4.5153377504632655, "grad_norm": 12.392226219177246, "learning_rate": 4.9443783000284165e-05, "loss": 0.6916, "step": 338700 }, { "epoch": 4.51667088826972, "grad_norm": 6.6551313400268555, "learning_rate": 4.943717195854966e-05, "loss": 0.8596, "step": 338800 }, { "epoch": 4.518004026076175, "grad_norm": 4.435255527496338, "learning_rate": 4.943055928959639e-05, "loss": 0.7336, "step": 338900 }, { "epoch": 4.51933716388263, "grad_norm": 5.211879253387451, "learning_rate": 4.942394499397795e-05, "loss": 0.7059, "step": 339000 }, { "epoch": 4.520670301689085, "grad_norm": 5.302968978881836, "learning_rate": 4.9417329072248066e-05, "loss": 0.7833, "step": 339100 }, { "epoch": 4.52200343949554, "grad_norm": 6.5472822189331055, "learning_rate": 4.941071152496059e-05, "loss": 0.7153, "step": 339200 }, { "epoch": 4.523336577301996, "grad_norm": 5.154623508453369, "learning_rate": 4.940409235266955e-05, "loss": 0.7925, "step": 339300 }, { "epoch": 4.524669715108451, "grad_norm": 2.2789971828460693, "learning_rate": 4.9397471555929045e-05, "loss": 0.7182, "step": 339400 }, { "epoch": 4.526002852914906, "grad_norm": 7.175196170806885, "learning_rate": 4.9390849135293377e-05, "loss": 0.766, "step": 339500 }, { "epoch": 4.527335990721361, "grad_norm": 6.020444869995117, "learning_rate": 4.9384225091316944e-05, "loss": 0.7757, "step": 339600 }, { "epoch": 4.528669128527816, "grad_norm": 5.166047096252441, "learning_rate": 4.937759942455429e-05, "loss": 0.7376, "step": 339700 }, { "epoch": 4.530002266334271, "grad_norm": 6.942443370819092, "learning_rate": 4.9370972135560104e-05, "loss": 0.7321, "step": 339800 }, { "epoch": 4.531335404140726, "grad_norm": 6.362654685974121, "learning_rate": 4.936434322488919e-05, "loss": 0.8175, "step": 339900 }, { "epoch": 4.5326685419471815, "grad_norm": 3.817260503768921, "learning_rate": 4.9357712693096496e-05, "loss": 0.7332, "step": 340000 }, { "epoch": 4.534001679753636, "grad_norm": 9.65871524810791, "learning_rate": 4.935108054073712e-05, "loss": 0.7496, "step": 340100 }, { "epoch": 4.535334817560091, "grad_norm": 8.839977264404297, "learning_rate": 4.934444676836628e-05, "loss": 0.7804, "step": 340200 }, { "epoch": 4.536667955366546, "grad_norm": 5.313350200653076, "learning_rate": 4.933781137653934e-05, "loss": 0.6942, "step": 340300 }, { "epoch": 4.538001093173001, "grad_norm": 10.146084785461426, "learning_rate": 4.933117436581179e-05, "loss": 0.7018, "step": 340400 }, { "epoch": 4.539334230979456, "grad_norm": 4.996520042419434, "learning_rate": 4.932453573673924e-05, "loss": 0.7878, "step": 340500 }, { "epoch": 4.540667368785911, "grad_norm": 2.831453561782837, "learning_rate": 4.93178954898775e-05, "loss": 0.7431, "step": 340600 }, { "epoch": 4.542000506592366, "grad_norm": 10.192998886108398, "learning_rate": 4.931125362578243e-05, "loss": 0.7745, "step": 340700 }, { "epoch": 4.543333644398821, "grad_norm": 5.680927276611328, "learning_rate": 4.9304610145010085e-05, "loss": 0.6728, "step": 340800 }, { "epoch": 4.544666782205277, "grad_norm": 6.754660129547119, "learning_rate": 4.9297965048116634e-05, "loss": 0.7687, "step": 340900 }, { "epoch": 4.545999920011732, "grad_norm": 9.078397750854492, "learning_rate": 4.9291318335658384e-05, "loss": 0.7329, "step": 341000 }, { "epoch": 4.547333057818187, "grad_norm": 9.221770286560059, "learning_rate": 4.928467000819176e-05, "loss": 0.7184, "step": 341100 }, { "epoch": 4.548666195624642, "grad_norm": 5.184329986572266, "learning_rate": 4.927802006627337e-05, "loss": 0.6679, "step": 341200 }, { "epoch": 4.549999333431097, "grad_norm": 10.891254425048828, "learning_rate": 4.9271368510459896e-05, "loss": 0.7094, "step": 341300 }, { "epoch": 4.551332471237552, "grad_norm": 14.611030578613281, "learning_rate": 4.926471534130821e-05, "loss": 0.7301, "step": 341400 }, { "epoch": 4.5526656090440065, "grad_norm": 11.23036003112793, "learning_rate": 4.925806055937528e-05, "loss": 0.7881, "step": 341500 }, { "epoch": 4.553998746850462, "grad_norm": 19.21023941040039, "learning_rate": 4.925140416521822e-05, "loss": 0.7079, "step": 341600 }, { "epoch": 4.555331884656917, "grad_norm": 1.990302324295044, "learning_rate": 4.92447461593943e-05, "loss": 0.7584, "step": 341700 }, { "epoch": 4.556665022463372, "grad_norm": 8.929369926452637, "learning_rate": 4.923808654246089e-05, "loss": 0.6572, "step": 341800 }, { "epoch": 4.557998160269827, "grad_norm": 8.973365783691406, "learning_rate": 4.9231425314975516e-05, "loss": 0.7518, "step": 341900 }, { "epoch": 4.559331298076282, "grad_norm": 58.0665168762207, "learning_rate": 4.922476247749584e-05, "loss": 0.7756, "step": 342000 }, { "epoch": 4.560664435882737, "grad_norm": 10.204224586486816, "learning_rate": 4.9218098030579655e-05, "loss": 0.7127, "step": 342100 }, { "epoch": 4.561997573689192, "grad_norm": 4.43432092666626, "learning_rate": 4.921143197478489e-05, "loss": 0.7446, "step": 342200 }, { "epoch": 4.563330711495647, "grad_norm": 42.690338134765625, "learning_rate": 4.920476431066958e-05, "loss": 0.776, "step": 342300 }, { "epoch": 4.564663849302102, "grad_norm": 4.242959499359131, "learning_rate": 4.9198095038791946e-05, "loss": 0.7497, "step": 342400 }, { "epoch": 4.565996987108558, "grad_norm": 5.707180976867676, "learning_rate": 4.919142415971033e-05, "loss": 0.8605, "step": 342500 }, { "epoch": 4.567330124915013, "grad_norm": 11.943452835083008, "learning_rate": 4.918475167398316e-05, "loss": 0.7911, "step": 342600 }, { "epoch": 4.568663262721468, "grad_norm": 11.268969535827637, "learning_rate": 4.9178077582169065e-05, "loss": 0.8598, "step": 342700 }, { "epoch": 4.5699964005279226, "grad_norm": 8.129239082336426, "learning_rate": 4.917140188482676e-05, "loss": 0.7989, "step": 342800 }, { "epoch": 4.5713295383343775, "grad_norm": 4.5182390213012695, "learning_rate": 4.916479136348102e-05, "loss": 0.8315, "step": 342900 }, { "epoch": 4.572662676140832, "grad_norm": 5.6163649559021, "learning_rate": 4.915811247280039e-05, "loss": 0.7367, "step": 343000 }, { "epoch": 4.573995813947287, "grad_norm": 4.058534145355225, "learning_rate": 4.915143197826298e-05, "loss": 0.7375, "step": 343100 }, { "epoch": 4.575328951753743, "grad_norm": 28.256349563598633, "learning_rate": 4.914474988042806e-05, "loss": 0.7703, "step": 343200 }, { "epoch": 4.576662089560198, "grad_norm": 6.928072452545166, "learning_rate": 4.913806617985502e-05, "loss": 0.7688, "step": 343300 }, { "epoch": 4.577995227366653, "grad_norm": 2.4414899349212646, "learning_rate": 4.9131380877103424e-05, "loss": 0.718, "step": 343400 }, { "epoch": 4.579328365173108, "grad_norm": 5.929523944854736, "learning_rate": 4.912469397273291e-05, "loss": 0.8839, "step": 343500 }, { "epoch": 4.580661502979563, "grad_norm": 2.534170150756836, "learning_rate": 4.911800546730332e-05, "loss": 0.7808, "step": 343600 }, { "epoch": 4.581994640786018, "grad_norm": 6.112884044647217, "learning_rate": 4.911131536137457e-05, "loss": 0.7217, "step": 343700 }, { "epoch": 4.583327778592473, "grad_norm": 20.03754425048828, "learning_rate": 4.910462365550673e-05, "loss": 0.7062, "step": 343800 }, { "epoch": 4.584660916398928, "grad_norm": 14.5361909866333, "learning_rate": 4.9097930350260035e-05, "loss": 0.7563, "step": 343900 }, { "epoch": 4.585994054205383, "grad_norm": 11.365541458129883, "learning_rate": 4.90912354461948e-05, "loss": 0.725, "step": 344000 }, { "epoch": 4.587327192011839, "grad_norm": 8.690185546875, "learning_rate": 4.908453894387153e-05, "loss": 0.769, "step": 344100 }, { "epoch": 4.5886603298182935, "grad_norm": 11.677840232849121, "learning_rate": 4.907784084385081e-05, "loss": 0.8046, "step": 344200 }, { "epoch": 4.5899934676247485, "grad_norm": 18.40022850036621, "learning_rate": 4.90711411466934e-05, "loss": 0.661, "step": 344300 }, { "epoch": 4.591326605431203, "grad_norm": 5.203464984893799, "learning_rate": 4.906443985296017e-05, "loss": 0.7323, "step": 344400 }, { "epoch": 4.592659743237658, "grad_norm": 6.244145393371582, "learning_rate": 4.905773696321212e-05, "loss": 0.7415, "step": 344500 }, { "epoch": 4.593992881044113, "grad_norm": 9.603128433227539, "learning_rate": 4.9051032478010416e-05, "loss": 0.7515, "step": 344600 }, { "epoch": 4.595326018850568, "grad_norm": 10.43336296081543, "learning_rate": 4.904432639791633e-05, "loss": 0.7418, "step": 344700 }, { "epoch": 4.596659156657024, "grad_norm": 11.925375938415527, "learning_rate": 4.9037618723491256e-05, "loss": 0.712, "step": 344800 }, { "epoch": 4.597992294463479, "grad_norm": 46.25681686401367, "learning_rate": 4.9030909455296773e-05, "loss": 0.7407, "step": 344900 }, { "epoch": 4.599325432269934, "grad_norm": 3.5771167278289795, "learning_rate": 4.9024198593894534e-05, "loss": 0.7141, "step": 345000 }, { "epoch": 4.600658570076389, "grad_norm": 5.578309059143066, "learning_rate": 4.901748613984636e-05, "loss": 0.8084, "step": 345100 }, { "epoch": 4.601991707882844, "grad_norm": 24.978837966918945, "learning_rate": 4.901077209371419e-05, "loss": 0.7231, "step": 345200 }, { "epoch": 4.603324845689299, "grad_norm": 252.02291870117188, "learning_rate": 4.900405645606011e-05, "loss": 0.6952, "step": 345300 }, { "epoch": 4.604657983495754, "grad_norm": 8.429513931274414, "learning_rate": 4.899733922744633e-05, "loss": 0.7612, "step": 345400 }, { "epoch": 4.605991121302209, "grad_norm": 27.946741104125977, "learning_rate": 4.899062040843519e-05, "loss": 0.8846, "step": 345500 }, { "epoch": 4.607324259108664, "grad_norm": 7.188169956207275, "learning_rate": 4.898389999958917e-05, "loss": 0.7526, "step": 345600 }, { "epoch": 4.608657396915119, "grad_norm": 16.18622398376465, "learning_rate": 4.897717800147089e-05, "loss": 0.7756, "step": 345700 }, { "epoch": 4.609990534721574, "grad_norm": 1.0614433288574219, "learning_rate": 4.8970454414643094e-05, "loss": 0.724, "step": 345800 }, { "epoch": 4.611323672528029, "grad_norm": 9.703765869140625, "learning_rate": 4.896372923966865e-05, "loss": 0.8077, "step": 345900 }, { "epoch": 4.612656810334484, "grad_norm": 14.600996017456055, "learning_rate": 4.895700247711058e-05, "loss": 0.7002, "step": 346000 }, { "epoch": 4.613989948140939, "grad_norm": 39.13536834716797, "learning_rate": 4.8950274127532005e-05, "loss": 0.7219, "step": 346100 }, { "epoch": 4.615323085947394, "grad_norm": 1.5166409015655518, "learning_rate": 4.894354419149622e-05, "loss": 0.7626, "step": 346200 }, { "epoch": 4.616656223753849, "grad_norm": 6.2564592361450195, "learning_rate": 4.893681266956664e-05, "loss": 0.6654, "step": 346300 }, { "epoch": 4.617989361560305, "grad_norm": 7.959069728851318, "learning_rate": 4.893007956230679e-05, "loss": 0.8354, "step": 346400 }, { "epoch": 4.61932249936676, "grad_norm": 4.292284965515137, "learning_rate": 4.8923344870280356e-05, "loss": 0.7272, "step": 346500 }, { "epoch": 4.620655637173215, "grad_norm": 33.2987060546875, "learning_rate": 4.8916608594051135e-05, "loss": 0.7321, "step": 346600 }, { "epoch": 4.62198877497967, "grad_norm": 22.573139190673828, "learning_rate": 4.890987073418308e-05, "loss": 0.5838, "step": 346700 }, { "epoch": 4.623321912786125, "grad_norm": 5.386003494262695, "learning_rate": 4.8903198693504056e-05, "loss": 0.7874, "step": 346800 }, { "epoch": 4.62465505059258, "grad_norm": 3.6820359230041504, "learning_rate": 4.889645768387297e-05, "loss": 0.7712, "step": 346900 }, { "epoch": 4.625988188399035, "grad_norm": 7.857994556427002, "learning_rate": 4.888971509229002e-05, "loss": 0.7297, "step": 347000 }, { "epoch": 4.6273213262054895, "grad_norm": 10.516395568847656, "learning_rate": 4.888297091931968e-05, "loss": 0.703, "step": 347100 }, { "epoch": 4.6286544640119445, "grad_norm": 5.9022650718688965, "learning_rate": 4.887622516552652e-05, "loss": 0.7914, "step": 347200 }, { "epoch": 4.6299876018184, "grad_norm": 2.3113443851470947, "learning_rate": 4.8869477831475304e-05, "loss": 0.782, "step": 347300 }, { "epoch": 4.631320739624855, "grad_norm": 8.053510665893555, "learning_rate": 4.8862728917730876e-05, "loss": 0.7715, "step": 347400 }, { "epoch": 4.63265387743131, "grad_norm": 2.4778945446014404, "learning_rate": 4.885597842485825e-05, "loss": 0.726, "step": 347500 }, { "epoch": 4.633987015237765, "grad_norm": 16.11250877380371, "learning_rate": 4.884922635342255e-05, "loss": 0.7671, "step": 347600 }, { "epoch": 4.63532015304422, "grad_norm": 11.59731388092041, "learning_rate": 4.8842472703989034e-05, "loss": 0.6908, "step": 347700 }, { "epoch": 4.636653290850675, "grad_norm": 15.83884048461914, "learning_rate": 4.8835717477123106e-05, "loss": 0.7713, "step": 347800 }, { "epoch": 4.63798642865713, "grad_norm": 14.863941192626953, "learning_rate": 4.8828960673390285e-05, "loss": 0.7347, "step": 347900 }, { "epoch": 4.639319566463586, "grad_norm": 6.346035957336426, "learning_rate": 4.882220229335622e-05, "loss": 0.7605, "step": 348000 }, { "epoch": 4.640652704270041, "grad_norm": 40.04359817504883, "learning_rate": 4.881544233758672e-05, "loss": 0.6857, "step": 348100 }, { "epoch": 4.641985842076496, "grad_norm": 2.76633620262146, "learning_rate": 4.880868080664771e-05, "loss": 0.7122, "step": 348200 }, { "epoch": 4.643318979882951, "grad_norm": 5.324151515960693, "learning_rate": 4.8801917701105214e-05, "loss": 0.7201, "step": 348300 }, { "epoch": 4.644652117689406, "grad_norm": 30.705026626586914, "learning_rate": 4.879515302152544e-05, "loss": 0.7388, "step": 348400 }, { "epoch": 4.6459852554958605, "grad_norm": 10.20486068725586, "learning_rate": 4.878838676847471e-05, "loss": 0.7001, "step": 348500 }, { "epoch": 4.647318393302315, "grad_norm": 14.99458122253418, "learning_rate": 4.878161894251946e-05, "loss": 0.6675, "step": 348600 }, { "epoch": 4.64865153110877, "grad_norm": 4.517219543457031, "learning_rate": 4.877484954422628e-05, "loss": 0.7094, "step": 348700 }, { "epoch": 4.649984668915225, "grad_norm": 9.089329719543457, "learning_rate": 4.8768078574161884e-05, "loss": 0.7043, "step": 348800 }, { "epoch": 4.651317806721681, "grad_norm": 13.543821334838867, "learning_rate": 4.87613060328931e-05, "loss": 0.6626, "step": 348900 }, { "epoch": 4.652650944528136, "grad_norm": 18.604581832885742, "learning_rate": 4.875453192098693e-05, "loss": 0.7152, "step": 349000 }, { "epoch": 4.653984082334591, "grad_norm": 3.076111316680908, "learning_rate": 4.8747756239010445e-05, "loss": 0.6512, "step": 349100 }, { "epoch": 4.655317220141046, "grad_norm": 14.832231521606445, "learning_rate": 4.8740978987530925e-05, "loss": 0.7224, "step": 349200 }, { "epoch": 4.656650357947501, "grad_norm": 2.9462244510650635, "learning_rate": 4.8734200167115716e-05, "loss": 0.7489, "step": 349300 }, { "epoch": 4.657983495753956, "grad_norm": 2.346763849258423, "learning_rate": 4.872741977833232e-05, "loss": 0.6858, "step": 349400 }, { "epoch": 4.659316633560411, "grad_norm": 19.42036247253418, "learning_rate": 4.872063782174838e-05, "loss": 0.7415, "step": 349500 }, { "epoch": 4.660649771366867, "grad_norm": 9.000970840454102, "learning_rate": 4.8713854297931664e-05, "loss": 0.6378, "step": 349600 }, { "epoch": 4.661982909173322, "grad_norm": 3.6805028915405273, "learning_rate": 4.870706920745004e-05, "loss": 0.7052, "step": 349700 }, { "epoch": 4.6633160469797765, "grad_norm": 3.8843941688537598, "learning_rate": 4.870028255087155e-05, "loss": 0.6608, "step": 349800 }, { "epoch": 4.6646491847862315, "grad_norm": 2.4364981651306152, "learning_rate": 4.869349432876436e-05, "loss": 0.7836, "step": 349900 }, { "epoch": 4.665982322592686, "grad_norm": 8.225556373596191, "learning_rate": 4.868670454169677e-05, "loss": 0.7577, "step": 350000 }, { "epoch": 4.667315460399141, "grad_norm": 3.118438959121704, "learning_rate": 4.867991319023715e-05, "loss": 0.7051, "step": 350100 }, { "epoch": 4.668648598205596, "grad_norm": 16.174610137939453, "learning_rate": 4.867312027495409e-05, "loss": 0.6939, "step": 350200 }, { "epoch": 4.669981736012051, "grad_norm": 5.446938514709473, "learning_rate": 4.866632579641628e-05, "loss": 0.7179, "step": 350300 }, { "epoch": 4.671314873818506, "grad_norm": 26.019908905029297, "learning_rate": 4.86595297551925e-05, "loss": 0.7219, "step": 350400 }, { "epoch": 4.672648011624962, "grad_norm": 13.563935279846191, "learning_rate": 4.865273215185172e-05, "loss": 0.6853, "step": 350500 }, { "epoch": 4.673981149431417, "grad_norm": 29.26582145690918, "learning_rate": 4.8645932986962996e-05, "loss": 0.7006, "step": 350600 }, { "epoch": 4.675314287237872, "grad_norm": 3.3110671043395996, "learning_rate": 4.8639132261095535e-05, "loss": 0.7685, "step": 350700 }, { "epoch": 4.676647425044327, "grad_norm": 18.275327682495117, "learning_rate": 4.863232997481868e-05, "loss": 0.6534, "step": 350800 }, { "epoch": 4.677980562850782, "grad_norm": 6.892544269561768, "learning_rate": 4.8625526128701894e-05, "loss": 0.8117, "step": 350900 }, { "epoch": 4.679313700657237, "grad_norm": 41.82587432861328, "learning_rate": 4.861872072331478e-05, "loss": 0.7268, "step": 351000 }, { "epoch": 4.680646838463692, "grad_norm": 25.138944625854492, "learning_rate": 4.861191375922705e-05, "loss": 0.7847, "step": 351100 }, { "epoch": 4.6819799762701475, "grad_norm": 2.6358680725097656, "learning_rate": 4.8605105237008574e-05, "loss": 0.6609, "step": 351200 }, { "epoch": 4.6833131140766024, "grad_norm": 3.645737409591675, "learning_rate": 4.859829515722933e-05, "loss": 0.8135, "step": 351300 }, { "epoch": 4.684646251883057, "grad_norm": 7.534621238708496, "learning_rate": 4.859148352045945e-05, "loss": 0.7801, "step": 351400 }, { "epoch": 4.685979389689512, "grad_norm": 5.592484951019287, "learning_rate": 4.8584670327269164e-05, "loss": 0.7278, "step": 351500 }, { "epoch": 4.687312527495967, "grad_norm": 61.4531364440918, "learning_rate": 4.8577855578228876e-05, "loss": 0.764, "step": 351600 }, { "epoch": 4.688645665302422, "grad_norm": 7.7176923751831055, "learning_rate": 4.8571039273909076e-05, "loss": 0.75, "step": 351700 }, { "epoch": 4.689978803108877, "grad_norm": 8.088235855102539, "learning_rate": 4.8564221414880406e-05, "loss": 0.6754, "step": 351800 }, { "epoch": 4.691311940915332, "grad_norm": 40.99746322631836, "learning_rate": 4.855740200171364e-05, "loss": 0.7884, "step": 351900 }, { "epoch": 4.692645078721787, "grad_norm": 0.7727888822555542, "learning_rate": 4.855058103497968e-05, "loss": 0.7182, "step": 352000 }, { "epoch": 4.693978216528243, "grad_norm": 4.140727996826172, "learning_rate": 4.8543758515249545e-05, "loss": 0.6788, "step": 352100 }, { "epoch": 4.695311354334698, "grad_norm": 17.27607536315918, "learning_rate": 4.8536934443094406e-05, "loss": 0.7347, "step": 352200 }, { "epoch": 4.696644492141153, "grad_norm": 3.008143901824951, "learning_rate": 4.8530108819085556e-05, "loss": 0.6664, "step": 352300 }, { "epoch": 4.697977629947608, "grad_norm": 22.373973846435547, "learning_rate": 4.8523281643794396e-05, "loss": 0.7395, "step": 352400 }, { "epoch": 4.699310767754063, "grad_norm": 3.821472406387329, "learning_rate": 4.851645291779249e-05, "loss": 0.7051, "step": 352500 }, { "epoch": 4.700643905560518, "grad_norm": 17.227523803710938, "learning_rate": 4.850962264165152e-05, "loss": 0.8022, "step": 352600 }, { "epoch": 4.7019770433669725, "grad_norm": 4.224480152130127, "learning_rate": 4.8502790815943286e-05, "loss": 0.7675, "step": 352700 }, { "epoch": 4.703310181173428, "grad_norm": 5.934909820556641, "learning_rate": 4.8495957441239723e-05, "loss": 0.6514, "step": 352800 }, { "epoch": 4.704643318979883, "grad_norm": 21.317581176757812, "learning_rate": 4.848912251811293e-05, "loss": 0.7078, "step": 352900 }, { "epoch": 4.705976456786338, "grad_norm": 36.86885452270508, "learning_rate": 4.848228604713506e-05, "loss": 0.7164, "step": 353000 }, { "epoch": 4.707309594592793, "grad_norm": 11.6904296875, "learning_rate": 4.8475448028878475e-05, "loss": 0.6996, "step": 353100 }, { "epoch": 4.708642732399248, "grad_norm": 8.613327980041504, "learning_rate": 4.8468608463915616e-05, "loss": 0.7556, "step": 353200 }, { "epoch": 4.709975870205703, "grad_norm": 24.314313888549805, "learning_rate": 4.846176735281907e-05, "loss": 0.7328, "step": 353300 }, { "epoch": 4.711309008012158, "grad_norm": 4.3498969078063965, "learning_rate": 4.845492469616157e-05, "loss": 0.7027, "step": 353400 }, { "epoch": 4.712642145818613, "grad_norm": 7.972418785095215, "learning_rate": 4.844808049451594e-05, "loss": 0.8546, "step": 353500 }, { "epoch": 4.713975283625068, "grad_norm": 6.2135090827941895, "learning_rate": 4.844130321355876e-05, "loss": 0.6999, "step": 353600 }, { "epoch": 4.715308421431524, "grad_norm": 5.669065475463867, "learning_rate": 4.843445593909152e-05, "loss": 0.6877, "step": 353700 }, { "epoch": 4.716641559237979, "grad_norm": 4.584819793701172, "learning_rate": 4.8427607121349744e-05, "loss": 0.6871, "step": 353800 }, { "epoch": 4.717974697044434, "grad_norm": 1.0254629850387573, "learning_rate": 4.8420756760906795e-05, "loss": 0.7119, "step": 353900 }, { "epoch": 4.719307834850889, "grad_norm": 5.270462989807129, "learning_rate": 4.841390485833616e-05, "loss": 0.7826, "step": 354000 }, { "epoch": 4.7206409726573435, "grad_norm": 4.314075946807861, "learning_rate": 4.840705141421145e-05, "loss": 0.6672, "step": 354100 }, { "epoch": 4.7219741104637984, "grad_norm": 4.074345111846924, "learning_rate": 4.840019642910642e-05, "loss": 0.8416, "step": 354200 }, { "epoch": 4.723307248270253, "grad_norm": 106.37751007080078, "learning_rate": 4.8393339903594944e-05, "loss": 0.6641, "step": 354300 }, { "epoch": 4.724640386076709, "grad_norm": 30.1156063079834, "learning_rate": 4.8386481838251035e-05, "loss": 0.74, "step": 354400 }, { "epoch": 4.725973523883164, "grad_norm": 14.944146156311035, "learning_rate": 4.837962223364883e-05, "loss": 0.7973, "step": 354500 }, { "epoch": 4.727306661689619, "grad_norm": 9.761926651000977, "learning_rate": 4.837276109036259e-05, "loss": 0.6175, "step": 354600 }, { "epoch": 4.728639799496074, "grad_norm": 22.969524383544922, "learning_rate": 4.8365898408966696e-05, "loss": 0.7327, "step": 354700 }, { "epoch": 4.729972937302529, "grad_norm": 18.5396671295166, "learning_rate": 4.835903419003569e-05, "loss": 0.6897, "step": 354800 }, { "epoch": 4.731306075108984, "grad_norm": 2.4392924308776855, "learning_rate": 4.8352168434144216e-05, "loss": 0.7841, "step": 354900 }, { "epoch": 4.732639212915439, "grad_norm": 11.300447463989258, "learning_rate": 4.834530114186704e-05, "loss": 0.8338, "step": 355000 }, { "epoch": 4.733972350721894, "grad_norm": 8.185022354125977, "learning_rate": 4.833843231377909e-05, "loss": 0.6575, "step": 355100 }, { "epoch": 4.735305488528349, "grad_norm": 2.6484856605529785, "learning_rate": 4.83315619504554e-05, "loss": 0.7792, "step": 355200 }, { "epoch": 4.736638626334805, "grad_norm": 5.242673397064209, "learning_rate": 4.832469005247113e-05, "loss": 0.7107, "step": 355300 }, { "epoch": 4.7379717641412595, "grad_norm": 8.254341125488281, "learning_rate": 4.831781662040156e-05, "loss": 0.6724, "step": 355400 }, { "epoch": 4.7393049019477145, "grad_norm": 3.9205167293548584, "learning_rate": 4.831094165482214e-05, "loss": 0.7642, "step": 355500 }, { "epoch": 4.740638039754169, "grad_norm": 4.706592082977295, "learning_rate": 4.83040651563084e-05, "loss": 0.7226, "step": 355600 }, { "epoch": 4.741971177560624, "grad_norm": 5.377463340759277, "learning_rate": 4.829718712543602e-05, "loss": 0.7734, "step": 355700 }, { "epoch": 4.743304315367079, "grad_norm": 3.717937469482422, "learning_rate": 4.829030756278081e-05, "loss": 0.685, "step": 355800 }, { "epoch": 4.744637453173534, "grad_norm": 10.086644172668457, "learning_rate": 4.828342646891872e-05, "loss": 0.7603, "step": 355900 }, { "epoch": 4.74597059097999, "grad_norm": 4.470726490020752, "learning_rate": 4.827654384442579e-05, "loss": 0.6797, "step": 356000 }, { "epoch": 4.747303728786445, "grad_norm": 8.272162437438965, "learning_rate": 4.826965968987822e-05, "loss": 0.659, "step": 356100 }, { "epoch": 4.7486368665929, "grad_norm": 4.880573272705078, "learning_rate": 4.826277400585233e-05, "loss": 0.7184, "step": 356200 }, { "epoch": 4.749970004399355, "grad_norm": 18.457311630249023, "learning_rate": 4.825588679292458e-05, "loss": 0.6843, "step": 356300 }, { "epoch": 4.75130314220581, "grad_norm": 17.73239517211914, "learning_rate": 4.824899805167153e-05, "loss": 0.7813, "step": 356400 }, { "epoch": 4.752636280012265, "grad_norm": 5.944051742553711, "learning_rate": 4.824210778266989e-05, "loss": 0.7549, "step": 356500 }, { "epoch": 4.75396941781872, "grad_norm": 3.5566909313201904, "learning_rate": 4.823521598649649e-05, "loss": 0.6946, "step": 356600 }, { "epoch": 4.755302555625175, "grad_norm": 4.4795451164245605, "learning_rate": 4.822832266372829e-05, "loss": 0.6393, "step": 356700 }, { "epoch": 4.75663569343163, "grad_norm": 5.425159931182861, "learning_rate": 4.822142781494238e-05, "loss": 0.7216, "step": 356800 }, { "epoch": 4.7579688312380854, "grad_norm": 6.791501045227051, "learning_rate": 4.821453144071597e-05, "loss": 0.6759, "step": 356900 }, { "epoch": 4.75930196904454, "grad_norm": 18.312990188598633, "learning_rate": 4.82076335416264e-05, "loss": 0.7543, "step": 357000 }, { "epoch": 4.760635106850995, "grad_norm": 26.571805953979492, "learning_rate": 4.820073411825115e-05, "loss": 0.8165, "step": 357100 }, { "epoch": 4.76196824465745, "grad_norm": 39.767189025878906, "learning_rate": 4.819383317116782e-05, "loss": 0.775, "step": 357200 }, { "epoch": 4.763301382463905, "grad_norm": 7.931504726409912, "learning_rate": 4.818693070095411e-05, "loss": 0.7057, "step": 357300 }, { "epoch": 4.76463452027036, "grad_norm": 9.023282051086426, "learning_rate": 4.818002670818791e-05, "loss": 0.7228, "step": 357400 }, { "epoch": 4.765967658076815, "grad_norm": 6.93164587020874, "learning_rate": 4.817312119344718e-05, "loss": 0.6941, "step": 357500 }, { "epoch": 4.767300795883271, "grad_norm": 12.532785415649414, "learning_rate": 4.8166214157310015e-05, "loss": 0.7435, "step": 357600 }, { "epoch": 4.768633933689726, "grad_norm": 3.574777364730835, "learning_rate": 4.8159305600354685e-05, "loss": 0.6719, "step": 357700 }, { "epoch": 4.769967071496181, "grad_norm": 3.05659556388855, "learning_rate": 4.8152395523159515e-05, "loss": 0.7134, "step": 357800 }, { "epoch": 4.771300209302636, "grad_norm": 2.0632898807525635, "learning_rate": 4.814548392630303e-05, "loss": 0.755, "step": 357900 }, { "epoch": 4.772633347109091, "grad_norm": 29.158710479736328, "learning_rate": 4.813863994904077e-05, "loss": 0.7163, "step": 358000 }, { "epoch": 4.773966484915546, "grad_norm": 1.054344654083252, "learning_rate": 4.81317944834871e-05, "loss": 0.7179, "step": 358100 }, { "epoch": 4.775299622722001, "grad_norm": 60.29671859741211, "learning_rate": 4.812487836147165e-05, "loss": 0.7453, "step": 358200 }, { "epoch": 4.7766327605284555, "grad_norm": 2.8855268955230713, "learning_rate": 4.811796072209853e-05, "loss": 0.7193, "step": 358300 }, { "epoch": 4.7779658983349105, "grad_norm": 3.791412830352783, "learning_rate": 4.811104156594684e-05, "loss": 0.7223, "step": 358400 }, { "epoch": 4.779299036141366, "grad_norm": 10.766729354858398, "learning_rate": 4.810412089359584e-05, "loss": 0.6518, "step": 358500 }, { "epoch": 4.780632173947821, "grad_norm": 17.240081787109375, "learning_rate": 4.8097198705624915e-05, "loss": 0.7451, "step": 358600 }, { "epoch": 4.781965311754276, "grad_norm": 80.78507995605469, "learning_rate": 4.809027500261356e-05, "loss": 0.6983, "step": 358700 }, { "epoch": 4.783298449560731, "grad_norm": 30.274831771850586, "learning_rate": 4.80833497851414e-05, "loss": 0.7952, "step": 358800 }, { "epoch": 4.784631587367186, "grad_norm": 8.878092765808105, "learning_rate": 4.8076423053788194e-05, "loss": 0.7242, "step": 358900 }, { "epoch": 4.785964725173641, "grad_norm": 5.916757583618164, "learning_rate": 4.806949480913385e-05, "loss": 0.6803, "step": 359000 }, { "epoch": 4.787297862980096, "grad_norm": 17.895641326904297, "learning_rate": 4.806256505175835e-05, "loss": 0.6784, "step": 359100 }, { "epoch": 4.788631000786552, "grad_norm": 4.10896635055542, "learning_rate": 4.805563378224184e-05, "loss": 0.7247, "step": 359200 }, { "epoch": 4.789964138593007, "grad_norm": 9.287232398986816, "learning_rate": 4.804870100116459e-05, "loss": 0.7649, "step": 359300 }, { "epoch": 4.791297276399462, "grad_norm": 31.324249267578125, "learning_rate": 4.8041766709106986e-05, "loss": 0.6806, "step": 359400 }, { "epoch": 4.792630414205917, "grad_norm": 34.29448699951172, "learning_rate": 4.803483090664955e-05, "loss": 0.7255, "step": 359500 }, { "epoch": 4.793963552012372, "grad_norm": 5.373016834259033, "learning_rate": 4.8027893594372915e-05, "loss": 0.7302, "step": 359600 }, { "epoch": 4.7952966898188265, "grad_norm": 5.119516372680664, "learning_rate": 4.8020954772857866e-05, "loss": 0.6546, "step": 359700 }, { "epoch": 4.7966298276252815, "grad_norm": 7.827272891998291, "learning_rate": 4.801401444268528e-05, "loss": 0.7022, "step": 359800 }, { "epoch": 4.797962965431736, "grad_norm": 6.303323268890381, "learning_rate": 4.800707260443619e-05, "loss": 0.7376, "step": 359900 }, { "epoch": 4.799296103238191, "grad_norm": 4.92755126953125, "learning_rate": 4.800012925869176e-05, "loss": 0.63, "step": 360000 }, { "epoch": 4.800629241044647, "grad_norm": 10.57948112487793, "learning_rate": 4.799318440603323e-05, "loss": 0.6821, "step": 360100 }, { "epoch": 4.801962378851102, "grad_norm": 2.6355974674224854, "learning_rate": 4.798623804704203e-05, "loss": 0.675, "step": 360200 }, { "epoch": 4.803295516657557, "grad_norm": 5.784335136413574, "learning_rate": 4.797929018229968e-05, "loss": 0.7366, "step": 360300 }, { "epoch": 4.804628654464012, "grad_norm": 32.179908752441406, "learning_rate": 4.797234081238783e-05, "loss": 0.6529, "step": 360400 }, { "epoch": 4.805961792270467, "grad_norm": 3.090625762939453, "learning_rate": 4.796538993788825e-05, "loss": 0.6395, "step": 360500 }, { "epoch": 4.807294930076922, "grad_norm": 6.865628719329834, "learning_rate": 4.795843755938287e-05, "loss": 0.6657, "step": 360600 }, { "epoch": 4.808628067883377, "grad_norm": 8.968984603881836, "learning_rate": 4.79514836774537e-05, "loss": 0.7269, "step": 360700 }, { "epoch": 4.809961205689833, "grad_norm": 4.970248222351074, "learning_rate": 4.794452829268291e-05, "loss": 0.5918, "step": 360800 }, { "epoch": 4.811294343496288, "grad_norm": 8.85795783996582, "learning_rate": 4.793757140565277e-05, "loss": 0.644, "step": 360900 }, { "epoch": 4.8126274813027425, "grad_norm": 3.6292691230773926, "learning_rate": 4.793061301694571e-05, "loss": 0.6206, "step": 361000 }, { "epoch": 4.8139606191091975, "grad_norm": 18.496456146240234, "learning_rate": 4.792365312714424e-05, "loss": 0.6619, "step": 361100 }, { "epoch": 4.815293756915652, "grad_norm": 2.22416615486145, "learning_rate": 4.7916761358159786e-05, "loss": 0.703, "step": 361200 }, { "epoch": 4.816626894722107, "grad_norm": 15.912293434143066, "learning_rate": 4.790979848291402e-05, "loss": 0.6732, "step": 361300 }, { "epoch": 4.817960032528562, "grad_norm": 3.025177478790283, "learning_rate": 4.7902834108316396e-05, "loss": 0.6734, "step": 361400 }, { "epoch": 4.819293170335017, "grad_norm": 4.789200305938721, "learning_rate": 4.7895868234949934e-05, "loss": 0.7447, "step": 361500 }, { "epoch": 4.820626308141472, "grad_norm": 7.730136871337891, "learning_rate": 4.788890086339779e-05, "loss": 0.7397, "step": 361600 }, { "epoch": 4.821959445947928, "grad_norm": 55.58611297607422, "learning_rate": 4.788193199424325e-05, "loss": 0.7932, "step": 361700 }, { "epoch": 4.823292583754383, "grad_norm": 5.3307061195373535, "learning_rate": 4.7874961628069745e-05, "loss": 0.666, "step": 361800 }, { "epoch": 4.824625721560838, "grad_norm": 15.665431022644043, "learning_rate": 4.786798976546079e-05, "loss": 0.731, "step": 361900 }, { "epoch": 4.825958859367293, "grad_norm": 11.322463989257812, "learning_rate": 4.786101640700006e-05, "loss": 0.7263, "step": 362000 }, { "epoch": 4.827291997173748, "grad_norm": 6.2412261962890625, "learning_rate": 4.7854041553271335e-05, "loss": 0.7672, "step": 362100 }, { "epoch": 4.828625134980203, "grad_norm": 6.837485313415527, "learning_rate": 4.784706520485854e-05, "loss": 0.7143, "step": 362200 }, { "epoch": 4.829958272786658, "grad_norm": 6.205463409423828, "learning_rate": 4.78400873623457e-05, "loss": 0.7765, "step": 362300 }, { "epoch": 4.831291410593113, "grad_norm": 19.455333709716797, "learning_rate": 4.7833108026316985e-05, "loss": 0.7102, "step": 362400 }, { "epoch": 4.8326245483995685, "grad_norm": 3.1183738708496094, "learning_rate": 4.7826127197356684e-05, "loss": 0.7334, "step": 362500 }, { "epoch": 4.833957686206023, "grad_norm": 28.288883209228516, "learning_rate": 4.7819144876049204e-05, "loss": 0.7475, "step": 362600 }, { "epoch": 4.835290824012478, "grad_norm": 58.18262481689453, "learning_rate": 4.7812161062979084e-05, "loss": 0.7228, "step": 362700 }, { "epoch": 4.836623961818933, "grad_norm": 18.410337448120117, "learning_rate": 4.780517575873099e-05, "loss": 0.6198, "step": 362800 }, { "epoch": 4.837957099625388, "grad_norm": 54.57406234741211, "learning_rate": 4.779818896388971e-05, "loss": 0.7407, "step": 362900 }, { "epoch": 4.839290237431843, "grad_norm": 2.5735552310943604, "learning_rate": 4.7791200679040155e-05, "loss": 0.6679, "step": 363000 }, { "epoch": 4.840623375238298, "grad_norm": 13.399038314819336, "learning_rate": 4.778421090476736e-05, "loss": 0.7852, "step": 363100 }, { "epoch": 4.841956513044753, "grad_norm": 2.5295279026031494, "learning_rate": 4.7777219641656486e-05, "loss": 0.6847, "step": 363200 }, { "epoch": 4.843289650851208, "grad_norm": 3.119300603866577, "learning_rate": 4.777029682517139e-05, "loss": 0.6623, "step": 363300 }, { "epoch": 4.844622788657664, "grad_norm": 8.298480033874512, "learning_rate": 4.776330260101412e-05, "loss": 0.6629, "step": 363400 }, { "epoch": 4.845955926464119, "grad_norm": 2.6391255855560303, "learning_rate": 4.775630688976915e-05, "loss": 0.6596, "step": 363500 }, { "epoch": 4.847289064270574, "grad_norm": 5.807689189910889, "learning_rate": 4.774930969202215e-05, "loss": 0.6989, "step": 363600 }, { "epoch": 4.848622202077029, "grad_norm": 16.139114379882812, "learning_rate": 4.774231100835889e-05, "loss": 0.7483, "step": 363700 }, { "epoch": 4.849955339883484, "grad_norm": 32.55651092529297, "learning_rate": 4.773531083936528e-05, "loss": 0.6751, "step": 363800 }, { "epoch": 4.8512884776899385, "grad_norm": 5.572301864624023, "learning_rate": 4.772830918562735e-05, "loss": 0.7113, "step": 363900 }, { "epoch": 4.8526216154963935, "grad_norm": 5.283978462219238, "learning_rate": 4.7721306047731265e-05, "loss": 0.6942, "step": 364000 }, { "epoch": 4.853954753302849, "grad_norm": 4.71458625793457, "learning_rate": 4.771430142626331e-05, "loss": 0.7556, "step": 364100 }, { "epoch": 4.855287891109304, "grad_norm": 14.10135555267334, "learning_rate": 4.7707295321809856e-05, "loss": 0.6652, "step": 364200 }, { "epoch": 4.856621028915759, "grad_norm": 0.9063433408737183, "learning_rate": 4.770028773495747e-05, "loss": 0.6184, "step": 364300 }, { "epoch": 4.857954166722214, "grad_norm": 4.089513301849365, "learning_rate": 4.769327866629279e-05, "loss": 0.6682, "step": 364400 }, { "epoch": 4.859287304528669, "grad_norm": 6.023785591125488, "learning_rate": 4.7686268116402585e-05, "loss": 0.6961, "step": 364500 }, { "epoch": 4.860620442335124, "grad_norm": 5.443392276763916, "learning_rate": 4.767925608587378e-05, "loss": 0.6753, "step": 364600 }, { "epoch": 4.861953580141579, "grad_norm": 5.221805095672607, "learning_rate": 4.767224257529337e-05, "loss": 0.7246, "step": 364700 }, { "epoch": 4.863286717948034, "grad_norm": 8.830789566040039, "learning_rate": 4.766522758524853e-05, "loss": 0.7576, "step": 364800 }, { "epoch": 4.864619855754489, "grad_norm": 17.445972442626953, "learning_rate": 4.765821111632652e-05, "loss": 0.6621, "step": 364900 }, { "epoch": 4.865952993560945, "grad_norm": 33.22230529785156, "learning_rate": 4.765119316911474e-05, "loss": 0.7094, "step": 365000 }, { "epoch": 4.8672861313674, "grad_norm": 2.727012872695923, "learning_rate": 4.764417374420071e-05, "loss": 0.6859, "step": 365100 }, { "epoch": 4.868619269173855, "grad_norm": 26.508281707763672, "learning_rate": 4.763715284217207e-05, "loss": 0.7068, "step": 365200 }, { "epoch": 4.8699524069803095, "grad_norm": 2.6350064277648926, "learning_rate": 4.76301304636166e-05, "loss": 0.6198, "step": 365300 }, { "epoch": 4.8712855447867645, "grad_norm": 8.621742248535156, "learning_rate": 4.7623106609122166e-05, "loss": 0.7048, "step": 365400 }, { "epoch": 4.872618682593219, "grad_norm": 72.86518859863281, "learning_rate": 4.7616081279276806e-05, "loss": 0.7516, "step": 365500 }, { "epoch": 4.873951820399674, "grad_norm": 2.303215742111206, "learning_rate": 4.7609054474668657e-05, "loss": 0.7165, "step": 365600 }, { "epoch": 4.87528495820613, "grad_norm": 6.979100704193115, "learning_rate": 4.760202619588598e-05, "loss": 0.6862, "step": 365700 }, { "epoch": 4.876618096012585, "grad_norm": 0.5533574819564819, "learning_rate": 4.759499644351714e-05, "loss": 0.7081, "step": 365800 }, { "epoch": 4.87795123381904, "grad_norm": 3.3406355381011963, "learning_rate": 4.7587965218150676e-05, "loss": 0.7017, "step": 365900 }, { "epoch": 4.879284371625495, "grad_norm": 32.81590270996094, "learning_rate": 4.75809325203752e-05, "loss": 0.6894, "step": 366000 }, { "epoch": 4.88061750943195, "grad_norm": 26.30389976501465, "learning_rate": 4.7573898350779474e-05, "loss": 0.7442, "step": 366100 }, { "epoch": 4.881950647238405, "grad_norm": 4.530538082122803, "learning_rate": 4.756686270995237e-05, "loss": 0.795, "step": 366200 }, { "epoch": 4.88328378504486, "grad_norm": 103.96440887451172, "learning_rate": 4.7559825598482906e-05, "loss": 0.6349, "step": 366300 }, { "epoch": 4.884616922851315, "grad_norm": 9.055879592895508, "learning_rate": 4.755278701696019e-05, "loss": 0.7589, "step": 366400 }, { "epoch": 4.88595006065777, "grad_norm": 3.634098768234253, "learning_rate": 4.7545746965973483e-05, "loss": 0.7226, "step": 366500 }, { "epoch": 4.8872831984642255, "grad_norm": 34.180049896240234, "learning_rate": 4.753870544611214e-05, "loss": 0.5928, "step": 366600 }, { "epoch": 4.8886163362706805, "grad_norm": 7.191957950592041, "learning_rate": 4.753166245796566e-05, "loss": 0.7505, "step": 366700 }, { "epoch": 4.889949474077135, "grad_norm": 24.28753089904785, "learning_rate": 4.752461800212368e-05, "loss": 0.6948, "step": 366800 }, { "epoch": 4.89128261188359, "grad_norm": 14.834750175476074, "learning_rate": 4.7517572079175914e-05, "loss": 0.7064, "step": 366900 }, { "epoch": 4.892615749690045, "grad_norm": 15.913505554199219, "learning_rate": 4.7510524689712235e-05, "loss": 0.6213, "step": 367000 }, { "epoch": 4.8939488874965, "grad_norm": 7.547337532043457, "learning_rate": 4.750347583432262e-05, "loss": 0.6825, "step": 367100 }, { "epoch": 4.895282025302955, "grad_norm": 3.794813394546509, "learning_rate": 4.749649602405593e-05, "loss": 0.6779, "step": 367200 }, { "epoch": 4.896615163109411, "grad_norm": 4.698238849639893, "learning_rate": 4.7489444253229444e-05, "loss": 0.6504, "step": 367300 }, { "epoch": 4.897948300915866, "grad_norm": 27.342296600341797, "learning_rate": 4.74823910182418e-05, "loss": 0.7455, "step": 367400 }, { "epoch": 4.899281438722321, "grad_norm": 11.198063850402832, "learning_rate": 4.747540687391183e-05, "loss": 0.6892, "step": 367500 }, { "epoch": 4.900614576528776, "grad_norm": 12.66908073425293, "learning_rate": 4.7468350727000346e-05, "loss": 0.6862, "step": 367600 }, { "epoch": 4.901947714335231, "grad_norm": 3.2306158542633057, "learning_rate": 4.74612931176936e-05, "loss": 0.7121, "step": 367700 }, { "epoch": 4.903280852141686, "grad_norm": 14.168074607849121, "learning_rate": 4.7454234046582445e-05, "loss": 0.6956, "step": 367800 }, { "epoch": 4.904613989948141, "grad_norm": 4.128460884094238, "learning_rate": 4.744717351425782e-05, "loss": 0.6876, "step": 367900 }, { "epoch": 4.905947127754596, "grad_norm": 7.238423824310303, "learning_rate": 4.7440111521310835e-05, "loss": 0.7247, "step": 368000 }, { "epoch": 4.907280265561051, "grad_norm": 4.656269550323486, "learning_rate": 4.743304806833269e-05, "loss": 0.6959, "step": 368100 }, { "epoch": 4.908613403367506, "grad_norm": 7.745709419250488, "learning_rate": 4.742598315591471e-05, "loss": 0.74, "step": 368200 }, { "epoch": 4.909946541173961, "grad_norm": 32.188594818115234, "learning_rate": 4.741891678464836e-05, "loss": 0.7343, "step": 368300 }, { "epoch": 4.911279678980416, "grad_norm": 3.9012868404388428, "learning_rate": 4.741184895512521e-05, "loss": 0.7241, "step": 368400 }, { "epoch": 4.912612816786871, "grad_norm": 7.739018440246582, "learning_rate": 4.740477966793694e-05, "loss": 0.7699, "step": 368500 }, { "epoch": 4.913945954593326, "grad_norm": 5.864432334899902, "learning_rate": 4.7397708923675386e-05, "loss": 0.7051, "step": 368600 }, { "epoch": 4.915279092399781, "grad_norm": 11.803476333618164, "learning_rate": 4.7390636722932475e-05, "loss": 0.7061, "step": 368700 }, { "epoch": 4.916612230206236, "grad_norm": 11.002632141113281, "learning_rate": 4.738356306630029e-05, "loss": 0.7602, "step": 368800 }, { "epoch": 4.917945368012692, "grad_norm": 1.8805017471313477, "learning_rate": 4.7376487954370996e-05, "loss": 0.6725, "step": 368900 }, { "epoch": 4.919278505819147, "grad_norm": 12.602154731750488, "learning_rate": 4.736941138773691e-05, "loss": 0.7274, "step": 369000 }, { "epoch": 4.920611643625602, "grad_norm": 20.379981994628906, "learning_rate": 4.736233336699045e-05, "loss": 0.6226, "step": 369100 }, { "epoch": 4.921944781432057, "grad_norm": 11.335253715515137, "learning_rate": 4.735525389272417e-05, "loss": 0.6777, "step": 369200 }, { "epoch": 4.923277919238512, "grad_norm": 19.387662887573242, "learning_rate": 4.734817296553075e-05, "loss": 0.6338, "step": 369300 }, { "epoch": 4.924611057044967, "grad_norm": 6.981071472167969, "learning_rate": 4.734109058600297e-05, "loss": 0.718, "step": 369400 }, { "epoch": 4.9259441948514215, "grad_norm": 16.806865692138672, "learning_rate": 4.733400675473375e-05, "loss": 0.6938, "step": 369500 }, { "epoch": 4.9272773326578765, "grad_norm": 18.903980255126953, "learning_rate": 4.732692147231613e-05, "loss": 0.7521, "step": 369600 }, { "epoch": 4.928610470464331, "grad_norm": 6.866927623748779, "learning_rate": 4.7319834739343266e-05, "loss": 0.7733, "step": 369700 }, { "epoch": 4.929943608270787, "grad_norm": 42.07257080078125, "learning_rate": 4.731274655640843e-05, "loss": 0.6843, "step": 369800 }, { "epoch": 4.931276746077242, "grad_norm": 3.030620574951172, "learning_rate": 4.730565692410503e-05, "loss": 0.6766, "step": 369900 }, { "epoch": 4.932609883883697, "grad_norm": 5.745643615722656, "learning_rate": 4.729856584302658e-05, "loss": 0.6396, "step": 370000 }, { "epoch": 4.933943021690152, "grad_norm": 10.558979034423828, "learning_rate": 4.729147331376674e-05, "loss": 0.7584, "step": 370100 }, { "epoch": 4.935276159496607, "grad_norm": 7.027693748474121, "learning_rate": 4.728437933691924e-05, "loss": 0.7392, "step": 370200 }, { "epoch": 4.936609297303062, "grad_norm": 2.4949848651885986, "learning_rate": 4.7277283913078e-05, "loss": 0.7091, "step": 370300 }, { "epoch": 4.937942435109517, "grad_norm": 8.091155052185059, "learning_rate": 4.727018704283701e-05, "loss": 0.652, "step": 370400 }, { "epoch": 4.939275572915973, "grad_norm": 17.65036964416504, "learning_rate": 4.7263088726790405e-05, "loss": 0.6967, "step": 370500 }, { "epoch": 4.940608710722428, "grad_norm": 6.303225517272949, "learning_rate": 4.7255988965532426e-05, "loss": 0.6813, "step": 370600 }, { "epoch": 4.941941848528883, "grad_norm": 25.230716705322266, "learning_rate": 4.7248887759657445e-05, "loss": 0.6363, "step": 370700 }, { "epoch": 4.943274986335338, "grad_norm": 3.5167644023895264, "learning_rate": 4.724178510975996e-05, "loss": 0.7261, "step": 370800 }, { "epoch": 4.9446081241417925, "grad_norm": 11.769718170166016, "learning_rate": 4.723468101643457e-05, "loss": 0.7534, "step": 370900 }, { "epoch": 4.9459412619482475, "grad_norm": 17.373716354370117, "learning_rate": 4.7227575480276016e-05, "loss": 0.6959, "step": 371000 }, { "epoch": 4.947274399754702, "grad_norm": 12.874485969543457, "learning_rate": 4.722046850187915e-05, "loss": 0.6949, "step": 371100 }, { "epoch": 4.948607537561157, "grad_norm": 21.5897159576416, "learning_rate": 4.721343117317353e-05, "loss": 0.7197, "step": 371200 }, { "epoch": 4.949940675367612, "grad_norm": 2.1302402019500732, "learning_rate": 4.720632132649261e-05, "loss": 0.6216, "step": 371300 }, { "epoch": 4.951273813174068, "grad_norm": 6.948965549468994, "learning_rate": 4.719921003935271e-05, "loss": 0.7078, "step": 371400 }, { "epoch": 4.952606950980523, "grad_norm": 8.643341064453125, "learning_rate": 4.719209731234916e-05, "loss": 0.7162, "step": 371500 }, { "epoch": 4.953940088786978, "grad_norm": 4.553717136383057, "learning_rate": 4.718498314607743e-05, "loss": 0.7976, "step": 371600 }, { "epoch": 4.955273226593433, "grad_norm": 4.429551124572754, "learning_rate": 4.7177867541133074e-05, "loss": 0.6809, "step": 371700 }, { "epoch": 4.956606364399888, "grad_norm": 17.4309139251709, "learning_rate": 4.71707504981118e-05, "loss": 0.6621, "step": 371800 }, { "epoch": 4.957939502206343, "grad_norm": 22.03120231628418, "learning_rate": 4.716363201760942e-05, "loss": 0.6758, "step": 371900 }, { "epoch": 4.959272640012798, "grad_norm": 7.07007360458374, "learning_rate": 4.7156512100221875e-05, "loss": 0.6511, "step": 372000 }, { "epoch": 4.960605777819254, "grad_norm": 52.58435821533203, "learning_rate": 4.7149390746545206e-05, "loss": 0.7833, "step": 372100 }, { "epoch": 4.9619389156257085, "grad_norm": 36.68666076660156, "learning_rate": 4.714226795717561e-05, "loss": 0.6616, "step": 372200 }, { "epoch": 4.9632720534321635, "grad_norm": 19.784639358520508, "learning_rate": 4.7135143732709384e-05, "loss": 0.6691, "step": 372300 }, { "epoch": 4.964605191238618, "grad_norm": 11.184845924377441, "learning_rate": 4.7128018073742935e-05, "loss": 0.8126, "step": 372400 }, { "epoch": 4.965938329045073, "grad_norm": 4.146548748016357, "learning_rate": 4.712089098087281e-05, "loss": 0.7362, "step": 372500 }, { "epoch": 4.967271466851528, "grad_norm": 16.818939208984375, "learning_rate": 4.711376245469565e-05, "loss": 0.7126, "step": 372600 }, { "epoch": 4.968604604657983, "grad_norm": 4.43336296081543, "learning_rate": 4.710663249580826e-05, "loss": 0.6516, "step": 372700 }, { "epoch": 4.969937742464438, "grad_norm": 10.29688549041748, "learning_rate": 4.709950110480751e-05, "loss": 0.614, "step": 372800 }, { "epoch": 4.971270880270893, "grad_norm": 2.8309547901153564, "learning_rate": 4.709236828229043e-05, "loss": 0.6587, "step": 372900 }, { "epoch": 4.972604018077349, "grad_norm": 11.178062438964844, "learning_rate": 4.708523402885416e-05, "loss": 0.7127, "step": 373000 }, { "epoch": 4.973937155883804, "grad_norm": 8.508285522460938, "learning_rate": 4.707809834509596e-05, "loss": 0.6427, "step": 373100 }, { "epoch": 4.975270293690259, "grad_norm": 2.895301342010498, "learning_rate": 4.707096123161319e-05, "loss": 0.681, "step": 373200 }, { "epoch": 4.976603431496714, "grad_norm": 9.906957626342773, "learning_rate": 4.706382268900337e-05, "loss": 0.7181, "step": 373300 }, { "epoch": 4.977936569303169, "grad_norm": 1.370613932609558, "learning_rate": 4.70566827178641e-05, "loss": 0.6343, "step": 373400 }, { "epoch": 4.979269707109624, "grad_norm": 2.7046000957489014, "learning_rate": 4.704954131879311e-05, "loss": 0.6935, "step": 373500 }, { "epoch": 4.980602844916079, "grad_norm": 3.6519720554351807, "learning_rate": 4.704239849238828e-05, "loss": 0.7502, "step": 373600 }, { "epoch": 4.9819359827225345, "grad_norm": 11.240161895751953, "learning_rate": 4.703525423924756e-05, "loss": 0.716, "step": 373700 }, { "epoch": 4.983269120528989, "grad_norm": 24.185426712036133, "learning_rate": 4.7028108559969056e-05, "loss": 0.678, "step": 373800 }, { "epoch": 4.984602258335444, "grad_norm": 19.38528823852539, "learning_rate": 4.7020961455150995e-05, "loss": 0.7016, "step": 373900 }, { "epoch": 4.985935396141899, "grad_norm": 4.402939319610596, "learning_rate": 4.701381292539168e-05, "loss": 0.6579, "step": 374000 }, { "epoch": 4.987268533948354, "grad_norm": 6.692856788635254, "learning_rate": 4.700666297128958e-05, "loss": 0.7859, "step": 374100 }, { "epoch": 4.988601671754809, "grad_norm": 7.115968227386475, "learning_rate": 4.6999511593443276e-05, "loss": 0.6465, "step": 374200 }, { "epoch": 4.989934809561264, "grad_norm": 4.114320755004883, "learning_rate": 4.699235879245144e-05, "loss": 0.6451, "step": 374300 }, { "epoch": 4.991267947367719, "grad_norm": 4.521657943725586, "learning_rate": 4.6985204568912894e-05, "loss": 0.6915, "step": 374400 }, { "epoch": 4.992601085174174, "grad_norm": 7.2993364334106445, "learning_rate": 4.697804892342656e-05, "loss": 0.7214, "step": 374500 }, { "epoch": 4.99393422298063, "grad_norm": 26.616865158081055, "learning_rate": 4.697089185659149e-05, "loss": 0.7645, "step": 374600 }, { "epoch": 4.995267360787085, "grad_norm": 7.230391025543213, "learning_rate": 4.696373336900686e-05, "loss": 0.7065, "step": 374700 }, { "epoch": 4.99660049859354, "grad_norm": 19.79372787475586, "learning_rate": 4.6956573461271934e-05, "loss": 0.6598, "step": 374800 }, { "epoch": 4.997933636399995, "grad_norm": 25.93690299987793, "learning_rate": 4.694941213398613e-05, "loss": 0.6619, "step": 374900 }, { "epoch": 4.99926677420645, "grad_norm": 7.309396266937256, "learning_rate": 4.6942249387748976e-05, "loss": 0.7169, "step": 375000 }, { "epoch": 5.0005999120129045, "grad_norm": 12.101661682128906, "learning_rate": 4.693508522316011e-05, "loss": 0.695, "step": 375100 }, { "epoch": 5.0019330498193595, "grad_norm": 9.265740394592285, "learning_rate": 4.6927919640819294e-05, "loss": 0.7156, "step": 375200 }, { "epoch": 5.003266187625815, "grad_norm": 8.632291793823242, "learning_rate": 4.6920824318334266e-05, "loss": 0.7522, "step": 375300 }, { "epoch": 5.00459932543227, "grad_norm": 5.537557125091553, "learning_rate": 4.691365591645186e-05, "loss": 0.726, "step": 375400 }, { "epoch": 5.005932463238725, "grad_norm": 19.80573272705078, "learning_rate": 4.6906486098611485e-05, "loss": 0.6994, "step": 375500 }, { "epoch": 5.00726560104518, "grad_norm": 11.960626602172852, "learning_rate": 4.6899314865413405e-05, "loss": 0.7547, "step": 375600 }, { "epoch": 5.008598738851635, "grad_norm": 10.153066635131836, "learning_rate": 4.6892142217457934e-05, "loss": 0.7195, "step": 375700 }, { "epoch": 5.00993187665809, "grad_norm": 9.100223541259766, "learning_rate": 4.688496815534559e-05, "loss": 0.7322, "step": 375800 }, { "epoch": 5.011265014464545, "grad_norm": 13.192870140075684, "learning_rate": 4.687779267967693e-05, "loss": 0.7673, "step": 375900 }, { "epoch": 5.012598152271, "grad_norm": 0.7479863166809082, "learning_rate": 4.687061579105266e-05, "loss": 0.6793, "step": 376000 }, { "epoch": 5.013931290077456, "grad_norm": 4.625082015991211, "learning_rate": 4.686343749007362e-05, "loss": 0.5912, "step": 376100 }, { "epoch": 5.015264427883911, "grad_norm": 5.21481466293335, "learning_rate": 4.685625777734075e-05, "loss": 0.6634, "step": 376200 }, { "epoch": 5.016597565690366, "grad_norm": 3.156416416168213, "learning_rate": 4.684907665345512e-05, "loss": 0.7193, "step": 376300 }, { "epoch": 5.017930703496821, "grad_norm": 158.39492797851562, "learning_rate": 4.684189411901791e-05, "loss": 0.697, "step": 376400 }, { "epoch": 5.0192638413032755, "grad_norm": 5.936325550079346, "learning_rate": 4.6834710174630406e-05, "loss": 0.661, "step": 376500 }, { "epoch": 5.0205969791097305, "grad_norm": 7.978941440582275, "learning_rate": 4.6827524820894036e-05, "loss": 0.6361, "step": 376600 }, { "epoch": 5.021930116916185, "grad_norm": 28.683258056640625, "learning_rate": 4.6820338058410335e-05, "loss": 0.7356, "step": 376700 }, { "epoch": 5.02326325472264, "grad_norm": 19.610855102539062, "learning_rate": 4.6813149887780946e-05, "loss": 0.7012, "step": 376800 }, { "epoch": 5.024596392529096, "grad_norm": 46.30748748779297, "learning_rate": 4.6805960309607655e-05, "loss": 0.7279, "step": 376900 }, { "epoch": 5.025929530335551, "grad_norm": 6.980889320373535, "learning_rate": 4.679876932449235e-05, "loss": 0.662, "step": 377000 }, { "epoch": 5.027262668142006, "grad_norm": 5.013949871063232, "learning_rate": 4.679157693303702e-05, "loss": 0.6635, "step": 377100 }, { "epoch": 5.028595805948461, "grad_norm": 9.203089714050293, "learning_rate": 4.67843831358438e-05, "loss": 0.7056, "step": 377200 }, { "epoch": 5.029928943754916, "grad_norm": 8.563629150390625, "learning_rate": 4.677718793351495e-05, "loss": 0.7362, "step": 377300 }, { "epoch": 5.031262081561371, "grad_norm": 2.63537859916687, "learning_rate": 4.676999132665279e-05, "loss": 0.807, "step": 377400 }, { "epoch": 5.032595219367826, "grad_norm": 1.2512905597686768, "learning_rate": 4.676279331585983e-05, "loss": 0.6962, "step": 377500 }, { "epoch": 5.033928357174281, "grad_norm": 6.315536975860596, "learning_rate": 4.6755593901738656e-05, "loss": 0.6781, "step": 377600 }, { "epoch": 5.035261494980736, "grad_norm": 8.495819091796875, "learning_rate": 4.674846510000196e-05, "loss": 0.6746, "step": 377700 }, { "epoch": 5.0365946327871915, "grad_norm": 17.508657455444336, "learning_rate": 4.6741262895050854e-05, "loss": 0.7938, "step": 377800 }, { "epoch": 5.0379277705936465, "grad_norm": 31.65326499938965, "learning_rate": 4.673405928857398e-05, "loss": 0.6519, "step": 377900 }, { "epoch": 5.039260908400101, "grad_norm": 6.780392646789551, "learning_rate": 4.6726854281174426e-05, "loss": 0.6398, "step": 378000 }, { "epoch": 5.040594046206556, "grad_norm": 3.2913100719451904, "learning_rate": 4.671964787345535e-05, "loss": 0.7018, "step": 378100 }, { "epoch": 5.041927184013011, "grad_norm": 12.26225471496582, "learning_rate": 4.6712440066020065e-05, "loss": 0.709, "step": 378200 }, { "epoch": 5.043260321819466, "grad_norm": 10.987298965454102, "learning_rate": 4.6705230859471975e-05, "loss": 0.6589, "step": 378300 }, { "epoch": 5.044593459625921, "grad_norm": 15.300312995910645, "learning_rate": 4.669802025441462e-05, "loss": 0.7163, "step": 378400 }, { "epoch": 5.045926597432376, "grad_norm": 21.61668586730957, "learning_rate": 4.669080825145165e-05, "loss": 0.7544, "step": 378500 }, { "epoch": 5.047259735238832, "grad_norm": 6.998690605163574, "learning_rate": 4.668359485118681e-05, "loss": 0.7494, "step": 378600 }, { "epoch": 5.048592873045287, "grad_norm": 8.446210861206055, "learning_rate": 4.667638005422402e-05, "loss": 0.7716, "step": 378700 }, { "epoch": 5.049926010851742, "grad_norm": 16.098878860473633, "learning_rate": 4.666916386116725e-05, "loss": 0.813, "step": 378800 }, { "epoch": 5.051259148658197, "grad_norm": 9.346399307250977, "learning_rate": 4.666194627262063e-05, "loss": 0.6637, "step": 378900 }, { "epoch": 5.052592286464652, "grad_norm": 8.743783950805664, "learning_rate": 4.665472728918839e-05, "loss": 0.7108, "step": 379000 }, { "epoch": 5.053925424271107, "grad_norm": 5.06964635848999, "learning_rate": 4.664750691147489e-05, "loss": 0.7086, "step": 379100 }, { "epoch": 5.055258562077562, "grad_norm": 10.052847862243652, "learning_rate": 4.664028514008458e-05, "loss": 0.7391, "step": 379200 }, { "epoch": 5.056591699884017, "grad_norm": 14.500021934509277, "learning_rate": 4.663306197562205e-05, "loss": 0.7017, "step": 379300 }, { "epoch": 5.057924837690472, "grad_norm": 2.5486972332000732, "learning_rate": 4.6625837418692e-05, "loss": 0.6826, "step": 379400 }, { "epoch": 5.059257975496927, "grad_norm": 8.14146614074707, "learning_rate": 4.661861146989927e-05, "loss": 0.754, "step": 379500 }, { "epoch": 5.060591113303382, "grad_norm": 34.66436004638672, "learning_rate": 4.661138412984876e-05, "loss": 0.7526, "step": 379600 }, { "epoch": 5.061924251109837, "grad_norm": 5.793639183044434, "learning_rate": 4.6604155399145534e-05, "loss": 0.6834, "step": 379700 }, { "epoch": 5.063257388916292, "grad_norm": 88.59083557128906, "learning_rate": 4.659692527839476e-05, "loss": 0.5855, "step": 379800 }, { "epoch": 5.064590526722747, "grad_norm": 14.246593475341797, "learning_rate": 4.658969376820172e-05, "loss": 0.6191, "step": 379900 }, { "epoch": 5.065923664529202, "grad_norm": 5.378897190093994, "learning_rate": 4.658246086917182e-05, "loss": 0.6496, "step": 380000 }, { "epoch": 5.067256802335657, "grad_norm": 24.442899703979492, "learning_rate": 4.657522658191056e-05, "loss": 0.7141, "step": 380100 }, { "epoch": 5.068589940142113, "grad_norm": 4.425895690917969, "learning_rate": 4.656799090702359e-05, "loss": 0.6741, "step": 380200 }, { "epoch": 5.069923077948568, "grad_norm": 5.415194511413574, "learning_rate": 4.6560753845116635e-05, "loss": 0.708, "step": 380300 }, { "epoch": 5.071256215755023, "grad_norm": 3.3870389461517334, "learning_rate": 4.6553587788139545e-05, "loss": 0.617, "step": 380400 }, { "epoch": 5.072589353561478, "grad_norm": 7.214760780334473, "learning_rate": 4.6546347967865443e-05, "loss": 0.7215, "step": 380500 }, { "epoch": 5.073922491367933, "grad_norm": 20.008317947387695, "learning_rate": 4.653910676238325e-05, "loss": 0.6958, "step": 380600 }, { "epoch": 5.0752556291743876, "grad_norm": 8.324637413024902, "learning_rate": 4.6531864172299175e-05, "loss": 0.6593, "step": 380700 }, { "epoch": 5.0765887669808425, "grad_norm": 6.575832366943359, "learning_rate": 4.652462019821953e-05, "loss": 0.6658, "step": 380800 }, { "epoch": 5.077921904787297, "grad_norm": 1.3851919174194336, "learning_rate": 4.651737484075079e-05, "loss": 0.6767, "step": 380900 }, { "epoch": 5.079255042593753, "grad_norm": 5.034763336181641, "learning_rate": 4.651012810049948e-05, "loss": 0.7023, "step": 381000 }, { "epoch": 5.080588180400208, "grad_norm": 1.273407220840454, "learning_rate": 4.65028799780723e-05, "loss": 0.6024, "step": 381100 }, { "epoch": 5.081921318206663, "grad_norm": 2.482778787612915, "learning_rate": 4.649563047407603e-05, "loss": 0.7167, "step": 381200 }, { "epoch": 5.083254456013118, "grad_norm": 6.585549354553223, "learning_rate": 4.648837958911758e-05, "loss": 0.6907, "step": 381300 }, { "epoch": 5.084587593819573, "grad_norm": 18.67153549194336, "learning_rate": 4.648112732380395e-05, "loss": 0.6608, "step": 381400 }, { "epoch": 5.085920731626028, "grad_norm": 6.300518989562988, "learning_rate": 4.64738736787423e-05, "loss": 0.704, "step": 381500 }, { "epoch": 5.087253869432483, "grad_norm": 3.612468957901001, "learning_rate": 4.6466618654539874e-05, "loss": 0.6718, "step": 381600 }, { "epoch": 5.088587007238938, "grad_norm": 7.790915012359619, "learning_rate": 4.645936225180405e-05, "loss": 0.6499, "step": 381700 }, { "epoch": 5.089920145045394, "grad_norm": 4.172530174255371, "learning_rate": 4.6452104471142297e-05, "loss": 0.6957, "step": 381800 }, { "epoch": 5.091253282851849, "grad_norm": 2.2806198596954346, "learning_rate": 4.644484531316223e-05, "loss": 0.6617, "step": 381900 }, { "epoch": 5.092586420658304, "grad_norm": 2.412452220916748, "learning_rate": 4.643758477847154e-05, "loss": 0.657, "step": 382000 }, { "epoch": 5.0939195584647585, "grad_norm": 4.017266750335693, "learning_rate": 4.643032286767806e-05, "loss": 0.6651, "step": 382100 }, { "epoch": 5.0952526962712135, "grad_norm": 4.218239784240723, "learning_rate": 4.642305958138977e-05, "loss": 0.6931, "step": 382200 }, { "epoch": 5.096585834077668, "grad_norm": 15.744443893432617, "learning_rate": 4.641579492021468e-05, "loss": 0.6734, "step": 382300 }, { "epoch": 5.097918971884123, "grad_norm": 15.52928352355957, "learning_rate": 4.640852888476099e-05, "loss": 0.5802, "step": 382400 }, { "epoch": 5.099252109690578, "grad_norm": 14.481036186218262, "learning_rate": 4.640126147563699e-05, "loss": 0.6489, "step": 382500 }, { "epoch": 5.100585247497034, "grad_norm": 22.862628936767578, "learning_rate": 4.639399269345108e-05, "loss": 0.5993, "step": 382600 }, { "epoch": 5.101918385303489, "grad_norm": 3.9541592597961426, "learning_rate": 4.6386722538811774e-05, "loss": 0.6244, "step": 382700 }, { "epoch": 5.103251523109944, "grad_norm": 13.981147766113281, "learning_rate": 4.6379451012327715e-05, "loss": 0.7025, "step": 382800 }, { "epoch": 5.104584660916399, "grad_norm": 15.84036922454834, "learning_rate": 4.6372178114607653e-05, "loss": 0.6413, "step": 382900 }, { "epoch": 5.105917798722854, "grad_norm": 4.498083591461182, "learning_rate": 4.636497659572653e-05, "loss": 0.7538, "step": 383000 }, { "epoch": 5.107250936529309, "grad_norm": 23.576297760009766, "learning_rate": 4.635770097105832e-05, "loss": 0.6098, "step": 383100 }, { "epoch": 5.108584074335764, "grad_norm": 6.303623199462891, "learning_rate": 4.635042397697496e-05, "loss": 0.6893, "step": 383200 }, { "epoch": 5.109917212142219, "grad_norm": 56.5682487487793, "learning_rate": 4.6343145614085636e-05, "loss": 0.6621, "step": 383300 }, { "epoch": 5.1112503499486746, "grad_norm": 13.77003288269043, "learning_rate": 4.633586588299969e-05, "loss": 0.7776, "step": 383400 }, { "epoch": 5.1125834877551295, "grad_norm": 2.486362934112549, "learning_rate": 4.6328584784326534e-05, "loss": 0.6516, "step": 383500 }, { "epoch": 5.113916625561584, "grad_norm": 4.900931358337402, "learning_rate": 4.632130231867575e-05, "loss": 0.632, "step": 383600 }, { "epoch": 5.115249763368039, "grad_norm": 4.8075852394104, "learning_rate": 4.6314091331738685e-05, "loss": 0.6541, "step": 383700 }, { "epoch": 5.116582901174494, "grad_norm": 98.90145874023438, "learning_rate": 4.6306806147616265e-05, "loss": 0.6433, "step": 383800 }, { "epoch": 5.117916038980949, "grad_norm": 4.53857946395874, "learning_rate": 4.6299519598339444e-05, "loss": 0.6109, "step": 383900 }, { "epoch": 5.119249176787404, "grad_norm": 3.4575231075286865, "learning_rate": 4.6292231684518226e-05, "loss": 0.5761, "step": 384000 }, { "epoch": 5.120582314593859, "grad_norm": 11.918498992919922, "learning_rate": 4.628494240676273e-05, "loss": 0.7342, "step": 384100 }, { "epoch": 5.121915452400315, "grad_norm": 17.327404022216797, "learning_rate": 4.627765176568318e-05, "loss": 0.6527, "step": 384200 }, { "epoch": 5.12324859020677, "grad_norm": 4.391838550567627, "learning_rate": 4.6270359761889945e-05, "loss": 0.6981, "step": 384300 }, { "epoch": 5.124581728013225, "grad_norm": 25.741546630859375, "learning_rate": 4.626306639599348e-05, "loss": 0.7281, "step": 384400 }, { "epoch": 5.12591486581968, "grad_norm": 7.0436859130859375, "learning_rate": 4.625577166860435e-05, "loss": 0.6342, "step": 384500 }, { "epoch": 5.127248003626135, "grad_norm": 15.220873832702637, "learning_rate": 4.6248475580333275e-05, "loss": 0.7326, "step": 384600 }, { "epoch": 5.12858114143259, "grad_norm": 6.892846584320068, "learning_rate": 4.624117813179104e-05, "loss": 0.701, "step": 384700 }, { "epoch": 5.129914279239045, "grad_norm": 0.9300477504730225, "learning_rate": 4.623387932358856e-05, "loss": 0.6934, "step": 384800 }, { "epoch": 5.1312474170455, "grad_norm": 3.1919214725494385, "learning_rate": 4.622657915633689e-05, "loss": 0.7675, "step": 384900 }, { "epoch": 5.132580554851955, "grad_norm": 5.9980692863464355, "learning_rate": 4.621927763064714e-05, "loss": 0.7, "step": 385000 }, { "epoch": 5.13391369265841, "grad_norm": 1.4437717199325562, "learning_rate": 4.621197474713062e-05, "loss": 0.6525, "step": 385100 }, { "epoch": 5.135246830464865, "grad_norm": 21.761531829833984, "learning_rate": 4.620467050639867e-05, "loss": 0.6751, "step": 385200 }, { "epoch": 5.13657996827132, "grad_norm": 18.468524932861328, "learning_rate": 4.6197364909062783e-05, "loss": 0.7104, "step": 385300 }, { "epoch": 5.137913106077775, "grad_norm": 10.693479537963867, "learning_rate": 4.619005795573456e-05, "loss": 0.6708, "step": 385400 }, { "epoch": 5.13924624388423, "grad_norm": 7.598273754119873, "learning_rate": 4.618274964702573e-05, "loss": 0.6509, "step": 385500 }, { "epoch": 5.140579381690685, "grad_norm": 20.799875259399414, "learning_rate": 4.617543998354811e-05, "loss": 0.6504, "step": 385600 }, { "epoch": 5.14191251949714, "grad_norm": 13.92419719696045, "learning_rate": 4.616812896591365e-05, "loss": 0.6141, "step": 385700 }, { "epoch": 5.143245657303596, "grad_norm": 4.2148261070251465, "learning_rate": 4.616081659473438e-05, "loss": 0.679, "step": 385800 }, { "epoch": 5.144578795110051, "grad_norm": 5.601406097412109, "learning_rate": 4.6153502870622506e-05, "loss": 0.7053, "step": 385900 }, { "epoch": 5.145911932916506, "grad_norm": 10.99193286895752, "learning_rate": 4.614618779419029e-05, "loss": 0.7037, "step": 386000 }, { "epoch": 5.147245070722961, "grad_norm": 7.545496940612793, "learning_rate": 4.613887136605013e-05, "loss": 0.6916, "step": 386100 }, { "epoch": 5.148578208529416, "grad_norm": 5.495508670806885, "learning_rate": 4.613155358681452e-05, "loss": 0.688, "step": 386200 }, { "epoch": 5.1499113463358706, "grad_norm": 17.48488426208496, "learning_rate": 4.612423445709609e-05, "loss": 0.6087, "step": 386300 }, { "epoch": 5.1512444841423255, "grad_norm": 9.29713249206543, "learning_rate": 4.6116913977507606e-05, "loss": 0.6473, "step": 386400 }, { "epoch": 5.15257762194878, "grad_norm": 2.595430374145508, "learning_rate": 4.610959214866187e-05, "loss": 0.6393, "step": 386500 }, { "epoch": 5.153910759755236, "grad_norm": 6.447098255157471, "learning_rate": 4.6102268971171866e-05, "loss": 0.6333, "step": 386600 }, { "epoch": 5.155243897561691, "grad_norm": 3.484131336212158, "learning_rate": 4.609494444565066e-05, "loss": 0.6474, "step": 386700 }, { "epoch": 5.156577035368146, "grad_norm": 2.1649622917175293, "learning_rate": 4.6087618572711446e-05, "loss": 0.685, "step": 386800 }, { "epoch": 5.157910173174601, "grad_norm": 5.332505226135254, "learning_rate": 4.6080291352967515e-05, "loss": 0.6188, "step": 386900 }, { "epoch": 5.159243310981056, "grad_norm": 24.891645431518555, "learning_rate": 4.607303607935326e-05, "loss": 0.7278, "step": 387000 }, { "epoch": 5.160576448787511, "grad_norm": 2.675412654876709, "learning_rate": 4.6065706181293e-05, "loss": 0.6288, "step": 387100 }, { "epoch": 5.161909586593966, "grad_norm": 9.306151390075684, "learning_rate": 4.605837493826244e-05, "loss": 0.6864, "step": 387200 }, { "epoch": 5.163242724400421, "grad_norm": 11.504470825195312, "learning_rate": 4.605104235087537e-05, "loss": 0.6792, "step": 387300 }, { "epoch": 5.164575862206877, "grad_norm": 2.4403188228607178, "learning_rate": 4.604370841974562e-05, "loss": 0.6178, "step": 387400 }, { "epoch": 5.165909000013332, "grad_norm": 11.092242240905762, "learning_rate": 4.603637314548718e-05, "loss": 0.7064, "step": 387500 }, { "epoch": 5.167242137819787, "grad_norm": 7.460960865020752, "learning_rate": 4.6029036528714125e-05, "loss": 0.6966, "step": 387600 }, { "epoch": 5.1685752756262415, "grad_norm": 93.26532745361328, "learning_rate": 4.602169857004066e-05, "loss": 0.66, "step": 387700 }, { "epoch": 5.1699084134326965, "grad_norm": 3.2482728958129883, "learning_rate": 4.601435927008111e-05, "loss": 0.6678, "step": 387800 }, { "epoch": 5.171241551239151, "grad_norm": 1333.623291015625, "learning_rate": 4.6007018629449866e-05, "loss": 0.6978, "step": 387900 }, { "epoch": 5.172574689045606, "grad_norm": 19.102190017700195, "learning_rate": 4.599967664876149e-05, "loss": 0.7231, "step": 388000 }, { "epoch": 5.173907826852061, "grad_norm": 1.838512659072876, "learning_rate": 4.599233332863062e-05, "loss": 0.6888, "step": 388100 }, { "epoch": 5.175240964658517, "grad_norm": 5.5763139724731445, "learning_rate": 4.598498866967201e-05, "loss": 0.6585, "step": 388200 }, { "epoch": 5.176574102464972, "grad_norm": 68.49230194091797, "learning_rate": 4.597764267250054e-05, "loss": 0.7392, "step": 388300 }, { "epoch": 5.177907240271427, "grad_norm": 26.439729690551758, "learning_rate": 4.5970295337731195e-05, "loss": 0.682, "step": 388400 }, { "epoch": 5.179240378077882, "grad_norm": 95.81089782714844, "learning_rate": 4.5962946665979064e-05, "loss": 0.6955, "step": 388500 }, { "epoch": 5.180573515884337, "grad_norm": 22.40450096130371, "learning_rate": 4.595559665785935e-05, "loss": 0.7035, "step": 388600 }, { "epoch": 5.181906653690792, "grad_norm": 39.630767822265625, "learning_rate": 4.5948245313987393e-05, "loss": 0.8004, "step": 388700 }, { "epoch": 5.183239791497247, "grad_norm": 4.919468879699707, "learning_rate": 4.594089263497862e-05, "loss": 0.7156, "step": 388800 }, { "epoch": 5.184572929303702, "grad_norm": 5.3886213302612305, "learning_rate": 4.5933538621448545e-05, "loss": 0.714, "step": 388900 }, { "epoch": 5.185906067110158, "grad_norm": 10.582456588745117, "learning_rate": 4.592618327401285e-05, "loss": 0.6608, "step": 389000 }, { "epoch": 5.1872392049166125, "grad_norm": 6.0368475914001465, "learning_rate": 4.5918900166692315e-05, "loss": 0.6965, "step": 389100 }, { "epoch": 5.188572342723067, "grad_norm": 2.4600796699523926, "learning_rate": 4.591154216661647e-05, "loss": 0.6714, "step": 389200 }, { "epoch": 5.189905480529522, "grad_norm": 24.531145095825195, "learning_rate": 4.5904182834476474e-05, "loss": 0.7178, "step": 389300 }, { "epoch": 5.191238618335977, "grad_norm": 7.500330448150635, "learning_rate": 4.589682217088842e-05, "loss": 0.7405, "step": 389400 }, { "epoch": 5.192571756142432, "grad_norm": 15.124776840209961, "learning_rate": 4.588946017646853e-05, "loss": 0.6546, "step": 389500 }, { "epoch": 5.193904893948887, "grad_norm": 4.729407787322998, "learning_rate": 4.588209685183312e-05, "loss": 0.6029, "step": 389600 }, { "epoch": 5.195238031755342, "grad_norm": 5.242870807647705, "learning_rate": 4.587473219759863e-05, "loss": 0.6039, "step": 389700 }, { "epoch": 5.196571169561798, "grad_norm": 17.06231689453125, "learning_rate": 4.58673662143816e-05, "loss": 0.6957, "step": 389800 }, { "epoch": 5.197904307368253, "grad_norm": 3.050602436065674, "learning_rate": 4.585999890279868e-05, "loss": 0.6577, "step": 389900 }, { "epoch": 5.199237445174708, "grad_norm": 1.6816962957382202, "learning_rate": 4.585263026346666e-05, "loss": 0.6533, "step": 390000 }, { "epoch": 5.200570582981163, "grad_norm": 26.003419876098633, "learning_rate": 4.58452602970024e-05, "loss": 0.6591, "step": 390100 }, { "epoch": 5.201903720787618, "grad_norm": 5.288934230804443, "learning_rate": 4.58378890040229e-05, "loss": 0.7004, "step": 390200 }, { "epoch": 5.203236858594073, "grad_norm": 4.2566046714782715, "learning_rate": 4.583051638514525e-05, "loss": 0.7511, "step": 390300 }, { "epoch": 5.204569996400528, "grad_norm": 6.1485443115234375, "learning_rate": 4.5823142440986684e-05, "loss": 0.6017, "step": 390400 }, { "epoch": 5.205903134206983, "grad_norm": 11.830964088439941, "learning_rate": 4.58157671721645e-05, "loss": 0.6716, "step": 390500 }, { "epoch": 5.207236272013438, "grad_norm": 2.3684258460998535, "learning_rate": 4.5808390579296145e-05, "loss": 0.6638, "step": 390600 }, { "epoch": 5.208569409819893, "grad_norm": 22.062801361083984, "learning_rate": 4.580101266299916e-05, "loss": 0.6731, "step": 390700 }, { "epoch": 5.209902547626348, "grad_norm": 3.4248883724212646, "learning_rate": 4.579363342389122e-05, "loss": 0.7177, "step": 390800 }, { "epoch": 5.211235685432803, "grad_norm": 8.727167129516602, "learning_rate": 4.5786252862590064e-05, "loss": 0.6214, "step": 390900 }, { "epoch": 5.212568823239258, "grad_norm": 16.49262046813965, "learning_rate": 4.577887097971357e-05, "loss": 0.6566, "step": 391000 }, { "epoch": 5.213901961045713, "grad_norm": 36.731380462646484, "learning_rate": 4.577148777587976e-05, "loss": 0.7039, "step": 391100 }, { "epoch": 5.215235098852168, "grad_norm": 7.481164455413818, "learning_rate": 4.5764103251706695e-05, "loss": 0.7157, "step": 391200 }, { "epoch": 5.216568236658623, "grad_norm": 9.928060531616211, "learning_rate": 4.57567174078126e-05, "loss": 0.6293, "step": 391300 }, { "epoch": 5.217901374465079, "grad_norm": 9.056794166564941, "learning_rate": 4.574940412297329e-05, "loss": 0.6712, "step": 391400 }, { "epoch": 5.219234512271534, "grad_norm": 3.375326156616211, "learning_rate": 4.5742015654673985e-05, "loss": 0.5674, "step": 391500 }, { "epoch": 5.220567650077989, "grad_norm": 7.4716901779174805, "learning_rate": 4.5734625868502755e-05, "loss": 0.6944, "step": 391600 }, { "epoch": 5.221900787884444, "grad_norm": 18.22946548461914, "learning_rate": 4.572723476507824e-05, "loss": 0.6431, "step": 391700 }, { "epoch": 5.223233925690899, "grad_norm": 17.90432357788086, "learning_rate": 4.571984234501921e-05, "loss": 0.6758, "step": 391800 }, { "epoch": 5.224567063497354, "grad_norm": 4.710541725158691, "learning_rate": 4.5712448608944526e-05, "loss": 0.6849, "step": 391900 }, { "epoch": 5.2259002013038085, "grad_norm": 5.855199813842773, "learning_rate": 4.5705053557473184e-05, "loss": 0.7008, "step": 392000 }, { "epoch": 5.227233339110263, "grad_norm": 7.34309196472168, "learning_rate": 4.5697657191224264e-05, "loss": 0.6145, "step": 392100 }, { "epoch": 5.228566476916719, "grad_norm": 5.688935279846191, "learning_rate": 4.569025951081696e-05, "loss": 0.5891, "step": 392200 }, { "epoch": 5.229899614723174, "grad_norm": 9.641427993774414, "learning_rate": 4.5682860516870604e-05, "loss": 0.6993, "step": 392300 }, { "epoch": 5.231232752529629, "grad_norm": 32.69057846069336, "learning_rate": 4.5675460210004584e-05, "loss": 0.6912, "step": 392400 }, { "epoch": 5.232565890336084, "grad_norm": 3.0189526081085205, "learning_rate": 4.566805859083846e-05, "loss": 0.6111, "step": 392500 }, { "epoch": 5.233899028142539, "grad_norm": 11.549250602722168, "learning_rate": 4.5660655659991855e-05, "loss": 0.6374, "step": 392600 }, { "epoch": 5.235232165948994, "grad_norm": 83.21086120605469, "learning_rate": 4.565332546699132e-05, "loss": 0.6663, "step": 392700 }, { "epoch": 5.236565303755449, "grad_norm": 11.319336891174316, "learning_rate": 4.564591992774447e-05, "loss": 0.6261, "step": 392800 }, { "epoch": 5.237898441561904, "grad_norm": 71.07398986816406, "learning_rate": 4.563851307867051e-05, "loss": 0.6709, "step": 392900 }, { "epoch": 5.23923157936836, "grad_norm": 1.7033123970031738, "learning_rate": 4.563110492038955e-05, "loss": 0.6169, "step": 393000 }, { "epoch": 5.240564717174815, "grad_norm": 3.6524970531463623, "learning_rate": 4.562369545352175e-05, "loss": 0.6235, "step": 393100 }, { "epoch": 5.24189785498127, "grad_norm": 5.434134483337402, "learning_rate": 4.561628467868742e-05, "loss": 0.7033, "step": 393200 }, { "epoch": 5.2432309927877245, "grad_norm": 7.157660961151123, "learning_rate": 4.5608872596506974e-05, "loss": 0.6314, "step": 393300 }, { "epoch": 5.2445641305941795, "grad_norm": 4.246083736419678, "learning_rate": 4.56014592076009e-05, "loss": 0.6175, "step": 393400 }, { "epoch": 5.245897268400634, "grad_norm": 5.224868297576904, "learning_rate": 4.559404451258985e-05, "loss": 0.6226, "step": 393500 }, { "epoch": 5.247230406207089, "grad_norm": 22.76787757873535, "learning_rate": 4.558662851209455e-05, "loss": 0.6382, "step": 393600 }, { "epoch": 5.248563544013544, "grad_norm": 16.433923721313477, "learning_rate": 4.5579211206735835e-05, "loss": 0.6487, "step": 393700 }, { "epoch": 5.24989668182, "grad_norm": 4.987452507019043, "learning_rate": 4.5571792597134675e-05, "loss": 0.7009, "step": 393800 }, { "epoch": 5.251229819626455, "grad_norm": 3.7530863285064697, "learning_rate": 4.5564372683912125e-05, "loss": 0.6461, "step": 393900 }, { "epoch": 5.25256295743291, "grad_norm": 3.399604320526123, "learning_rate": 4.555695146768936e-05, "loss": 0.6209, "step": 394000 }, { "epoch": 5.253896095239365, "grad_norm": 24.225019454956055, "learning_rate": 4.554952894908765e-05, "loss": 0.6656, "step": 394100 }, { "epoch": 5.25522923304582, "grad_norm": 3.324496269226074, "learning_rate": 4.554210512872841e-05, "loss": 0.6436, "step": 394200 }, { "epoch": 5.256562370852275, "grad_norm": 4.143369674682617, "learning_rate": 4.55346800072331e-05, "loss": 0.6752, "step": 394300 }, { "epoch": 5.25789550865873, "grad_norm": 4.493373394012451, "learning_rate": 4.552725358522336e-05, "loss": 0.6834, "step": 394400 }, { "epoch": 5.259228646465185, "grad_norm": 15.61578369140625, "learning_rate": 4.5519825863320895e-05, "loss": 0.5956, "step": 394500 }, { "epoch": 5.260561784271641, "grad_norm": 3.9764504432678223, "learning_rate": 4.551239684214753e-05, "loss": 0.653, "step": 394600 }, { "epoch": 5.2618949220780955, "grad_norm": 4.681465148925781, "learning_rate": 4.5504966522325204e-05, "loss": 0.6317, "step": 394700 }, { "epoch": 5.2632280598845504, "grad_norm": 90.2477798461914, "learning_rate": 4.549753490447596e-05, "loss": 0.6653, "step": 394800 }, { "epoch": 5.264561197691005, "grad_norm": 6.780401706695557, "learning_rate": 4.549010198922194e-05, "loss": 0.629, "step": 394900 }, { "epoch": 5.26589433549746, "grad_norm": 13.186914443969727, "learning_rate": 4.5482667777185405e-05, "loss": 0.6024, "step": 395000 }, { "epoch": 5.267227473303915, "grad_norm": 5.380972862243652, "learning_rate": 4.547523226898874e-05, "loss": 0.7219, "step": 395100 }, { "epoch": 5.26856061111037, "grad_norm": 44.96696472167969, "learning_rate": 4.54677954652544e-05, "loss": 0.6971, "step": 395200 }, { "epoch": 5.269893748916825, "grad_norm": 5.073739528656006, "learning_rate": 4.546035736660499e-05, "loss": 0.6196, "step": 395300 }, { "epoch": 5.271226886723281, "grad_norm": 5.531641006469727, "learning_rate": 4.5452917973663193e-05, "loss": 0.618, "step": 395400 }, { "epoch": 5.272560024529736, "grad_norm": 3.478839159011841, "learning_rate": 4.544547728705182e-05, "loss": 0.6179, "step": 395500 }, { "epoch": 5.273893162336191, "grad_norm": 3.928091049194336, "learning_rate": 4.5438035307393776e-05, "loss": 0.6922, "step": 395600 }, { "epoch": 5.275226300142646, "grad_norm": 5.711125373840332, "learning_rate": 4.5430592035312075e-05, "loss": 0.7039, "step": 395700 }, { "epoch": 5.276559437949101, "grad_norm": 6.168242454528809, "learning_rate": 4.5423147471429865e-05, "loss": 0.6976, "step": 395800 }, { "epoch": 5.277892575755556, "grad_norm": 2.1706361770629883, "learning_rate": 4.5415701616370354e-05, "loss": 0.6314, "step": 395900 }, { "epoch": 5.279225713562011, "grad_norm": 2.7119388580322266, "learning_rate": 4.54082544707569e-05, "loss": 0.5506, "step": 396000 }, { "epoch": 5.280558851368466, "grad_norm": 11.253676414489746, "learning_rate": 4.5400806035212946e-05, "loss": 0.6282, "step": 396100 }, { "epoch": 5.281891989174921, "grad_norm": 5.626124858856201, "learning_rate": 4.539335631036207e-05, "loss": 0.6156, "step": 396200 }, { "epoch": 5.283225126981376, "grad_norm": 19.77374267578125, "learning_rate": 4.538590529682792e-05, "loss": 0.6274, "step": 396300 }, { "epoch": 5.284558264787831, "grad_norm": 12.243189811706543, "learning_rate": 4.5378452995234285e-05, "loss": 0.6313, "step": 396400 }, { "epoch": 5.285891402594286, "grad_norm": 35.9114875793457, "learning_rate": 4.5370999406205045e-05, "loss": 0.6498, "step": 396500 }, { "epoch": 5.287224540400741, "grad_norm": 34.50605010986328, "learning_rate": 4.536354453036418e-05, "loss": 0.6787, "step": 396600 }, { "epoch": 5.288557678207196, "grad_norm": 10.651232719421387, "learning_rate": 4.53560883683358e-05, "loss": 0.6606, "step": 396700 }, { "epoch": 5.289890816013651, "grad_norm": 5.432039737701416, "learning_rate": 4.53486309207441e-05, "loss": 0.633, "step": 396800 }, { "epoch": 5.291223953820106, "grad_norm": 27.208587646484375, "learning_rate": 4.534117218821342e-05, "loss": 0.6188, "step": 396900 }, { "epoch": 5.292557091626562, "grad_norm": 3.2361578941345215, "learning_rate": 4.533371217136815e-05, "loss": 0.6853, "step": 397000 }, { "epoch": 5.293890229433017, "grad_norm": 10.430398941040039, "learning_rate": 4.532625087083285e-05, "loss": 0.5682, "step": 397100 }, { "epoch": 5.295223367239472, "grad_norm": 3.249687910079956, "learning_rate": 4.531878828723212e-05, "loss": 0.6712, "step": 397200 }, { "epoch": 5.296556505045927, "grad_norm": 5.11284875869751, "learning_rate": 4.531132442119073e-05, "loss": 0.6293, "step": 397300 }, { "epoch": 5.297889642852382, "grad_norm": 6.479248523712158, "learning_rate": 4.530385927333353e-05, "loss": 0.6633, "step": 397400 }, { "epoch": 5.299222780658837, "grad_norm": 2.319636583328247, "learning_rate": 4.529639284428547e-05, "loss": 0.6642, "step": 397500 }, { "epoch": 5.3005559184652915, "grad_norm": 6.536771297454834, "learning_rate": 4.528892513467162e-05, "loss": 0.6648, "step": 397600 }, { "epoch": 5.3018890562717464, "grad_norm": 4.25121545791626, "learning_rate": 4.5281456145117143e-05, "loss": 0.7088, "step": 397700 }, { "epoch": 5.303222194078202, "grad_norm": 18.035654067993164, "learning_rate": 4.527398587624734e-05, "loss": 0.7424, "step": 397800 }, { "epoch": 5.304555331884657, "grad_norm": 52.32666778564453, "learning_rate": 4.526651432868759e-05, "loss": 0.6525, "step": 397900 }, { "epoch": 5.305888469691112, "grad_norm": 10.371049880981445, "learning_rate": 4.525904150306338e-05, "loss": 0.5888, "step": 398000 }, { "epoch": 5.307221607497567, "grad_norm": 30.31196403503418, "learning_rate": 4.5251567400000313e-05, "loss": 0.6457, "step": 398100 }, { "epoch": 5.308554745304022, "grad_norm": 9.79326057434082, "learning_rate": 4.5244092020124094e-05, "loss": 0.6049, "step": 398200 }, { "epoch": 5.309887883110477, "grad_norm": 5.648993015289307, "learning_rate": 4.523661536406056e-05, "loss": 0.6749, "step": 398300 }, { "epoch": 5.311221020916932, "grad_norm": 4.549161434173584, "learning_rate": 4.522913743243561e-05, "loss": 0.6525, "step": 398400 }, { "epoch": 5.312554158723387, "grad_norm": 20.981355667114258, "learning_rate": 4.522165822587528e-05, "loss": 0.6539, "step": 398500 }, { "epoch": 5.313887296529842, "grad_norm": 39.74961471557617, "learning_rate": 4.52141777450057e-05, "loss": 0.6213, "step": 398600 }, { "epoch": 5.315220434336298, "grad_norm": 4.05847692489624, "learning_rate": 4.5206695990453144e-05, "loss": 0.6678, "step": 398700 }, { "epoch": 5.316553572142753, "grad_norm": 79.8815689086914, "learning_rate": 4.5199212962843905e-05, "loss": 0.6882, "step": 398800 }, { "epoch": 5.3178867099492075, "grad_norm": 83.30738067626953, "learning_rate": 4.519172866280448e-05, "loss": 0.6512, "step": 398900 }, { "epoch": 5.3192198477556625, "grad_norm": 15.512331008911133, "learning_rate": 4.5184243090961436e-05, "loss": 0.6603, "step": 399000 }, { "epoch": 5.320552985562117, "grad_norm": 3.6768693923950195, "learning_rate": 4.5176756247941394e-05, "loss": 0.6862, "step": 399100 }, { "epoch": 5.321886123368572, "grad_norm": 8.288839340209961, "learning_rate": 4.516934302179405e-05, "loss": 0.601, "step": 399200 }, { "epoch": 5.323219261175027, "grad_norm": 367.9263000488281, "learning_rate": 4.5161853650996655e-05, "loss": 0.7406, "step": 399300 }, { "epoch": 5.324552398981483, "grad_norm": 3.0445611476898193, "learning_rate": 4.515436301089666e-05, "loss": 0.7026, "step": 399400 }, { "epoch": 5.325885536787938, "grad_norm": 6.171147346496582, "learning_rate": 4.5146871102121166e-05, "loss": 0.6457, "step": 399500 }, { "epoch": 5.327218674594393, "grad_norm": 6.8230671882629395, "learning_rate": 4.513937792529737e-05, "loss": 0.638, "step": 399600 }, { "epoch": 5.328551812400848, "grad_norm": 4.248763561248779, "learning_rate": 4.513188348105258e-05, "loss": 0.6024, "step": 399700 }, { "epoch": 5.329884950207303, "grad_norm": 9.656667709350586, "learning_rate": 4.512438777001421e-05, "loss": 0.618, "step": 399800 }, { "epoch": 5.331218088013758, "grad_norm": 5.581717491149902, "learning_rate": 4.5116890792809755e-05, "loss": 0.6826, "step": 399900 }, { "epoch": 5.332551225820213, "grad_norm": 8.0575532913208, "learning_rate": 4.510939255006687e-05, "loss": 0.6241, "step": 400000 }, { "epoch": 5.333884363626668, "grad_norm": 17.275043487548828, "learning_rate": 4.510189304241327e-05, "loss": 0.7036, "step": 400100 }, { "epoch": 5.335217501433123, "grad_norm": 18.61288070678711, "learning_rate": 4.5094392270476795e-05, "loss": 0.6931, "step": 400200 }, { "epoch": 5.3365506392395785, "grad_norm": 17.410648345947266, "learning_rate": 4.508689023488536e-05, "loss": 0.6851, "step": 400300 }, { "epoch": 5.3378837770460335, "grad_norm": 6.13608980178833, "learning_rate": 4.507938693626705e-05, "loss": 0.683, "step": 400400 }, { "epoch": 5.339216914852488, "grad_norm": 8.328500747680664, "learning_rate": 4.507188237524999e-05, "loss": 0.6213, "step": 400500 }, { "epoch": 5.340550052658943, "grad_norm": 4.511363506317139, "learning_rate": 4.506437655246245e-05, "loss": 0.6435, "step": 400600 }, { "epoch": 5.341883190465398, "grad_norm": 21.86338996887207, "learning_rate": 4.505686946853279e-05, "loss": 0.6642, "step": 400700 }, { "epoch": 5.343216328271853, "grad_norm": 7.742108345031738, "learning_rate": 4.50493611240895e-05, "loss": 0.6087, "step": 400800 }, { "epoch": 5.344549466078308, "grad_norm": 9.26535415649414, "learning_rate": 4.504185151976112e-05, "loss": 0.6816, "step": 400900 }, { "epoch": 5.345882603884764, "grad_norm": 6.352686882019043, "learning_rate": 4.5034340656176346e-05, "loss": 0.6187, "step": 401000 }, { "epoch": 5.347215741691219, "grad_norm": 19.651710510253906, "learning_rate": 4.502682853396397e-05, "loss": 0.6244, "step": 401100 }, { "epoch": 5.348548879497674, "grad_norm": 10.853625297546387, "learning_rate": 4.501931515375287e-05, "loss": 0.7074, "step": 401200 }, { "epoch": 5.349882017304129, "grad_norm": 7.751393795013428, "learning_rate": 4.501180051617206e-05, "loss": 0.7006, "step": 401300 }, { "epoch": 5.351215155110584, "grad_norm": 8.728290557861328, "learning_rate": 4.500428462185062e-05, "loss": 0.6687, "step": 401400 }, { "epoch": 5.352548292917039, "grad_norm": 19.222959518432617, "learning_rate": 4.4996767471417785e-05, "loss": 0.6924, "step": 401500 }, { "epoch": 5.353881430723494, "grad_norm": 2.9929661750793457, "learning_rate": 4.498932425577456e-05, "loss": 0.6319, "step": 401600 }, { "epoch": 5.355214568529949, "grad_norm": 5.6110920906066895, "learning_rate": 4.4981804607552346e-05, "loss": 0.7072, "step": 401700 }, { "epoch": 5.3565477063364035, "grad_norm": 6.242130279541016, "learning_rate": 4.4974283705100685e-05, "loss": 0.7063, "step": 401800 }, { "epoch": 5.357880844142859, "grad_norm": 11.956379890441895, "learning_rate": 4.496676154904919e-05, "loss": 0.7228, "step": 401900 }, { "epoch": 5.359213981949314, "grad_norm": 3.9662718772888184, "learning_rate": 4.495923814002759e-05, "loss": 0.5636, "step": 402000 }, { "epoch": 5.360547119755769, "grad_norm": 6.743645191192627, "learning_rate": 4.495171347866573e-05, "loss": 0.6696, "step": 402100 }, { "epoch": 5.361880257562224, "grad_norm": 3.291480302810669, "learning_rate": 4.494418756559356e-05, "loss": 0.6744, "step": 402200 }, { "epoch": 5.363213395368679, "grad_norm": 5.092231273651123, "learning_rate": 4.493666040144111e-05, "loss": 0.7548, "step": 402300 }, { "epoch": 5.364546533175134, "grad_norm": 22.51038932800293, "learning_rate": 4.4929131986838535e-05, "loss": 0.7115, "step": 402400 }, { "epoch": 5.365879670981589, "grad_norm": 5.930697441101074, "learning_rate": 4.4921602322416084e-05, "loss": 0.605, "step": 402500 }, { "epoch": 5.367212808788045, "grad_norm": 5.013140678405762, "learning_rate": 4.491407140880413e-05, "loss": 0.5656, "step": 402600 }, { "epoch": 5.3685459465945, "grad_norm": 8.162553787231445, "learning_rate": 4.490653924663314e-05, "loss": 0.6868, "step": 402700 }, { "epoch": 5.369879084400955, "grad_norm": 18.65629005432129, "learning_rate": 4.489900583653366e-05, "loss": 0.6125, "step": 402800 }, { "epoch": 5.37121222220741, "grad_norm": 4.296085357666016, "learning_rate": 4.489147117913638e-05, "loss": 0.6537, "step": 402900 }, { "epoch": 5.372545360013865, "grad_norm": 2.7818734645843506, "learning_rate": 4.488393527507208e-05, "loss": 0.6528, "step": 403000 }, { "epoch": 5.37387849782032, "grad_norm": 15.13791561126709, "learning_rate": 4.487639812497163e-05, "loss": 0.6768, "step": 403100 }, { "epoch": 5.3752116356267745, "grad_norm": 12.757869720458984, "learning_rate": 4.4868859729466036e-05, "loss": 0.6089, "step": 403200 }, { "epoch": 5.3765447734332295, "grad_norm": 10.782398223876953, "learning_rate": 4.486132008918636e-05, "loss": 0.6646, "step": 403300 }, { "epoch": 5.377877911239684, "grad_norm": 3.467517614364624, "learning_rate": 4.485377920476382e-05, "loss": 0.7321, "step": 403400 }, { "epoch": 5.37921104904614, "grad_norm": 10.152444839477539, "learning_rate": 4.4846237076829716e-05, "loss": 0.6451, "step": 403500 }, { "epoch": 5.380544186852595, "grad_norm": 2.7751524448394775, "learning_rate": 4.483869370601543e-05, "loss": 0.7567, "step": 403600 }, { "epoch": 5.38187732465905, "grad_norm": 17.193695068359375, "learning_rate": 4.4831149092952494e-05, "loss": 0.6584, "step": 403700 }, { "epoch": 5.383210462465505, "grad_norm": 5.055281162261963, "learning_rate": 4.482360323827249e-05, "loss": 0.6457, "step": 403800 }, { "epoch": 5.38454360027196, "grad_norm": 51.97506332397461, "learning_rate": 4.481605614260717e-05, "loss": 0.6253, "step": 403900 }, { "epoch": 5.385876738078415, "grad_norm": 11.138448715209961, "learning_rate": 4.480850780658832e-05, "loss": 0.6547, "step": 404000 }, { "epoch": 5.38720987588487, "grad_norm": 20.499366760253906, "learning_rate": 4.480095823084789e-05, "loss": 0.5993, "step": 404100 }, { "epoch": 5.388543013691326, "grad_norm": 3.4233992099761963, "learning_rate": 4.479340741601788e-05, "loss": 0.6327, "step": 404200 }, { "epoch": 5.389876151497781, "grad_norm": 7.976290702819824, "learning_rate": 4.478585536273045e-05, "loss": 0.6645, "step": 404300 }, { "epoch": 5.391209289304236, "grad_norm": 7.380827903747559, "learning_rate": 4.4778302071617805e-05, "loss": 0.7144, "step": 404400 }, { "epoch": 5.3925424271106905, "grad_norm": 10.713329315185547, "learning_rate": 4.47707475433123e-05, "loss": 0.6366, "step": 404500 }, { "epoch": 5.3938755649171455, "grad_norm": 2.813164710998535, "learning_rate": 4.476319177844637e-05, "loss": 0.6903, "step": 404600 }, { "epoch": 5.3952087027236, "grad_norm": 1.6767617464065552, "learning_rate": 4.475563477765257e-05, "loss": 0.6839, "step": 404700 }, { "epoch": 5.396541840530055, "grad_norm": 4.362523555755615, "learning_rate": 4.474807654156354e-05, "loss": 0.5991, "step": 404800 }, { "epoch": 5.39787497833651, "grad_norm": 14.73632526397705, "learning_rate": 4.474051707081201e-05, "loss": 0.6326, "step": 404900 }, { "epoch": 5.399208116142965, "grad_norm": 5.111706733703613, "learning_rate": 4.473295636603088e-05, "loss": 0.5986, "step": 405000 }, { "epoch": 5.400541253949421, "grad_norm": 6.905246734619141, "learning_rate": 4.472539442785308e-05, "loss": 0.5648, "step": 405100 }, { "epoch": 5.401874391755876, "grad_norm": 6.888442516326904, "learning_rate": 4.471783125691168e-05, "loss": 0.6044, "step": 405200 }, { "epoch": 5.403207529562331, "grad_norm": 15.986771583557129, "learning_rate": 4.471026685383983e-05, "loss": 0.6945, "step": 405300 }, { "epoch": 5.404540667368786, "grad_norm": 20.617908477783203, "learning_rate": 4.4702701219270813e-05, "loss": 0.5988, "step": 405400 }, { "epoch": 5.405873805175241, "grad_norm": 10.301351547241211, "learning_rate": 4.469513435383801e-05, "loss": 0.6377, "step": 405500 }, { "epoch": 5.407206942981696, "grad_norm": 40.51301574707031, "learning_rate": 4.4687641945219056e-05, "loss": 0.6057, "step": 405600 }, { "epoch": 5.408540080788151, "grad_norm": 8.949335098266602, "learning_rate": 4.4680072632252005e-05, "loss": 0.6724, "step": 405700 }, { "epoch": 5.409873218594607, "grad_norm": 2.3867766857147217, "learning_rate": 4.467250209031555e-05, "loss": 0.7129, "step": 405800 }, { "epoch": 5.4112063564010615, "grad_norm": 4.8644795417785645, "learning_rate": 4.4664930320043466e-05, "loss": 0.6858, "step": 405900 }, { "epoch": 5.4125394942075165, "grad_norm": 1.8053134679794312, "learning_rate": 4.465735732206965e-05, "loss": 0.7006, "step": 406000 }, { "epoch": 5.413872632013971, "grad_norm": 14.784026145935059, "learning_rate": 4.464978309702807e-05, "loss": 0.6905, "step": 406100 }, { "epoch": 5.415205769820426, "grad_norm": 11.823491096496582, "learning_rate": 4.464220764555284e-05, "loss": 0.6827, "step": 406200 }, { "epoch": 5.416538907626881, "grad_norm": 8.992270469665527, "learning_rate": 4.4634630968278134e-05, "loss": 0.6514, "step": 406300 }, { "epoch": 5.417872045433336, "grad_norm": 7.641887187957764, "learning_rate": 4.462705306583826e-05, "loss": 0.7176, "step": 406400 }, { "epoch": 5.419205183239791, "grad_norm": 2.993445634841919, "learning_rate": 4.461947393886762e-05, "loss": 0.7281, "step": 406500 }, { "epoch": 5.420538321046246, "grad_norm": 26.632389068603516, "learning_rate": 4.461196939756557e-05, "loss": 0.6889, "step": 406600 }, { "epoch": 5.421871458852702, "grad_norm": 16.630962371826172, "learning_rate": 4.460438783566646e-05, "loss": 0.6617, "step": 406700 }, { "epoch": 5.423204596659157, "grad_norm": 9.975518226623535, "learning_rate": 4.4596805051134024e-05, "loss": 0.7486, "step": 406800 }, { "epoch": 5.424537734465612, "grad_norm": 11.810505867004395, "learning_rate": 4.458922104460308e-05, "loss": 0.5995, "step": 406900 }, { "epoch": 5.425870872272067, "grad_norm": 1.4850513935089111, "learning_rate": 4.4581635816708554e-05, "loss": 0.5387, "step": 407000 }, { "epoch": 5.427204010078522, "grad_norm": 6.072740077972412, "learning_rate": 4.457404936808544e-05, "loss": 0.6668, "step": 407100 }, { "epoch": 5.428537147884977, "grad_norm": 3.856480360031128, "learning_rate": 4.456646169936885e-05, "loss": 0.6761, "step": 407200 }, { "epoch": 5.429870285691432, "grad_norm": 58.48727035522461, "learning_rate": 4.4558872811194e-05, "loss": 0.6586, "step": 407300 }, { "epoch": 5.431203423497887, "grad_norm": 35.67238235473633, "learning_rate": 4.455128270419622e-05, "loss": 0.6436, "step": 407400 }, { "epoch": 5.432536561304342, "grad_norm": 5.9616379737854, "learning_rate": 4.454369137901092e-05, "loss": 0.6285, "step": 407500 }, { "epoch": 5.433869699110797, "grad_norm": 9.789863586425781, "learning_rate": 4.453609883627362e-05, "loss": 0.7426, "step": 407600 }, { "epoch": 5.435202836917252, "grad_norm": 11.56650161743164, "learning_rate": 4.452850507661996e-05, "loss": 0.6821, "step": 407700 }, { "epoch": 5.436535974723707, "grad_norm": 11.469454765319824, "learning_rate": 4.452098605646348e-05, "loss": 0.7058, "step": 407800 }, { "epoch": 5.437869112530162, "grad_norm": 7.823631286621094, "learning_rate": 4.451338987703764e-05, "loss": 0.6348, "step": 407900 }, { "epoch": 5.439202250336617, "grad_norm": 1.3099480867385864, "learning_rate": 4.4505792482596556e-05, "loss": 0.6745, "step": 408000 }, { "epoch": 5.440535388143072, "grad_norm": 3.2304153442382812, "learning_rate": 4.4498193873776245e-05, "loss": 0.6253, "step": 408100 }, { "epoch": 5.441868525949527, "grad_norm": 26.882200241088867, "learning_rate": 4.449059405121285e-05, "loss": 0.6233, "step": 408200 }, { "epoch": 5.443201663755983, "grad_norm": 9.468321800231934, "learning_rate": 4.44829930155426e-05, "loss": 0.6939, "step": 408300 }, { "epoch": 5.444534801562438, "grad_norm": 7.079102993011475, "learning_rate": 4.447546679588287e-05, "loss": 0.7183, "step": 408400 }, { "epoch": 5.445867939368893, "grad_norm": 8.519906997680664, "learning_rate": 4.44678633480232e-05, "loss": 0.6775, "step": 408500 }, { "epoch": 5.447201077175348, "grad_norm": 66.43767547607422, "learning_rate": 4.446025868895962e-05, "loss": 0.6462, "step": 408600 }, { "epoch": 5.448534214981803, "grad_norm": 12.10456657409668, "learning_rate": 4.445265281932877e-05, "loss": 0.6137, "step": 408700 }, { "epoch": 5.4498673527882575, "grad_norm": 9.586686134338379, "learning_rate": 4.444504573976737e-05, "loss": 0.6498, "step": 408800 }, { "epoch": 5.4512004905947125, "grad_norm": 11.725483894348145, "learning_rate": 4.443743745091228e-05, "loss": 0.7048, "step": 408900 }, { "epoch": 5.452533628401168, "grad_norm": 2.5212960243225098, "learning_rate": 4.4429827953400444e-05, "loss": 0.6547, "step": 409000 }, { "epoch": 5.453866766207623, "grad_norm": 13.945954322814941, "learning_rate": 4.442221724786891e-05, "loss": 0.7274, "step": 409100 }, { "epoch": 5.455199904014078, "grad_norm": 305.59942626953125, "learning_rate": 4.441460533495479e-05, "loss": 0.6932, "step": 409200 }, { "epoch": 5.456533041820533, "grad_norm": 18.676836013793945, "learning_rate": 4.440699221529535e-05, "loss": 0.6552, "step": 409300 }, { "epoch": 5.457866179626988, "grad_norm": 325.3246765136719, "learning_rate": 4.439937788952796e-05, "loss": 0.7276, "step": 409400 }, { "epoch": 5.459199317433443, "grad_norm": 10.295157432556152, "learning_rate": 4.439176235829001e-05, "loss": 0.596, "step": 409500 }, { "epoch": 5.460532455239898, "grad_norm": 30.979759216308594, "learning_rate": 4.4384145622219104e-05, "loss": 0.6196, "step": 409600 }, { "epoch": 5.461865593046353, "grad_norm": 52.773006439208984, "learning_rate": 4.437652768195287e-05, "loss": 0.7173, "step": 409700 }, { "epoch": 5.463198730852808, "grad_norm": 107.91571807861328, "learning_rate": 4.4368908538129054e-05, "loss": 0.6375, "step": 409800 }, { "epoch": 5.464531868659264, "grad_norm": 23.086641311645508, "learning_rate": 4.436128819138551e-05, "loss": 0.639, "step": 409900 }, { "epoch": 5.465865006465719, "grad_norm": 9.32318115234375, "learning_rate": 4.435366664236018e-05, "loss": 0.7421, "step": 410000 }, { "epoch": 5.4671981442721735, "grad_norm": 8.018279075622559, "learning_rate": 4.434604389169114e-05, "loss": 0.6805, "step": 410100 }, { "epoch": 5.4685312820786285, "grad_norm": 5.691890716552734, "learning_rate": 4.433841994001651e-05, "loss": 0.7584, "step": 410200 }, { "epoch": 5.469864419885083, "grad_norm": 2.6475841999053955, "learning_rate": 4.4330794787974574e-05, "loss": 0.6081, "step": 410300 }, { "epoch": 5.471197557691538, "grad_norm": 12.921601295471191, "learning_rate": 4.4323168436203674e-05, "loss": 0.5786, "step": 410400 }, { "epoch": 5.472530695497993, "grad_norm": 6.770762920379639, "learning_rate": 4.4315540885342264e-05, "loss": 0.5758, "step": 410500 }, { "epoch": 5.473863833304449, "grad_norm": 4.356102466583252, "learning_rate": 4.430791213602889e-05, "loss": 0.6633, "step": 410600 }, { "epoch": 5.475196971110904, "grad_norm": 8.124466896057129, "learning_rate": 4.4300282188902216e-05, "loss": 0.6067, "step": 410700 }, { "epoch": 5.476530108917359, "grad_norm": 2.906402111053467, "learning_rate": 4.4292651044601004e-05, "loss": 0.7162, "step": 410800 }, { "epoch": 5.477863246723814, "grad_norm": 4.314637184143066, "learning_rate": 4.428501870376408e-05, "loss": 0.664, "step": 410900 }, { "epoch": 5.479196384530269, "grad_norm": 7.596796035766602, "learning_rate": 4.427738516703044e-05, "loss": 0.6975, "step": 411000 }, { "epoch": 5.480529522336724, "grad_norm": 72.24333190917969, "learning_rate": 4.426975043503912e-05, "loss": 0.6579, "step": 411100 }, { "epoch": 5.481862660143179, "grad_norm": 1.2582861185073853, "learning_rate": 4.426211450842927e-05, "loss": 0.6397, "step": 411200 }, { "epoch": 5.483195797949634, "grad_norm": 13.958908081054688, "learning_rate": 4.425447738784014e-05, "loss": 0.7465, "step": 411300 }, { "epoch": 5.484528935756089, "grad_norm": 17.9923095703125, "learning_rate": 4.424683907391112e-05, "loss": 0.6891, "step": 411400 }, { "epoch": 5.4858620735625445, "grad_norm": 1.9312050342559814, "learning_rate": 4.423919956728163e-05, "loss": 0.6948, "step": 411500 }, { "epoch": 5.4871952113689995, "grad_norm": 8.822339057922363, "learning_rate": 4.423155886859124e-05, "loss": 0.7184, "step": 411600 }, { "epoch": 5.488528349175454, "grad_norm": 7.917006015777588, "learning_rate": 4.42239169784796e-05, "loss": 0.6937, "step": 411700 }, { "epoch": 5.489861486981909, "grad_norm": 5.7538347244262695, "learning_rate": 4.4216273897586474e-05, "loss": 0.7038, "step": 411800 }, { "epoch": 5.491194624788364, "grad_norm": 6.244414329528809, "learning_rate": 4.42086296265517e-05, "loss": 0.6626, "step": 411900 }, { "epoch": 5.492527762594819, "grad_norm": 12.879645347595215, "learning_rate": 4.4200984166015255e-05, "loss": 0.678, "step": 412000 }, { "epoch": 5.493860900401274, "grad_norm": 32.10919952392578, "learning_rate": 4.4193337516617175e-05, "loss": 0.7684, "step": 412100 }, { "epoch": 5.49519403820773, "grad_norm": 6.319097995758057, "learning_rate": 4.4185689678997626e-05, "loss": 0.6766, "step": 412200 }, { "epoch": 5.496527176014185, "grad_norm": 48.49184036254883, "learning_rate": 4.4178040653796847e-05, "loss": 0.7964, "step": 412300 }, { "epoch": 5.49786031382064, "grad_norm": 14.318880081176758, "learning_rate": 4.417039044165521e-05, "loss": 0.6612, "step": 412400 }, { "epoch": 5.499193451627095, "grad_norm": 1.6690994501113892, "learning_rate": 4.416273904321313e-05, "loss": 0.6397, "step": 412500 }, { "epoch": 5.50052658943355, "grad_norm": 18.70594596862793, "learning_rate": 4.41550864591112e-05, "loss": 0.6499, "step": 412600 }, { "epoch": 5.501859727240005, "grad_norm": 7.334005832672119, "learning_rate": 4.414743268999005e-05, "loss": 0.6657, "step": 412700 }, { "epoch": 5.50319286504646, "grad_norm": 3.9514925479888916, "learning_rate": 4.413977773649043e-05, "loss": 0.6114, "step": 412800 }, { "epoch": 5.504526002852915, "grad_norm": 6.986052513122559, "learning_rate": 4.413212159925319e-05, "loss": 0.7338, "step": 412900 }, { "epoch": 5.5058591406593695, "grad_norm": 3.7732105255126953, "learning_rate": 4.412446427891928e-05, "loss": 0.652, "step": 413000 }, { "epoch": 5.507192278465825, "grad_norm": 3.885742664337158, "learning_rate": 4.411680577612975e-05, "loss": 0.6632, "step": 413100 }, { "epoch": 5.50852541627228, "grad_norm": 19.451154708862305, "learning_rate": 4.410914609152573e-05, "loss": 0.6724, "step": 413200 }, { "epoch": 5.509858554078735, "grad_norm": 4.355400562286377, "learning_rate": 4.4101485225748473e-05, "loss": 0.6714, "step": 413300 }, { "epoch": 5.51119169188519, "grad_norm": 34.14997100830078, "learning_rate": 4.409382317943933e-05, "loss": 0.6241, "step": 413400 }, { "epoch": 5.512524829691645, "grad_norm": 6.7019805908203125, "learning_rate": 4.4086159953239745e-05, "loss": 0.6758, "step": 413500 }, { "epoch": 5.5138579674981, "grad_norm": 10.110997200012207, "learning_rate": 4.4078495547791245e-05, "loss": 0.6355, "step": 413600 }, { "epoch": 5.515191105304555, "grad_norm": 17.276533126831055, "learning_rate": 4.407082996373548e-05, "loss": 0.7142, "step": 413700 }, { "epoch": 5.516524243111011, "grad_norm": 3.8590517044067383, "learning_rate": 4.4063163201714175e-05, "loss": 0.6627, "step": 413800 }, { "epoch": 5.517857380917466, "grad_norm": 12.28921890258789, "learning_rate": 4.405549526236919e-05, "loss": 0.671, "step": 413900 }, { "epoch": 5.519190518723921, "grad_norm": 10.402558326721191, "learning_rate": 4.404782614634245e-05, "loss": 0.6449, "step": 414000 }, { "epoch": 5.520523656530376, "grad_norm": 3.045224666595459, "learning_rate": 4.404023256301592e-05, "loss": 0.7191, "step": 414100 }, { "epoch": 5.521856794336831, "grad_norm": 16.223459243774414, "learning_rate": 4.403256110730267e-05, "loss": 0.8171, "step": 414200 }, { "epoch": 5.523189932143286, "grad_norm": 12.944986343383789, "learning_rate": 4.402488847682762e-05, "loss": 0.6791, "step": 414300 }, { "epoch": 5.5245230699497405, "grad_norm": 16.285036087036133, "learning_rate": 4.401721467223313e-05, "loss": 0.6129, "step": 414400 }, { "epoch": 5.5258562077561955, "grad_norm": 26.968135833740234, "learning_rate": 4.400953969416161e-05, "loss": 0.6133, "step": 414500 }, { "epoch": 5.52718934556265, "grad_norm": 24.814590454101562, "learning_rate": 4.400186354325559e-05, "loss": 0.7593, "step": 414600 }, { "epoch": 5.528522483369106, "grad_norm": 10.729105949401855, "learning_rate": 4.399418622015769e-05, "loss": 0.6843, "step": 414700 }, { "epoch": 5.529855621175561, "grad_norm": 1.8162351846694946, "learning_rate": 4.398650772551062e-05, "loss": 0.6327, "step": 414800 }, { "epoch": 5.531188758982016, "grad_norm": 6.358514308929443, "learning_rate": 4.397882805995723e-05, "loss": 0.6494, "step": 414900 }, { "epoch": 5.532521896788471, "grad_norm": 14.238226890563965, "learning_rate": 4.3971147224140414e-05, "loss": 0.6677, "step": 415000 }, { "epoch": 5.533855034594926, "grad_norm": 6.020965099334717, "learning_rate": 4.396346521870319e-05, "loss": 0.6307, "step": 415100 }, { "epoch": 5.535188172401381, "grad_norm": 11.726151466369629, "learning_rate": 4.3955782044288684e-05, "loss": 0.6686, "step": 415200 }, { "epoch": 5.536521310207836, "grad_norm": 5.943363666534424, "learning_rate": 4.394809770154009e-05, "loss": 0.6834, "step": 415300 }, { "epoch": 5.537854448014292, "grad_norm": 5.6667962074279785, "learning_rate": 4.394041219110073e-05, "loss": 0.6946, "step": 415400 }, { "epoch": 5.539187585820747, "grad_norm": 63.961578369140625, "learning_rate": 4.3932725513614e-05, "loss": 0.6827, "step": 415500 }, { "epoch": 5.540520723627202, "grad_norm": 5.102651119232178, "learning_rate": 4.392503766972342e-05, "loss": 0.7289, "step": 415600 }, { "epoch": 5.5418538614336565, "grad_norm": 5.462428569793701, "learning_rate": 4.391734866007257e-05, "loss": 0.6796, "step": 415700 }, { "epoch": 5.5431869992401115, "grad_norm": 13.651060104370117, "learning_rate": 4.3909658485305165e-05, "loss": 0.7517, "step": 415800 }, { "epoch": 5.544520137046566, "grad_norm": 5.4450297355651855, "learning_rate": 4.3901967146064995e-05, "loss": 0.7339, "step": 415900 }, { "epoch": 5.545853274853021, "grad_norm": 8.46572494506836, "learning_rate": 4.3894274642995964e-05, "loss": 0.7332, "step": 416000 }, { "epoch": 5.547186412659476, "grad_norm": 14.802889823913574, "learning_rate": 4.388658097674205e-05, "loss": 0.5882, "step": 416100 }, { "epoch": 5.548519550465931, "grad_norm": 15.547002792358398, "learning_rate": 4.387888614794736e-05, "loss": 0.7156, "step": 416200 }, { "epoch": 5.549852688272387, "grad_norm": 37.4771614074707, "learning_rate": 4.3871190157256065e-05, "loss": 0.681, "step": 416300 }, { "epoch": 5.551185826078842, "grad_norm": 13.495577812194824, "learning_rate": 4.386349300531246e-05, "loss": 0.6811, "step": 416400 }, { "epoch": 5.552518963885297, "grad_norm": 38.21630859375, "learning_rate": 4.3855794692760924e-05, "loss": 0.6656, "step": 416500 }, { "epoch": 5.553852101691752, "grad_norm": 10.823563575744629, "learning_rate": 4.384809522024593e-05, "loss": 0.7169, "step": 416600 }, { "epoch": 5.555185239498207, "grad_norm": 9.992149353027344, "learning_rate": 4.384047160046691e-05, "loss": 0.6865, "step": 416700 }, { "epoch": 5.556518377304662, "grad_norm": 10.326072692871094, "learning_rate": 4.3832769821542385e-05, "loss": 0.6413, "step": 416800 }, { "epoch": 5.557851515111117, "grad_norm": 17.17520523071289, "learning_rate": 4.382506688458197e-05, "loss": 0.6699, "step": 416900 }, { "epoch": 5.559184652917573, "grad_norm": 7.558578968048096, "learning_rate": 4.381736279023053e-05, "loss": 0.7115, "step": 417000 }, { "epoch": 5.5605177907240275, "grad_norm": 4.429562568664551, "learning_rate": 4.380965753913305e-05, "loss": 0.6979, "step": 417100 }, { "epoch": 5.5618509285304825, "grad_norm": 6.902691841125488, "learning_rate": 4.380195113193457e-05, "loss": 0.6365, "step": 417200 }, { "epoch": 5.563184066336937, "grad_norm": 34.63185119628906, "learning_rate": 4.3794243569280245e-05, "loss": 0.6444, "step": 417300 }, { "epoch": 5.564517204143392, "grad_norm": 10.910612106323242, "learning_rate": 4.378653485181532e-05, "loss": 0.6257, "step": 417400 }, { "epoch": 5.565850341949847, "grad_norm": 70.61425018310547, "learning_rate": 4.3778824980185165e-05, "loss": 0.719, "step": 417500 }, { "epoch": 5.567183479756302, "grad_norm": 10.188260078430176, "learning_rate": 4.3771113955035226e-05, "loss": 0.5957, "step": 417600 }, { "epoch": 5.568516617562757, "grad_norm": 2.6471476554870605, "learning_rate": 4.376340177701102e-05, "loss": 0.6508, "step": 417700 }, { "epoch": 5.569849755369212, "grad_norm": 2.300189733505249, "learning_rate": 4.375568844675821e-05, "loss": 0.6999, "step": 417800 }, { "epoch": 5.571182893175668, "grad_norm": 7.0720038414001465, "learning_rate": 4.374797396492254e-05, "loss": 0.6604, "step": 417900 }, { "epoch": 5.572516030982123, "grad_norm": 11.232669830322266, "learning_rate": 4.374025833214981e-05, "loss": 0.667, "step": 418000 }, { "epoch": 5.573849168788578, "grad_norm": 7.996166706085205, "learning_rate": 4.373254154908597e-05, "loss": 0.763, "step": 418100 }, { "epoch": 5.575182306595033, "grad_norm": 17.639564514160156, "learning_rate": 4.372482361637704e-05, "loss": 0.732, "step": 418200 }, { "epoch": 5.576515444401488, "grad_norm": 10.613512992858887, "learning_rate": 4.3717104534669144e-05, "loss": 0.7249, "step": 418300 }, { "epoch": 5.577848582207943, "grad_norm": 51.044410705566406, "learning_rate": 4.370938430460849e-05, "loss": 0.6626, "step": 418400 }, { "epoch": 5.579181720014398, "grad_norm": 76.06254577636719, "learning_rate": 4.37016629268414e-05, "loss": 0.7411, "step": 418500 }, { "epoch": 5.580514857820853, "grad_norm": 13.600457191467285, "learning_rate": 4.369394040201429e-05, "loss": 0.7279, "step": 418600 }, { "epoch": 5.581847995627308, "grad_norm": 4.891232967376709, "learning_rate": 4.368621673077365e-05, "loss": 0.7303, "step": 418700 }, { "epoch": 5.583181133433763, "grad_norm": 20.757631301879883, "learning_rate": 4.367849191376608e-05, "loss": 0.657, "step": 418800 }, { "epoch": 5.584514271240218, "grad_norm": 98.83023071289062, "learning_rate": 4.36707659516383e-05, "loss": 0.7766, "step": 418900 }, { "epoch": 5.585847409046673, "grad_norm": 67.65653228759766, "learning_rate": 4.3663038845037076e-05, "loss": 0.7221, "step": 419000 }, { "epoch": 5.587180546853128, "grad_norm": 3.442941188812256, "learning_rate": 4.365531059460931e-05, "loss": 0.6577, "step": 419100 }, { "epoch": 5.588513684659583, "grad_norm": 35.14106750488281, "learning_rate": 4.3647581201001996e-05, "loss": 0.6044, "step": 419200 }, { "epoch": 5.589846822466038, "grad_norm": 42.340450286865234, "learning_rate": 4.36398506648622e-05, "loss": 0.6252, "step": 419300 }, { "epoch": 5.591179960272493, "grad_norm": 2.010157823562622, "learning_rate": 4.3632118986837105e-05, "loss": 0.7507, "step": 419400 }, { "epoch": 5.592513098078949, "grad_norm": 7.497711658477783, "learning_rate": 4.362438616757398e-05, "loss": 0.7074, "step": 419500 }, { "epoch": 5.593846235885404, "grad_norm": 51.63133239746094, "learning_rate": 4.361672955296252e-05, "loss": 0.696, "step": 419600 }, { "epoch": 5.595179373691859, "grad_norm": 14.767909049987793, "learning_rate": 4.360899446456176e-05, "loss": 0.7531, "step": 419700 }, { "epoch": 5.596512511498314, "grad_norm": 16.32986831665039, "learning_rate": 4.360125823685888e-05, "loss": 0.7799, "step": 419800 }, { "epoch": 5.597845649304769, "grad_norm": 8.152706146240234, "learning_rate": 4.359352087050153e-05, "loss": 0.6855, "step": 419900 }, { "epoch": 5.5991787871112235, "grad_norm": 15.423282623291016, "learning_rate": 4.3585782366137475e-05, "loss": 0.6989, "step": 420000 }, { "epoch": 5.6005119249176785, "grad_norm": 8.031996726989746, "learning_rate": 4.3578042724414544e-05, "loss": 0.7006, "step": 420100 }, { "epoch": 5.601845062724133, "grad_norm": 1.9699705839157104, "learning_rate": 4.357030194598066e-05, "loss": 0.6721, "step": 420200 }, { "epoch": 5.603178200530589, "grad_norm": 49.09469985961914, "learning_rate": 4.35625600314839e-05, "loss": 0.6827, "step": 420300 }, { "epoch": 5.604511338337044, "grad_norm": 23.257831573486328, "learning_rate": 4.3554816981572356e-05, "loss": 0.7425, "step": 420400 }, { "epoch": 5.605844476143499, "grad_norm": 192.9647674560547, "learning_rate": 4.354707279689427e-05, "loss": 0.6275, "step": 420500 }, { "epoch": 5.607177613949954, "grad_norm": 17.2260684967041, "learning_rate": 4.353932747809794e-05, "loss": 0.6863, "step": 420600 }, { "epoch": 5.608510751756409, "grad_norm": 4.285545825958252, "learning_rate": 4.35315810258318e-05, "loss": 0.7791, "step": 420700 }, { "epoch": 5.609843889562864, "grad_norm": 53.589439392089844, "learning_rate": 4.3523833440744346e-05, "loss": 0.7024, "step": 420800 }, { "epoch": 5.611177027369319, "grad_norm": 130.67405700683594, "learning_rate": 4.351608472348419e-05, "loss": 0.6717, "step": 420900 }, { "epoch": 5.612510165175774, "grad_norm": 6.692306995391846, "learning_rate": 4.3508334874700036e-05, "loss": 0.6895, "step": 421000 }, { "epoch": 5.613843302982229, "grad_norm": 9.857467651367188, "learning_rate": 4.3500583895040655e-05, "loss": 0.774, "step": 421100 }, { "epoch": 5.615176440788685, "grad_norm": 7.702019691467285, "learning_rate": 4.3492831785154964e-05, "loss": 0.8476, "step": 421200 }, { "epoch": 5.6165095785951396, "grad_norm": 11.480873107910156, "learning_rate": 4.348507854569191e-05, "loss": 0.6319, "step": 421300 }, { "epoch": 5.6178427164015945, "grad_norm": 4.873190402984619, "learning_rate": 4.347732417730061e-05, "loss": 0.7861, "step": 421400 }, { "epoch": 5.619175854208049, "grad_norm": 5.330602645874023, "learning_rate": 4.346956868063019e-05, "loss": 0.758, "step": 421500 }, { "epoch": 5.620508992014504, "grad_norm": 7.756630897521973, "learning_rate": 4.346181205632996e-05, "loss": 0.7292, "step": 421600 }, { "epoch": 5.621842129820959, "grad_norm": 33.33749771118164, "learning_rate": 4.345405430504925e-05, "loss": 0.6291, "step": 421700 }, { "epoch": 5.623175267627414, "grad_norm": 31.02823829650879, "learning_rate": 4.344629542743753e-05, "loss": 0.6297, "step": 421800 }, { "epoch": 5.62450840543387, "grad_norm": 2.190993547439575, "learning_rate": 4.343853542414434e-05, "loss": 0.6411, "step": 421900 }, { "epoch": 5.625841543240325, "grad_norm": 11.056086540222168, "learning_rate": 4.343077429581932e-05, "loss": 0.6319, "step": 422000 }, { "epoch": 5.62717468104678, "grad_norm": 9.554079055786133, "learning_rate": 4.342301204311222e-05, "loss": 0.6892, "step": 422100 }, { "epoch": 5.628507818853235, "grad_norm": 5.624352931976318, "learning_rate": 4.341524866667286e-05, "loss": 0.6762, "step": 422200 }, { "epoch": 5.62984095665969, "grad_norm": 6.303808212280273, "learning_rate": 4.340756181770352e-05, "loss": 0.6881, "step": 422300 }, { "epoch": 5.631174094466145, "grad_norm": 13.666826248168945, "learning_rate": 4.3399796206970623e-05, "loss": 0.7772, "step": 422400 }, { "epoch": 5.6325072322726, "grad_norm": 60.99671173095703, "learning_rate": 4.339202947444903e-05, "loss": 0.6442, "step": 422500 }, { "epoch": 5.633840370079055, "grad_norm": 4.731222629547119, "learning_rate": 4.338426162078896e-05, "loss": 0.6519, "step": 422600 }, { "epoch": 5.63517350788551, "grad_norm": 5.5145697593688965, "learning_rate": 4.3376492646640685e-05, "loss": 0.6242, "step": 422700 }, { "epoch": 5.6365066456919655, "grad_norm": 15.273177146911621, "learning_rate": 4.336872255265461e-05, "loss": 0.6691, "step": 422800 }, { "epoch": 5.63783978349842, "grad_norm": 7.13780403137207, "learning_rate": 4.336095133948123e-05, "loss": 0.6439, "step": 422900 }, { "epoch": 5.639172921304875, "grad_norm": 2.735966920852661, "learning_rate": 4.335317900777111e-05, "loss": 0.6491, "step": 423000 }, { "epoch": 5.64050605911133, "grad_norm": 23.67481231689453, "learning_rate": 4.334540555817495e-05, "loss": 0.5954, "step": 423100 }, { "epoch": 5.641839196917785, "grad_norm": 10.585471153259277, "learning_rate": 4.33376309913435e-05, "loss": 0.7002, "step": 423200 }, { "epoch": 5.64317233472424, "grad_norm": 7.624035835266113, "learning_rate": 4.332985530792764e-05, "loss": 0.6585, "step": 423300 }, { "epoch": 5.644505472530695, "grad_norm": 6.8290839195251465, "learning_rate": 4.332207850857829e-05, "loss": 0.6881, "step": 423400 }, { "epoch": 5.645838610337151, "grad_norm": 6.438426971435547, "learning_rate": 4.3314300593946536e-05, "loss": 0.7006, "step": 423500 }, { "epoch": 5.647171748143606, "grad_norm": 11.449352264404297, "learning_rate": 4.330652156468351e-05, "loss": 0.6781, "step": 423600 }, { "epoch": 5.648504885950061, "grad_norm": 6.316861152648926, "learning_rate": 4.329874142144043e-05, "loss": 0.6084, "step": 423700 }, { "epoch": 5.649838023756516, "grad_norm": 1.6105644702911377, "learning_rate": 4.329096016486864e-05, "loss": 0.6591, "step": 423800 }, { "epoch": 5.651171161562971, "grad_norm": 11.113757133483887, "learning_rate": 4.328317779561957e-05, "loss": 0.6358, "step": 423900 }, { "epoch": 5.652504299369426, "grad_norm": 51.40912628173828, "learning_rate": 4.3275394314344734e-05, "loss": 0.6443, "step": 424000 }, { "epoch": 5.653837437175881, "grad_norm": 13.745274543762207, "learning_rate": 4.326760972169572e-05, "loss": 0.6911, "step": 424100 }, { "epoch": 5.6551705749823356, "grad_norm": 9.780818939208984, "learning_rate": 4.325982401832425e-05, "loss": 0.6388, "step": 424200 }, { "epoch": 5.6565037127887905, "grad_norm": 4.4882073402404785, "learning_rate": 4.325203720488212e-05, "loss": 0.6771, "step": 424300 }, { "epoch": 5.657836850595246, "grad_norm": 7.0416646003723145, "learning_rate": 4.3244249282021194e-05, "loss": 0.706, "step": 424400 }, { "epoch": 5.659169988401701, "grad_norm": 15.392812728881836, "learning_rate": 4.3236460250393465e-05, "loss": 0.6658, "step": 424500 }, { "epoch": 5.660503126208156, "grad_norm": 43.76149368286133, "learning_rate": 4.322867011065102e-05, "loss": 0.6638, "step": 424600 }, { "epoch": 5.661836264014611, "grad_norm": 5.118747711181641, "learning_rate": 4.322087886344602e-05, "loss": 0.7244, "step": 424700 }, { "epoch": 5.663169401821066, "grad_norm": 4.682455062866211, "learning_rate": 4.321308650943069e-05, "loss": 0.6861, "step": 424800 }, { "epoch": 5.664502539627521, "grad_norm": 6.995842456817627, "learning_rate": 4.320529304925743e-05, "loss": 0.6813, "step": 424900 }, { "epoch": 5.665835677433976, "grad_norm": 13.321234703063965, "learning_rate": 4.319749848357866e-05, "loss": 0.6177, "step": 425000 }, { "epoch": 5.667168815240432, "grad_norm": 8.946671485900879, "learning_rate": 4.3189702813046926e-05, "loss": 0.657, "step": 425100 }, { "epoch": 5.668501953046887, "grad_norm": 5.538520336151123, "learning_rate": 4.318190603831484e-05, "loss": 0.6671, "step": 425200 }, { "epoch": 5.669835090853342, "grad_norm": 4.879834175109863, "learning_rate": 4.317410816003513e-05, "loss": 0.7236, "step": 425300 }, { "epoch": 5.671168228659797, "grad_norm": 3.5781710147857666, "learning_rate": 4.316630917886062e-05, "loss": 0.6697, "step": 425400 }, { "epoch": 5.672501366466252, "grad_norm": 75.71310424804688, "learning_rate": 4.31585090954442e-05, "loss": 0.6659, "step": 425500 }, { "epoch": 5.6738345042727065, "grad_norm": 6.2209248542785645, "learning_rate": 4.31507079104389e-05, "loss": 0.6315, "step": 425600 }, { "epoch": 5.6751676420791615, "grad_norm": 7.338298320770264, "learning_rate": 4.314290562449777e-05, "loss": 0.6585, "step": 425700 }, { "epoch": 5.676500779885616, "grad_norm": 5.965504169464111, "learning_rate": 4.313510223827402e-05, "loss": 0.6645, "step": 425800 }, { "epoch": 5.677833917692071, "grad_norm": 24.16714096069336, "learning_rate": 4.3127297752420913e-05, "loss": 0.6746, "step": 425900 }, { "epoch": 5.679167055498527, "grad_norm": 6.5098161697387695, "learning_rate": 4.3119492167591815e-05, "loss": 0.6214, "step": 426000 }, { "epoch": 5.680500193304982, "grad_norm": 4.593982696533203, "learning_rate": 4.3111685484440194e-05, "loss": 0.6719, "step": 426100 }, { "epoch": 5.681833331111437, "grad_norm": 7.863062381744385, "learning_rate": 4.3103877703619586e-05, "loss": 0.6846, "step": 426200 }, { "epoch": 5.683166468917892, "grad_norm": 10.252492904663086, "learning_rate": 4.309606882578366e-05, "loss": 0.7265, "step": 426300 }, { "epoch": 5.684499606724347, "grad_norm": 31.949848175048828, "learning_rate": 4.3088258851586126e-05, "loss": 0.6697, "step": 426400 }, { "epoch": 5.685832744530802, "grad_norm": 10.743928909301758, "learning_rate": 4.3080447781680815e-05, "loss": 0.6906, "step": 426500 }, { "epoch": 5.687165882337257, "grad_norm": 4.329451084136963, "learning_rate": 4.307263561672164e-05, "loss": 0.601, "step": 426600 }, { "epoch": 5.688499020143713, "grad_norm": 3.9495413303375244, "learning_rate": 4.306482235736264e-05, "loss": 0.6615, "step": 426700 }, { "epoch": 5.689832157950168, "grad_norm": 2.1944479942321777, "learning_rate": 4.3057008004257885e-05, "loss": 0.6532, "step": 426800 }, { "epoch": 5.6911652957566226, "grad_norm": 2.20268177986145, "learning_rate": 4.3049192558061585e-05, "loss": 0.5961, "step": 426900 }, { "epoch": 5.6924984335630775, "grad_norm": 7.502962589263916, "learning_rate": 4.304137601942802e-05, "loss": 0.631, "step": 427000 }, { "epoch": 5.693831571369532, "grad_norm": 31.912446975708008, "learning_rate": 4.3033558389011556e-05, "loss": 0.6219, "step": 427100 }, { "epoch": 5.695164709175987, "grad_norm": 146.8378143310547, "learning_rate": 4.3025739667466676e-05, "loss": 0.6316, "step": 427200 }, { "epoch": 5.696497846982442, "grad_norm": 11.60507869720459, "learning_rate": 4.301791985544793e-05, "loss": 0.6397, "step": 427300 }, { "epoch": 5.697830984788897, "grad_norm": 7.527518272399902, "learning_rate": 4.3010098953609965e-05, "loss": 0.7123, "step": 427400 }, { "epoch": 5.699164122595352, "grad_norm": 0.9635306596755981, "learning_rate": 4.300227696260753e-05, "loss": 0.6844, "step": 427500 }, { "epoch": 5.700497260401808, "grad_norm": 18.64783477783203, "learning_rate": 4.299445388309545e-05, "loss": 0.6898, "step": 427600 }, { "epoch": 5.701830398208263, "grad_norm": 6.448761940002441, "learning_rate": 4.298662971572866e-05, "loss": 0.6428, "step": 427700 }, { "epoch": 5.703163536014718, "grad_norm": 13.172983169555664, "learning_rate": 4.297880446116217e-05, "loss": 0.6436, "step": 427800 }, { "epoch": 5.704496673821173, "grad_norm": 3.3579354286193848, "learning_rate": 4.297097812005107e-05, "loss": 0.6987, "step": 427900 }, { "epoch": 5.705829811627628, "grad_norm": 20.228771209716797, "learning_rate": 4.296315069305058e-05, "loss": 0.5873, "step": 428000 }, { "epoch": 5.707162949434083, "grad_norm": 7.655489444732666, "learning_rate": 4.295532218081598e-05, "loss": 0.6626, "step": 428100 }, { "epoch": 5.708496087240538, "grad_norm": 14.288227081298828, "learning_rate": 4.294749258400264e-05, "loss": 0.7753, "step": 428200 }, { "epoch": 5.7098292250469935, "grad_norm": 5.298830032348633, "learning_rate": 4.293966190326604e-05, "loss": 0.7165, "step": 428300 }, { "epoch": 5.7111623628534485, "grad_norm": 7.341174602508545, "learning_rate": 4.2931830139261734e-05, "loss": 0.5883, "step": 428400 }, { "epoch": 5.712495500659903, "grad_norm": 13.550023078918457, "learning_rate": 4.2923997292645366e-05, "loss": 0.7749, "step": 428500 }, { "epoch": 5.713828638466358, "grad_norm": 9.775358200073242, "learning_rate": 4.2916163364072684e-05, "loss": 0.6956, "step": 428600 }, { "epoch": 5.715161776272813, "grad_norm": 4.837075710296631, "learning_rate": 4.2908328354199526e-05, "loss": 0.651, "step": 428700 }, { "epoch": 5.716494914079268, "grad_norm": 3.8919849395751953, "learning_rate": 4.2900492263681816e-05, "loss": 0.6389, "step": 428800 }, { "epoch": 5.717828051885723, "grad_norm": 4.904496192932129, "learning_rate": 4.289265509317555e-05, "loss": 0.6348, "step": 428900 }, { "epoch": 5.719161189692178, "grad_norm": 2.9453723430633545, "learning_rate": 4.2884816843336845e-05, "loss": 0.6525, "step": 429000 }, { "epoch": 5.720494327498633, "grad_norm": 7.691165924072266, "learning_rate": 4.287697751482188e-05, "loss": 0.6915, "step": 429100 }, { "epoch": 5.721827465305089, "grad_norm": 2.128624200820923, "learning_rate": 4.286913710828697e-05, "loss": 0.6834, "step": 429200 }, { "epoch": 5.723160603111544, "grad_norm": 29.221759796142578, "learning_rate": 4.286129562438845e-05, "loss": 0.6806, "step": 429300 }, { "epoch": 5.724493740917999, "grad_norm": 24.317840576171875, "learning_rate": 4.2853453063782804e-05, "loss": 0.7076, "step": 429400 }, { "epoch": 5.725826878724454, "grad_norm": 19.400407791137695, "learning_rate": 4.2845609427126596e-05, "loss": 0.6303, "step": 429500 }, { "epoch": 5.727160016530909, "grad_norm": 65.45232391357422, "learning_rate": 4.283776471507645e-05, "loss": 0.7348, "step": 429600 }, { "epoch": 5.728493154337364, "grad_norm": 10.147842407226562, "learning_rate": 4.2829918928289106e-05, "loss": 0.6588, "step": 429700 }, { "epoch": 5.7298262921438186, "grad_norm": 27.722270965576172, "learning_rate": 4.2822072067421405e-05, "loss": 0.6165, "step": 429800 }, { "epoch": 5.731159429950274, "grad_norm": 28.017690658569336, "learning_rate": 4.281422413313024e-05, "loss": 0.7277, "step": 429900 }, { "epoch": 5.732492567756729, "grad_norm": 4.029597282409668, "learning_rate": 4.280637512607264e-05, "loss": 0.6316, "step": 430000 }, { "epoch": 5.733825705563184, "grad_norm": 36.75868225097656, "learning_rate": 4.279860355300211e-05, "loss": 0.6994, "step": 430100 }, { "epoch": 5.735158843369639, "grad_norm": 3.5336461067199707, "learning_rate": 4.279075241309425e-05, "loss": 0.6887, "step": 430200 }, { "epoch": 5.736491981176094, "grad_norm": 167.57528686523438, "learning_rate": 4.27829002023849e-05, "loss": 0.6097, "step": 430300 }, { "epoch": 5.737825118982549, "grad_norm": 12.209819793701172, "learning_rate": 4.277504692153145e-05, "loss": 0.6344, "step": 430400 }, { "epoch": 5.739158256789004, "grad_norm": 3.996966600418091, "learning_rate": 4.276719257119133e-05, "loss": 0.6449, "step": 430500 }, { "epoch": 5.740491394595459, "grad_norm": 12.835688591003418, "learning_rate": 4.27593371520221e-05, "loss": 0.6307, "step": 430600 }, { "epoch": 5.741824532401914, "grad_norm": 3.2151477336883545, "learning_rate": 4.2751480664681395e-05, "loss": 0.6665, "step": 430700 }, { "epoch": 5.74315767020837, "grad_norm": 3.7861196994781494, "learning_rate": 4.2743623109826913e-05, "loss": 0.6428, "step": 430800 }, { "epoch": 5.744490808014825, "grad_norm": 3.743710994720459, "learning_rate": 4.2735764488116485e-05, "loss": 0.6012, "step": 430900 }, { "epoch": 5.74582394582128, "grad_norm": 2.785374879837036, "learning_rate": 4.272790480020799e-05, "loss": 0.621, "step": 431000 }, { "epoch": 5.747157083627735, "grad_norm": 9.325956344604492, "learning_rate": 4.2720044046759435e-05, "loss": 0.6881, "step": 431100 }, { "epoch": 5.7484902214341895, "grad_norm": 15.9859037399292, "learning_rate": 4.271218222842888e-05, "loss": 0.6684, "step": 431200 }, { "epoch": 5.7498233592406445, "grad_norm": 19.793010711669922, "learning_rate": 4.27043193458745e-05, "loss": 0.63, "step": 431300 }, { "epoch": 5.751156497047099, "grad_norm": 32.926734924316406, "learning_rate": 4.2696455399754574e-05, "loss": 0.6276, "step": 431400 }, { "epoch": 5.752489634853555, "grad_norm": 6.736769676208496, "learning_rate": 4.2688590390727404e-05, "loss": 0.6176, "step": 431500 }, { "epoch": 5.75382277266001, "grad_norm": 3.617908477783203, "learning_rate": 4.268072431945145e-05, "loss": 0.6835, "step": 431600 }, { "epoch": 5.755155910466465, "grad_norm": 4.804955959320068, "learning_rate": 4.267285718658522e-05, "loss": 0.6295, "step": 431700 }, { "epoch": 5.75648904827292, "grad_norm": 3.4297802448272705, "learning_rate": 4.266498899278735e-05, "loss": 0.6505, "step": 431800 }, { "epoch": 5.757822186079375, "grad_norm": 3.6572773456573486, "learning_rate": 4.2657119738716516e-05, "loss": 0.6504, "step": 431900 }, { "epoch": 5.75915532388583, "grad_norm": 13.93950080871582, "learning_rate": 4.2649328133411296e-05, "loss": 0.6678, "step": 432000 }, { "epoch": 5.760488461692285, "grad_norm": 24.760507583618164, "learning_rate": 4.2641456771357304e-05, "loss": 0.6736, "step": 432100 }, { "epoch": 5.76182159949874, "grad_norm": 2.0764455795288086, "learning_rate": 4.26335843510004e-05, "loss": 0.6807, "step": 432200 }, { "epoch": 5.763154737305195, "grad_norm": 5.20115852355957, "learning_rate": 4.262571087299963e-05, "loss": 0.6147, "step": 432300 }, { "epoch": 5.764487875111651, "grad_norm": 7.1416335105896, "learning_rate": 4.261783633801416e-05, "loss": 0.6158, "step": 432400 }, { "epoch": 5.765821012918106, "grad_norm": 4.898864269256592, "learning_rate": 4.26099607467032e-05, "loss": 0.7192, "step": 432500 }, { "epoch": 5.7671541507245605, "grad_norm": 3.15116810798645, "learning_rate": 4.2602084099726074e-05, "loss": 0.709, "step": 432600 }, { "epoch": 5.7684872885310154, "grad_norm": 11.445701599121094, "learning_rate": 4.259420639774219e-05, "loss": 0.6356, "step": 432700 }, { "epoch": 5.76982042633747, "grad_norm": 21.207609176635742, "learning_rate": 4.2586327641411054e-05, "loss": 0.6341, "step": 432800 }, { "epoch": 5.771153564143925, "grad_norm": 4.832655906677246, "learning_rate": 4.257844783139224e-05, "loss": 0.6914, "step": 432900 }, { "epoch": 5.77248670195038, "grad_norm": 13.064532279968262, "learning_rate": 4.257056696834542e-05, "loss": 0.5659, "step": 433000 }, { "epoch": 5.773819839756836, "grad_norm": 6.103123188018799, "learning_rate": 4.256268505293036e-05, "loss": 0.6362, "step": 433100 }, { "epoch": 5.775152977563291, "grad_norm": 3.5692999362945557, "learning_rate": 4.255480208580692e-05, "loss": 0.7301, "step": 433200 }, { "epoch": 5.776486115369746, "grad_norm": 32.66421890258789, "learning_rate": 4.254691806763501e-05, "loss": 0.7389, "step": 433300 }, { "epoch": 5.777819253176201, "grad_norm": 22.563106536865234, "learning_rate": 4.253903299907468e-05, "loss": 0.6462, "step": 433400 }, { "epoch": 5.779152390982656, "grad_norm": 8.388107299804688, "learning_rate": 4.2531146880786034e-05, "loss": 0.677, "step": 433500 }, { "epoch": 5.780485528789111, "grad_norm": 3.0752434730529785, "learning_rate": 4.252325971342927e-05, "loss": 0.7102, "step": 433600 }, { "epoch": 5.781818666595566, "grad_norm": 2.3621644973754883, "learning_rate": 4.251537149766467e-05, "loss": 0.6094, "step": 433700 }, { "epoch": 5.783151804402021, "grad_norm": 10.542299270629883, "learning_rate": 4.250748223415263e-05, "loss": 0.68, "step": 433800 }, { "epoch": 5.784484942208476, "grad_norm": 13.410025596618652, "learning_rate": 4.2499591923553617e-05, "loss": 0.6903, "step": 433900 }, { "epoch": 5.7858180800149315, "grad_norm": 12.583968162536621, "learning_rate": 4.249170056652815e-05, "loss": 0.6098, "step": 434000 }, { "epoch": 5.787151217821386, "grad_norm": 4.826545238494873, "learning_rate": 4.24838081637369e-05, "loss": 0.6556, "step": 434100 }, { "epoch": 5.788484355627841, "grad_norm": 23.027456283569336, "learning_rate": 4.247591471584058e-05, "loss": 0.582, "step": 434200 }, { "epoch": 5.789817493434296, "grad_norm": 19.795759201049805, "learning_rate": 4.2468020223500006e-05, "loss": 0.6809, "step": 434300 }, { "epoch": 5.791150631240751, "grad_norm": 3.1282238960266113, "learning_rate": 4.2460124687376076e-05, "loss": 0.6697, "step": 434400 }, { "epoch": 5.792483769047206, "grad_norm": 17.862478256225586, "learning_rate": 4.245230707908354e-05, "loss": 0.6478, "step": 434500 }, { "epoch": 5.793816906853661, "grad_norm": 2.336348533630371, "learning_rate": 4.244440946779731e-05, "loss": 0.6703, "step": 434600 }, { "epoch": 5.795150044660117, "grad_norm": 10.32851791381836, "learning_rate": 4.243651081470435e-05, "loss": 0.6017, "step": 434700 }, { "epoch": 5.796483182466572, "grad_norm": 14.73315715789795, "learning_rate": 4.242861112046591e-05, "loss": 0.5935, "step": 434800 }, { "epoch": 5.797816320273027, "grad_norm": 19.100849151611328, "learning_rate": 4.2420710385743316e-05, "loss": 0.5731, "step": 434900 }, { "epoch": 5.799149458079482, "grad_norm": 43.60984802246094, "learning_rate": 4.2412808611198006e-05, "loss": 0.646, "step": 435000 }, { "epoch": 5.800482595885937, "grad_norm": 46.4980583190918, "learning_rate": 4.24049057974915e-05, "loss": 0.6217, "step": 435100 }, { "epoch": 5.801815733692392, "grad_norm": 3.5465285778045654, "learning_rate": 4.2397001945285364e-05, "loss": 0.688, "step": 435200 }, { "epoch": 5.803148871498847, "grad_norm": 11.086283683776855, "learning_rate": 4.2389097055241316e-05, "loss": 0.6067, "step": 435300 }, { "epoch": 5.804482009305302, "grad_norm": 2.042062520980835, "learning_rate": 4.238127019242516e-05, "loss": 0.6728, "step": 435400 }, { "epoch": 5.8058151471117565, "grad_norm": 10.197444915771484, "learning_rate": 4.237336323905254e-05, "loss": 0.6268, "step": 435500 }, { "epoch": 5.807148284918212, "grad_norm": 3.7472751140594482, "learning_rate": 4.2365455249820946e-05, "loss": 0.593, "step": 435600 }, { "epoch": 5.808481422724667, "grad_norm": 6.520091533660889, "learning_rate": 4.235754622539242e-05, "loss": 0.5683, "step": 435700 }, { "epoch": 5.809814560531122, "grad_norm": 8.908949851989746, "learning_rate": 4.234963616642907e-05, "loss": 0.5825, "step": 435800 }, { "epoch": 5.811147698337577, "grad_norm": 3.5790622234344482, "learning_rate": 4.234172507359312e-05, "loss": 0.6022, "step": 435900 }, { "epoch": 5.812480836144032, "grad_norm": 3.9826812744140625, "learning_rate": 4.2333812947546834e-05, "loss": 0.6282, "step": 436000 }, { "epoch": 5.813813973950487, "grad_norm": 2.479769468307495, "learning_rate": 4.232589978895262e-05, "loss": 0.6793, "step": 436100 }, { "epoch": 5.815147111756942, "grad_norm": 5.488991737365723, "learning_rate": 4.231798559847294e-05, "loss": 0.6594, "step": 436200 }, { "epoch": 5.816480249563398, "grad_norm": 5.350215435028076, "learning_rate": 4.231007037677032e-05, "loss": 0.5693, "step": 436300 }, { "epoch": 5.817813387369853, "grad_norm": 0.5469276905059814, "learning_rate": 4.230215412450742e-05, "loss": 0.64, "step": 436400 }, { "epoch": 5.819146525176308, "grad_norm": 10.203537940979004, "learning_rate": 4.2294236842346955e-05, "loss": 0.7055, "step": 436500 }, { "epoch": 5.820479662982763, "grad_norm": 3.5535881519317627, "learning_rate": 4.2286318530951745e-05, "loss": 0.6276, "step": 436600 }, { "epoch": 5.821812800789218, "grad_norm": 120.23602294921875, "learning_rate": 4.2278399190984654e-05, "loss": 0.6756, "step": 436700 }, { "epoch": 5.8231459385956725, "grad_norm": 5.2718825340271, "learning_rate": 4.227047882310871e-05, "loss": 0.6474, "step": 436800 }, { "epoch": 5.8244790764021275, "grad_norm": 5.036118030548096, "learning_rate": 4.226255742798695e-05, "loss": 0.7047, "step": 436900 }, { "epoch": 5.825812214208582, "grad_norm": 8.679299354553223, "learning_rate": 4.225463500628254e-05, "loss": 0.6238, "step": 437000 }, { "epoch": 5.827145352015037, "grad_norm": 4.50606107711792, "learning_rate": 4.224671155865871e-05, "loss": 0.6725, "step": 437100 }, { "epoch": 5.828478489821493, "grad_norm": 11.045022010803223, "learning_rate": 4.223878708577881e-05, "loss": 0.7106, "step": 437200 }, { "epoch": 5.829811627627948, "grad_norm": 4.972891807556152, "learning_rate": 4.223086158830622e-05, "loss": 0.6416, "step": 437300 }, { "epoch": 5.831144765434403, "grad_norm": 3.699489116668701, "learning_rate": 4.222293506690445e-05, "loss": 0.696, "step": 437400 }, { "epoch": 5.832477903240858, "grad_norm": 3.839280605316162, "learning_rate": 4.2215007522237094e-05, "loss": 0.5831, "step": 437500 }, { "epoch": 5.833811041047313, "grad_norm": 3.8336424827575684, "learning_rate": 4.220707895496781e-05, "loss": 0.6535, "step": 437600 }, { "epoch": 5.835144178853768, "grad_norm": 6.657586574554443, "learning_rate": 4.219914936576036e-05, "loss": 0.6033, "step": 437700 }, { "epoch": 5.836477316660223, "grad_norm": 8.09968376159668, "learning_rate": 4.219121875527858e-05, "loss": 0.5907, "step": 437800 }, { "epoch": 5.837810454466679, "grad_norm": 16.950136184692383, "learning_rate": 4.218328712418639e-05, "loss": 0.5721, "step": 437900 }, { "epoch": 5.839143592273134, "grad_norm": 5.8412184715271, "learning_rate": 4.217535447314781e-05, "loss": 0.6167, "step": 438000 }, { "epoch": 5.840476730079589, "grad_norm": 3.7598869800567627, "learning_rate": 4.216750014457341e-05, "loss": 0.587, "step": 438100 }, { "epoch": 5.8418098678860435, "grad_norm": 2.6844887733459473, "learning_rate": 4.215956546581731e-05, "loss": 0.6348, "step": 438200 }, { "epoch": 5.8431430056924984, "grad_norm": 10.893294334411621, "learning_rate": 4.215162976910073e-05, "loss": 0.7148, "step": 438300 }, { "epoch": 5.844476143498953, "grad_norm": 2.3723292350769043, "learning_rate": 4.2143693055088e-05, "loss": 0.5867, "step": 438400 }, { "epoch": 5.845809281305408, "grad_norm": 5.815936088562012, "learning_rate": 4.2135755324443584e-05, "loss": 0.6633, "step": 438500 }, { "epoch": 5.847142419111863, "grad_norm": 3.325727701187134, "learning_rate": 4.2127816577831955e-05, "loss": 0.5549, "step": 438600 }, { "epoch": 5.848475556918318, "grad_norm": 6.110037803649902, "learning_rate": 4.2119876815917786e-05, "loss": 0.6299, "step": 438700 }, { "epoch": 5.849808694724774, "grad_norm": 9.84129810333252, "learning_rate": 4.211193603936572e-05, "loss": 0.5923, "step": 438800 }, { "epoch": 5.851141832531229, "grad_norm": 13.741397857666016, "learning_rate": 4.2103994248840545e-05, "loss": 0.6478, "step": 438900 }, { "epoch": 5.852474970337684, "grad_norm": 5.0437541007995605, "learning_rate": 4.209605144500713e-05, "loss": 0.7128, "step": 439000 }, { "epoch": 5.853808108144139, "grad_norm": 3.6007285118103027, "learning_rate": 4.2088107628530424e-05, "loss": 0.68, "step": 439100 }, { "epoch": 5.855141245950594, "grad_norm": 3.739156723022461, "learning_rate": 4.2080162800075447e-05, "loss": 0.7129, "step": 439200 }, { "epoch": 5.856474383757049, "grad_norm": 11.589164733886719, "learning_rate": 4.207221696030732e-05, "loss": 0.7143, "step": 439300 }, { "epoch": 5.857807521563504, "grad_norm": 6.602013111114502, "learning_rate": 4.2064270109891243e-05, "loss": 0.6426, "step": 439400 }, { "epoch": 5.8591406593699595, "grad_norm": 78.14620208740234, "learning_rate": 4.2056322249492504e-05, "loss": 0.6202, "step": 439500 }, { "epoch": 5.8604737971764145, "grad_norm": 10.149581909179688, "learning_rate": 4.204837337977647e-05, "loss": 0.6302, "step": 439600 }, { "epoch": 5.861806934982869, "grad_norm": 8.088441848754883, "learning_rate": 4.2040423501408595e-05, "loss": 0.6598, "step": 439700 }, { "epoch": 5.863140072789324, "grad_norm": 34.94643020629883, "learning_rate": 4.2032472615054426e-05, "loss": 0.6877, "step": 439800 }, { "epoch": 5.864473210595779, "grad_norm": 4.139352321624756, "learning_rate": 4.202452072137958e-05, "loss": 0.6898, "step": 439900 }, { "epoch": 5.865806348402234, "grad_norm": 4.314764976501465, "learning_rate": 4.201656782104975e-05, "loss": 0.6918, "step": 440000 }, { "epoch": 5.867139486208689, "grad_norm": 9.628056526184082, "learning_rate": 4.200861391473075e-05, "loss": 0.609, "step": 440100 }, { "epoch": 5.868472624015144, "grad_norm": 8.81982421875, "learning_rate": 4.200065900308846e-05, "loss": 0.6134, "step": 440200 }, { "epoch": 5.869805761821599, "grad_norm": 14.178510665893555, "learning_rate": 4.199270308678881e-05, "loss": 0.6261, "step": 440300 }, { "epoch": 5.871138899628055, "grad_norm": 1.7849794626235962, "learning_rate": 4.198474616649786e-05, "loss": 0.6238, "step": 440400 }, { "epoch": 5.87247203743451, "grad_norm": 7.889785289764404, "learning_rate": 4.197678824288175e-05, "loss": 0.6676, "step": 440500 }, { "epoch": 5.873805175240965, "grad_norm": 10.12877368927002, "learning_rate": 4.196882931660668e-05, "loss": 0.6212, "step": 440600 }, { "epoch": 5.87513831304742, "grad_norm": 12.31745719909668, "learning_rate": 4.1960869388338947e-05, "loss": 0.6328, "step": 440700 }, { "epoch": 5.876471450853875, "grad_norm": 45.700531005859375, "learning_rate": 4.195290845874494e-05, "loss": 0.7074, "step": 440800 }, { "epoch": 5.87780458866033, "grad_norm": 48.234519958496094, "learning_rate": 4.194494652849111e-05, "loss": 0.6819, "step": 440900 }, { "epoch": 5.879137726466785, "grad_norm": 7.0586347579956055, "learning_rate": 4.193698359824401e-05, "loss": 0.6665, "step": 441000 }, { "epoch": 5.88047086427324, "grad_norm": 6.5600666999816895, "learning_rate": 4.1929019668670275e-05, "loss": 0.5773, "step": 441100 }, { "epoch": 5.881804002079695, "grad_norm": 5.913551330566406, "learning_rate": 4.192105474043662e-05, "loss": 0.6339, "step": 441200 }, { "epoch": 5.88313713988615, "grad_norm": 3.2300429344177246, "learning_rate": 4.191308881420982e-05, "loss": 0.5862, "step": 441300 }, { "epoch": 5.884470277692605, "grad_norm": 18.505617141723633, "learning_rate": 4.1905121890656795e-05, "loss": 0.5879, "step": 441400 }, { "epoch": 5.88580341549906, "grad_norm": 6.218980312347412, "learning_rate": 4.189715397044449e-05, "loss": 0.6203, "step": 441500 }, { "epoch": 5.887136553305515, "grad_norm": 10.94021987915039, "learning_rate": 4.188918505423996e-05, "loss": 0.6585, "step": 441600 }, { "epoch": 5.88846969111197, "grad_norm": 3.4819507598876953, "learning_rate": 4.1881215142710336e-05, "loss": 0.6406, "step": 441700 }, { "epoch": 5.889802828918425, "grad_norm": 3.1938529014587402, "learning_rate": 4.187324423652283e-05, "loss": 0.6915, "step": 441800 }, { "epoch": 5.89113596672488, "grad_norm": 44.17326354980469, "learning_rate": 4.1865272336344746e-05, "loss": 0.6575, "step": 441900 }, { "epoch": 5.892469104531336, "grad_norm": 2.548140048980713, "learning_rate": 4.185729944284346e-05, "loss": 0.6495, "step": 442000 }, { "epoch": 5.893802242337791, "grad_norm": 20.97295379638672, "learning_rate": 4.184932555668645e-05, "loss": 0.6702, "step": 442100 }, { "epoch": 5.895135380144246, "grad_norm": 3.3644466400146484, "learning_rate": 4.1841350678541254e-05, "loss": 0.6223, "step": 442200 }, { "epoch": 5.896468517950701, "grad_norm": 21.442096710205078, "learning_rate": 4.1833374809075504e-05, "loss": 0.6486, "step": 442300 }, { "epoch": 5.8978016557571555, "grad_norm": 1.6843045949935913, "learning_rate": 4.1825397948956916e-05, "loss": 0.6984, "step": 442400 }, { "epoch": 5.8991347935636105, "grad_norm": 5.764869689941406, "learning_rate": 4.181742009885329e-05, "loss": 0.6701, "step": 442500 }, { "epoch": 5.900467931370065, "grad_norm": 5.192907333374023, "learning_rate": 4.1809441259432516e-05, "loss": 0.7254, "step": 442600 }, { "epoch": 5.901801069176521, "grad_norm": 4.541945934295654, "learning_rate": 4.180146143136252e-05, "loss": 0.6353, "step": 442700 }, { "epoch": 5.903134206982976, "grad_norm": 9.023366928100586, "learning_rate": 4.1793480615311395e-05, "loss": 0.6312, "step": 442800 }, { "epoch": 5.904467344789431, "grad_norm": 3.6089377403259277, "learning_rate": 4.178549881194724e-05, "loss": 0.615, "step": 442900 }, { "epoch": 5.905800482595886, "grad_norm": 9.880529403686523, "learning_rate": 4.1777516021938284e-05, "loss": 0.5434, "step": 443000 }, { "epoch": 5.907133620402341, "grad_norm": 11.673579216003418, "learning_rate": 4.1769612088591054e-05, "loss": 0.6163, "step": 443100 }, { "epoch": 5.908466758208796, "grad_norm": 11.04975414276123, "learning_rate": 4.1761627337147205e-05, "loss": 0.6862, "step": 443200 }, { "epoch": 5.909799896015251, "grad_norm": 3.2332088947296143, "learning_rate": 4.1753641601057004e-05, "loss": 0.6162, "step": 443300 }, { "epoch": 5.911133033821706, "grad_norm": 29.528974533081055, "learning_rate": 4.174565488098897e-05, "loss": 0.6074, "step": 443400 }, { "epoch": 5.912466171628161, "grad_norm": 18.118911743164062, "learning_rate": 4.173766717761174e-05, "loss": 0.701, "step": 443500 }, { "epoch": 5.913799309434617, "grad_norm": 6.715007305145264, "learning_rate": 4.172967849159402e-05, "loss": 0.6964, "step": 443600 }, { "epoch": 5.915132447241072, "grad_norm": 1.7359768152236938, "learning_rate": 4.1721688823604576e-05, "loss": 0.6519, "step": 443700 }, { "epoch": 5.9164655850475265, "grad_norm": 6.094398021697998, "learning_rate": 4.171369817431231e-05, "loss": 0.64, "step": 443800 }, { "epoch": 5.9177987228539815, "grad_norm": 21.838485717773438, "learning_rate": 4.170570654438614e-05, "loss": 0.6434, "step": 443900 }, { "epoch": 5.919131860660436, "grad_norm": 8.843774795532227, "learning_rate": 4.1697713934495124e-05, "loss": 0.7397, "step": 444000 }, { "epoch": 5.920464998466891, "grad_norm": 5.193586349487305, "learning_rate": 4.168972034530837e-05, "loss": 0.6213, "step": 444100 }, { "epoch": 5.921798136273346, "grad_norm": 125.79238891601562, "learning_rate": 4.168172577749509e-05, "loss": 0.7303, "step": 444200 }, { "epoch": 5.923131274079802, "grad_norm": 7.809467315673828, "learning_rate": 4.167373023172455e-05, "loss": 0.7063, "step": 444300 }, { "epoch": 5.924464411886257, "grad_norm": 4.541965007781982, "learning_rate": 4.1665733708666104e-05, "loss": 0.6338, "step": 444400 }, { "epoch": 5.925797549692712, "grad_norm": 9.99629020690918, "learning_rate": 4.1657736208989216e-05, "loss": 0.7479, "step": 444500 }, { "epoch": 5.927130687499167, "grad_norm": 8.95327377319336, "learning_rate": 4.1649737733363386e-05, "loss": 0.6848, "step": 444600 }, { "epoch": 5.928463825305622, "grad_norm": 27.323503494262695, "learning_rate": 4.164173828245825e-05, "loss": 0.5321, "step": 444700 }, { "epoch": 5.929796963112077, "grad_norm": 3.351954460144043, "learning_rate": 4.163373785694348e-05, "loss": 0.6325, "step": 444800 }, { "epoch": 5.931130100918532, "grad_norm": 15.084319114685059, "learning_rate": 4.162573645748884e-05, "loss": 0.6461, "step": 444900 }, { "epoch": 5.932463238724987, "grad_norm": 3.8359580039978027, "learning_rate": 4.16177340847642e-05, "loss": 0.6986, "step": 445000 }, { "epoch": 5.933796376531442, "grad_norm": 88.71365356445312, "learning_rate": 4.1609730739439486e-05, "loss": 0.6582, "step": 445100 }, { "epoch": 5.9351295143378975, "grad_norm": 4.85537052154541, "learning_rate": 4.160172642218471e-05, "loss": 0.6055, "step": 445200 }, { "epoch": 5.936462652144352, "grad_norm": 10.230713844299316, "learning_rate": 4.159372113366996e-05, "loss": 0.648, "step": 445300 }, { "epoch": 5.937795789950807, "grad_norm": 3.2732346057891846, "learning_rate": 4.158571487456544e-05, "loss": 0.6788, "step": 445400 }, { "epoch": 5.939128927757262, "grad_norm": 39.60625457763672, "learning_rate": 4.157770764554138e-05, "loss": 0.6576, "step": 445500 }, { "epoch": 5.940462065563717, "grad_norm": 5.866606712341309, "learning_rate": 4.156969944726813e-05, "loss": 0.6851, "step": 445600 }, { "epoch": 5.941795203370172, "grad_norm": 4.0486578941345215, "learning_rate": 4.1561690280416124e-05, "loss": 0.5921, "step": 445700 }, { "epoch": 5.943128341176627, "grad_norm": 7.522919178009033, "learning_rate": 4.155368014565585e-05, "loss": 0.6092, "step": 445800 }, { "epoch": 5.944461478983083, "grad_norm": 11.60612964630127, "learning_rate": 4.154566904365789e-05, "loss": 0.6246, "step": 445900 }, { "epoch": 5.945794616789538, "grad_norm": 5.283799171447754, "learning_rate": 4.153765697509292e-05, "loss": 0.5683, "step": 446000 }, { "epoch": 5.947127754595993, "grad_norm": 7.415313243865967, "learning_rate": 4.1529643940631674e-05, "loss": 0.6463, "step": 446100 }, { "epoch": 5.948460892402448, "grad_norm": 27.783851623535156, "learning_rate": 4.152162994094497e-05, "loss": 0.61, "step": 446200 }, { "epoch": 5.949794030208903, "grad_norm": 1.4003441333770752, "learning_rate": 4.151361497670374e-05, "loss": 0.613, "step": 446300 }, { "epoch": 5.951127168015358, "grad_norm": 6.847833633422852, "learning_rate": 4.1505599048578954e-05, "loss": 0.5672, "step": 446400 }, { "epoch": 5.952460305821813, "grad_norm": 21.29709243774414, "learning_rate": 4.14975821572417e-05, "loss": 0.7732, "step": 446500 }, { "epoch": 5.953793443628268, "grad_norm": 4.739384651184082, "learning_rate": 4.1489564303363086e-05, "loss": 0.6494, "step": 446600 }, { "epoch": 5.9551265814347225, "grad_norm": 2.1034741401672363, "learning_rate": 4.148154548761438e-05, "loss": 0.6169, "step": 446700 }, { "epoch": 5.956459719241178, "grad_norm": 8.352067947387695, "learning_rate": 4.1473525710666874e-05, "loss": 0.697, "step": 446800 }, { "epoch": 5.957792857047633, "grad_norm": 10.264455795288086, "learning_rate": 4.1465504973191975e-05, "loss": 0.6652, "step": 446900 }, { "epoch": 5.959125994854088, "grad_norm": 1.8495851755142212, "learning_rate": 4.145748327586113e-05, "loss": 0.6025, "step": 447000 }, { "epoch": 5.960459132660543, "grad_norm": 22.94900131225586, "learning_rate": 4.1449460619345904e-05, "loss": 0.5882, "step": 447100 }, { "epoch": 5.961792270466998, "grad_norm": 12.662360191345215, "learning_rate": 4.144143700431793e-05, "loss": 0.7127, "step": 447200 }, { "epoch": 5.963125408273453, "grad_norm": 17.605131149291992, "learning_rate": 4.143349268191671e-05, "loss": 0.6861, "step": 447300 }, { "epoch": 5.964458546079908, "grad_norm": 20.784313201904297, "learning_rate": 4.1425467161446805e-05, "loss": 0.6103, "step": 447400 }, { "epoch": 5.965791683886364, "grad_norm": 7.9689788818359375, "learning_rate": 4.141744068447282e-05, "loss": 0.6742, "step": 447500 }, { "epoch": 5.967124821692819, "grad_norm": 3.117894172668457, "learning_rate": 4.140941325166669e-05, "loss": 0.6157, "step": 447600 }, { "epoch": 5.968457959499274, "grad_norm": 4.692608833312988, "learning_rate": 4.1401384863700444e-05, "loss": 0.7164, "step": 447700 }, { "epoch": 5.969791097305729, "grad_norm": 23.05195426940918, "learning_rate": 4.139335552124619e-05, "loss": 0.6489, "step": 447800 }, { "epoch": 5.971124235112184, "grad_norm": 5.763943672180176, "learning_rate": 4.1385325224976145e-05, "loss": 0.6543, "step": 447900 }, { "epoch": 5.9724573729186385, "grad_norm": 3.089848279953003, "learning_rate": 4.137729397556255e-05, "loss": 0.6354, "step": 448000 }, { "epoch": 5.9737905107250935, "grad_norm": 8.928787231445312, "learning_rate": 4.136926177367776e-05, "loss": 0.6213, "step": 448100 }, { "epoch": 5.975123648531548, "grad_norm": 8.942693710327148, "learning_rate": 4.136122861999423e-05, "loss": 0.6609, "step": 448200 }, { "epoch": 5.976456786338003, "grad_norm": 4.093536853790283, "learning_rate": 4.1353194515184445e-05, "loss": 0.6297, "step": 448300 }, { "epoch": 5.977789924144459, "grad_norm": 3.121781826019287, "learning_rate": 4.1345159459921014e-05, "loss": 0.5845, "step": 448400 }, { "epoch": 5.979123061950914, "grad_norm": 5.2029829025268555, "learning_rate": 4.133712345487657e-05, "loss": 0.6664, "step": 448500 }, { "epoch": 5.980456199757369, "grad_norm": 10.037812232971191, "learning_rate": 4.132908650072392e-05, "loss": 0.5826, "step": 448600 }, { "epoch": 5.981789337563824, "grad_norm": 1.846273422241211, "learning_rate": 4.132104859813584e-05, "loss": 0.6394, "step": 448700 }, { "epoch": 5.983122475370279, "grad_norm": 5.575175762176514, "learning_rate": 4.131300974778526e-05, "loss": 0.6254, "step": 448800 }, { "epoch": 5.984455613176734, "grad_norm": 5.613738536834717, "learning_rate": 4.130496995034517e-05, "loss": 0.6188, "step": 448900 }, { "epoch": 5.985788750983189, "grad_norm": 114.28770446777344, "learning_rate": 4.129692920648863e-05, "loss": 0.6602, "step": 449000 }, { "epoch": 5.987121888789644, "grad_norm": 4.436588287353516, "learning_rate": 4.128888751688879e-05, "loss": 0.6727, "step": 449100 }, { "epoch": 5.9884550265961, "grad_norm": 7.1050848960876465, "learning_rate": 4.1280844882218865e-05, "loss": 0.6584, "step": 449200 }, { "epoch": 5.989788164402555, "grad_norm": 11.090825080871582, "learning_rate": 4.127280130315218e-05, "loss": 0.6661, "step": 449300 }, { "epoch": 5.9911213022090095, "grad_norm": 5.105791091918945, "learning_rate": 4.1264756780362094e-05, "loss": 0.6228, "step": 449400 }, { "epoch": 5.9924544400154645, "grad_norm": 8.99405574798584, "learning_rate": 4.125671131452208e-05, "loss": 0.6866, "step": 449500 }, { "epoch": 5.993787577821919, "grad_norm": 3.5911002159118652, "learning_rate": 4.1248664906305696e-05, "loss": 0.6337, "step": 449600 }, { "epoch": 5.995120715628374, "grad_norm": 21.020673751831055, "learning_rate": 4.1240617556386533e-05, "loss": 0.596, "step": 449700 }, { "epoch": 5.996453853434829, "grad_norm": 2.2099075317382812, "learning_rate": 4.12325692654383e-05, "loss": 0.5661, "step": 449800 }, { "epoch": 5.997786991241284, "grad_norm": 4.5307488441467285, "learning_rate": 4.122452003413477e-05, "loss": 0.5982, "step": 449900 }, { "epoch": 5.999120129047739, "grad_norm": 7.764740943908691, "learning_rate": 4.121646986314982e-05, "loss": 0.6528, "step": 450000 }, { "epoch": 6.000453266854195, "grad_norm": 2.108865261077881, "learning_rate": 4.1208418753157356e-05, "loss": 0.5298, "step": 450100 }, { "epoch": 6.00178640466065, "grad_norm": 9.326233863830566, "learning_rate": 4.1200366704831416e-05, "loss": 0.6402, "step": 450200 }, { "epoch": 6.003119542467105, "grad_norm": 32.88676834106445, "learning_rate": 4.1192313718846075e-05, "loss": 0.5025, "step": 450300 }, { "epoch": 6.00445268027356, "grad_norm": 1.3335766792297363, "learning_rate": 4.118434033974109e-05, "loss": 0.6278, "step": 450400 }, { "epoch": 6.005785818080015, "grad_norm": 2.6932361125946045, "learning_rate": 4.1176285489819326e-05, "loss": 0.6312, "step": 450500 }, { "epoch": 6.00711895588647, "grad_norm": 4.644631385803223, "learning_rate": 4.1168229704254176e-05, "loss": 0.5537, "step": 450600 }, { "epoch": 6.008452093692925, "grad_norm": 11.823433876037598, "learning_rate": 4.116017298372003e-05, "loss": 0.6609, "step": 450700 }, { "epoch": 6.0097852314993805, "grad_norm": 4.785359859466553, "learning_rate": 4.11521153288914e-05, "loss": 0.6522, "step": 450800 }, { "epoch": 6.011118369305835, "grad_norm": 7.0220561027526855, "learning_rate": 4.1144056740442813e-05, "loss": 0.6976, "step": 450900 }, { "epoch": 6.01245150711229, "grad_norm": 4.7116923332214355, "learning_rate": 4.1135997219048926e-05, "loss": 0.5809, "step": 451000 }, { "epoch": 6.013784644918745, "grad_norm": 6.590474605560303, "learning_rate": 4.1127936765384455e-05, "loss": 0.6199, "step": 451100 }, { "epoch": 6.0151177827252, "grad_norm": 3.754201889038086, "learning_rate": 4.111987538012421e-05, "loss": 0.5336, "step": 451200 }, { "epoch": 6.016450920531655, "grad_norm": 7.238857746124268, "learning_rate": 4.1111813063943036e-05, "loss": 0.6364, "step": 451300 }, { "epoch": 6.01778405833811, "grad_norm": 2.267613410949707, "learning_rate": 4.110374981751591e-05, "loss": 0.5996, "step": 451400 }, { "epoch": 6.019117196144565, "grad_norm": 7.83193302154541, "learning_rate": 4.109568564151784e-05, "loss": 0.5607, "step": 451500 }, { "epoch": 6.020450333951021, "grad_norm": 35.57539367675781, "learning_rate": 4.108762053662396e-05, "loss": 0.6191, "step": 451600 }, { "epoch": 6.021783471757476, "grad_norm": 2.3180205821990967, "learning_rate": 4.107955450350943e-05, "loss": 0.6481, "step": 451700 }, { "epoch": 6.023116609563931, "grad_norm": 10.868805885314941, "learning_rate": 4.107148754284952e-05, "loss": 0.5749, "step": 451800 }, { "epoch": 6.024449747370386, "grad_norm": 1.979588508605957, "learning_rate": 4.1063419655319585e-05, "loss": 0.5714, "step": 451900 }, { "epoch": 6.025782885176841, "grad_norm": 3.850559711456299, "learning_rate": 4.105535084159501e-05, "loss": 0.6075, "step": 452000 }, { "epoch": 6.027116022983296, "grad_norm": 3.8373680114746094, "learning_rate": 4.104728110235133e-05, "loss": 0.6164, "step": 452100 }, { "epoch": 6.028449160789751, "grad_norm": 4.722270965576172, "learning_rate": 4.1039210438264096e-05, "loss": 0.5828, "step": 452200 }, { "epoch": 6.0297822985962055, "grad_norm": 1.8505243062973022, "learning_rate": 4.103113885000897e-05, "loss": 0.5564, "step": 452300 }, { "epoch": 6.031115436402661, "grad_norm": 25.854843139648438, "learning_rate": 4.102306633826166e-05, "loss": 0.6248, "step": 452400 }, { "epoch": 6.032448574209116, "grad_norm": 57.89518356323242, "learning_rate": 4.1014992903697985e-05, "loss": 0.5458, "step": 452500 }, { "epoch": 6.033781712015571, "grad_norm": 10.737422943115234, "learning_rate": 4.1006918546993845e-05, "loss": 0.6366, "step": 452600 }, { "epoch": 6.035114849822026, "grad_norm": 21.173381805419922, "learning_rate": 4.099884326882516e-05, "loss": 0.5457, "step": 452700 }, { "epoch": 6.036447987628481, "grad_norm": 5.1578545570373535, "learning_rate": 4.099076706986799e-05, "loss": 0.5978, "step": 452800 }, { "epoch": 6.037781125434936, "grad_norm": 2.011068105697632, "learning_rate": 4.0982689950798454e-05, "loss": 0.6131, "step": 452900 }, { "epoch": 6.039114263241391, "grad_norm": 10.530591011047363, "learning_rate": 4.097461191229273e-05, "loss": 0.5781, "step": 453000 }, { "epoch": 6.040447401047846, "grad_norm": 1.7441167831420898, "learning_rate": 4.0966532955027096e-05, "loss": 0.6843, "step": 453100 }, { "epoch": 6.041780538854302, "grad_norm": 4.044055938720703, "learning_rate": 4.095845307967789e-05, "loss": 0.5653, "step": 453200 }, { "epoch": 6.043113676660757, "grad_norm": 9.510551452636719, "learning_rate": 4.0950372286921545e-05, "loss": 0.5884, "step": 453300 }, { "epoch": 6.044446814467212, "grad_norm": 12.045501708984375, "learning_rate": 4.094229057743454e-05, "loss": 0.6042, "step": 453400 }, { "epoch": 6.045779952273667, "grad_norm": 157.23789978027344, "learning_rate": 4.093420795189346e-05, "loss": 0.6048, "step": 453500 }, { "epoch": 6.0471130900801215, "grad_norm": 3.6838996410369873, "learning_rate": 4.092620525091305e-05, "loss": 0.625, "step": 453600 }, { "epoch": 6.0484462278865765, "grad_norm": 10.588725090026855, "learning_rate": 4.091812080443751e-05, "loss": 0.6329, "step": 453700 }, { "epoch": 6.049779365693031, "grad_norm": 3.044553279876709, "learning_rate": 4.0910035443931325e-05, "loss": 0.5778, "step": 453800 }, { "epoch": 6.051112503499486, "grad_norm": 21.554136276245117, "learning_rate": 4.0901949170071344e-05, "loss": 0.5912, "step": 453900 }, { "epoch": 6.052445641305942, "grad_norm": 10.638510704040527, "learning_rate": 4.089386198353456e-05, "loss": 0.6289, "step": 454000 }, { "epoch": 6.053778779112397, "grad_norm": 60.776180267333984, "learning_rate": 4.0885773884997994e-05, "loss": 0.6095, "step": 454100 }, { "epoch": 6.055111916918852, "grad_norm": 21.036142349243164, "learning_rate": 4.087768487513874e-05, "loss": 0.6118, "step": 454200 }, { "epoch": 6.056445054725307, "grad_norm": 3.3872926235198975, "learning_rate": 4.086959495463402e-05, "loss": 0.528, "step": 454300 }, { "epoch": 6.057778192531762, "grad_norm": 6.419023036956787, "learning_rate": 4.086150412416105e-05, "loss": 0.6224, "step": 454400 }, { "epoch": 6.059111330338217, "grad_norm": 3.5076026916503906, "learning_rate": 4.085341238439721e-05, "loss": 0.6172, "step": 454500 }, { "epoch": 6.060444468144672, "grad_norm": 21.38725471496582, "learning_rate": 4.0845319736019886e-05, "loss": 0.5628, "step": 454600 }, { "epoch": 6.061777605951127, "grad_norm": 20.05513572692871, "learning_rate": 4.083722617970659e-05, "loss": 0.6165, "step": 454700 }, { "epoch": 6.063110743757583, "grad_norm": 25.610963821411133, "learning_rate": 4.082913171613487e-05, "loss": 0.6091, "step": 454800 }, { "epoch": 6.064443881564038, "grad_norm": 7.4083757400512695, "learning_rate": 4.0821036345982384e-05, "loss": 0.5615, "step": 454900 }, { "epoch": 6.0657770193704925, "grad_norm": 7.495591640472412, "learning_rate": 4.081294006992685e-05, "loss": 0.5953, "step": 455000 }, { "epoch": 6.0671101571769475, "grad_norm": 6.0958685874938965, "learning_rate": 4.0804842888646064e-05, "loss": 0.5669, "step": 455100 }, { "epoch": 6.068443294983402, "grad_norm": 14.746123313903809, "learning_rate": 4.079674480281788e-05, "loss": 0.5291, "step": 455200 }, { "epoch": 6.069776432789857, "grad_norm": 79.93061828613281, "learning_rate": 4.0788645813120256e-05, "loss": 0.6328, "step": 455300 }, { "epoch": 6.071109570596312, "grad_norm": 6.493475437164307, "learning_rate": 4.078054592023122e-05, "loss": 0.6124, "step": 455400 }, { "epoch": 6.072442708402767, "grad_norm": 6.314487934112549, "learning_rate": 4.077244512482886e-05, "loss": 0.61, "step": 455500 }, { "epoch": 6.073775846209223, "grad_norm": 3.6200265884399414, "learning_rate": 4.076434342759135e-05, "loss": 0.6288, "step": 455600 }, { "epoch": 6.075108984015678, "grad_norm": 11.119216918945312, "learning_rate": 4.075624082919693e-05, "loss": 0.6376, "step": 455700 }, { "epoch": 6.076442121822133, "grad_norm": 4.5258917808532715, "learning_rate": 4.0748137330323955e-05, "loss": 0.5428, "step": 455800 }, { "epoch": 6.077775259628588, "grad_norm": 6.138638019561768, "learning_rate": 4.0740032931650794e-05, "loss": 0.5935, "step": 455900 }, { "epoch": 6.079108397435043, "grad_norm": 63.47269821166992, "learning_rate": 4.0731927633855926e-05, "loss": 0.5445, "step": 456000 }, { "epoch": 6.080441535241498, "grad_norm": 2.337386131286621, "learning_rate": 4.072382143761792e-05, "loss": 0.5821, "step": 456100 }, { "epoch": 6.081774673047953, "grad_norm": 17.324554443359375, "learning_rate": 4.071571434361538e-05, "loss": 0.6095, "step": 456200 }, { "epoch": 6.083107810854408, "grad_norm": 10.8655424118042, "learning_rate": 4.070760635252701e-05, "loss": 0.5804, "step": 456300 }, { "epoch": 6.0844409486608635, "grad_norm": 11.605944633483887, "learning_rate": 4.069949746503159e-05, "loss": 0.603, "step": 456400 }, { "epoch": 6.085774086467318, "grad_norm": 8.366158485412598, "learning_rate": 4.0691387681807976e-05, "loss": 0.6159, "step": 456500 }, { "epoch": 6.087107224273773, "grad_norm": 7.795595169067383, "learning_rate": 4.068327700353508e-05, "loss": 0.6333, "step": 456600 }, { "epoch": 6.088440362080228, "grad_norm": 4.940967559814453, "learning_rate": 4.0675165430891926e-05, "loss": 0.6253, "step": 456700 }, { "epoch": 6.089773499886683, "grad_norm": 7.40712833404541, "learning_rate": 4.066705296455755e-05, "loss": 0.6051, "step": 456800 }, { "epoch": 6.091106637693138, "grad_norm": 20.48149299621582, "learning_rate": 4.0658939605211144e-05, "loss": 0.6005, "step": 456900 }, { "epoch": 6.092439775499593, "grad_norm": 2.2032151222229004, "learning_rate": 4.06508253535319e-05, "loss": 0.6096, "step": 457000 }, { "epoch": 6.093772913306048, "grad_norm": 11.091002464294434, "learning_rate": 4.0642710210199144e-05, "loss": 0.5796, "step": 457100 }, { "epoch": 6.095106051112504, "grad_norm": 3.9435582160949707, "learning_rate": 4.0634594175892235e-05, "loss": 0.6645, "step": 457200 }, { "epoch": 6.096439188918959, "grad_norm": 5.626891136169434, "learning_rate": 4.062647725129062e-05, "loss": 0.5807, "step": 457300 }, { "epoch": 6.097772326725414, "grad_norm": 19.646684646606445, "learning_rate": 4.061835943707383e-05, "loss": 0.559, "step": 457400 }, { "epoch": 6.099105464531869, "grad_norm": 5.219512939453125, "learning_rate": 4.061024073392146e-05, "loss": 0.5884, "step": 457500 }, { "epoch": 6.100438602338324, "grad_norm": 3.383549451828003, "learning_rate": 4.0602202342821896e-05, "loss": 0.6576, "step": 457600 }, { "epoch": 6.101771740144779, "grad_norm": 28.47482681274414, "learning_rate": 4.0594081872709856e-05, "loss": 0.5484, "step": 457700 }, { "epoch": 6.103104877951234, "grad_norm": 35.747962951660156, "learning_rate": 4.0585960515694676e-05, "loss": 0.5592, "step": 457800 }, { "epoch": 6.1044380157576885, "grad_norm": 9.790647506713867, "learning_rate": 4.0577838272456245e-05, "loss": 0.579, "step": 457900 }, { "epoch": 6.105771153564144, "grad_norm": 4.270594120025635, "learning_rate": 4.056971514367454e-05, "loss": 0.5767, "step": 458000 }, { "epoch": 6.107104291370599, "grad_norm": 11.232253074645996, "learning_rate": 4.05615911300296e-05, "loss": 0.6198, "step": 458100 }, { "epoch": 6.108437429177054, "grad_norm": 6.391158103942871, "learning_rate": 4.055346623220156e-05, "loss": 0.536, "step": 458200 }, { "epoch": 6.109770566983509, "grad_norm": 4.921675205230713, "learning_rate": 4.054534045087058e-05, "loss": 0.5149, "step": 458300 }, { "epoch": 6.111103704789964, "grad_norm": 10.093210220336914, "learning_rate": 4.053721378671694e-05, "loss": 0.6016, "step": 458400 }, { "epoch": 6.112436842596419, "grad_norm": 5.30582857131958, "learning_rate": 4.052908624042097e-05, "loss": 0.5897, "step": 458500 }, { "epoch": 6.113769980402874, "grad_norm": 1.4267001152038574, "learning_rate": 4.0521039101301685e-05, "loss": 0.5887, "step": 458600 }, { "epoch": 6.115103118209329, "grad_norm": 2.9306540489196777, "learning_rate": 4.051290980156684e-05, "loss": 0.5676, "step": 458700 }, { "epoch": 6.116436256015785, "grad_norm": 19.26532554626465, "learning_rate": 4.050477962172432e-05, "loss": 0.5824, "step": 458800 }, { "epoch": 6.11776939382224, "grad_norm": 10.912606239318848, "learning_rate": 4.049664856245477e-05, "loss": 0.5953, "step": 458900 }, { "epoch": 6.119102531628695, "grad_norm": 5.142520427703857, "learning_rate": 4.048851662443889e-05, "loss": 0.5777, "step": 459000 }, { "epoch": 6.12043566943515, "grad_norm": 22.671274185180664, "learning_rate": 4.048038380835746e-05, "loss": 0.6358, "step": 459100 }, { "epoch": 6.1217688072416045, "grad_norm": 10.199555397033691, "learning_rate": 4.047225011489134e-05, "loss": 0.5758, "step": 459200 }, { "epoch": 6.1231019450480595, "grad_norm": 31.276296615600586, "learning_rate": 4.046411554472143e-05, "loss": 0.5368, "step": 459300 }, { "epoch": 6.124435082854514, "grad_norm": 3.5606820583343506, "learning_rate": 4.045606145732478e-05, "loss": 0.6293, "step": 459400 }, { "epoch": 6.125768220660969, "grad_norm": 16.02019691467285, "learning_rate": 4.0447925144540465e-05, "loss": 0.5702, "step": 459500 }, { "epoch": 6.127101358467425, "grad_norm": 6.261847496032715, "learning_rate": 4.0439787957088795e-05, "loss": 0.5901, "step": 459600 }, { "epoch": 6.12843449627388, "grad_norm": 7.852921962738037, "learning_rate": 4.0431649895650993e-05, "loss": 0.5485, "step": 459700 }, { "epoch": 6.129767634080335, "grad_norm": 7.822122097015381, "learning_rate": 4.042351096090833e-05, "loss": 0.6194, "step": 459800 }, { "epoch": 6.13110077188679, "grad_norm": 5.073729515075684, "learning_rate": 4.04153711535422e-05, "loss": 0.5552, "step": 459900 }, { "epoch": 6.132433909693245, "grad_norm": 2.9680535793304443, "learning_rate": 4.0407230474234025e-05, "loss": 0.6308, "step": 460000 }, { "epoch": 6.1337670474997, "grad_norm": 0.18814823031425476, "learning_rate": 4.039908892366532e-05, "loss": 0.581, "step": 460100 }, { "epoch": 6.135100185306155, "grad_norm": 2.7393736839294434, "learning_rate": 4.039094650251768e-05, "loss": 0.5529, "step": 460200 }, { "epoch": 6.13643332311261, "grad_norm": 14.693556785583496, "learning_rate": 4.038280321147274e-05, "loss": 0.5658, "step": 460300 }, { "epoch": 6.137766460919066, "grad_norm": 2.5369772911071777, "learning_rate": 4.0374659051212255e-05, "loss": 0.5796, "step": 460400 }, { "epoch": 6.139099598725521, "grad_norm": 1.356995940208435, "learning_rate": 4.0366514022418006e-05, "loss": 0.6901, "step": 460500 }, { "epoch": 6.1404327365319755, "grad_norm": 31.331300735473633, "learning_rate": 4.03583681257719e-05, "loss": 0.5929, "step": 460600 }, { "epoch": 6.1417658743384305, "grad_norm": 4.066425323486328, "learning_rate": 4.035022136195585e-05, "loss": 0.5387, "step": 460700 }, { "epoch": 6.143099012144885, "grad_norm": 9.317974090576172, "learning_rate": 4.0342073731651895e-05, "loss": 0.6059, "step": 460800 }, { "epoch": 6.14443214995134, "grad_norm": 37.56455993652344, "learning_rate": 4.0333925235542134e-05, "loss": 0.5915, "step": 460900 }, { "epoch": 6.145765287757795, "grad_norm": 3.979360342025757, "learning_rate": 4.0325775874308734e-05, "loss": 0.6335, "step": 461000 }, { "epoch": 6.14709842556425, "grad_norm": 27.24138069152832, "learning_rate": 4.031762564863392e-05, "loss": 0.6185, "step": 461100 }, { "epoch": 6.148431563370705, "grad_norm": 8.841344833374023, "learning_rate": 4.0309556074367715e-05, "loss": 0.5536, "step": 461200 }, { "epoch": 6.149764701177161, "grad_norm": 35.73392868041992, "learning_rate": 4.030140413048448e-05, "loss": 0.63, "step": 461300 }, { "epoch": 6.151097838983616, "grad_norm": 4.242180347442627, "learning_rate": 4.0293251324200175e-05, "loss": 0.5784, "step": 461400 }, { "epoch": 6.152430976790071, "grad_norm": 5.618804931640625, "learning_rate": 4.028509765619731e-05, "loss": 0.6362, "step": 461500 }, { "epoch": 6.153764114596526, "grad_norm": 11.44865608215332, "learning_rate": 4.027694312715849e-05, "loss": 0.5837, "step": 461600 }, { "epoch": 6.155097252402981, "grad_norm": 3.3295974731445312, "learning_rate": 4.026878773776638e-05, "loss": 0.546, "step": 461700 }, { "epoch": 6.156430390209436, "grad_norm": 8.629931449890137, "learning_rate": 4.026063148870374e-05, "loss": 0.5511, "step": 461800 }, { "epoch": 6.157763528015891, "grad_norm": 1.4733139276504517, "learning_rate": 4.025247438065337e-05, "loss": 0.5433, "step": 461900 }, { "epoch": 6.1590966658223465, "grad_norm": 15.2240629196167, "learning_rate": 4.024431641429817e-05, "loss": 0.6442, "step": 462000 }, { "epoch": 6.160429803628801, "grad_norm": 41.66719055175781, "learning_rate": 4.023615759032109e-05, "loss": 0.6007, "step": 462100 }, { "epoch": 6.161762941435256, "grad_norm": 10.496927261352539, "learning_rate": 4.022799790940516e-05, "loss": 0.5301, "step": 462200 }, { "epoch": 6.163096079241711, "grad_norm": 43.33152389526367, "learning_rate": 4.0219837372233486e-05, "loss": 0.6735, "step": 462300 }, { "epoch": 6.164429217048166, "grad_norm": 5.178104877471924, "learning_rate": 4.021167597948925e-05, "loss": 0.5392, "step": 462400 }, { "epoch": 6.165762354854621, "grad_norm": 4.455805778503418, "learning_rate": 4.020351373185569e-05, "loss": 0.6039, "step": 462500 }, { "epoch": 6.167095492661076, "grad_norm": 8.598963737487793, "learning_rate": 4.019535063001612e-05, "loss": 0.5634, "step": 462600 }, { "epoch": 6.168428630467531, "grad_norm": 14.846790313720703, "learning_rate": 4.018718667465393e-05, "loss": 0.5965, "step": 462700 }, { "epoch": 6.169761768273986, "grad_norm": 19.668725967407227, "learning_rate": 4.017902186645259e-05, "loss": 0.6538, "step": 462800 }, { "epoch": 6.171094906080442, "grad_norm": 3.2465779781341553, "learning_rate": 4.017085620609564e-05, "loss": 0.6583, "step": 462900 }, { "epoch": 6.172428043886897, "grad_norm": 26.27853775024414, "learning_rate": 4.016268969426664e-05, "loss": 0.5784, "step": 463000 }, { "epoch": 6.173761181693352, "grad_norm": 29.819656372070312, "learning_rate": 4.0154522331649316e-05, "loss": 0.6655, "step": 463100 }, { "epoch": 6.175094319499807, "grad_norm": 15.813342094421387, "learning_rate": 4.0146435805260374e-05, "loss": 0.5887, "step": 463200 }, { "epoch": 6.176427457306262, "grad_norm": 10.496512413024902, "learning_rate": 4.013826675160849e-05, "loss": 0.6136, "step": 463300 }, { "epoch": 6.177760595112717, "grad_norm": 13.340502738952637, "learning_rate": 4.0130096849212864e-05, "loss": 0.5924, "step": 463400 }, { "epoch": 6.1790937329191715, "grad_norm": 2.7559633255004883, "learning_rate": 4.0121926098757466e-05, "loss": 0.6392, "step": 463500 }, { "epoch": 6.180426870725627, "grad_norm": 5.92676305770874, "learning_rate": 4.011375450092632e-05, "loss": 0.6665, "step": 463600 }, { "epoch": 6.181760008532082, "grad_norm": 4.230922222137451, "learning_rate": 4.010558205640353e-05, "loss": 0.6821, "step": 463700 }, { "epoch": 6.183093146338537, "grad_norm": 8.553509712219238, "learning_rate": 4.009740876587327e-05, "loss": 0.6169, "step": 463800 }, { "epoch": 6.184426284144992, "grad_norm": 9.539249420166016, "learning_rate": 4.0089234630019774e-05, "loss": 0.6263, "step": 463900 }, { "epoch": 6.185759421951447, "grad_norm": 12.65579891204834, "learning_rate": 4.0081059649527354e-05, "loss": 0.5538, "step": 464000 }, { "epoch": 6.187092559757902, "grad_norm": 5.399196147918701, "learning_rate": 4.0072883825080415e-05, "loss": 0.6155, "step": 464100 }, { "epoch": 6.188425697564357, "grad_norm": 52.469970703125, "learning_rate": 4.006470715736337e-05, "loss": 0.5853, "step": 464200 }, { "epoch": 6.189758835370812, "grad_norm": 5.266059398651123, "learning_rate": 4.0056529647060785e-05, "loss": 0.6, "step": 464300 }, { "epoch": 6.191091973177267, "grad_norm": 4.567556381225586, "learning_rate": 4.004835129485724e-05, "loss": 0.5703, "step": 464400 }, { "epoch": 6.192425110983723, "grad_norm": 12.040631294250488, "learning_rate": 4.004017210143738e-05, "loss": 0.6007, "step": 464500 }, { "epoch": 6.193758248790178, "grad_norm": 4.474602222442627, "learning_rate": 4.0031992067485985e-05, "loss": 0.5964, "step": 464600 }, { "epoch": 6.195091386596633, "grad_norm": 5.22589111328125, "learning_rate": 4.0023811193687824e-05, "loss": 0.6088, "step": 464700 }, { "epoch": 6.1964245244030876, "grad_norm": 1.8676801919937134, "learning_rate": 4.00156294807278e-05, "loss": 0.6601, "step": 464800 }, { "epoch": 6.1977576622095425, "grad_norm": 5.619805812835693, "learning_rate": 4.000744692929084e-05, "loss": 0.5796, "step": 464900 }, { "epoch": 6.199090800015997, "grad_norm": 7.300234794616699, "learning_rate": 3.999926354006197e-05, "loss": 0.55, "step": 465000 }, { "epoch": 6.200423937822452, "grad_norm": 11.903410911560059, "learning_rate": 3.999107931372627e-05, "loss": 0.6436, "step": 465100 }, { "epoch": 6.201757075628908, "grad_norm": 14.651102066040039, "learning_rate": 3.9982894250968906e-05, "loss": 0.6514, "step": 465200 }, { "epoch": 6.203090213435363, "grad_norm": 2.6765849590301514, "learning_rate": 3.997470835247511e-05, "loss": 0.5415, "step": 465300 }, { "epoch": 6.204423351241818, "grad_norm": 15.059951782226562, "learning_rate": 3.9966521618930175e-05, "loss": 0.5742, "step": 465400 }, { "epoch": 6.205756489048273, "grad_norm": 7.867154598236084, "learning_rate": 3.995833405101947e-05, "loss": 0.6173, "step": 465500 }, { "epoch": 6.207089626854728, "grad_norm": 42.62480926513672, "learning_rate": 3.995014564942841e-05, "loss": 0.6144, "step": 465600 }, { "epoch": 6.208422764661183, "grad_norm": 10.291913986206055, "learning_rate": 3.994195641484254e-05, "loss": 0.6493, "step": 465700 }, { "epoch": 6.209755902467638, "grad_norm": 18.732885360717773, "learning_rate": 3.9933766347947414e-05, "loss": 0.5941, "step": 465800 }, { "epoch": 6.211089040274093, "grad_norm": 8.874728202819824, "learning_rate": 3.992557544942869e-05, "loss": 0.6368, "step": 465900 }, { "epoch": 6.212422178080548, "grad_norm": 2.6219089031219482, "learning_rate": 3.9917383719972074e-05, "loss": 0.6769, "step": 466000 }, { "epoch": 6.213755315887004, "grad_norm": 35.29193878173828, "learning_rate": 3.9909191160263346e-05, "loss": 0.6128, "step": 466100 }, { "epoch": 6.2150884536934585, "grad_norm": 3.088637351989746, "learning_rate": 3.9900997770988384e-05, "loss": 0.6335, "step": 466200 }, { "epoch": 6.2164215914999135, "grad_norm": 0.639113187789917, "learning_rate": 3.989280355283308e-05, "loss": 0.6999, "step": 466300 }, { "epoch": 6.217754729306368, "grad_norm": 6.344577789306641, "learning_rate": 3.988460850648347e-05, "loss": 0.6323, "step": 466400 }, { "epoch": 6.219087867112823, "grad_norm": 4.878941535949707, "learning_rate": 3.987641263262558e-05, "loss": 0.6356, "step": 466500 }, { "epoch": 6.220421004919278, "grad_norm": 38.66592788696289, "learning_rate": 3.986821593194556e-05, "loss": 0.64, "step": 466600 }, { "epoch": 6.221754142725733, "grad_norm": 7.470267295837402, "learning_rate": 3.9860018405129615e-05, "loss": 0.6137, "step": 466700 }, { "epoch": 6.223087280532189, "grad_norm": 22.658525466918945, "learning_rate": 3.985190204047038e-05, "loss": 0.6661, "step": 466800 }, { "epoch": 6.224420418338644, "grad_norm": 3.630197525024414, "learning_rate": 3.98437028716857e-05, "loss": 0.5808, "step": 466900 }, { "epoch": 6.225753556145099, "grad_norm": 5.552822589874268, "learning_rate": 3.983550287881722e-05, "loss": 0.6681, "step": 467000 }, { "epoch": 6.227086693951554, "grad_norm": 4.355082988739014, "learning_rate": 3.982730206255146e-05, "loss": 0.5625, "step": 467100 }, { "epoch": 6.228419831758009, "grad_norm": 5.797217845916748, "learning_rate": 3.981910042357495e-05, "loss": 0.6371, "step": 467200 }, { "epoch": 6.229752969564464, "grad_norm": 16.29414176940918, "learning_rate": 3.9810897962574306e-05, "loss": 0.6156, "step": 467300 }, { "epoch": 6.231086107370919, "grad_norm": 25.59995460510254, "learning_rate": 3.98026946802362e-05, "loss": 0.6802, "step": 467400 }, { "epoch": 6.232419245177374, "grad_norm": 15.936580657958984, "learning_rate": 3.979449057724741e-05, "loss": 0.5798, "step": 467500 }, { "epoch": 6.233752382983829, "grad_norm": 8.128181457519531, "learning_rate": 3.9786285654294735e-05, "loss": 0.6125, "step": 467600 }, { "epoch": 6.235085520790284, "grad_norm": 27.85443115234375, "learning_rate": 3.9778079912065074e-05, "loss": 0.5987, "step": 467700 }, { "epoch": 6.236418658596739, "grad_norm": 3.0277416706085205, "learning_rate": 3.9769873351245386e-05, "loss": 0.547, "step": 467800 }, { "epoch": 6.237751796403194, "grad_norm": 8.554011344909668, "learning_rate": 3.976166597252271e-05, "loss": 0.6387, "step": 467900 }, { "epoch": 6.239084934209649, "grad_norm": 1.0940027236938477, "learning_rate": 3.975353986258646e-05, "loss": 0.6338, "step": 468000 }, { "epoch": 6.240418072016104, "grad_norm": 77.7574462890625, "learning_rate": 3.974533085828102e-05, "loss": 0.6608, "step": 468100 }, { "epoch": 6.241751209822559, "grad_norm": 5.065624237060547, "learning_rate": 3.973720314036493e-05, "loss": 0.6357, "step": 468200 }, { "epoch": 6.243084347629014, "grad_norm": 9.015028953552246, "learning_rate": 3.972899251319824e-05, "loss": 0.6186, "step": 468300 }, { "epoch": 6.244417485435469, "grad_norm": 28.347087860107422, "learning_rate": 3.9720781071550956e-05, "loss": 0.5579, "step": 468400 }, { "epoch": 6.245750623241925, "grad_norm": 8.766159057617188, "learning_rate": 3.971256881611053e-05, "loss": 0.6417, "step": 468500 }, { "epoch": 6.24708376104838, "grad_norm": 10.221138000488281, "learning_rate": 3.970435574756446e-05, "loss": 0.6458, "step": 468600 }, { "epoch": 6.248416898854835, "grad_norm": 8.272504806518555, "learning_rate": 3.9696141866600316e-05, "loss": 0.6019, "step": 468700 }, { "epoch": 6.24975003666129, "grad_norm": 5.809655666351318, "learning_rate": 3.9687927173905734e-05, "loss": 0.5784, "step": 468800 }, { "epoch": 6.251083174467745, "grad_norm": 20.37782859802246, "learning_rate": 3.967971167016844e-05, "loss": 0.6349, "step": 468900 }, { "epoch": 6.2524163122742, "grad_norm": 15.834842681884766, "learning_rate": 3.9671495356076186e-05, "loss": 0.6219, "step": 469000 }, { "epoch": 6.2537494500806545, "grad_norm": 41.069183349609375, "learning_rate": 3.9663278232316834e-05, "loss": 0.6022, "step": 469100 }, { "epoch": 6.2550825878871095, "grad_norm": 3.657179832458496, "learning_rate": 3.965506029957829e-05, "loss": 0.6537, "step": 469200 }, { "epoch": 6.256415725693565, "grad_norm": 7.273163318634033, "learning_rate": 3.9646841558548535e-05, "loss": 0.6247, "step": 469300 }, { "epoch": 6.25774886350002, "grad_norm": 1.3581888675689697, "learning_rate": 3.963862200991561e-05, "loss": 0.605, "step": 469400 }, { "epoch": 6.259082001306475, "grad_norm": 6.163660049438477, "learning_rate": 3.9630401654367635e-05, "loss": 0.6668, "step": 469500 }, { "epoch": 6.26041513911293, "grad_norm": 9.422931671142578, "learning_rate": 3.962218049259279e-05, "loss": 0.6357, "step": 469600 }, { "epoch": 6.261748276919385, "grad_norm": 8.264527320861816, "learning_rate": 3.9613958525279326e-05, "loss": 0.6367, "step": 469700 }, { "epoch": 6.26308141472584, "grad_norm": 6.846439361572266, "learning_rate": 3.960573575311556e-05, "loss": 0.579, "step": 469800 }, { "epoch": 6.264414552532295, "grad_norm": 10.42089557647705, "learning_rate": 3.959751217678988e-05, "loss": 0.6037, "step": 469900 }, { "epoch": 6.265747690338751, "grad_norm": 9.849461555480957, "learning_rate": 3.958928779699074e-05, "loss": 0.5867, "step": 470000 }, { "epoch": 6.267080828145206, "grad_norm": 10.43415641784668, "learning_rate": 3.958106261440664e-05, "loss": 0.6676, "step": 470100 }, { "epoch": 6.268413965951661, "grad_norm": 42.065513610839844, "learning_rate": 3.957283662972619e-05, "loss": 0.6512, "step": 470200 }, { "epoch": 6.269747103758116, "grad_norm": 25.454185485839844, "learning_rate": 3.956460984363803e-05, "loss": 0.5701, "step": 470300 }, { "epoch": 6.271080241564571, "grad_norm": 4.742605686187744, "learning_rate": 3.9556382256830876e-05, "loss": 0.6022, "step": 470400 }, { "epoch": 6.2724133793710255, "grad_norm": 29.2418270111084, "learning_rate": 3.954815386999354e-05, "loss": 0.5939, "step": 470500 }, { "epoch": 6.27374651717748, "grad_norm": 4.83158540725708, "learning_rate": 3.9539924683814845e-05, "loss": 0.608, "step": 470600 }, { "epoch": 6.275079654983935, "grad_norm": 12.325703620910645, "learning_rate": 3.953169469898374e-05, "loss": 0.5792, "step": 470700 }, { "epoch": 6.27641279279039, "grad_norm": 1.3989875316619873, "learning_rate": 3.95234639161892e-05, "loss": 0.606, "step": 470800 }, { "epoch": 6.277745930596846, "grad_norm": 31.365114212036133, "learning_rate": 3.9515232336120276e-05, "loss": 0.5653, "step": 470900 }, { "epoch": 6.279079068403301, "grad_norm": 18.865577697753906, "learning_rate": 3.9506999959466105e-05, "loss": 0.6036, "step": 471000 }, { "epoch": 6.280412206209756, "grad_norm": 8.867795944213867, "learning_rate": 3.949876678691584e-05, "loss": 0.537, "step": 471100 }, { "epoch": 6.281745344016211, "grad_norm": 3.554948329925537, "learning_rate": 3.949053281915879e-05, "loss": 0.6366, "step": 471200 }, { "epoch": 6.283078481822666, "grad_norm": 0.21393392980098724, "learning_rate": 3.948229805688423e-05, "loss": 0.6456, "step": 471300 }, { "epoch": 6.284411619629121, "grad_norm": 5.505984783172607, "learning_rate": 3.9474062500781584e-05, "loss": 0.5876, "step": 471400 }, { "epoch": 6.285744757435576, "grad_norm": 2.548428535461426, "learning_rate": 3.946582615154028e-05, "loss": 0.596, "step": 471500 }, { "epoch": 6.287077895242031, "grad_norm": 2.968925952911377, "learning_rate": 3.945758900984986e-05, "loss": 0.602, "step": 471600 }, { "epoch": 6.288411033048487, "grad_norm": 3.3091678619384766, "learning_rate": 3.944935107639989e-05, "loss": 0.5812, "step": 471700 }, { "epoch": 6.2897441708549415, "grad_norm": 13.938187599182129, "learning_rate": 3.944111235188004e-05, "loss": 0.5459, "step": 471800 }, { "epoch": 6.2910773086613965, "grad_norm": 7.687440395355225, "learning_rate": 3.9432872836980025e-05, "loss": 0.5983, "step": 471900 }, { "epoch": 6.292410446467851, "grad_norm": 7.2985029220581055, "learning_rate": 3.9424632532389626e-05, "loss": 0.607, "step": 472000 }, { "epoch": 6.293743584274306, "grad_norm": 12.205863952636719, "learning_rate": 3.941639143879871e-05, "loss": 0.6831, "step": 472100 }, { "epoch": 6.295076722080761, "grad_norm": 2.756765604019165, "learning_rate": 3.940814955689718e-05, "loss": 0.6216, "step": 472200 }, { "epoch": 6.296409859887216, "grad_norm": 6.154998779296875, "learning_rate": 3.939990688737503e-05, "loss": 0.6638, "step": 472300 }, { "epoch": 6.297742997693671, "grad_norm": 9.55738353729248, "learning_rate": 3.9391663430922296e-05, "loss": 0.5579, "step": 472400 }, { "epoch": 6.299076135500127, "grad_norm": 8.784103393554688, "learning_rate": 3.938341918822911e-05, "loss": 0.6421, "step": 472500 }, { "epoch": 6.300409273306582, "grad_norm": 8.336384773254395, "learning_rate": 3.9375174159985656e-05, "loss": 0.5916, "step": 472600 }, { "epoch": 6.301742411113037, "grad_norm": 21.672563552856445, "learning_rate": 3.9366928346882165e-05, "loss": 0.6744, "step": 472700 }, { "epoch": 6.303075548919492, "grad_norm": 6.939993858337402, "learning_rate": 3.935868174960896e-05, "loss": 0.5917, "step": 472800 }, { "epoch": 6.304408686725947, "grad_norm": 2.4854636192321777, "learning_rate": 3.935043436885642e-05, "loss": 0.6057, "step": 472900 }, { "epoch": 6.305741824532402, "grad_norm": 3.5319457054138184, "learning_rate": 3.9342186205315e-05, "loss": 0.6708, "step": 473000 }, { "epoch": 6.307074962338857, "grad_norm": 11.375289916992188, "learning_rate": 3.933393725967519e-05, "loss": 0.5778, "step": 473100 }, { "epoch": 6.308408100145312, "grad_norm": 3.7730801105499268, "learning_rate": 3.932568753262758e-05, "loss": 0.6148, "step": 473200 }, { "epoch": 6.3097412379517674, "grad_norm": 131.075927734375, "learning_rate": 3.93174370248628e-05, "loss": 0.6219, "step": 473300 }, { "epoch": 6.311074375758222, "grad_norm": 4.5515947341918945, "learning_rate": 3.9309185737071576e-05, "loss": 0.5481, "step": 473400 }, { "epoch": 6.312407513564677, "grad_norm": 3.862903118133545, "learning_rate": 3.9300933669944655e-05, "loss": 0.6406, "step": 473500 }, { "epoch": 6.313740651371132, "grad_norm": 7.815329074859619, "learning_rate": 3.929268082417289e-05, "loss": 0.615, "step": 473600 }, { "epoch": 6.315073789177587, "grad_norm": 5.643088340759277, "learning_rate": 3.928442720044719e-05, "loss": 0.6255, "step": 473700 }, { "epoch": 6.316406926984042, "grad_norm": 10.78684139251709, "learning_rate": 3.92761727994585e-05, "loss": 0.5906, "step": 473800 }, { "epoch": 6.317740064790497, "grad_norm": 5.234546661376953, "learning_rate": 3.926791762189787e-05, "loss": 0.6016, "step": 473900 }, { "epoch": 6.319073202596952, "grad_norm": 8.153946876525879, "learning_rate": 3.925966166845638e-05, "loss": 0.6733, "step": 474000 }, { "epoch": 6.320406340403408, "grad_norm": 15.368525505065918, "learning_rate": 3.9251404939825216e-05, "loss": 0.5521, "step": 474100 }, { "epoch": 6.321739478209863, "grad_norm": 15.222819328308105, "learning_rate": 3.924314743669559e-05, "loss": 0.6296, "step": 474200 }, { "epoch": 6.323072616016318, "grad_norm": 9.287569999694824, "learning_rate": 3.92348891597588e-05, "loss": 0.5833, "step": 474300 }, { "epoch": 6.324405753822773, "grad_norm": 11.641810417175293, "learning_rate": 3.922663010970619e-05, "loss": 0.6556, "step": 474400 }, { "epoch": 6.325738891629228, "grad_norm": 15.159187316894531, "learning_rate": 3.921837028722921e-05, "loss": 0.5987, "step": 474500 }, { "epoch": 6.327072029435683, "grad_norm": 31.5465087890625, "learning_rate": 3.9210109693019315e-05, "loss": 0.5321, "step": 474600 }, { "epoch": 6.3284051672421375, "grad_norm": 17.468887329101562, "learning_rate": 3.9201848327768065e-05, "loss": 0.5908, "step": 474700 }, { "epoch": 6.3297383050485925, "grad_norm": 7.328719139099121, "learning_rate": 3.9193586192167095e-05, "loss": 0.5739, "step": 474800 }, { "epoch": 6.331071442855048, "grad_norm": 5.152555465698242, "learning_rate": 3.918532328690806e-05, "loss": 0.6455, "step": 474900 }, { "epoch": 6.332404580661503, "grad_norm": 3.1104156970977783, "learning_rate": 3.917705961268272e-05, "loss": 0.569, "step": 475000 }, { "epoch": 6.333737718467958, "grad_norm": 4.590539455413818, "learning_rate": 3.9168795170182875e-05, "loss": 0.5795, "step": 475100 }, { "epoch": 6.335070856274413, "grad_norm": 3.0560734272003174, "learning_rate": 3.9160529960100385e-05, "loss": 0.5758, "step": 475200 }, { "epoch": 6.336403994080868, "grad_norm": 12.805692672729492, "learning_rate": 3.915226398312723e-05, "loss": 0.5821, "step": 475300 }, { "epoch": 6.337737131887323, "grad_norm": 2.9514551162719727, "learning_rate": 3.9143997239955376e-05, "loss": 0.5772, "step": 475400 }, { "epoch": 6.339070269693778, "grad_norm": 0.0959504023194313, "learning_rate": 3.9135729731276894e-05, "loss": 0.5773, "step": 475500 }, { "epoch": 6.340403407500233, "grad_norm": 3.681762933731079, "learning_rate": 3.9127461457783904e-05, "loss": 0.5531, "step": 475600 }, { "epoch": 6.341736545306689, "grad_norm": 5.260800838470459, "learning_rate": 3.9119192420168635e-05, "loss": 0.6524, "step": 475700 }, { "epoch": 6.343069683113144, "grad_norm": 25.374462127685547, "learning_rate": 3.911092261912331e-05, "loss": 0.552, "step": 475800 }, { "epoch": 6.344402820919599, "grad_norm": 4.2164306640625, "learning_rate": 3.9102652055340264e-05, "loss": 0.5722, "step": 475900 }, { "epoch": 6.345735958726054, "grad_norm": 6.3367018699646, "learning_rate": 3.90943807295119e-05, "loss": 0.6739, "step": 476000 }, { "epoch": 6.3470690965325085, "grad_norm": 22.52724266052246, "learning_rate": 3.9086108642330624e-05, "loss": 0.5336, "step": 476100 }, { "epoch": 6.3484022343389634, "grad_norm": 26.844547271728516, "learning_rate": 3.907783579448899e-05, "loss": 0.6155, "step": 476200 }, { "epoch": 6.349735372145418, "grad_norm": 23.536094665527344, "learning_rate": 3.906956218667955e-05, "loss": 0.5607, "step": 476300 }, { "epoch": 6.351068509951873, "grad_norm": 12.474503517150879, "learning_rate": 3.906128781959496e-05, "loss": 0.6496, "step": 476400 }, { "epoch": 6.352401647758329, "grad_norm": 6.507882595062256, "learning_rate": 3.905301269392792e-05, "loss": 0.6209, "step": 476500 }, { "epoch": 6.353734785564784, "grad_norm": 7.934950828552246, "learning_rate": 3.904473681037119e-05, "loss": 0.5424, "step": 476600 }, { "epoch": 6.355067923371239, "grad_norm": 5.596549987792969, "learning_rate": 3.9036460169617604e-05, "loss": 0.5349, "step": 476700 }, { "epoch": 6.356401061177694, "grad_norm": 5.833059310913086, "learning_rate": 3.902818277236006e-05, "loss": 0.565, "step": 476800 }, { "epoch": 6.357734198984149, "grad_norm": 7.084552764892578, "learning_rate": 3.901990461929152e-05, "loss": 0.6634, "step": 476900 }, { "epoch": 6.359067336790604, "grad_norm": 6.751688480377197, "learning_rate": 3.9011625711104975e-05, "loss": 0.5278, "step": 477000 }, { "epoch": 6.360400474597059, "grad_norm": 3.6957833766937256, "learning_rate": 3.900334604849356e-05, "loss": 0.6276, "step": 477100 }, { "epoch": 6.361733612403514, "grad_norm": 11.691972732543945, "learning_rate": 3.8995065632150385e-05, "loss": 0.6508, "step": 477200 }, { "epoch": 6.36306675020997, "grad_norm": 5.751698017120361, "learning_rate": 3.898678446276866e-05, "loss": 0.5219, "step": 477300 }, { "epoch": 6.3643998880164245, "grad_norm": 10.255271911621094, "learning_rate": 3.897850254104168e-05, "loss": 0.6114, "step": 477400 }, { "epoch": 6.3657330258228795, "grad_norm": 6.527185916900635, "learning_rate": 3.897021986766277e-05, "loss": 0.4942, "step": 477500 }, { "epoch": 6.367066163629334, "grad_norm": 2.5724024772644043, "learning_rate": 3.896193644332533e-05, "loss": 0.5311, "step": 477600 }, { "epoch": 6.368399301435789, "grad_norm": 4.320896625518799, "learning_rate": 3.895365226872282e-05, "loss": 0.5329, "step": 477700 }, { "epoch": 6.369732439242244, "grad_norm": 5.178065776824951, "learning_rate": 3.894536734454876e-05, "loss": 0.5542, "step": 477800 }, { "epoch": 6.371065577048699, "grad_norm": 8.886308670043945, "learning_rate": 3.893708167149675e-05, "loss": 0.5611, "step": 477900 }, { "epoch": 6.372398714855154, "grad_norm": 4.620723247528076, "learning_rate": 3.8928795250260437e-05, "loss": 0.5592, "step": 478000 }, { "epoch": 6.37373185266161, "grad_norm": 2.786424160003662, "learning_rate": 3.892050808153352e-05, "loss": 0.5684, "step": 478100 }, { "epoch": 6.375064990468065, "grad_norm": 8.199150085449219, "learning_rate": 3.89122201660098e-05, "loss": 0.5715, "step": 478200 }, { "epoch": 6.37639812827452, "grad_norm": 22.7852783203125, "learning_rate": 3.890393150438309e-05, "loss": 0.6235, "step": 478300 }, { "epoch": 6.377731266080975, "grad_norm": 20.38633155822754, "learning_rate": 3.889572499510516e-05, "loss": 0.5907, "step": 478400 }, { "epoch": 6.37906440388743, "grad_norm": 8.211111068725586, "learning_rate": 3.8887434850797984e-05, "loss": 0.6392, "step": 478500 }, { "epoch": 6.380397541693885, "grad_norm": 9.335923194885254, "learning_rate": 3.887914396246277e-05, "loss": 0.6486, "step": 478600 }, { "epoch": 6.38173067950034, "grad_norm": 18.137126922607422, "learning_rate": 3.887085233079362e-05, "loss": 0.5761, "step": 478700 }, { "epoch": 6.383063817306795, "grad_norm": 28.404748916625977, "learning_rate": 3.886255995648467e-05, "loss": 0.6307, "step": 478800 }, { "epoch": 6.3843969551132504, "grad_norm": 12.14786148071289, "learning_rate": 3.885426684023014e-05, "loss": 0.4945, "step": 478900 }, { "epoch": 6.385730092919705, "grad_norm": 3.20278000831604, "learning_rate": 3.88459729827243e-05, "loss": 0.6195, "step": 479000 }, { "epoch": 6.38706323072616, "grad_norm": 3.5984911918640137, "learning_rate": 3.8837678384661496e-05, "loss": 0.594, "step": 479100 }, { "epoch": 6.388396368532615, "grad_norm": 2.266150712966919, "learning_rate": 3.88293830467361e-05, "loss": 0.5918, "step": 479200 }, { "epoch": 6.38972950633907, "grad_norm": 22.12363624572754, "learning_rate": 3.8821086969642594e-05, "loss": 0.5514, "step": 479300 }, { "epoch": 6.391062644145525, "grad_norm": 2.192633867263794, "learning_rate": 3.88127901540755e-05, "loss": 0.5682, "step": 479400 }, { "epoch": 6.39239578195198, "grad_norm": 6.61533260345459, "learning_rate": 3.880449260072939e-05, "loss": 0.608, "step": 479500 }, { "epoch": 6.393728919758435, "grad_norm": 24.32050132751465, "learning_rate": 3.879619431029891e-05, "loss": 0.6581, "step": 479600 }, { "epoch": 6.395062057564891, "grad_norm": 8.225729942321777, "learning_rate": 3.878789528347878e-05, "loss": 0.5746, "step": 479700 }, { "epoch": 6.396395195371346, "grad_norm": 12.72215747833252, "learning_rate": 3.877959552096374e-05, "loss": 0.7048, "step": 479800 }, { "epoch": 6.397728333177801, "grad_norm": 7.101632118225098, "learning_rate": 3.8771295023448656e-05, "loss": 0.6014, "step": 479900 }, { "epoch": 6.399061470984256, "grad_norm": 0.8958373069763184, "learning_rate": 3.8762993791628396e-05, "loss": 0.5702, "step": 480000 }, { "epoch": 6.400394608790711, "grad_norm": 1.252549409866333, "learning_rate": 3.875469182619793e-05, "loss": 0.5167, "step": 480100 }, { "epoch": 6.401727746597166, "grad_norm": 7.66314172744751, "learning_rate": 3.874655518899723e-05, "loss": 0.5347, "step": 480200 }, { "epoch": 6.4030608844036205, "grad_norm": 1.623230218887329, "learning_rate": 3.873825177306902e-05, "loss": 0.623, "step": 480300 }, { "epoch": 6.4043940222100755, "grad_norm": 2.751742124557495, "learning_rate": 3.872994762560191e-05, "loss": 0.5553, "step": 480400 }, { "epoch": 6.405727160016531, "grad_norm": 9.183104515075684, "learning_rate": 3.872164274729111e-05, "loss": 0.5825, "step": 480500 }, { "epoch": 6.407060297822986, "grad_norm": 14.026739120483398, "learning_rate": 3.871333713883187e-05, "loss": 0.6117, "step": 480600 }, { "epoch": 6.408393435629441, "grad_norm": 11.535308837890625, "learning_rate": 3.870503080091951e-05, "loss": 0.6237, "step": 480700 }, { "epoch": 6.409726573435896, "grad_norm": 2.93013858795166, "learning_rate": 3.8696723734249414e-05, "loss": 0.593, "step": 480800 }, { "epoch": 6.411059711242351, "grad_norm": 15.244665145874023, "learning_rate": 3.868841593951702e-05, "loss": 0.6538, "step": 480900 }, { "epoch": 6.412392849048806, "grad_norm": 14.288681983947754, "learning_rate": 3.868010741741783e-05, "loss": 0.6049, "step": 481000 }, { "epoch": 6.413725986855261, "grad_norm": 9.27951717376709, "learning_rate": 3.86717981686474e-05, "loss": 0.6553, "step": 481100 }, { "epoch": 6.415059124661716, "grad_norm": 2.805110454559326, "learning_rate": 3.866348819390139e-05, "loss": 0.5304, "step": 481200 }, { "epoch": 6.416392262468172, "grad_norm": 37.74892044067383, "learning_rate": 3.8655177493875434e-05, "loss": 0.6195, "step": 481300 }, { "epoch": 6.417725400274627, "grad_norm": 2.527395725250244, "learning_rate": 3.864686606926531e-05, "loss": 0.596, "step": 481400 }, { "epoch": 6.419058538081082, "grad_norm": 5.156349182128906, "learning_rate": 3.863855392076681e-05, "loss": 0.5478, "step": 481500 }, { "epoch": 6.420391675887537, "grad_norm": 52.58755874633789, "learning_rate": 3.8630241049075815e-05, "loss": 0.5763, "step": 481600 }, { "epoch": 6.4217248136939915, "grad_norm": 6.154906272888184, "learning_rate": 3.862192745488824e-05, "loss": 0.5836, "step": 481700 }, { "epoch": 6.4230579515004464, "grad_norm": 12.950711250305176, "learning_rate": 3.8613613138900076e-05, "loss": 0.6718, "step": 481800 }, { "epoch": 6.424391089306901, "grad_norm": 4.836496353149414, "learning_rate": 3.860529810180738e-05, "loss": 0.5074, "step": 481900 }, { "epoch": 6.425724227113356, "grad_norm": 4.349376201629639, "learning_rate": 3.859698234430625e-05, "loss": 0.5902, "step": 482000 }, { "epoch": 6.427057364919812, "grad_norm": 27.5339298248291, "learning_rate": 3.858866586709286e-05, "loss": 0.5449, "step": 482100 }, { "epoch": 6.428390502726267, "grad_norm": 6.423632621765137, "learning_rate": 3.858034867086342e-05, "loss": 0.6063, "step": 482200 }, { "epoch": 6.429723640532722, "grad_norm": 10.763943672180176, "learning_rate": 3.857203075631426e-05, "loss": 0.5481, "step": 482300 }, { "epoch": 6.431056778339177, "grad_norm": 4.594361782073975, "learning_rate": 3.8563712124141695e-05, "loss": 0.5912, "step": 482400 }, { "epoch": 6.432389916145632, "grad_norm": 8.313197135925293, "learning_rate": 3.8555392775042145e-05, "loss": 0.5138, "step": 482500 }, { "epoch": 6.433723053952087, "grad_norm": 33.50289535522461, "learning_rate": 3.854707270971209e-05, "loss": 0.5465, "step": 482600 }, { "epoch": 6.435056191758542, "grad_norm": 3.40399432182312, "learning_rate": 3.853875192884804e-05, "loss": 0.5038, "step": 482700 }, { "epoch": 6.436389329564997, "grad_norm": 3.2637102603912354, "learning_rate": 3.85304304331466e-05, "loss": 0.5341, "step": 482800 }, { "epoch": 6.437722467371453, "grad_norm": 4.448236465454102, "learning_rate": 3.852210822330441e-05, "loss": 0.5867, "step": 482900 }, { "epoch": 6.4390556051779075, "grad_norm": 7.628891468048096, "learning_rate": 3.851378530001819e-05, "loss": 0.669, "step": 483000 }, { "epoch": 6.4403887429843625, "grad_norm": 5.7996320724487305, "learning_rate": 3.8505461663984695e-05, "loss": 0.5748, "step": 483100 }, { "epoch": 6.441721880790817, "grad_norm": 1.9295412302017212, "learning_rate": 3.849713731590078e-05, "loss": 0.5236, "step": 483200 }, { "epoch": 6.443055018597272, "grad_norm": 7.651988983154297, "learning_rate": 3.848881225646329e-05, "loss": 0.55, "step": 483300 }, { "epoch": 6.444388156403727, "grad_norm": 7.101032257080078, "learning_rate": 3.8480486486369205e-05, "loss": 0.5708, "step": 483400 }, { "epoch": 6.445721294210182, "grad_norm": 5.707064628601074, "learning_rate": 3.847216000631553e-05, "loss": 0.6201, "step": 483500 }, { "epoch": 6.447054432016637, "grad_norm": 41.275386810302734, "learning_rate": 3.846383281699932e-05, "loss": 0.5892, "step": 483600 }, { "epoch": 6.448387569823093, "grad_norm": 31.642446517944336, "learning_rate": 3.845550491911771e-05, "loss": 0.5741, "step": 483700 }, { "epoch": 6.449720707629548, "grad_norm": 9.510377883911133, "learning_rate": 3.8447176313367883e-05, "loss": 0.6272, "step": 483800 }, { "epoch": 6.451053845436003, "grad_norm": 414.42034912109375, "learning_rate": 3.8438847000447075e-05, "loss": 0.5608, "step": 483900 }, { "epoch": 6.452386983242458, "grad_norm": 159.27871704101562, "learning_rate": 3.843051698105261e-05, "loss": 0.5478, "step": 484000 }, { "epoch": 6.453720121048913, "grad_norm": 6.871500492095947, "learning_rate": 3.842218625588183e-05, "loss": 0.5823, "step": 484100 }, { "epoch": 6.455053258855368, "grad_norm": 27.118358612060547, "learning_rate": 3.841385482563217e-05, "loss": 0.584, "step": 484200 }, { "epoch": 6.456386396661823, "grad_norm": 2.586226224899292, "learning_rate": 3.84055226910011e-05, "loss": 0.5835, "step": 484300 }, { "epoch": 6.457719534468278, "grad_norm": 5.341148853302002, "learning_rate": 3.8397189852686176e-05, "loss": 0.6684, "step": 484400 }, { "epoch": 6.4590526722747335, "grad_norm": 6.492839813232422, "learning_rate": 3.8388856311384986e-05, "loss": 0.607, "step": 484500 }, { "epoch": 6.460385810081188, "grad_norm": 11.86131763458252, "learning_rate": 3.838052206779518e-05, "loss": 0.6024, "step": 484600 }, { "epoch": 6.461718947887643, "grad_norm": 1.4674550294876099, "learning_rate": 3.8372187122614486e-05, "loss": 0.5493, "step": 484700 }, { "epoch": 6.463052085694098, "grad_norm": 100.32569885253906, "learning_rate": 3.8363851476540695e-05, "loss": 0.5732, "step": 484800 }, { "epoch": 6.464385223500553, "grad_norm": 0.9998890161514282, "learning_rate": 3.8355515130271604e-05, "loss": 0.591, "step": 484900 }, { "epoch": 6.465718361307008, "grad_norm": 3.0915207862854004, "learning_rate": 3.834717808450513e-05, "loss": 0.6598, "step": 485000 }, { "epoch": 6.467051499113463, "grad_norm": 15.293728828430176, "learning_rate": 3.833884033993923e-05, "loss": 0.5816, "step": 485100 }, { "epoch": 6.468384636919918, "grad_norm": 6.245082378387451, "learning_rate": 3.833050189727189e-05, "loss": 0.551, "step": 485200 }, { "epoch": 6.469717774726373, "grad_norm": 0.9097810983657837, "learning_rate": 3.832216275720119e-05, "loss": 0.5578, "step": 485300 }, { "epoch": 6.471050912532829, "grad_norm": 21.075885772705078, "learning_rate": 3.831382292042527e-05, "loss": 0.6227, "step": 485400 }, { "epoch": 6.472384050339284, "grad_norm": 6.253649711608887, "learning_rate": 3.8305482387642296e-05, "loss": 0.5873, "step": 485500 }, { "epoch": 6.473717188145739, "grad_norm": 9.290946006774902, "learning_rate": 3.829714115955051e-05, "loss": 0.5885, "step": 485600 }, { "epoch": 6.475050325952194, "grad_norm": 34.982479095458984, "learning_rate": 3.828879923684823e-05, "loss": 0.6372, "step": 485700 }, { "epoch": 6.476383463758649, "grad_norm": 14.240558624267578, "learning_rate": 3.8280540049832524e-05, "loss": 0.5938, "step": 485800 }, { "epoch": 6.4777166015651035, "grad_norm": 5.185649394989014, "learning_rate": 3.8272196746933045e-05, "loss": 0.6623, "step": 485900 }, { "epoch": 6.4790497393715585, "grad_norm": 5.514410018920898, "learning_rate": 3.826385275151135e-05, "loss": 0.6604, "step": 486000 }, { "epoch": 6.480382877178014, "grad_norm": 8.452677726745605, "learning_rate": 3.8255508064265937e-05, "loss": 0.5131, "step": 486100 }, { "epoch": 6.481716014984469, "grad_norm": 17.7845458984375, "learning_rate": 3.824716268589542e-05, "loss": 0.6084, "step": 486200 }, { "epoch": 6.483049152790924, "grad_norm": 41.748870849609375, "learning_rate": 3.8238816617098435e-05, "loss": 0.5407, "step": 486300 }, { "epoch": 6.484382290597379, "grad_norm": 13.58513069152832, "learning_rate": 3.823046985857369e-05, "loss": 0.5879, "step": 486400 }, { "epoch": 6.485715428403834, "grad_norm": 2.68489146232605, "learning_rate": 3.822212241101996e-05, "loss": 0.6293, "step": 486500 }, { "epoch": 6.487048566210289, "grad_norm": 5.09619665145874, "learning_rate": 3.821377427513606e-05, "loss": 0.4914, "step": 486600 }, { "epoch": 6.488381704016744, "grad_norm": 3.5115208625793457, "learning_rate": 3.820542545162087e-05, "loss": 0.6262, "step": 486700 }, { "epoch": 6.489714841823199, "grad_norm": 3.3194472789764404, "learning_rate": 3.819707594117332e-05, "loss": 0.5956, "step": 486800 }, { "epoch": 6.491047979629654, "grad_norm": 4.698775291442871, "learning_rate": 3.8188725744492424e-05, "loss": 0.5476, "step": 486900 }, { "epoch": 6.49238111743611, "grad_norm": 6.213549613952637, "learning_rate": 3.818037486227721e-05, "loss": 0.5217, "step": 487000 }, { "epoch": 6.493714255242565, "grad_norm": 6.4075117111206055, "learning_rate": 3.8172023295226804e-05, "loss": 0.5599, "step": 487100 }, { "epoch": 6.49504739304902, "grad_norm": 11.316558837890625, "learning_rate": 3.816367104404037e-05, "loss": 0.6005, "step": 487200 }, { "epoch": 6.4963805308554745, "grad_norm": 8.69881820678711, "learning_rate": 3.815531810941713e-05, "loss": 0.6216, "step": 487300 }, { "epoch": 6.4977136686619295, "grad_norm": 2.8535337448120117, "learning_rate": 3.814696449205636e-05, "loss": 0.6129, "step": 487400 }, { "epoch": 6.499046806468384, "grad_norm": 13.262742042541504, "learning_rate": 3.813861019265741e-05, "loss": 0.5551, "step": 487500 }, { "epoch": 6.500379944274839, "grad_norm": 4.928792476654053, "learning_rate": 3.813025521191967e-05, "loss": 0.6354, "step": 487600 }, { "epoch": 6.501713082081295, "grad_norm": 59.262596130371094, "learning_rate": 3.8121899550542585e-05, "loss": 0.6204, "step": 487700 }, { "epoch": 6.50304621988775, "grad_norm": 4.063572883605957, "learning_rate": 3.811354320922568e-05, "loss": 0.5757, "step": 487800 }, { "epoch": 6.504379357694205, "grad_norm": 5.8684210777282715, "learning_rate": 3.8105186188668516e-05, "loss": 0.6001, "step": 487900 }, { "epoch": 6.50571249550066, "grad_norm": 9.22815990447998, "learning_rate": 3.809691206991816e-05, "loss": 0.639, "step": 488000 }, { "epoch": 6.507045633307115, "grad_norm": 143.49085998535156, "learning_rate": 3.808855369975434e-05, "loss": 0.5867, "step": 488100 }, { "epoch": 6.50837877111357, "grad_norm": 3.5404460430145264, "learning_rate": 3.8080194652442305e-05, "loss": 0.538, "step": 488200 }, { "epoch": 6.509711908920025, "grad_norm": 1.182723045349121, "learning_rate": 3.8071834928681845e-05, "loss": 0.605, "step": 488300 }, { "epoch": 6.51104504672648, "grad_norm": 33.982112884521484, "learning_rate": 3.806347452917281e-05, "loss": 0.5705, "step": 488400 }, { "epoch": 6.512378184532935, "grad_norm": 9.720298767089844, "learning_rate": 3.80551134546151e-05, "loss": 0.6097, "step": 488500 }, { "epoch": 6.5137113223393905, "grad_norm": 4.50303316116333, "learning_rate": 3.804675170570869e-05, "loss": 0.5844, "step": 488600 }, { "epoch": 6.5150444601458455, "grad_norm": 10.216618537902832, "learning_rate": 3.803838928315358e-05, "loss": 0.5799, "step": 488700 }, { "epoch": 6.5163775979523, "grad_norm": 2.348006248474121, "learning_rate": 3.8030026187649875e-05, "loss": 0.5766, "step": 488800 }, { "epoch": 6.517710735758755, "grad_norm": 10.410428047180176, "learning_rate": 3.8021662419897676e-05, "loss": 0.4832, "step": 488900 }, { "epoch": 6.51904387356521, "grad_norm": 4.931389808654785, "learning_rate": 3.80132979805972e-05, "loss": 0.5388, "step": 489000 }, { "epoch": 6.520377011371665, "grad_norm": 33.87925720214844, "learning_rate": 3.8004932870448665e-05, "loss": 0.5869, "step": 489100 }, { "epoch": 6.52171014917812, "grad_norm": 4.8617634773254395, "learning_rate": 3.799656709015239e-05, "loss": 0.5871, "step": 489200 }, { "epoch": 6.523043286984576, "grad_norm": 2.393244743347168, "learning_rate": 3.798820064040872e-05, "loss": 0.5362, "step": 489300 }, { "epoch": 6.524376424791031, "grad_norm": 2.051729917526245, "learning_rate": 3.797983352191808e-05, "loss": 0.5791, "step": 489400 }, { "epoch": 6.525709562597486, "grad_norm": 3.106987953186035, "learning_rate": 3.797146573538093e-05, "loss": 0.5677, "step": 489500 }, { "epoch": 6.527042700403941, "grad_norm": 3.3447682857513428, "learning_rate": 3.79630972814978e-05, "loss": 0.6346, "step": 489600 }, { "epoch": 6.528375838210396, "grad_norm": 1.1795947551727295, "learning_rate": 3.795472816096926e-05, "loss": 0.6576, "step": 489700 }, { "epoch": 6.529708976016851, "grad_norm": 3.860607385635376, "learning_rate": 3.7946358374495965e-05, "loss": 0.5866, "step": 489800 }, { "epoch": 6.531042113823306, "grad_norm": 2.416433811187744, "learning_rate": 3.793798792277858e-05, "loss": 0.6007, "step": 489900 }, { "epoch": 6.532375251629761, "grad_norm": 9.810206413269043, "learning_rate": 3.792961680651788e-05, "loss": 0.6678, "step": 490000 }, { "epoch": 6.533708389436216, "grad_norm": 1.5317442417144775, "learning_rate": 3.792124502641465e-05, "loss": 0.502, "step": 490100 }, { "epoch": 6.535041527242671, "grad_norm": 2.328298807144165, "learning_rate": 3.7912956310882465e-05, "loss": 0.5684, "step": 490200 }, { "epoch": 6.536374665049126, "grad_norm": 3.041088104248047, "learning_rate": 3.790458321181777e-05, "loss": 0.5255, "step": 490300 }, { "epoch": 6.537707802855581, "grad_norm": 3.0651426315307617, "learning_rate": 3.7896209451006273e-05, "loss": 0.5898, "step": 490400 }, { "epoch": 6.539040940662036, "grad_norm": 2.3224849700927734, "learning_rate": 3.7887835029149015e-05, "loss": 0.4605, "step": 490500 }, { "epoch": 6.540374078468491, "grad_norm": 21.429927825927734, "learning_rate": 3.787945994694708e-05, "loss": 0.5476, "step": 490600 }, { "epoch": 6.541707216274946, "grad_norm": 66.49769592285156, "learning_rate": 3.7871084205101595e-05, "loss": 0.5295, "step": 490700 }, { "epoch": 6.543040354081401, "grad_norm": 2.5005156993865967, "learning_rate": 3.7862791571581094e-05, "loss": 0.5287, "step": 490800 }, { "epoch": 6.544373491887857, "grad_norm": 7.489987850189209, "learning_rate": 3.785441451913108e-05, "loss": 0.5618, "step": 490900 }, { "epoch": 6.545706629694312, "grad_norm": 6.17756986618042, "learning_rate": 3.7846036809134235e-05, "loss": 0.5585, "step": 491000 }, { "epoch": 6.547039767500767, "grad_norm": 8.228737831115723, "learning_rate": 3.7837658442291925e-05, "loss": 0.4862, "step": 491100 }, { "epoch": 6.548372905307222, "grad_norm": 58.088741302490234, "learning_rate": 3.782927941930556e-05, "loss": 0.5663, "step": 491200 }, { "epoch": 6.549706043113677, "grad_norm": 2.5811121463775635, "learning_rate": 3.78208997408766e-05, "loss": 0.5251, "step": 491300 }, { "epoch": 6.551039180920132, "grad_norm": 1.9767931699752808, "learning_rate": 3.7812519407706566e-05, "loss": 0.528, "step": 491400 }, { "epoch": 6.5523723187265865, "grad_norm": 5.638088703155518, "learning_rate": 3.7804138420497035e-05, "loss": 0.6218, "step": 491500 }, { "epoch": 6.5537054565330415, "grad_norm": 4.919740200042725, "learning_rate": 3.7795756779949626e-05, "loss": 0.573, "step": 491600 }, { "epoch": 6.555038594339496, "grad_norm": 4.114835739135742, "learning_rate": 3.7787374486766056e-05, "loss": 0.633, "step": 491700 }, { "epoch": 6.556371732145952, "grad_norm": 7.462393283843994, "learning_rate": 3.7778991541648026e-05, "loss": 0.5448, "step": 491800 }, { "epoch": 6.557704869952407, "grad_norm": 18.910118103027344, "learning_rate": 3.7770607945297354e-05, "loss": 0.6002, "step": 491900 }, { "epoch": 6.559038007758862, "grad_norm": 17.85862159729004, "learning_rate": 3.776222369841587e-05, "loss": 0.5901, "step": 492000 }, { "epoch": 6.560371145565317, "grad_norm": 1.898429036140442, "learning_rate": 3.7753838801705485e-05, "loss": 0.6088, "step": 492100 }, { "epoch": 6.561704283371772, "grad_norm": 1.514121651649475, "learning_rate": 3.774545325586818e-05, "loss": 0.585, "step": 492200 }, { "epoch": 6.563037421178227, "grad_norm": 10.902660369873047, "learning_rate": 3.773706706160592e-05, "loss": 0.6224, "step": 492300 }, { "epoch": 6.564370558984682, "grad_norm": 5.640266418457031, "learning_rate": 3.7728680219620805e-05, "loss": 0.5154, "step": 492400 }, { "epoch": 6.565703696791138, "grad_norm": 15.78943157196045, "learning_rate": 3.772029273061494e-05, "loss": 0.5499, "step": 492500 }, { "epoch": 6.567036834597593, "grad_norm": 4.270752429962158, "learning_rate": 3.771190459529051e-05, "loss": 0.6386, "step": 492600 }, { "epoch": 6.568369972404048, "grad_norm": 1.5695980787277222, "learning_rate": 3.770351581434973e-05, "loss": 0.5738, "step": 492700 }, { "epoch": 6.569703110210503, "grad_norm": 1.43257474899292, "learning_rate": 3.7695126388494896e-05, "loss": 0.6397, "step": 492800 }, { "epoch": 6.5710362480169575, "grad_norm": 17.14685821533203, "learning_rate": 3.768673631842834e-05, "loss": 0.6358, "step": 492900 }, { "epoch": 6.5723693858234125, "grad_norm": 37.33661651611328, "learning_rate": 3.767834560485244e-05, "loss": 0.638, "step": 493000 }, { "epoch": 6.573702523629867, "grad_norm": 4.7771148681640625, "learning_rate": 3.7669954248469666e-05, "loss": 0.5657, "step": 493100 }, { "epoch": 6.575035661436322, "grad_norm": 6.037445068359375, "learning_rate": 3.766156224998249e-05, "loss": 0.5345, "step": 493200 }, { "epoch": 6.576368799242777, "grad_norm": 0.4939862787723541, "learning_rate": 3.765316961009348e-05, "loss": 0.5771, "step": 493300 }, { "epoch": 6.577701937049233, "grad_norm": 12.034183502197266, "learning_rate": 3.764477632950522e-05, "loss": 0.5533, "step": 493400 }, { "epoch": 6.579035074855688, "grad_norm": 13.978435516357422, "learning_rate": 3.76363824089204e-05, "loss": 0.544, "step": 493500 }, { "epoch": 6.580368212662143, "grad_norm": 24.36769676208496, "learning_rate": 3.762798784904171e-05, "loss": 0.5749, "step": 493600 }, { "epoch": 6.581701350468598, "grad_norm": 4.946450233459473, "learning_rate": 3.761959265057192e-05, "loss": 0.6049, "step": 493700 }, { "epoch": 6.583034488275053, "grad_norm": 37.237548828125, "learning_rate": 3.7611196814213855e-05, "loss": 0.6961, "step": 493800 }, { "epoch": 6.584367626081508, "grad_norm": 5.601282119750977, "learning_rate": 3.760280034067039e-05, "loss": 0.5938, "step": 493900 }, { "epoch": 6.585700763887963, "grad_norm": 2.4893288612365723, "learning_rate": 3.7594403230644444e-05, "loss": 0.6059, "step": 494000 }, { "epoch": 6.587033901694419, "grad_norm": 11.300849914550781, "learning_rate": 3.7586005484838985e-05, "loss": 0.6236, "step": 494100 }, { "epoch": 6.5883670395008735, "grad_norm": 7.165943145751953, "learning_rate": 3.757760710395708e-05, "loss": 0.5151, "step": 494200 }, { "epoch": 6.5897001773073285, "grad_norm": 29.983787536621094, "learning_rate": 3.756920808870179e-05, "loss": 0.5777, "step": 494300 }, { "epoch": 6.591033315113783, "grad_norm": 1.9129250049591064, "learning_rate": 3.756080843977625e-05, "loss": 0.579, "step": 494400 }, { "epoch": 6.592366452920238, "grad_norm": 5.337916851043701, "learning_rate": 3.7552408157883674e-05, "loss": 0.6191, "step": 494500 }, { "epoch": 6.593699590726693, "grad_norm": 7.665248870849609, "learning_rate": 3.754400724372729e-05, "loss": 0.5821, "step": 494600 }, { "epoch": 6.595032728533148, "grad_norm": 15.814468383789062, "learning_rate": 3.753560569801039e-05, "loss": 0.6458, "step": 494700 }, { "epoch": 6.596365866339603, "grad_norm": 13.837919235229492, "learning_rate": 3.7527203521436346e-05, "loss": 0.5672, "step": 494800 }, { "epoch": 6.597699004146058, "grad_norm": 13.340038299560547, "learning_rate": 3.7518884745892775e-05, "loss": 0.568, "step": 494900 }, { "epoch": 6.599032141952514, "grad_norm": 5.077045440673828, "learning_rate": 3.751048131600571e-05, "loss": 0.5138, "step": 495000 }, { "epoch": 6.600365279758969, "grad_norm": 4.9030351638793945, "learning_rate": 3.750207725736481e-05, "loss": 0.5774, "step": 495100 }, { "epoch": 6.601698417565424, "grad_norm": 93.8594970703125, "learning_rate": 3.749367257067366e-05, "loss": 0.6228, "step": 495200 }, { "epoch": 6.603031555371879, "grad_norm": 83.38473510742188, "learning_rate": 3.748526725663587e-05, "loss": 0.6121, "step": 495300 }, { "epoch": 6.604364693178334, "grad_norm": 28.170652389526367, "learning_rate": 3.7476861315955094e-05, "loss": 0.6161, "step": 495400 }, { "epoch": 6.605697830984789, "grad_norm": 19.88453483581543, "learning_rate": 3.7468538818097346e-05, "loss": 0.5675, "step": 495500 }, { "epoch": 6.607030968791244, "grad_norm": 4.5780415534973145, "learning_rate": 3.746013163249069e-05, "loss": 0.5537, "step": 495600 }, { "epoch": 6.6083641065976995, "grad_norm": 43.33326721191406, "learning_rate": 3.745180790353594e-05, "loss": 0.6255, "step": 495700 }, { "epoch": 6.609697244404154, "grad_norm": 2.2098309993743896, "learning_rate": 3.7443399475790616e-05, "loss": 0.5832, "step": 495800 }, { "epoch": 6.611030382210609, "grad_norm": 13.327832221984863, "learning_rate": 3.743499042490735e-05, "loss": 0.6272, "step": 495900 }, { "epoch": 6.612363520017064, "grad_norm": 15.190401077270508, "learning_rate": 3.742658075159013e-05, "loss": 0.56, "step": 496000 }, { "epoch": 6.613696657823519, "grad_norm": 14.588624954223633, "learning_rate": 3.741817045654298e-05, "loss": 0.5231, "step": 496100 }, { "epoch": 6.615029795629974, "grad_norm": 6.420891761779785, "learning_rate": 3.740975954046997e-05, "loss": 0.5904, "step": 496200 }, { "epoch": 6.616362933436429, "grad_norm": 16.129959106445312, "learning_rate": 3.740134800407526e-05, "loss": 0.5983, "step": 496300 }, { "epoch": 6.617696071242884, "grad_norm": 25.316152572631836, "learning_rate": 3.739293584806303e-05, "loss": 0.6077, "step": 496400 }, { "epoch": 6.619029209049339, "grad_norm": 9.538970947265625, "learning_rate": 3.7384523073137504e-05, "loss": 0.5937, "step": 496500 }, { "epoch": 6.620362346855795, "grad_norm": 15.142566680908203, "learning_rate": 3.737610968000299e-05, "loss": 0.5768, "step": 496600 }, { "epoch": 6.62169548466225, "grad_norm": 16.06135368347168, "learning_rate": 3.7367695669363834e-05, "loss": 0.5969, "step": 496700 }, { "epoch": 6.623028622468705, "grad_norm": 12.728251457214355, "learning_rate": 3.7359281041924405e-05, "loss": 0.5578, "step": 496800 }, { "epoch": 6.62436176027516, "grad_norm": 2.7756221294403076, "learning_rate": 3.735086579838917e-05, "loss": 0.6138, "step": 496900 }, { "epoch": 6.625694898081615, "grad_norm": 13.417173385620117, "learning_rate": 3.734244993946262e-05, "loss": 0.5734, "step": 497000 }, { "epoch": 6.6270280358880695, "grad_norm": 8.358712196350098, "learning_rate": 3.733403346584932e-05, "loss": 0.5859, "step": 497100 }, { "epoch": 6.6283611736945245, "grad_norm": 4.100179195404053, "learning_rate": 3.732561637825385e-05, "loss": 0.5959, "step": 497200 }, { "epoch": 6.62969431150098, "grad_norm": 13.071290016174316, "learning_rate": 3.7317198677380864e-05, "loss": 0.5648, "step": 497300 }, { "epoch": 6.631027449307435, "grad_norm": 30.108652114868164, "learning_rate": 3.730878036393508e-05, "loss": 0.5752, "step": 497400 }, { "epoch": 6.63236058711389, "grad_norm": 12.962210655212402, "learning_rate": 3.730036143862123e-05, "loss": 0.6459, "step": 497500 }, { "epoch": 6.633693724920345, "grad_norm": 4.222620964050293, "learning_rate": 3.729194190214414e-05, "loss": 0.63, "step": 497600 }, { "epoch": 6.6350268627268, "grad_norm": 3.8428990840911865, "learning_rate": 3.728352175520866e-05, "loss": 0.5936, "step": 497700 }, { "epoch": 6.636360000533255, "grad_norm": 2.3009228706359863, "learning_rate": 3.7275100998519706e-05, "loss": 0.5913, "step": 497800 }, { "epoch": 6.63769313833971, "grad_norm": 11.166458129882812, "learning_rate": 3.726667963278222e-05, "loss": 0.5664, "step": 497900 }, { "epoch": 6.639026276146165, "grad_norm": 7.120764255523682, "learning_rate": 3.725825765870122e-05, "loss": 0.5897, "step": 498000 }, { "epoch": 6.64035941395262, "grad_norm": 22.27203941345215, "learning_rate": 3.724983507698178e-05, "loss": 0.5889, "step": 498100 }, { "epoch": 6.641692551759076, "grad_norm": 8.136771202087402, "learning_rate": 3.7241411888328984e-05, "loss": 0.6147, "step": 498200 }, { "epoch": 6.643025689565531, "grad_norm": 0.7602436542510986, "learning_rate": 3.7232988093448024e-05, "loss": 0.6148, "step": 498300 }, { "epoch": 6.644358827371986, "grad_norm": 1.1563128232955933, "learning_rate": 3.72245636930441e-05, "loss": 0.6407, "step": 498400 }, { "epoch": 6.6456919651784405, "grad_norm": 2.162611246109009, "learning_rate": 3.721613868782249e-05, "loss": 0.6356, "step": 498500 }, { "epoch": 6.6470251029848955, "grad_norm": 1.4765493869781494, "learning_rate": 3.720771307848847e-05, "loss": 0.6078, "step": 498600 }, { "epoch": 6.64835824079135, "grad_norm": 21.997703552246094, "learning_rate": 3.7199286865747445e-05, "loss": 0.6542, "step": 498700 }, { "epoch": 6.649691378597805, "grad_norm": 25.29610824584961, "learning_rate": 3.719086005030481e-05, "loss": 0.6244, "step": 498800 }, { "epoch": 6.651024516404261, "grad_norm": 8.553815841674805, "learning_rate": 3.7182432632866046e-05, "loss": 0.5635, "step": 498900 }, { "epoch": 6.652357654210716, "grad_norm": 3.5350630283355713, "learning_rate": 3.717400461413667e-05, "loss": 0.4979, "step": 499000 }, { "epoch": 6.653690792017171, "grad_norm": 1.075107455253601, "learning_rate": 3.716557599482223e-05, "loss": 0.5824, "step": 499100 }, { "epoch": 6.655023929823626, "grad_norm": 8.163947105407715, "learning_rate": 3.715714677562837e-05, "loss": 0.5976, "step": 499200 }, { "epoch": 6.656357067630081, "grad_norm": 10.42969036102295, "learning_rate": 3.714871695726073e-05, "loss": 0.581, "step": 499300 }, { "epoch": 6.657690205436536, "grad_norm": 41.52098083496094, "learning_rate": 3.714028654042505e-05, "loss": 0.6149, "step": 499400 }, { "epoch": 6.659023343242991, "grad_norm": 6.312849044799805, "learning_rate": 3.7131855525827076e-05, "loss": 0.5585, "step": 499500 }, { "epoch": 6.660356481049446, "grad_norm": 3.697704553604126, "learning_rate": 3.712342391417265e-05, "loss": 0.6056, "step": 499600 }, { "epoch": 6.661689618855901, "grad_norm": 5.775289535522461, "learning_rate": 3.711499170616763e-05, "loss": 0.5945, "step": 499700 }, { "epoch": 6.6630227566623565, "grad_norm": 35.35733413696289, "learning_rate": 3.710655890251792e-05, "loss": 0.62, "step": 499800 }, { "epoch": 6.6643558944688115, "grad_norm": 7.542510986328125, "learning_rate": 3.709812550392952e-05, "loss": 0.6013, "step": 499900 }, { "epoch": 6.665689032275266, "grad_norm": 6.348344802856445, "learning_rate": 3.708969151110841e-05, "loss": 0.5865, "step": 500000 }, { "epoch": 6.667022170081721, "grad_norm": 2.9983153343200684, "learning_rate": 3.708125692476068e-05, "loss": 0.6462, "step": 500100 }, { "epoch": 6.668355307888176, "grad_norm": 13.2160062789917, "learning_rate": 3.7072821745592443e-05, "loss": 0.5479, "step": 500200 }, { "epoch": 6.669688445694631, "grad_norm": 2.68845272064209, "learning_rate": 3.706438597430987e-05, "loss": 0.5705, "step": 500300 }, { "epoch": 6.671021583501086, "grad_norm": 8.279550552368164, "learning_rate": 3.705594961161917e-05, "loss": 0.5828, "step": 500400 }, { "epoch": 6.672354721307542, "grad_norm": 53.9633674621582, "learning_rate": 3.7047512658226606e-05, "loss": 0.6363, "step": 500500 }, { "epoch": 6.673687859113997, "grad_norm": 11.723766326904297, "learning_rate": 3.70390751148385e-05, "loss": 0.5254, "step": 500600 }, { "epoch": 6.675020996920452, "grad_norm": 17.990562438964844, "learning_rate": 3.7030636982161214e-05, "loss": 0.5773, "step": 500700 }, { "epoch": 6.676354134726907, "grad_norm": 1.9920156002044678, "learning_rate": 3.702219826090117e-05, "loss": 0.5717, "step": 500800 }, { "epoch": 6.677687272533362, "grad_norm": 151.25303649902344, "learning_rate": 3.701375895176482e-05, "loss": 0.5778, "step": 500900 }, { "epoch": 6.679020410339817, "grad_norm": 7.195647716522217, "learning_rate": 3.700531905545868e-05, "loss": 0.666, "step": 501000 }, { "epoch": 6.680353548146272, "grad_norm": 6.81307315826416, "learning_rate": 3.6996878572689294e-05, "loss": 0.5372, "step": 501100 }, { "epoch": 6.681686685952727, "grad_norm": 4.247907638549805, "learning_rate": 3.6988437504163315e-05, "loss": 0.5688, "step": 501200 }, { "epoch": 6.683019823759182, "grad_norm": 2.163771629333496, "learning_rate": 3.697999585058736e-05, "loss": 0.5161, "step": 501300 }, { "epoch": 6.684352961565637, "grad_norm": 3.6900224685668945, "learning_rate": 3.697155361266817e-05, "loss": 0.5372, "step": 501400 }, { "epoch": 6.685686099372092, "grad_norm": 5.487834453582764, "learning_rate": 3.696311079111248e-05, "loss": 0.6066, "step": 501500 }, { "epoch": 6.687019237178547, "grad_norm": 4.315130710601807, "learning_rate": 3.69546673866271e-05, "loss": 0.5639, "step": 501600 }, { "epoch": 6.688352374985002, "grad_norm": 9.684117317199707, "learning_rate": 3.69462233999189e-05, "loss": 0.5322, "step": 501700 }, { "epoch": 6.689685512791457, "grad_norm": 4.002140522003174, "learning_rate": 3.6937778831694757e-05, "loss": 0.5823, "step": 501800 }, { "epoch": 6.691018650597912, "grad_norm": 13.80837345123291, "learning_rate": 3.692933368266166e-05, "loss": 0.6894, "step": 501900 }, { "epoch": 6.692351788404367, "grad_norm": NaN, "learning_rate": 3.6920972413687115e-05, "loss": 0.5469, "step": 502000 }, { "epoch": 6.693684926210823, "grad_norm": 7.20200252532959, "learning_rate": 3.6912610576840676e-05, "loss": 0.5931, "step": 502100 }, { "epoch": 6.695018064017278, "grad_norm": 11.81999397277832, "learning_rate": 3.690416370118967e-05, "loss": 0.6019, "step": 502200 }, { "epoch": 6.696351201823733, "grad_norm": 4.914088249206543, "learning_rate": 3.689571624754385e-05, "loss": 0.5803, "step": 502300 }, { "epoch": 6.697684339630188, "grad_norm": 55.003387451171875, "learning_rate": 3.6887268216610395e-05, "loss": 0.5569, "step": 502400 }, { "epoch": 6.699017477436643, "grad_norm": 3.7128467559814453, "learning_rate": 3.687881960909657e-05, "loss": 0.5472, "step": 502500 }, { "epoch": 6.700350615243098, "grad_norm": 4.234442234039307, "learning_rate": 3.687037042570964e-05, "loss": 0.6027, "step": 502600 }, { "epoch": 6.7016837530495525, "grad_norm": 10.250554084777832, "learning_rate": 3.686192066715696e-05, "loss": 0.4648, "step": 502700 }, { "epoch": 6.7030168908560075, "grad_norm": 58.951393127441406, "learning_rate": 3.685347033414591e-05, "loss": 0.5757, "step": 502800 }, { "epoch": 6.704350028662462, "grad_norm": 24.702699661254883, "learning_rate": 3.684501942738393e-05, "loss": 0.6123, "step": 502900 }, { "epoch": 6.705683166468918, "grad_norm": 2.310415506362915, "learning_rate": 3.68365679475785e-05, "loss": 0.5379, "step": 503000 }, { "epoch": 6.707016304275373, "grad_norm": 31.10016441345215, "learning_rate": 3.682811589543715e-05, "loss": 0.5379, "step": 503100 }, { "epoch": 6.708349442081828, "grad_norm": 3.240618944168091, "learning_rate": 3.681966327166746e-05, "loss": 0.555, "step": 503200 }, { "epoch": 6.709682579888283, "grad_norm": 2.3255598545074463, "learning_rate": 3.681121007697706e-05, "loss": 0.6411, "step": 503300 }, { "epoch": 6.711015717694738, "grad_norm": 7.23580265045166, "learning_rate": 3.680275631207363e-05, "loss": 0.5557, "step": 503400 }, { "epoch": 6.712348855501193, "grad_norm": 7.1564154624938965, "learning_rate": 3.679430197766487e-05, "loss": 0.5802, "step": 503500 }, { "epoch": 6.713681993307648, "grad_norm": 6.67889404296875, "learning_rate": 3.6785847074458585e-05, "loss": 0.5263, "step": 503600 }, { "epoch": 6.715015131114104, "grad_norm": 9.329055786132812, "learning_rate": 3.677739160316256e-05, "loss": 0.5338, "step": 503700 }, { "epoch": 6.716348268920559, "grad_norm": 2.809394359588623, "learning_rate": 3.676893556448468e-05, "loss": 0.597, "step": 503800 }, { "epoch": 6.717681406727014, "grad_norm": 40.70637893676758, "learning_rate": 3.676047895913285e-05, "loss": 0.5953, "step": 503900 }, { "epoch": 6.719014544533469, "grad_norm": 7.4031243324279785, "learning_rate": 3.675202178781503e-05, "loss": 0.5853, "step": 504000 }, { "epoch": 6.7203476823399235, "grad_norm": 5.593477725982666, "learning_rate": 3.674356405123923e-05, "loss": 0.5774, "step": 504100 }, { "epoch": 6.7216808201463785, "grad_norm": 12.693502426147461, "learning_rate": 3.673510575011351e-05, "loss": 0.5377, "step": 504200 }, { "epoch": 6.723013957952833, "grad_norm": 8.583974838256836, "learning_rate": 3.672664688514596e-05, "loss": 0.5742, "step": 504300 }, { "epoch": 6.724347095759288, "grad_norm": 2.3690438270568848, "learning_rate": 3.671827205411095e-05, "loss": 0.6257, "step": 504400 }, { "epoch": 6.725680233565743, "grad_norm": 10.415632247924805, "learning_rate": 3.6709812069205e-05, "loss": 0.6174, "step": 504500 }, { "epoch": 6.727013371372199, "grad_norm": 17.71700668334961, "learning_rate": 3.670135152257475e-05, "loss": 0.5679, "step": 504600 }, { "epoch": 6.728346509178654, "grad_norm": 19.213443756103516, "learning_rate": 3.669289041492846e-05, "loss": 0.6382, "step": 504700 }, { "epoch": 6.729679646985109, "grad_norm": 4.597209453582764, "learning_rate": 3.6684428746974484e-05, "loss": 0.5017, "step": 504800 }, { "epoch": 6.731012784791564, "grad_norm": 3.568174123764038, "learning_rate": 3.667596651942121e-05, "loss": 0.5148, "step": 504900 }, { "epoch": 6.732345922598019, "grad_norm": 2.7243897914886475, "learning_rate": 3.666750373297706e-05, "loss": 0.5746, "step": 505000 }, { "epoch": 6.733679060404474, "grad_norm": 5.635130405426025, "learning_rate": 3.6659040388350524e-05, "loss": 0.6204, "step": 505100 }, { "epoch": 6.735012198210929, "grad_norm": 8.522517204284668, "learning_rate": 3.66505764862501e-05, "loss": 0.547, "step": 505200 }, { "epoch": 6.736345336017385, "grad_norm": 15.139245986938477, "learning_rate": 3.664211202738439e-05, "loss": 0.5781, "step": 505300 }, { "epoch": 6.7376784738238396, "grad_norm": 3.977376699447632, "learning_rate": 3.6633647012462e-05, "loss": 0.5785, "step": 505400 }, { "epoch": 6.7390116116302945, "grad_norm": 135.60845947265625, "learning_rate": 3.662518144219159e-05, "loss": 0.6341, "step": 505500 }, { "epoch": 6.740344749436749, "grad_norm": 6.724357604980469, "learning_rate": 3.661671531728187e-05, "loss": 0.6212, "step": 505600 }, { "epoch": 6.741677887243204, "grad_norm": 29.01312255859375, "learning_rate": 3.660824863844161e-05, "loss": 0.6049, "step": 505700 }, { "epoch": 6.743011025049659, "grad_norm": 94.20352935791016, "learning_rate": 3.6599781406379604e-05, "loss": 0.6049, "step": 505800 }, { "epoch": 6.744344162856114, "grad_norm": 49.40647888183594, "learning_rate": 3.659131362180469e-05, "loss": 0.6709, "step": 505900 }, { "epoch": 6.745677300662569, "grad_norm": 18.107297897338867, "learning_rate": 3.658284528542579e-05, "loss": 0.6602, "step": 506000 }, { "epoch": 6.747010438469024, "grad_norm": 6.625262260437012, "learning_rate": 3.657437639795183e-05, "loss": 0.6041, "step": 506100 }, { "epoch": 6.74834357627548, "grad_norm": 9.960132598876953, "learning_rate": 3.656590696009179e-05, "loss": 0.5512, "step": 506200 }, { "epoch": 6.749676714081935, "grad_norm": 21.308109283447266, "learning_rate": 3.655743697255474e-05, "loss": 0.6199, "step": 506300 }, { "epoch": 6.75100985188839, "grad_norm": 9.931486129760742, "learning_rate": 3.654896643604972e-05, "loss": 0.6084, "step": 506400 }, { "epoch": 6.752342989694845, "grad_norm": 25.39606285095215, "learning_rate": 3.654049535128588e-05, "loss": 0.5461, "step": 506500 }, { "epoch": 6.7536761275013, "grad_norm": 4.851017951965332, "learning_rate": 3.6532023718972385e-05, "loss": 0.6007, "step": 506600 }, { "epoch": 6.755009265307755, "grad_norm": 6.3049211502075195, "learning_rate": 3.6523551539818464e-05, "loss": 0.5132, "step": 506700 }, { "epoch": 6.75634240311421, "grad_norm": 23.033784866333008, "learning_rate": 3.651507881453336e-05, "loss": 0.5627, "step": 506800 }, { "epoch": 6.757675540920665, "grad_norm": 5.335298538208008, "learning_rate": 3.65066055438264e-05, "loss": 0.5433, "step": 506900 }, { "epoch": 6.75900867872712, "grad_norm": 4.642467975616455, "learning_rate": 3.649813172840693e-05, "loss": 0.5869, "step": 507000 }, { "epoch": 6.760341816533575, "grad_norm": 2.724982500076294, "learning_rate": 3.6489657368984363e-05, "loss": 0.6264, "step": 507100 }, { "epoch": 6.76167495434003, "grad_norm": 11.752636909484863, "learning_rate": 3.6481182466268134e-05, "loss": 0.6074, "step": 507200 }, { "epoch": 6.763008092146485, "grad_norm": 10.046971321105957, "learning_rate": 3.647270702096774e-05, "loss": 0.5647, "step": 507300 }, { "epoch": 6.76434122995294, "grad_norm": 18.642501831054688, "learning_rate": 3.6464231033792725e-05, "loss": 0.6165, "step": 507400 }, { "epoch": 6.765674367759395, "grad_norm": 6.1983418464660645, "learning_rate": 3.645575450545266e-05, "loss": 0.6063, "step": 507500 }, { "epoch": 6.76700750556585, "grad_norm": 19.403432846069336, "learning_rate": 3.644727743665717e-05, "loss": 0.6286, "step": 507600 }, { "epoch": 6.768340643372305, "grad_norm": 2.892193555831909, "learning_rate": 3.643879982811594e-05, "loss": 0.6279, "step": 507700 }, { "epoch": 6.76967378117876, "grad_norm": 5.38682222366333, "learning_rate": 3.64303216805387e-05, "loss": 0.6508, "step": 507800 }, { "epoch": 6.771006918985216, "grad_norm": 4.938145637512207, "learning_rate": 3.6421842994635187e-05, "loss": 0.5741, "step": 507900 }, { "epoch": 6.772340056791671, "grad_norm": 8.384328842163086, "learning_rate": 3.641336377111521e-05, "loss": 0.6152, "step": 508000 }, { "epoch": 6.773673194598126, "grad_norm": 3.0560243129730225, "learning_rate": 3.640488401068865e-05, "loss": 0.4914, "step": 508100 }, { "epoch": 6.775006332404581, "grad_norm": 4.938547134399414, "learning_rate": 3.6396403714065394e-05, "loss": 0.6092, "step": 508200 }, { "epoch": 6.7763394702110356, "grad_norm": 9.059401512145996, "learning_rate": 3.6387922881955375e-05, "loss": 0.557, "step": 508300 }, { "epoch": 6.7776726080174905, "grad_norm": 4.276464462280273, "learning_rate": 3.6379441515068584e-05, "loss": 0.5774, "step": 508400 }, { "epoch": 6.779005745823945, "grad_norm": 5.6417317390441895, "learning_rate": 3.637095961411507e-05, "loss": 0.6023, "step": 508500 }, { "epoch": 6.780338883630401, "grad_norm": 3.9768097400665283, "learning_rate": 3.636247717980488e-05, "loss": 0.5701, "step": 508600 }, { "epoch": 6.781672021436856, "grad_norm": 3.776472568511963, "learning_rate": 3.635399421284818e-05, "loss": 0.5981, "step": 508700 }, { "epoch": 6.783005159243311, "grad_norm": 4.690443992614746, "learning_rate": 3.63455107139551e-05, "loss": 0.5326, "step": 508800 }, { "epoch": 6.784338297049766, "grad_norm": 0.5611951351165771, "learning_rate": 3.6337026683835874e-05, "loss": 0.6514, "step": 508900 }, { "epoch": 6.785671434856221, "grad_norm": 27.394384384155273, "learning_rate": 3.632854212320076e-05, "loss": 0.5627, "step": 509000 }, { "epoch": 6.787004572662676, "grad_norm": 12.076654434204102, "learning_rate": 3.632005703276002e-05, "loss": 0.5803, "step": 509100 }, { "epoch": 6.788337710469131, "grad_norm": 29.236679077148438, "learning_rate": 3.6311571413224055e-05, "loss": 0.5866, "step": 509200 }, { "epoch": 6.789670848275586, "grad_norm": 3.4961800575256348, "learning_rate": 3.63030852653032e-05, "loss": 0.5573, "step": 509300 }, { "epoch": 6.791003986082041, "grad_norm": 8.339653015136719, "learning_rate": 3.629459858970794e-05, "loss": 0.5812, "step": 509400 }, { "epoch": 6.792337123888497, "grad_norm": 19.613727569580078, "learning_rate": 3.628611138714872e-05, "loss": 0.6114, "step": 509500 }, { "epoch": 6.793670261694952, "grad_norm": 6.038331031799316, "learning_rate": 3.627762365833607e-05, "loss": 0.5806, "step": 509600 }, { "epoch": 6.7950033995014065, "grad_norm": 4.12071418762207, "learning_rate": 3.626913540398056e-05, "loss": 0.5336, "step": 509700 }, { "epoch": 6.7963365373078615, "grad_norm": 1.5722440481185913, "learning_rate": 3.626064662479279e-05, "loss": 0.5199, "step": 509800 }, { "epoch": 6.797669675114316, "grad_norm": 3.2980334758758545, "learning_rate": 3.625215732148343e-05, "loss": 0.5918, "step": 509900 }, { "epoch": 6.799002812920771, "grad_norm": 14.371454238891602, "learning_rate": 3.624366749476315e-05, "loss": 0.5702, "step": 510000 }, { "epoch": 6.800335950727226, "grad_norm": 3.682321071624756, "learning_rate": 3.623517714534272e-05, "loss": 0.5861, "step": 510100 }, { "epoch": 6.801669088533682, "grad_norm": 7.099031448364258, "learning_rate": 3.6226686273932906e-05, "loss": 0.5944, "step": 510200 }, { "epoch": 6.803002226340137, "grad_norm": 1.9103493690490723, "learning_rate": 3.6218194881244556e-05, "loss": 0.5707, "step": 510300 }, { "epoch": 6.804335364146592, "grad_norm": 16.150428771972656, "learning_rate": 3.620970296798854e-05, "loss": 0.6166, "step": 510400 }, { "epoch": 6.805668501953047, "grad_norm": 15.692450523376465, "learning_rate": 3.620121053487574e-05, "loss": 0.5663, "step": 510500 }, { "epoch": 6.807001639759502, "grad_norm": 17.789390563964844, "learning_rate": 3.619271758261716e-05, "loss": 0.5895, "step": 510600 }, { "epoch": 6.808334777565957, "grad_norm": 3.482804536819458, "learning_rate": 3.618422411192378e-05, "loss": 0.6017, "step": 510700 }, { "epoch": 6.809667915372412, "grad_norm": 3.7041842937469482, "learning_rate": 3.617573012350666e-05, "loss": 0.5295, "step": 510800 }, { "epoch": 6.811001053178867, "grad_norm": 7.072526454925537, "learning_rate": 3.616723561807687e-05, "loss": 0.5868, "step": 510900 }, { "epoch": 6.812334190985322, "grad_norm": 14.383550643920898, "learning_rate": 3.615874059634556e-05, "loss": 0.5558, "step": 511000 }, { "epoch": 6.8136673287917775, "grad_norm": 42.016632080078125, "learning_rate": 3.615024505902389e-05, "loss": 0.5657, "step": 511100 }, { "epoch": 6.815000466598232, "grad_norm": 5.2471747398376465, "learning_rate": 3.6141749006823115e-05, "loss": 0.5809, "step": 511200 }, { "epoch": 6.816333604404687, "grad_norm": 4.617633819580078, "learning_rate": 3.613325244045446e-05, "loss": 0.5806, "step": 511300 }, { "epoch": 6.817666742211142, "grad_norm": 4.8213419914245605, "learning_rate": 3.6124755360629245e-05, "loss": 0.5047, "step": 511400 }, { "epoch": 6.818999880017597, "grad_norm": 10.439311981201172, "learning_rate": 3.6116257768058815e-05, "loss": 0.5223, "step": 511500 }, { "epoch": 6.820333017824052, "grad_norm": 3.1680331230163574, "learning_rate": 3.610775966345457e-05, "loss": 0.6243, "step": 511600 }, { "epoch": 6.821666155630507, "grad_norm": 5.828176498413086, "learning_rate": 3.6099261047527935e-05, "loss": 0.4916, "step": 511700 }, { "epoch": 6.822999293436963, "grad_norm": 2.771164655685425, "learning_rate": 3.609076192099039e-05, "loss": 0.5687, "step": 511800 }, { "epoch": 6.824332431243418, "grad_norm": 6.286619186401367, "learning_rate": 3.608226228455347e-05, "loss": 0.5951, "step": 511900 }, { "epoch": 6.825665569049873, "grad_norm": 4.480233669281006, "learning_rate": 3.6073762138928714e-05, "loss": 0.5536, "step": 512000 }, { "epoch": 6.826998706856328, "grad_norm": 8.313863754272461, "learning_rate": 3.6065346493883364e-05, "loss": 0.5507, "step": 512100 }, { "epoch": 6.828331844662783, "grad_norm": 2.528449773788452, "learning_rate": 3.605684533709194e-05, "loss": 0.7322, "step": 512200 }, { "epoch": 6.829664982469238, "grad_norm": 3.406731128692627, "learning_rate": 3.604834367324051e-05, "loss": 0.5501, "step": 512300 }, { "epoch": 6.830998120275693, "grad_norm": 2.8972771167755127, "learning_rate": 3.603984150304083e-05, "loss": 0.5724, "step": 512400 }, { "epoch": 6.832331258082148, "grad_norm": 2.2659294605255127, "learning_rate": 3.603133882720465e-05, "loss": 0.5998, "step": 512500 }, { "epoch": 6.8336643958886025, "grad_norm": 6.505931377410889, "learning_rate": 3.6022835646443794e-05, "loss": 0.533, "step": 512600 }, { "epoch": 6.834997533695058, "grad_norm": 47.172603607177734, "learning_rate": 3.6014331961470124e-05, "loss": 0.5501, "step": 512700 }, { "epoch": 6.836330671501513, "grad_norm": 6.570284843444824, "learning_rate": 3.600582777299554e-05, "loss": 0.5525, "step": 512800 }, { "epoch": 6.837663809307968, "grad_norm": 4.044145107269287, "learning_rate": 3.599732308173199e-05, "loss": 0.6212, "step": 512900 }, { "epoch": 6.838996947114423, "grad_norm": 6.948278427124023, "learning_rate": 3.598881788839145e-05, "loss": 0.59, "step": 513000 }, { "epoch": 6.840330084920878, "grad_norm": 3.4936840534210205, "learning_rate": 3.598031219368597e-05, "loss": 0.5106, "step": 513100 }, { "epoch": 6.841663222727333, "grad_norm": 12.665050506591797, "learning_rate": 3.597180599832759e-05, "loss": 0.5385, "step": 513200 }, { "epoch": 6.842996360533788, "grad_norm": 9.987471580505371, "learning_rate": 3.5963299303028435e-05, "loss": 0.5477, "step": 513300 }, { "epoch": 6.844329498340244, "grad_norm": 177.96990966796875, "learning_rate": 3.5954792108500667e-05, "loss": 0.5893, "step": 513400 }, { "epoch": 6.845662636146699, "grad_norm": 0.8200234174728394, "learning_rate": 3.5946284415456475e-05, "loss": 0.5817, "step": 513500 }, { "epoch": 6.846995773953154, "grad_norm": 7.2018141746521, "learning_rate": 3.5937776224608094e-05, "loss": 0.5509, "step": 513600 }, { "epoch": 6.848328911759609, "grad_norm": 5.158544540405273, "learning_rate": 3.59292675366678e-05, "loss": 0.5599, "step": 513700 }, { "epoch": 6.849662049566064, "grad_norm": 4.030385971069336, "learning_rate": 3.5920758352347934e-05, "loss": 0.624, "step": 513800 }, { "epoch": 6.850995187372519, "grad_norm": 6.083106517791748, "learning_rate": 3.5912248672360824e-05, "loss": 0.5632, "step": 513900 }, { "epoch": 6.8523283251789735, "grad_norm": 3.855465888977051, "learning_rate": 3.590382360161601e-05, "loss": 0.5198, "step": 514000 }, { "epoch": 6.853661462985428, "grad_norm": 5.4366607666015625, "learning_rate": 3.5895312937370605e-05, "loss": 0.4921, "step": 514100 }, { "epoch": 6.854994600791883, "grad_norm": 2.2461366653442383, "learning_rate": 3.5886801779588185e-05, "loss": 0.4879, "step": 514200 }, { "epoch": 6.856327738598339, "grad_norm": 11.376131057739258, "learning_rate": 3.587829012898129e-05, "loss": 0.5682, "step": 514300 }, { "epoch": 6.857660876404794, "grad_norm": 26.13758087158203, "learning_rate": 3.586977798626246e-05, "loss": 0.5639, "step": 514400 }, { "epoch": 6.858994014211249, "grad_norm": 3.319314479827881, "learning_rate": 3.5861265352144325e-05, "loss": 0.5696, "step": 514500 }, { "epoch": 6.860327152017704, "grad_norm": 2.4655356407165527, "learning_rate": 3.5852752227339535e-05, "loss": 0.5744, "step": 514600 }, { "epoch": 6.861660289824159, "grad_norm": 9.87103271484375, "learning_rate": 3.584423861256076e-05, "loss": 0.538, "step": 514700 }, { "epoch": 6.862993427630614, "grad_norm": 6.244855880737305, "learning_rate": 3.583572450852077e-05, "loss": 0.633, "step": 514800 }, { "epoch": 6.864326565437069, "grad_norm": 5.27488374710083, "learning_rate": 3.5827209915932314e-05, "loss": 0.5351, "step": 514900 }, { "epoch": 6.865659703243525, "grad_norm": 30.03072166442871, "learning_rate": 3.581869483550822e-05, "loss": 0.5505, "step": 515000 }, { "epoch": 6.86699284104998, "grad_norm": 0.18866348266601562, "learning_rate": 3.581017926796132e-05, "loss": 0.5415, "step": 515100 }, { "epoch": 6.868325978856435, "grad_norm": 1.9249593019485474, "learning_rate": 3.580166321400454e-05, "loss": 0.4965, "step": 515200 }, { "epoch": 6.8696591166628895, "grad_norm": 5.582560062408447, "learning_rate": 3.579314667435081e-05, "loss": 0.6088, "step": 515300 }, { "epoch": 6.8709922544693445, "grad_norm": 8.988249778747559, "learning_rate": 3.578462964971308e-05, "loss": 0.5745, "step": 515400 }, { "epoch": 6.872325392275799, "grad_norm": 3.4143474102020264, "learning_rate": 3.5776112140804405e-05, "loss": 0.578, "step": 515500 }, { "epoch": 6.873658530082254, "grad_norm": 3.875229835510254, "learning_rate": 3.576759414833783e-05, "loss": 0.5272, "step": 515600 }, { "epoch": 6.874991667888709, "grad_norm": 5.563320159912109, "learning_rate": 3.575907567302645e-05, "loss": 0.5466, "step": 515700 }, { "epoch": 6.876324805695164, "grad_norm": 4.062938213348389, "learning_rate": 3.5750556715583404e-05, "loss": 0.5331, "step": 515800 }, { "epoch": 6.87765794350162, "grad_norm": 12.176918983459473, "learning_rate": 3.574203727672189e-05, "loss": 0.6379, "step": 515900 }, { "epoch": 6.878991081308075, "grad_norm": 2.777564287185669, "learning_rate": 3.573351735715511e-05, "loss": 0.5242, "step": 516000 }, { "epoch": 6.88032421911453, "grad_norm": 6.691666603088379, "learning_rate": 3.5724996957596324e-05, "loss": 0.5527, "step": 516100 }, { "epoch": 6.881657356920985, "grad_norm": 11.568758010864258, "learning_rate": 3.571647607875885e-05, "loss": 0.5665, "step": 516200 }, { "epoch": 6.88299049472744, "grad_norm": 10.684115409851074, "learning_rate": 3.570795472135601e-05, "loss": 0.5505, "step": 516300 }, { "epoch": 6.884323632533895, "grad_norm": 1.1123751401901245, "learning_rate": 3.56994328861012e-05, "loss": 0.5177, "step": 516400 }, { "epoch": 6.88565677034035, "grad_norm": 2.859241485595703, "learning_rate": 3.5690910573707824e-05, "loss": 0.5277, "step": 516500 }, { "epoch": 6.886989908146806, "grad_norm": 2.5711493492126465, "learning_rate": 3.5682387784889366e-05, "loss": 0.5087, "step": 516600 }, { "epoch": 6.8883230459532605, "grad_norm": 7.751403331756592, "learning_rate": 3.567386452035931e-05, "loss": 0.5151, "step": 516700 }, { "epoch": 6.8896561837597154, "grad_norm": 29.294710159301758, "learning_rate": 3.5665340780831205e-05, "loss": 0.5811, "step": 516800 }, { "epoch": 6.89098932156617, "grad_norm": 8.154168128967285, "learning_rate": 3.565681656701863e-05, "loss": 0.5311, "step": 516900 }, { "epoch": 6.892322459372625, "grad_norm": 10.849458694458008, "learning_rate": 3.56482918796352e-05, "loss": 0.529, "step": 517000 }, { "epoch": 6.89365559717908, "grad_norm": 3.9119999408721924, "learning_rate": 3.563976671939457e-05, "loss": 0.5447, "step": 517100 }, { "epoch": 6.894988734985535, "grad_norm": 13.959806442260742, "learning_rate": 3.563124108701046e-05, "loss": 0.5597, "step": 517200 }, { "epoch": 6.89632187279199, "grad_norm": 5.460824489593506, "learning_rate": 3.56227149831966e-05, "loss": 0.5886, "step": 517300 }, { "epoch": 6.897655010598445, "grad_norm": 1.4982329607009888, "learning_rate": 3.5614188408666755e-05, "loss": 0.5748, "step": 517400 }, { "epoch": 6.898988148404901, "grad_norm": 2.9313178062438965, "learning_rate": 3.5605661364134765e-05, "loss": 0.6522, "step": 517500 }, { "epoch": 6.900321286211356, "grad_norm": 4.493814945220947, "learning_rate": 3.559713385031447e-05, "loss": 0.5613, "step": 517600 }, { "epoch": 6.901654424017811, "grad_norm": 2.5985898971557617, "learning_rate": 3.558860586791978e-05, "loss": 0.5722, "step": 517700 }, { "epoch": 6.902987561824266, "grad_norm": 2.0376155376434326, "learning_rate": 3.558007741766461e-05, "loss": 0.5468, "step": 517800 }, { "epoch": 6.904320699630721, "grad_norm": 4.3804731369018555, "learning_rate": 3.557154850026296e-05, "loss": 0.5223, "step": 517900 }, { "epoch": 6.905653837437176, "grad_norm": 3.089755058288574, "learning_rate": 3.556301911642883e-05, "loss": 0.5178, "step": 518000 }, { "epoch": 6.906986975243631, "grad_norm": 15.4813232421875, "learning_rate": 3.5554489266876266e-05, "loss": 0.5645, "step": 518100 }, { "epoch": 6.908320113050086, "grad_norm": 3.7314631938934326, "learning_rate": 3.554595895231938e-05, "loss": 0.5236, "step": 518200 }, { "epoch": 6.909653250856541, "grad_norm": 3.522817373275757, "learning_rate": 3.5537428173472286e-05, "loss": 0.4892, "step": 518300 }, { "epoch": 6.910986388662996, "grad_norm": 3.101320743560791, "learning_rate": 3.552889693104918e-05, "loss": 0.5095, "step": 518400 }, { "epoch": 6.912319526469451, "grad_norm": 1.5165181159973145, "learning_rate": 3.552036522576423e-05, "loss": 0.5596, "step": 518500 }, { "epoch": 6.913652664275906, "grad_norm": 1.7059400081634521, "learning_rate": 3.551183305833171e-05, "loss": 0.4973, "step": 518600 }, { "epoch": 6.914985802082361, "grad_norm": 4.421256065368652, "learning_rate": 3.550330042946591e-05, "loss": 0.5332, "step": 518700 }, { "epoch": 6.916318939888816, "grad_norm": 10.123547554016113, "learning_rate": 3.549476733988113e-05, "loss": 0.6022, "step": 518800 }, { "epoch": 6.917652077695271, "grad_norm": 6.680559158325195, "learning_rate": 3.5486233790291765e-05, "loss": 0.5993, "step": 518900 }, { "epoch": 6.918985215501726, "grad_norm": 2.2649314403533936, "learning_rate": 3.547769978141219e-05, "loss": 0.516, "step": 519000 }, { "epoch": 6.920318353308182, "grad_norm": 25.790287017822266, "learning_rate": 3.546916531395687e-05, "loss": 0.575, "step": 519100 }, { "epoch": 6.921651491114637, "grad_norm": 5.675275802612305, "learning_rate": 3.546063038864025e-05, "loss": 0.5584, "step": 519200 }, { "epoch": 6.922984628921092, "grad_norm": 1.9639452695846558, "learning_rate": 3.545209500617689e-05, "loss": 0.5426, "step": 519300 }, { "epoch": 6.924317766727547, "grad_norm": 4.706270217895508, "learning_rate": 3.544355916728131e-05, "loss": 0.5272, "step": 519400 }, { "epoch": 6.925650904534002, "grad_norm": 8.33592414855957, "learning_rate": 3.543502287266811e-05, "loss": 0.5988, "step": 519500 }, { "epoch": 6.9269840423404565, "grad_norm": 2.559537887573242, "learning_rate": 3.542648612305194e-05, "loss": 0.5646, "step": 519600 }, { "epoch": 6.9283171801469114, "grad_norm": 4.32254695892334, "learning_rate": 3.541794891914745e-05, "loss": 0.5084, "step": 519700 }, { "epoch": 6.929650317953367, "grad_norm": 4.777199745178223, "learning_rate": 3.540941126166936e-05, "loss": 0.5684, "step": 519800 }, { "epoch": 6.930983455759822, "grad_norm": 9.597325325012207, "learning_rate": 3.5400873151332405e-05, "loss": 0.572, "step": 519900 }, { "epoch": 6.932316593566277, "grad_norm": 4.334796905517578, "learning_rate": 3.539233458885138e-05, "loss": 0.5527, "step": 520000 }, { "epoch": 6.933649731372732, "grad_norm": 3.6706254482269287, "learning_rate": 3.53837955749411e-05, "loss": 0.5205, "step": 520100 }, { "epoch": 6.934982869179187, "grad_norm": 2.562386989593506, "learning_rate": 3.5375256110316425e-05, "loss": 0.5481, "step": 520200 }, { "epoch": 6.936316006985642, "grad_norm": 2.243443250656128, "learning_rate": 3.5366716195692256e-05, "loss": 0.5579, "step": 520300 }, { "epoch": 6.937649144792097, "grad_norm": 2.179516315460205, "learning_rate": 3.5358175831783516e-05, "loss": 0.5189, "step": 520400 }, { "epoch": 6.938982282598552, "grad_norm": 8.43565845489502, "learning_rate": 3.5349635019305196e-05, "loss": 0.5397, "step": 520500 }, { "epoch": 6.940315420405007, "grad_norm": 1.1112585067749023, "learning_rate": 3.5341093758972285e-05, "loss": 0.5453, "step": 520600 }, { "epoch": 6.941648558211463, "grad_norm": 3.6866612434387207, "learning_rate": 3.533255205149985e-05, "loss": 0.5746, "step": 520700 }, { "epoch": 6.942981696017918, "grad_norm": 6.985304355621338, "learning_rate": 3.532400989760296e-05, "loss": 0.5309, "step": 520800 }, { "epoch": 6.9443148338243725, "grad_norm": 5.410575866699219, "learning_rate": 3.5315467297996756e-05, "loss": 0.5544, "step": 520900 }, { "epoch": 6.9456479716308275, "grad_norm": 48.73355484008789, "learning_rate": 3.530692425339638e-05, "loss": 0.5038, "step": 521000 }, { "epoch": 6.946981109437282, "grad_norm": 3.5745351314544678, "learning_rate": 3.529838076451703e-05, "loss": 0.5229, "step": 521100 }, { "epoch": 6.948314247243737, "grad_norm": 0.997814416885376, "learning_rate": 3.5289836832073946e-05, "loss": 0.5751, "step": 521200 }, { "epoch": 6.949647385050192, "grad_norm": 2.3989226818084717, "learning_rate": 3.5281292456782394e-05, "loss": 0.5692, "step": 521300 }, { "epoch": 6.950980522856648, "grad_norm": 2.503087043762207, "learning_rate": 3.5272747639357694e-05, "loss": 0.5043, "step": 521400 }, { "epoch": 6.952313660663103, "grad_norm": 2.7885379791259766, "learning_rate": 3.526428783528628e-05, "loss": 0.5744, "step": 521500 }, { "epoch": 6.953646798469558, "grad_norm": 33.212928771972656, "learning_rate": 3.5255742140144817e-05, "loss": 0.5894, "step": 521600 }, { "epoch": 6.954979936276013, "grad_norm": 3.667573928833008, "learning_rate": 3.524719600500918e-05, "loss": 0.5659, "step": 521700 }, { "epoch": 6.956313074082468, "grad_norm": 5.081122875213623, "learning_rate": 3.523873489851108e-05, "loss": 0.5805, "step": 521800 }, { "epoch": 6.957646211888923, "grad_norm": 6.070621490478516, "learning_rate": 3.523018788991561e-05, "loss": 0.4853, "step": 521900 }, { "epoch": 6.958979349695378, "grad_norm": 18.78815269470215, "learning_rate": 3.522164044346531e-05, "loss": 0.5736, "step": 522000 }, { "epoch": 6.960312487501833, "grad_norm": 6.329167366027832, "learning_rate": 3.521309255987573e-05, "loss": 0.5267, "step": 522100 }, { "epoch": 6.961645625308288, "grad_norm": 5.618947505950928, "learning_rate": 3.520454423986247e-05, "loss": 0.5856, "step": 522200 }, { "epoch": 6.9629787631147435, "grad_norm": 16.801740646362305, "learning_rate": 3.5195995484141176e-05, "loss": 0.5893, "step": 522300 }, { "epoch": 6.9643119009211985, "grad_norm": 6.6290507316589355, "learning_rate": 3.5187446293427514e-05, "loss": 0.5649, "step": 522400 }, { "epoch": 6.965645038727653, "grad_norm": 3.076630115509033, "learning_rate": 3.517889666843721e-05, "loss": 0.5377, "step": 522500 }, { "epoch": 6.966978176534108, "grad_norm": 6.794808864593506, "learning_rate": 3.517034660988599e-05, "loss": 0.6315, "step": 522600 }, { "epoch": 6.968311314340563, "grad_norm": 1.9769830703735352, "learning_rate": 3.5161796118489665e-05, "loss": 0.5194, "step": 522700 }, { "epoch": 6.969644452147018, "grad_norm": 2.7840726375579834, "learning_rate": 3.5153245194964024e-05, "loss": 0.554, "step": 522800 }, { "epoch": 6.970977589953473, "grad_norm": 3.1112232208251953, "learning_rate": 3.5144693840024946e-05, "loss": 0.5442, "step": 522900 }, { "epoch": 6.972310727759929, "grad_norm": 4.145390033721924, "learning_rate": 3.513614205438832e-05, "loss": 0.5394, "step": 523000 }, { "epoch": 6.973643865566384, "grad_norm": 4.8225932121276855, "learning_rate": 3.5127589838770075e-05, "loss": 0.5855, "step": 523100 }, { "epoch": 6.974977003372839, "grad_norm": 7.845808506011963, "learning_rate": 3.511903719388617e-05, "loss": 0.5384, "step": 523200 }, { "epoch": 6.976310141179294, "grad_norm": 10.111106872558594, "learning_rate": 3.5110484120452596e-05, "loss": 0.5267, "step": 523300 }, { "epoch": 6.977643278985749, "grad_norm": 2.7185423374176025, "learning_rate": 3.510193061918542e-05, "loss": 0.538, "step": 523400 }, { "epoch": 6.978976416792204, "grad_norm": 5.793938636779785, "learning_rate": 3.509337669080069e-05, "loss": 0.4882, "step": 523500 }, { "epoch": 6.980309554598659, "grad_norm": 3.148698091506958, "learning_rate": 3.50848223360145e-05, "loss": 0.5904, "step": 523600 }, { "epoch": 6.981642692405114, "grad_norm": 14.319392204284668, "learning_rate": 3.507626755554303e-05, "loss": 0.5603, "step": 523700 }, { "epoch": 6.9829758302115685, "grad_norm": 3.572408676147461, "learning_rate": 3.506771235010245e-05, "loss": 0.5892, "step": 523800 }, { "epoch": 6.984308968018024, "grad_norm": 1.9129207134246826, "learning_rate": 3.5059156720408946e-05, "loss": 0.5541, "step": 523900 }, { "epoch": 6.985642105824479, "grad_norm": 4.161052703857422, "learning_rate": 3.50506006671788e-05, "loss": 0.5864, "step": 524000 }, { "epoch": 6.986975243630934, "grad_norm": 6.6114397048950195, "learning_rate": 3.50420441911283e-05, "loss": 0.5072, "step": 524100 }, { "epoch": 6.988308381437389, "grad_norm": 4.58620023727417, "learning_rate": 3.5033487292973735e-05, "loss": 0.5664, "step": 524200 }, { "epoch": 6.989641519243844, "grad_norm": 5.1788530349731445, "learning_rate": 3.502492997343148e-05, "loss": 0.5498, "step": 524300 }, { "epoch": 6.990974657050299, "grad_norm": 34.82984161376953, "learning_rate": 3.5016372233217934e-05, "loss": 0.5304, "step": 524400 }, { "epoch": 6.992307794856754, "grad_norm": 10.946674346923828, "learning_rate": 3.500781407304952e-05, "loss": 0.5176, "step": 524500 }, { "epoch": 6.99364093266321, "grad_norm": 6.220682144165039, "learning_rate": 3.499925549364269e-05, "loss": 0.4984, "step": 524600 }, { "epoch": 6.994974070469665, "grad_norm": 39.239418029785156, "learning_rate": 3.499069649571396e-05, "loss": 0.5327, "step": 524700 }, { "epoch": 6.99630720827612, "grad_norm": 4.555315971374512, "learning_rate": 3.498222267620297e-05, "loss": 0.4696, "step": 524800 }, { "epoch": 6.997640346082575, "grad_norm": 16.109966278076172, "learning_rate": 3.4973662847547384e-05, "loss": 0.5354, "step": 524900 }, { "epoch": 6.99897348388903, "grad_norm": 3.6269750595092773, "learning_rate": 3.4965102602512425e-05, "loss": 0.6383, "step": 525000 }, { "epoch": 7.000306621695485, "grad_norm": 6.785662651062012, "learning_rate": 3.495654194181473e-05, "loss": 0.4835, "step": 525100 }, { "epoch": 7.0016397595019395, "grad_norm": 17.872159957885742, "learning_rate": 3.494798086617096e-05, "loss": 0.5784, "step": 525200 }, { "epoch": 7.0029728973083945, "grad_norm": 9.819953918457031, "learning_rate": 3.493941937629785e-05, "loss": 0.5242, "step": 525300 }, { "epoch": 7.004306035114849, "grad_norm": 5.189639091491699, "learning_rate": 3.4930857472912085e-05, "loss": 0.4964, "step": 525400 }, { "epoch": 7.005639172921305, "grad_norm": 2.5513057708740234, "learning_rate": 3.49222951567305e-05, "loss": 0.5334, "step": 525500 }, { "epoch": 7.00697231072776, "grad_norm": 3.839682102203369, "learning_rate": 3.491373242846988e-05, "loss": 0.5317, "step": 525600 }, { "epoch": 7.008305448534215, "grad_norm": 2.246230363845825, "learning_rate": 3.4905169288847066e-05, "loss": 0.504, "step": 525700 }, { "epoch": 7.00963858634067, "grad_norm": 3.850083589553833, "learning_rate": 3.489660573857894e-05, "loss": 0.5475, "step": 525800 }, { "epoch": 7.010971724147125, "grad_norm": 5.648358345031738, "learning_rate": 3.4888041778382425e-05, "loss": 0.4299, "step": 525900 }, { "epoch": 7.01230486195358, "grad_norm": 4.7591352462768555, "learning_rate": 3.487947740897445e-05, "loss": 0.498, "step": 526000 }, { "epoch": 7.013637999760035, "grad_norm": 53.73900604248047, "learning_rate": 3.487091263107201e-05, "loss": 0.4758, "step": 526100 }, { "epoch": 7.01497113756649, "grad_norm": 2.7673444747924805, "learning_rate": 3.486234744539213e-05, "loss": 0.5229, "step": 526200 }, { "epoch": 7.016304275372946, "grad_norm": 5.067116737365723, "learning_rate": 3.4853781852651835e-05, "loss": 0.4677, "step": 526300 }, { "epoch": 7.017637413179401, "grad_norm": 0.9026546478271484, "learning_rate": 3.484521585356823e-05, "loss": 0.5467, "step": 526400 }, { "epoch": 7.0189705509858555, "grad_norm": 10.349634170532227, "learning_rate": 3.4836649448858424e-05, "loss": 0.521, "step": 526500 }, { "epoch": 7.0203036887923105, "grad_norm": 10.469524383544922, "learning_rate": 3.482808263923958e-05, "loss": 0.4931, "step": 526600 }, { "epoch": 7.021636826598765, "grad_norm": 2.8093602657318115, "learning_rate": 3.4819515425428866e-05, "loss": 0.4717, "step": 526700 }, { "epoch": 7.02296996440522, "grad_norm": 5.223283290863037, "learning_rate": 3.481094780814352e-05, "loss": 0.5417, "step": 526800 }, { "epoch": 7.024303102211675, "grad_norm": 116.31926727294922, "learning_rate": 3.480237978810079e-05, "loss": 0.5319, "step": 526900 }, { "epoch": 7.02563624001813, "grad_norm": 6.81387186050415, "learning_rate": 3.479381136601796e-05, "loss": 0.5141, "step": 527000 }, { "epoch": 7.026969377824586, "grad_norm": 4.433990955352783, "learning_rate": 3.478524254261236e-05, "loss": 0.5509, "step": 527100 }, { "epoch": 7.028302515631041, "grad_norm": 8.535418510437012, "learning_rate": 3.477675901282208e-05, "loss": 0.4678, "step": 527200 }, { "epoch": 7.029635653437496, "grad_norm": 6.015069484710693, "learning_rate": 3.476818939291835e-05, "loss": 0.5154, "step": 527300 }, { "epoch": 7.030968791243951, "grad_norm": 5.7658891677856445, "learning_rate": 3.475970507600123e-05, "loss": 0.5432, "step": 527400 }, { "epoch": 7.032301929050406, "grad_norm": 3.3929123878479004, "learning_rate": 3.475113466244044e-05, "loss": 0.5143, "step": 527500 }, { "epoch": 7.033635066856861, "grad_norm": 2.486142158508301, "learning_rate": 3.474256385112962e-05, "loss": 0.5227, "step": 527600 }, { "epoch": 7.034968204663316, "grad_norm": 8.65333080291748, "learning_rate": 3.473399264278632e-05, "loss": 0.456, "step": 527700 }, { "epoch": 7.036301342469771, "grad_norm": 24.284351348876953, "learning_rate": 3.4725421038128065e-05, "loss": 0.4861, "step": 527800 }, { "epoch": 7.0376344802762265, "grad_norm": 5.016411781311035, "learning_rate": 3.471684903787246e-05, "loss": 0.5506, "step": 527900 }, { "epoch": 7.0389676180826815, "grad_norm": 10.789039611816406, "learning_rate": 3.470827664273713e-05, "loss": 0.4708, "step": 528000 }, { "epoch": 7.040300755889136, "grad_norm": 9.437943458557129, "learning_rate": 3.469970385343971e-05, "loss": 0.4837, "step": 528100 }, { "epoch": 7.041633893695591, "grad_norm": 3.3966877460479736, "learning_rate": 3.4691130670697905e-05, "loss": 0.4506, "step": 528200 }, { "epoch": 7.042967031502046, "grad_norm": 3.3040950298309326, "learning_rate": 3.4682557095229435e-05, "loss": 0.5255, "step": 528300 }, { "epoch": 7.044300169308501, "grad_norm": 2.8988194465637207, "learning_rate": 3.467398312775203e-05, "loss": 0.502, "step": 528400 }, { "epoch": 7.045633307114956, "grad_norm": 2.8604543209075928, "learning_rate": 3.46654087689835e-05, "loss": 0.5301, "step": 528500 }, { "epoch": 7.046966444921411, "grad_norm": 4.998002529144287, "learning_rate": 3.465683401964166e-05, "loss": 0.5304, "step": 528600 }, { "epoch": 7.048299582727867, "grad_norm": 8.341509819030762, "learning_rate": 3.464825888044435e-05, "loss": 0.4665, "step": 528700 }, { "epoch": 7.049632720534322, "grad_norm": 8.080341339111328, "learning_rate": 3.463968335210946e-05, "loss": 0.5095, "step": 528800 }, { "epoch": 7.050965858340777, "grad_norm": 1.7280389070510864, "learning_rate": 3.4631107435354905e-05, "loss": 0.4963, "step": 528900 }, { "epoch": 7.052298996147232, "grad_norm": 1.8475470542907715, "learning_rate": 3.4622531130898636e-05, "loss": 0.5376, "step": 529000 }, { "epoch": 7.053632133953687, "grad_norm": 23.43145751953125, "learning_rate": 3.461395443945863e-05, "loss": 0.5168, "step": 529100 }, { "epoch": 7.054965271760142, "grad_norm": 12.027759552001953, "learning_rate": 3.4605377361752894e-05, "loss": 0.4847, "step": 529200 }, { "epoch": 7.056298409566597, "grad_norm": 3.5446078777313232, "learning_rate": 3.45967998984995e-05, "loss": 0.5295, "step": 529300 }, { "epoch": 7.0576315473730515, "grad_norm": 2.1728758811950684, "learning_rate": 3.4588222050416494e-05, "loss": 0.5076, "step": 529400 }, { "epoch": 7.058964685179507, "grad_norm": 35.347965240478516, "learning_rate": 3.4579643818222004e-05, "loss": 0.5412, "step": 529500 }, { "epoch": 7.060297822985962, "grad_norm": 2.769711494445801, "learning_rate": 3.457106520263418e-05, "loss": 0.4807, "step": 529600 }, { "epoch": 7.061630960792417, "grad_norm": 17.962360382080078, "learning_rate": 3.4562486204371176e-05, "loss": 0.517, "step": 529700 }, { "epoch": 7.062964098598872, "grad_norm": 2.419487953186035, "learning_rate": 3.455390682415121e-05, "loss": 0.4857, "step": 529800 }, { "epoch": 7.064297236405327, "grad_norm": 3.392069101333618, "learning_rate": 3.4545327062692524e-05, "loss": 0.541, "step": 529900 }, { "epoch": 7.065630374211782, "grad_norm": 3.2789876461029053, "learning_rate": 3.4536746920713386e-05, "loss": 0.4713, "step": 530000 }, { "epoch": 7.066963512018237, "grad_norm": 2.0944957733154297, "learning_rate": 3.4528166398932094e-05, "loss": 0.4921, "step": 530100 }, { "epoch": 7.068296649824692, "grad_norm": 13.489361763000488, "learning_rate": 3.451958549806698e-05, "loss": 0.5014, "step": 530200 }, { "epoch": 7.069629787631148, "grad_norm": 4.812258720397949, "learning_rate": 3.4511004218836417e-05, "loss": 0.5229, "step": 530300 }, { "epoch": 7.070962925437603, "grad_norm": 7.772181034088135, "learning_rate": 3.450242256195881e-05, "loss": 0.5698, "step": 530400 }, { "epoch": 7.072296063244058, "grad_norm": 17.286436080932617, "learning_rate": 3.449384052815256e-05, "loss": 0.5086, "step": 530500 }, { "epoch": 7.073629201050513, "grad_norm": 1.1970148086547852, "learning_rate": 3.4485258118136165e-05, "loss": 0.4656, "step": 530600 }, { "epoch": 7.074962338856968, "grad_norm": 2.434269428253174, "learning_rate": 3.4476675332628097e-05, "loss": 0.505, "step": 530700 }, { "epoch": 7.0762954766634225, "grad_norm": 4.057621002197266, "learning_rate": 3.4468178005802456e-05, "loss": 0.5171, "step": 530800 }, { "epoch": 7.0776286144698775, "grad_norm": 2.6249568462371826, "learning_rate": 3.445959447520364e-05, "loss": 0.5152, "step": 530900 }, { "epoch": 7.078961752276332, "grad_norm": 3.117245674133301, "learning_rate": 3.4451010571261625e-05, "loss": 0.4654, "step": 531000 }, { "epoch": 7.080294890082788, "grad_norm": 2.6018664836883545, "learning_rate": 3.444242629469503e-05, "loss": 0.4489, "step": 531100 }, { "epoch": 7.081628027889243, "grad_norm": 2.9379687309265137, "learning_rate": 3.443384164622253e-05, "loss": 0.5018, "step": 531200 }, { "epoch": 7.082961165695698, "grad_norm": 2.515641689300537, "learning_rate": 3.442525662656276e-05, "loss": 0.5025, "step": 531300 }, { "epoch": 7.084294303502153, "grad_norm": 1.6263753175735474, "learning_rate": 3.441667123643447e-05, "loss": 0.5278, "step": 531400 }, { "epoch": 7.085627441308608, "grad_norm": 1.6185115575790405, "learning_rate": 3.4408085476556375e-05, "loss": 0.4501, "step": 531500 }, { "epoch": 7.086960579115063, "grad_norm": 5.5535078048706055, "learning_rate": 3.439949934764728e-05, "loss": 0.5181, "step": 531600 }, { "epoch": 7.088293716921518, "grad_norm": 7.866011142730713, "learning_rate": 3.439091285042594e-05, "loss": 0.4178, "step": 531700 }, { "epoch": 7.089626854727973, "grad_norm": 4.405153274536133, "learning_rate": 3.438232598561124e-05, "loss": 0.4826, "step": 531800 }, { "epoch": 7.090959992534429, "grad_norm": 6.777457237243652, "learning_rate": 3.437373875392201e-05, "loss": 0.4482, "step": 531900 }, { "epoch": 7.092293130340884, "grad_norm": 0.5330533385276794, "learning_rate": 3.4365151156077164e-05, "loss": 0.4578, "step": 532000 }, { "epoch": 7.0936262681473385, "grad_norm": 9.67872142791748, "learning_rate": 3.435656319279563e-05, "loss": 0.5062, "step": 532100 }, { "epoch": 7.0949594059537935, "grad_norm": 6.73573112487793, "learning_rate": 3.4347974864796346e-05, "loss": 0.4767, "step": 532200 }, { "epoch": 7.096292543760248, "grad_norm": 10.730883598327637, "learning_rate": 3.433938617279832e-05, "loss": 0.5003, "step": 532300 }, { "epoch": 7.097625681566703, "grad_norm": 1.7137328386306763, "learning_rate": 3.4330797117520555e-05, "loss": 0.382, "step": 532400 }, { "epoch": 7.098958819373158, "grad_norm": 0.6396743059158325, "learning_rate": 3.432220769968212e-05, "loss": 0.5424, "step": 532500 }, { "epoch": 7.100291957179613, "grad_norm": 7.218337059020996, "learning_rate": 3.431361792000207e-05, "loss": 0.4731, "step": 532600 }, { "epoch": 7.101625094986069, "grad_norm": 0.6520601511001587, "learning_rate": 3.4305027779199525e-05, "loss": 0.4072, "step": 532700 }, { "epoch": 7.102958232792524, "grad_norm": 8.085028648376465, "learning_rate": 3.429643727799363e-05, "loss": 0.5037, "step": 532800 }, { "epoch": 7.104291370598979, "grad_norm": 2.455787420272827, "learning_rate": 3.4287846417103546e-05, "loss": 0.4354, "step": 532900 }, { "epoch": 7.105624508405434, "grad_norm": 3.8372461795806885, "learning_rate": 3.427925519724848e-05, "loss": 0.4957, "step": 533000 }, { "epoch": 7.106957646211889, "grad_norm": 7.299431324005127, "learning_rate": 3.427066361914766e-05, "loss": 0.5348, "step": 533100 }, { "epoch": 7.108290784018344, "grad_norm": 6.151542663574219, "learning_rate": 3.4262071683520344e-05, "loss": 0.4909, "step": 533200 }, { "epoch": 7.109623921824799, "grad_norm": 15.689681053161621, "learning_rate": 3.425347939108582e-05, "loss": 0.5173, "step": 533300 }, { "epoch": 7.110957059631254, "grad_norm": 8.039664268493652, "learning_rate": 3.42448867425634e-05, "loss": 0.4708, "step": 533400 }, { "epoch": 7.1122901974377095, "grad_norm": 15.383574485778809, "learning_rate": 3.423637967046808e-05, "loss": 0.4842, "step": 533500 }, { "epoch": 7.1136233352441645, "grad_norm": 33.60174560546875, "learning_rate": 3.4227786315470895e-05, "loss": 0.5436, "step": 533600 }, { "epoch": 7.114956473050619, "grad_norm": 3.1990060806274414, "learning_rate": 3.4219192606536784e-05, "loss": 0.5486, "step": 533700 }, { "epoch": 7.116289610857074, "grad_norm": 5.694830894470215, "learning_rate": 3.4210598544385157e-05, "loss": 0.4982, "step": 533800 }, { "epoch": 7.117622748663529, "grad_norm": 9.844496726989746, "learning_rate": 3.4202004129735496e-05, "loss": 0.4963, "step": 533900 }, { "epoch": 7.118955886469984, "grad_norm": 3.299334764480591, "learning_rate": 3.4193409363307307e-05, "loss": 0.492, "step": 534000 }, { "epoch": 7.120289024276439, "grad_norm": 0.36375948786735535, "learning_rate": 3.418481424582009e-05, "loss": 0.4656, "step": 534100 }, { "epoch": 7.121622162082894, "grad_norm": 3.3907690048217773, "learning_rate": 3.4176218777993415e-05, "loss": 0.5116, "step": 534200 }, { "epoch": 7.12295529988935, "grad_norm": 2.703964948654175, "learning_rate": 3.416762296054687e-05, "loss": 0.4966, "step": 534300 }, { "epoch": 7.124288437695805, "grad_norm": 4.282016754150391, "learning_rate": 3.4159026794200074e-05, "loss": 0.5428, "step": 534400 }, { "epoch": 7.12562157550226, "grad_norm": 1.8443034887313843, "learning_rate": 3.415043027967266e-05, "loss": 0.5229, "step": 534500 }, { "epoch": 7.126954713308715, "grad_norm": 9.784838676452637, "learning_rate": 3.414183341768431e-05, "loss": 0.5159, "step": 534600 }, { "epoch": 7.12828785111517, "grad_norm": 4.474815368652344, "learning_rate": 3.413323620895471e-05, "loss": 0.5037, "step": 534700 }, { "epoch": 7.129620988921625, "grad_norm": 2.1088297367095947, "learning_rate": 3.4124638654203626e-05, "loss": 0.4008, "step": 534800 }, { "epoch": 7.13095412672808, "grad_norm": 6.249424934387207, "learning_rate": 3.411604075415078e-05, "loss": 0.5544, "step": 534900 }, { "epoch": 7.1322872645345345, "grad_norm": 3.7024760246276855, "learning_rate": 3.410744250951598e-05, "loss": 0.5196, "step": 535000 }, { "epoch": 7.13362040234099, "grad_norm": 3.4760079383850098, "learning_rate": 3.409884392101905e-05, "loss": 0.5126, "step": 535100 }, { "epoch": 7.134953540147445, "grad_norm": 4.623532295227051, "learning_rate": 3.4090244989379803e-05, "loss": 0.4949, "step": 535200 }, { "epoch": 7.1362866779539, "grad_norm": 92.08438873291016, "learning_rate": 3.4081645715318164e-05, "loss": 0.5247, "step": 535300 }, { "epoch": 7.137619815760355, "grad_norm": 4.145254611968994, "learning_rate": 3.407304609955401e-05, "loss": 0.5005, "step": 535400 }, { "epoch": 7.13895295356681, "grad_norm": 7.940629959106445, "learning_rate": 3.406444614280727e-05, "loss": 0.4592, "step": 535500 }, { "epoch": 7.140286091373265, "grad_norm": 5.880749225616455, "learning_rate": 3.405584584579791e-05, "loss": 0.5273, "step": 535600 }, { "epoch": 7.14161922917972, "grad_norm": 1.7838255167007446, "learning_rate": 3.4047245209245915e-05, "loss": 0.5031, "step": 535700 }, { "epoch": 7.142952366986175, "grad_norm": 9.6378812789917, "learning_rate": 3.403864423387132e-05, "loss": 0.47, "step": 535800 }, { "epoch": 7.144285504792631, "grad_norm": 3.762119770050049, "learning_rate": 3.403004292039416e-05, "loss": 0.5273, "step": 535900 }, { "epoch": 7.145618642599086, "grad_norm": 2.4380006790161133, "learning_rate": 3.40214412695345e-05, "loss": 0.5032, "step": 536000 }, { "epoch": 7.146951780405541, "grad_norm": 2.104259729385376, "learning_rate": 3.4012925303551796e-05, "loss": 0.4578, "step": 536100 }, { "epoch": 7.148284918211996, "grad_norm": 8.128244400024414, "learning_rate": 3.4004322983443364e-05, "loss": 0.4831, "step": 536200 }, { "epoch": 7.149618056018451, "grad_norm": 2.216965675354004, "learning_rate": 3.399572032810563e-05, "loss": 0.4994, "step": 536300 }, { "epoch": 7.1509511938249055, "grad_norm": 3.370816946029663, "learning_rate": 3.398711733825878e-05, "loss": 0.4495, "step": 536400 }, { "epoch": 7.1522843316313605, "grad_norm": 3.8091647624969482, "learning_rate": 3.397851401462304e-05, "loss": 0.4305, "step": 536500 }, { "epoch": 7.153617469437815, "grad_norm": 2.168558120727539, "learning_rate": 3.396991035791865e-05, "loss": 0.4783, "step": 536600 }, { "epoch": 7.154950607244271, "grad_norm": 4.056833744049072, "learning_rate": 3.396130636886588e-05, "loss": 0.4863, "step": 536700 }, { "epoch": 7.156283745050726, "grad_norm": 9.839364051818848, "learning_rate": 3.3952702048185024e-05, "loss": 0.4803, "step": 536800 }, { "epoch": 7.157616882857181, "grad_norm": 5.685489654541016, "learning_rate": 3.3944097396596416e-05, "loss": 0.5402, "step": 536900 }, { "epoch": 7.158950020663636, "grad_norm": 6.181981086730957, "learning_rate": 3.393549241482041e-05, "loss": 0.5529, "step": 537000 }, { "epoch": 7.160283158470091, "grad_norm": 17.106294631958008, "learning_rate": 3.392688710357737e-05, "loss": 0.4744, "step": 537100 }, { "epoch": 7.161616296276546, "grad_norm": 13.790250778198242, "learning_rate": 3.391828146358774e-05, "loss": 0.5416, "step": 537200 }, { "epoch": 7.162949434083001, "grad_norm": 5.341979026794434, "learning_rate": 3.390967549557192e-05, "loss": 0.5612, "step": 537300 }, { "epoch": 7.164282571889456, "grad_norm": 2.736506462097168, "learning_rate": 3.39010692002504e-05, "loss": 0.464, "step": 537400 }, { "epoch": 7.165615709695912, "grad_norm": 6.395495414733887, "learning_rate": 3.3892462578343665e-05, "loss": 0.4725, "step": 537500 }, { "epoch": 7.166948847502367, "grad_norm": 49.85559844970703, "learning_rate": 3.388385563057223e-05, "loss": 0.4817, "step": 537600 }, { "epoch": 7.1682819853088215, "grad_norm": 7.758186340332031, "learning_rate": 3.387524835765665e-05, "loss": 0.3966, "step": 537700 }, { "epoch": 7.1696151231152765, "grad_norm": 1.9723485708236694, "learning_rate": 3.386664076031748e-05, "loss": 0.4675, "step": 537800 }, { "epoch": 7.170948260921731, "grad_norm": 2.6645448207855225, "learning_rate": 3.3858032839275346e-05, "loss": 0.4668, "step": 537900 }, { "epoch": 7.172281398728186, "grad_norm": 5.570096492767334, "learning_rate": 3.384942459525086e-05, "loss": 0.565, "step": 538000 }, { "epoch": 7.173614536534641, "grad_norm": 11.215906143188477, "learning_rate": 3.3840816028964685e-05, "loss": 0.508, "step": 538100 }, { "epoch": 7.174947674341096, "grad_norm": 6.685400009155273, "learning_rate": 3.383220714113749e-05, "loss": 0.5075, "step": 538200 }, { "epoch": 7.176280812147552, "grad_norm": 5.798212051391602, "learning_rate": 3.382359793249001e-05, "loss": 0.4809, "step": 538300 }, { "epoch": 7.177613949954007, "grad_norm": 8.628254890441895, "learning_rate": 3.381498840374295e-05, "loss": 0.5418, "step": 538400 }, { "epoch": 7.178947087760462, "grad_norm": 2.542196750640869, "learning_rate": 3.380637855561709e-05, "loss": 0.5063, "step": 538500 }, { "epoch": 7.180280225566917, "grad_norm": 0.5916475653648376, "learning_rate": 3.379776838883322e-05, "loss": 0.4557, "step": 538600 }, { "epoch": 7.181613363373372, "grad_norm": 8.707090377807617, "learning_rate": 3.378915790411215e-05, "loss": 0.4665, "step": 538700 }, { "epoch": 7.182946501179827, "grad_norm": 8.665145874023438, "learning_rate": 3.378054710217472e-05, "loss": 0.482, "step": 538800 }, { "epoch": 7.184279638986282, "grad_norm": 1.345489263534546, "learning_rate": 3.377193598374181e-05, "loss": 0.5875, "step": 538900 }, { "epoch": 7.185612776792737, "grad_norm": 0.1115158200263977, "learning_rate": 3.376332454953432e-05, "loss": 0.499, "step": 539000 }, { "epoch": 7.1869459145991925, "grad_norm": 8.91795825958252, "learning_rate": 3.375471280027314e-05, "loss": 0.4575, "step": 539100 }, { "epoch": 7.1882790524056475, "grad_norm": 4.545896053314209, "learning_rate": 3.3746100736679244e-05, "loss": 0.4779, "step": 539200 }, { "epoch": 7.189612190212102, "grad_norm": 3.2655608654022217, "learning_rate": 3.3737488359473614e-05, "loss": 0.4692, "step": 539300 }, { "epoch": 7.190945328018557, "grad_norm": 4.2214789390563965, "learning_rate": 3.372887566937724e-05, "loss": 0.4698, "step": 539400 }, { "epoch": 7.192278465825012, "grad_norm": 2.324394464492798, "learning_rate": 3.372026266711114e-05, "loss": 0.4807, "step": 539500 }, { "epoch": 7.193611603631467, "grad_norm": 1.3778387308120728, "learning_rate": 3.371164935339638e-05, "loss": 0.4574, "step": 539600 }, { "epoch": 7.194944741437922, "grad_norm": 1.7862271070480347, "learning_rate": 3.370303572895404e-05, "loss": 0.4974, "step": 539700 }, { "epoch": 7.196277879244377, "grad_norm": 4.64741849899292, "learning_rate": 3.369442179450522e-05, "loss": 0.4809, "step": 539800 }, { "epoch": 7.197611017050833, "grad_norm": 4.208517551422119, "learning_rate": 3.3685807550771046e-05, "loss": 0.5015, "step": 539900 }, { "epoch": 7.198944154857288, "grad_norm": 6.950897693634033, "learning_rate": 3.36771929984727e-05, "loss": 0.4559, "step": 540000 }, { "epoch": 7.200277292663743, "grad_norm": 1.6794062852859497, "learning_rate": 3.3668578138331334e-05, "loss": 0.4477, "step": 540100 }, { "epoch": 7.201610430470198, "grad_norm": 22.357358932495117, "learning_rate": 3.365996297106817e-05, "loss": 0.5047, "step": 540200 }, { "epoch": 7.202943568276653, "grad_norm": 7.029762268066406, "learning_rate": 3.365134749740444e-05, "loss": 0.4874, "step": 540300 }, { "epoch": 7.204276706083108, "grad_norm": 9.424796104431152, "learning_rate": 3.364273171806142e-05, "loss": 0.5041, "step": 540400 }, { "epoch": 7.205609843889563, "grad_norm": 2.8380632400512695, "learning_rate": 3.363411563376037e-05, "loss": 0.4313, "step": 540500 }, { "epoch": 7.2069429816960175, "grad_norm": 3.183107614517212, "learning_rate": 3.362549924522263e-05, "loss": 0.4415, "step": 540600 }, { "epoch": 7.208276119502473, "grad_norm": 3.438152313232422, "learning_rate": 3.361688255316951e-05, "loss": 0.4717, "step": 540700 }, { "epoch": 7.209609257308928, "grad_norm": 3.1127781867980957, "learning_rate": 3.360826555832239e-05, "loss": 0.4703, "step": 540800 }, { "epoch": 7.210942395115383, "grad_norm": 3.058256149291992, "learning_rate": 3.359964826140265e-05, "loss": 0.4877, "step": 540900 }, { "epoch": 7.212275532921838, "grad_norm": 3.486358165740967, "learning_rate": 3.3591030663131706e-05, "loss": 0.4692, "step": 541000 }, { "epoch": 7.213608670728293, "grad_norm": 3.658869504928589, "learning_rate": 3.3582412764230996e-05, "loss": 0.5235, "step": 541100 }, { "epoch": 7.214941808534748, "grad_norm": 3.9727659225463867, "learning_rate": 3.3573794565421974e-05, "loss": 0.448, "step": 541200 }, { "epoch": 7.216274946341203, "grad_norm": 1.9414142370224, "learning_rate": 3.356517606742614e-05, "loss": 0.4652, "step": 541300 }, { "epoch": 7.217608084147658, "grad_norm": 4.259581565856934, "learning_rate": 3.3556557270965005e-05, "loss": 0.4915, "step": 541400 }, { "epoch": 7.218941221954114, "grad_norm": 5.014084339141846, "learning_rate": 3.354793817676012e-05, "loss": 0.5472, "step": 541500 }, { "epoch": 7.220274359760569, "grad_norm": 9.89235782623291, "learning_rate": 3.353931878553302e-05, "loss": 0.4548, "step": 541600 }, { "epoch": 7.221607497567024, "grad_norm": 2.642542839050293, "learning_rate": 3.353078529634489e-05, "loss": 0.4868, "step": 541700 }, { "epoch": 7.222940635373479, "grad_norm": 21.95177459716797, "learning_rate": 3.352216531619041e-05, "loss": 0.4922, "step": 541800 }, { "epoch": 7.224273773179934, "grad_norm": 2.494025468826294, "learning_rate": 3.351354504117134e-05, "loss": 0.4794, "step": 541900 }, { "epoch": 7.2256069109863885, "grad_norm": 7.4589715003967285, "learning_rate": 3.350492447200935e-05, "loss": 0.5213, "step": 542000 }, { "epoch": 7.2269400487928435, "grad_norm": 1.4107142686843872, "learning_rate": 3.3496303609426146e-05, "loss": 0.4644, "step": 542100 }, { "epoch": 7.228273186599298, "grad_norm": 6.052897930145264, "learning_rate": 3.3487682454143415e-05, "loss": 0.5259, "step": 542200 }, { "epoch": 7.229606324405754, "grad_norm": 4.7726149559021, "learning_rate": 3.347906100688291e-05, "loss": 0.5255, "step": 542300 }, { "epoch": 7.230939462212209, "grad_norm": 5.671655654907227, "learning_rate": 3.3470439268366374e-05, "loss": 0.4745, "step": 542400 }, { "epoch": 7.232272600018664, "grad_norm": 6.841306686401367, "learning_rate": 3.346181723931561e-05, "loss": 0.4698, "step": 542500 }, { "epoch": 7.233605737825119, "grad_norm": 6.21930456161499, "learning_rate": 3.345319492045242e-05, "loss": 0.5136, "step": 542600 }, { "epoch": 7.234938875631574, "grad_norm": 7.085097789764404, "learning_rate": 3.3444572312498635e-05, "loss": 0.5728, "step": 542700 }, { "epoch": 7.236272013438029, "grad_norm": 73.59645080566406, "learning_rate": 3.343594941617611e-05, "loss": 0.5262, "step": 542800 }, { "epoch": 7.237605151244484, "grad_norm": 11.47912311553955, "learning_rate": 3.342732623220674e-05, "loss": 0.5506, "step": 542900 }, { "epoch": 7.238938289050939, "grad_norm": 18.596389770507812, "learning_rate": 3.341870276131242e-05, "loss": 0.564, "step": 543000 }, { "epoch": 7.240271426857395, "grad_norm": 2.418718099594116, "learning_rate": 3.341007900421507e-05, "loss": 0.4694, "step": 543100 }, { "epoch": 7.24160456466385, "grad_norm": 26.543588638305664, "learning_rate": 3.3401454961636664e-05, "loss": 0.5481, "step": 543200 }, { "epoch": 7.2429377024703046, "grad_norm": 7.658297061920166, "learning_rate": 3.339283063429917e-05, "loss": 0.4825, "step": 543300 }, { "epoch": 7.2442708402767595, "grad_norm": 2.7056233882904053, "learning_rate": 3.338420602292459e-05, "loss": 0.4721, "step": 543400 }, { "epoch": 7.245603978083214, "grad_norm": 8.233970642089844, "learning_rate": 3.337558112823495e-05, "loss": 0.4783, "step": 543500 }, { "epoch": 7.246937115889669, "grad_norm": 4.493114948272705, "learning_rate": 3.33669559509523e-05, "loss": 0.4701, "step": 543600 }, { "epoch": 7.248270253696124, "grad_norm": 5.809004783630371, "learning_rate": 3.3358330491798704e-05, "loss": 0.4975, "step": 543700 }, { "epoch": 7.249603391502579, "grad_norm": 4.477581977844238, "learning_rate": 3.334970475149627e-05, "loss": 0.4966, "step": 543800 }, { "epoch": 7.250936529309035, "grad_norm": 1.9204438924789429, "learning_rate": 3.334107873076712e-05, "loss": 0.5208, "step": 543900 }, { "epoch": 7.25226966711549, "grad_norm": 13.70842170715332, "learning_rate": 3.333245243033339e-05, "loss": 0.506, "step": 544000 }, { "epoch": 7.253602804921945, "grad_norm": 18.10363006591797, "learning_rate": 3.332382585091724e-05, "loss": 0.523, "step": 544100 }, { "epoch": 7.2549359427284, "grad_norm": 2.447766065597534, "learning_rate": 3.3315198993240867e-05, "loss": 0.5074, "step": 544200 }, { "epoch": 7.256269080534855, "grad_norm": 3.014385461807251, "learning_rate": 3.33065718580265e-05, "loss": 0.4405, "step": 544300 }, { "epoch": 7.25760221834131, "grad_norm": 16.783906936645508, "learning_rate": 3.329794444599634e-05, "loss": 0.4966, "step": 544400 }, { "epoch": 7.258935356147765, "grad_norm": 4.635498046875, "learning_rate": 3.328931675787268e-05, "loss": 0.4875, "step": 544500 }, { "epoch": 7.26026849395422, "grad_norm": 3.564997673034668, "learning_rate": 3.328068879437779e-05, "loss": 0.5428, "step": 544600 }, { "epoch": 7.261601631760675, "grad_norm": 4.979461669921875, "learning_rate": 3.327206055623396e-05, "loss": 0.4734, "step": 544700 }, { "epoch": 7.2629347695671305, "grad_norm": 3.052924156188965, "learning_rate": 3.3263432044163544e-05, "loss": 0.4921, "step": 544800 }, { "epoch": 7.264267907373585, "grad_norm": 3.741163730621338, "learning_rate": 3.3254803258888885e-05, "loss": 0.4585, "step": 544900 }, { "epoch": 7.26560104518004, "grad_norm": 7.637219429016113, "learning_rate": 3.324617420113236e-05, "loss": 0.4953, "step": 545000 }, { "epoch": 7.266934182986495, "grad_norm": 2.6368701457977295, "learning_rate": 3.3237544871616366e-05, "loss": 0.5214, "step": 545100 }, { "epoch": 7.26826732079295, "grad_norm": 13.536201477050781, "learning_rate": 3.322891527106332e-05, "loss": 0.4996, "step": 545200 }, { "epoch": 7.269600458599405, "grad_norm": 6.21090841293335, "learning_rate": 3.322028540019566e-05, "loss": 0.4872, "step": 545300 }, { "epoch": 7.27093359640586, "grad_norm": 3.5104808807373047, "learning_rate": 3.321165525973586e-05, "loss": 0.5354, "step": 545400 }, { "epoch": 7.272266734212316, "grad_norm": 4.006621360778809, "learning_rate": 3.320302485040641e-05, "loss": 0.4875, "step": 545500 }, { "epoch": 7.273599872018771, "grad_norm": 3.0566792488098145, "learning_rate": 3.3194394172929805e-05, "loss": 0.5096, "step": 545600 }, { "epoch": 7.274933009825226, "grad_norm": 10.93087387084961, "learning_rate": 3.3185763228028603e-05, "loss": 0.4689, "step": 545700 }, { "epoch": 7.276266147631681, "grad_norm": 6.396059513092041, "learning_rate": 3.317713201642533e-05, "loss": 0.5022, "step": 545800 }, { "epoch": 7.277599285438136, "grad_norm": 4.613943099975586, "learning_rate": 3.31685005388426e-05, "loss": 0.514, "step": 545900 }, { "epoch": 7.278932423244591, "grad_norm": 2.552232503890991, "learning_rate": 3.315995511474202e-05, "loss": 0.4316, "step": 546000 }, { "epoch": 7.280265561051046, "grad_norm": 11.303472518920898, "learning_rate": 3.3151323110009914e-05, "loss": 0.4437, "step": 546100 }, { "epoch": 7.2815986988575006, "grad_norm": 3.5309202671051025, "learning_rate": 3.314269084145897e-05, "loss": 0.5579, "step": 546200 }, { "epoch": 7.2829318366639555, "grad_norm": 5.667788505554199, "learning_rate": 3.313405830981186e-05, "loss": 0.5182, "step": 546300 }, { "epoch": 7.284264974470411, "grad_norm": 4.04608678817749, "learning_rate": 3.312542551579126e-05, "loss": 0.4789, "step": 546400 }, { "epoch": 7.285598112276866, "grad_norm": 14.245905876159668, "learning_rate": 3.31167924601199e-05, "loss": 0.5059, "step": 546500 }, { "epoch": 7.286931250083321, "grad_norm": 16.5080623626709, "learning_rate": 3.310815914352048e-05, "loss": 0.4538, "step": 546600 }, { "epoch": 7.288264387889776, "grad_norm": 1.0551958084106445, "learning_rate": 3.309952556671579e-05, "loss": 0.4756, "step": 546700 }, { "epoch": 7.289597525696231, "grad_norm": 2.1075265407562256, "learning_rate": 3.309089173042858e-05, "loss": 0.4612, "step": 546800 }, { "epoch": 7.290930663502686, "grad_norm": 14.180752754211426, "learning_rate": 3.3082257635381676e-05, "loss": 0.5216, "step": 546900 }, { "epoch": 7.292263801309141, "grad_norm": 3.1690757274627686, "learning_rate": 3.307362328229786e-05, "loss": 0.5348, "step": 547000 }, { "epoch": 7.293596939115597, "grad_norm": 9.80104923248291, "learning_rate": 3.30649886719e-05, "loss": 0.477, "step": 547100 }, { "epoch": 7.294930076922052, "grad_norm": 9.483053207397461, "learning_rate": 3.305635380491094e-05, "loss": 0.4983, "step": 547200 }, { "epoch": 7.296263214728507, "grad_norm": 4.983628749847412, "learning_rate": 3.304771868205359e-05, "loss": 0.4383, "step": 547300 }, { "epoch": 7.297596352534962, "grad_norm": 5.614069938659668, "learning_rate": 3.3039169659091456e-05, "loss": 0.5048, "step": 547400 }, { "epoch": 7.298929490341417, "grad_norm": 3.3809895515441895, "learning_rate": 3.303053402920687e-05, "loss": 0.5429, "step": 547500 }, { "epoch": 7.3002626281478715, "grad_norm": 22.61895751953125, "learning_rate": 3.302189814561552e-05, "loss": 0.5396, "step": 547600 }, { "epoch": 7.3015957659543265, "grad_norm": 30.598873138427734, "learning_rate": 3.30132620090404e-05, "loss": 0.5087, "step": 547700 }, { "epoch": 7.302928903760781, "grad_norm": 4.045563220977783, "learning_rate": 3.3004625620204464e-05, "loss": 0.4775, "step": 547800 }, { "epoch": 7.304262041567236, "grad_norm": 5.071800708770752, "learning_rate": 3.2995988979830753e-05, "loss": 0.4515, "step": 547900 }, { "epoch": 7.305595179373692, "grad_norm": 1.8803786039352417, "learning_rate": 3.298735208864229e-05, "loss": 0.4818, "step": 548000 }, { "epoch": 7.306928317180147, "grad_norm": 42.71408462524414, "learning_rate": 3.297871494736213e-05, "loss": 0.4611, "step": 548100 }, { "epoch": 7.308261454986602, "grad_norm": 7.4912495613098145, "learning_rate": 3.2970077556713335e-05, "loss": 0.5389, "step": 548200 }, { "epoch": 7.309594592793057, "grad_norm": 10.320737838745117, "learning_rate": 3.296143991741901e-05, "loss": 0.4586, "step": 548300 }, { "epoch": 7.310927730599512, "grad_norm": 5.330075740814209, "learning_rate": 3.2952802030202274e-05, "loss": 0.5393, "step": 548400 }, { "epoch": 7.312260868405967, "grad_norm": 3.1133549213409424, "learning_rate": 3.294416389578626e-05, "loss": 0.5708, "step": 548500 }, { "epoch": 7.313594006212422, "grad_norm": 44.09694290161133, "learning_rate": 3.293552551489413e-05, "loss": 0.4609, "step": 548600 }, { "epoch": 7.314927144018878, "grad_norm": 4.625251770019531, "learning_rate": 3.2926886888249055e-05, "loss": 0.5646, "step": 548700 }, { "epoch": 7.316260281825333, "grad_norm": 7.563993453979492, "learning_rate": 3.291824801657423e-05, "loss": 0.5227, "step": 548800 }, { "epoch": 7.3175934196317876, "grad_norm": 97.40321350097656, "learning_rate": 3.290969529295965e-05, "loss": 0.511, "step": 548900 }, { "epoch": 7.3189265574382425, "grad_norm": 25.26805305480957, "learning_rate": 3.2901055935827255e-05, "loss": 0.4727, "step": 549000 }, { "epoch": 7.320259695244697, "grad_norm": 4.708499908447266, "learning_rate": 3.2892416335827626e-05, "loss": 0.5122, "step": 549100 }, { "epoch": 7.321592833051152, "grad_norm": 9.763969421386719, "learning_rate": 3.288377649368401e-05, "loss": 0.4941, "step": 549200 }, { "epoch": 7.322925970857607, "grad_norm": 3.4517853260040283, "learning_rate": 3.287513641011972e-05, "loss": 0.4537, "step": 549300 }, { "epoch": 7.324259108664062, "grad_norm": 2.856166124343872, "learning_rate": 3.286649608585807e-05, "loss": 0.4906, "step": 549400 }, { "epoch": 7.325592246470517, "grad_norm": 1.861594319343567, "learning_rate": 3.285785552162241e-05, "loss": 0.5272, "step": 549500 }, { "epoch": 7.326925384276973, "grad_norm": 2.368558883666992, "learning_rate": 3.284921471813608e-05, "loss": 0.4316, "step": 549600 }, { "epoch": 7.328258522083428, "grad_norm": 4.553182125091553, "learning_rate": 3.2840573676122486e-05, "loss": 0.4931, "step": 549700 }, { "epoch": 7.329591659889883, "grad_norm": 6.806911468505859, "learning_rate": 3.2831932396305026e-05, "loss": 0.5014, "step": 549800 }, { "epoch": 7.330924797696338, "grad_norm": 8.354768753051758, "learning_rate": 3.28232908794071e-05, "loss": 0.4927, "step": 549900 }, { "epoch": 7.332257935502793, "grad_norm": 4.227484226226807, "learning_rate": 3.2814649126152164e-05, "loss": 0.4952, "step": 550000 }, { "epoch": 7.333591073309248, "grad_norm": 2.829970359802246, "learning_rate": 3.280600713726367e-05, "loss": 0.4661, "step": 550100 }, { "epoch": 7.334924211115703, "grad_norm": 1.229657530784607, "learning_rate": 3.279736491346512e-05, "loss": 0.4307, "step": 550200 }, { "epoch": 7.3362573489221585, "grad_norm": 6.027162075042725, "learning_rate": 3.278872245547999e-05, "loss": 0.5342, "step": 550300 }, { "epoch": 7.3375904867286135, "grad_norm": 3.344967842102051, "learning_rate": 3.278016619209955e-05, "loss": 0.5086, "step": 550400 }, { "epoch": 7.338923624535068, "grad_norm": 1.1451328992843628, "learning_rate": 3.277152327023566e-05, "loss": 0.5289, "step": 550500 }, { "epoch": 7.340256762341523, "grad_norm": 5.569802761077881, "learning_rate": 3.276288011634859e-05, "loss": 0.5381, "step": 550600 }, { "epoch": 7.341589900147978, "grad_norm": 4.534663677215576, "learning_rate": 3.27542367311619e-05, "loss": 0.5694, "step": 550700 }, { "epoch": 7.342923037954433, "grad_norm": 3.586465358734131, "learning_rate": 3.274559311539919e-05, "loss": 0.5075, "step": 550800 }, { "epoch": 7.344256175760888, "grad_norm": 5.728774547576904, "learning_rate": 3.273694926978407e-05, "loss": 0.5069, "step": 550900 }, { "epoch": 7.345589313567343, "grad_norm": 3.022101640701294, "learning_rate": 3.272830519504021e-05, "loss": 0.4587, "step": 551000 }, { "epoch": 7.346922451373798, "grad_norm": 11.4894380569458, "learning_rate": 3.271966089189122e-05, "loss": 0.5113, "step": 551100 }, { "epoch": 7.348255589180254, "grad_norm": 6.795777797698975, "learning_rate": 3.2711016361060793e-05, "loss": 0.5241, "step": 551200 }, { "epoch": 7.349588726986709, "grad_norm": 134.3616485595703, "learning_rate": 3.270237160327262e-05, "loss": 0.5143, "step": 551300 }, { "epoch": 7.350921864793164, "grad_norm": 15.281147956848145, "learning_rate": 3.269372661925043e-05, "loss": 0.5276, "step": 551400 }, { "epoch": 7.352255002599619, "grad_norm": 7.208462238311768, "learning_rate": 3.268508140971793e-05, "loss": 0.4709, "step": 551500 }, { "epoch": 7.353588140406074, "grad_norm": 7.378382682800293, "learning_rate": 3.267643597539888e-05, "loss": 0.4553, "step": 551600 }, { "epoch": 7.354921278212529, "grad_norm": 1.3508652448654175, "learning_rate": 3.266779031701705e-05, "loss": 0.444, "step": 551700 }, { "epoch": 7.3562544160189836, "grad_norm": 2.5676138401031494, "learning_rate": 3.2659144435296216e-05, "loss": 0.4332, "step": 551800 }, { "epoch": 7.357587553825439, "grad_norm": 8.625097274780273, "learning_rate": 3.26504983309602e-05, "loss": 0.4809, "step": 551900 }, { "epoch": 7.358920691631894, "grad_norm": 13.948102951049805, "learning_rate": 3.264185200473281e-05, "loss": 0.5076, "step": 552000 }, { "epoch": 7.360253829438349, "grad_norm": 1.5371419191360474, "learning_rate": 3.2633205457337915e-05, "loss": 0.4865, "step": 552100 }, { "epoch": 7.361586967244804, "grad_norm": 7.765728950500488, "learning_rate": 3.2624558689499345e-05, "loss": 0.4428, "step": 552200 }, { "epoch": 7.362920105051259, "grad_norm": 7.625241756439209, "learning_rate": 3.261591170194099e-05, "loss": 0.5148, "step": 552300 }, { "epoch": 7.364253242857714, "grad_norm": 17.920583724975586, "learning_rate": 3.2607264495386776e-05, "loss": 0.4563, "step": 552400 }, { "epoch": 7.365586380664169, "grad_norm": 2.153799057006836, "learning_rate": 3.259861707056057e-05, "loss": 0.4557, "step": 552500 }, { "epoch": 7.366919518470624, "grad_norm": 9.306193351745605, "learning_rate": 3.258996942818635e-05, "loss": 0.472, "step": 552600 }, { "epoch": 7.368252656277079, "grad_norm": 3.0704102516174316, "learning_rate": 3.258132156898805e-05, "loss": 0.5633, "step": 552700 }, { "epoch": 7.369585794083535, "grad_norm": 13.731754302978516, "learning_rate": 3.257267349368965e-05, "loss": 0.466, "step": 552800 }, { "epoch": 7.37091893188999, "grad_norm": 4.596365928649902, "learning_rate": 3.2564025203015126e-05, "loss": 0.4546, "step": 552900 }, { "epoch": 7.372252069696445, "grad_norm": 3.440836191177368, "learning_rate": 3.25553766976885e-05, "loss": 0.5973, "step": 553000 }, { "epoch": 7.3735852075029, "grad_norm": 3.145191192626953, "learning_rate": 3.25467279784338e-05, "loss": 0.4867, "step": 553100 }, { "epoch": 7.3749183453093545, "grad_norm": 4.799244403839111, "learning_rate": 3.2538079045975046e-05, "loss": 0.4395, "step": 553200 }, { "epoch": 7.3762514831158095, "grad_norm": 2.4353911876678467, "learning_rate": 3.2529429901036326e-05, "loss": 0.4631, "step": 553300 }, { "epoch": 7.377584620922264, "grad_norm": 8.16669750213623, "learning_rate": 3.2520780544341706e-05, "loss": 0.5516, "step": 553400 }, { "epoch": 7.37891775872872, "grad_norm": 3.736971139907837, "learning_rate": 3.2512130976615285e-05, "loss": 0.494, "step": 553500 }, { "epoch": 7.380250896535175, "grad_norm": 2.711426019668579, "learning_rate": 3.250348119858117e-05, "loss": 0.4959, "step": 553600 }, { "epoch": 7.38158403434163, "grad_norm": 5.785237789154053, "learning_rate": 3.2494831210963514e-05, "loss": 0.4956, "step": 553700 }, { "epoch": 7.382917172148085, "grad_norm": 4.614128589630127, "learning_rate": 3.248618101448644e-05, "loss": 0.4287, "step": 553800 }, { "epoch": 7.38425030995454, "grad_norm": 6.4147186279296875, "learning_rate": 3.2477530609874134e-05, "loss": 0.4754, "step": 553900 }, { "epoch": 7.385583447760995, "grad_norm": 4.260115146636963, "learning_rate": 3.2468879997850775e-05, "loss": 0.5638, "step": 554000 }, { "epoch": 7.38691658556745, "grad_norm": 2.016692638397217, "learning_rate": 3.246022917914057e-05, "loss": 0.5147, "step": 554100 }, { "epoch": 7.388249723373905, "grad_norm": 4.008019924163818, "learning_rate": 3.245157815446773e-05, "loss": 0.4618, "step": 554200 }, { "epoch": 7.38958286118036, "grad_norm": 0.054884783923625946, "learning_rate": 3.244292692455649e-05, "loss": 0.4792, "step": 554300 }, { "epoch": 7.390915998986816, "grad_norm": 4.636067867279053, "learning_rate": 3.243427549013112e-05, "loss": 0.5306, "step": 554400 }, { "epoch": 7.392249136793271, "grad_norm": 3.648001194000244, "learning_rate": 3.242562385191587e-05, "loss": 0.4594, "step": 554500 }, { "epoch": 7.3935822745997255, "grad_norm": 1.9000425338745117, "learning_rate": 3.2416972010635036e-05, "loss": 0.4615, "step": 554600 }, { "epoch": 7.39491541240618, "grad_norm": 3.580611228942871, "learning_rate": 3.240831996701293e-05, "loss": 0.5182, "step": 554700 }, { "epoch": 7.396248550212635, "grad_norm": 1.786407232284546, "learning_rate": 3.239966772177387e-05, "loss": 0.5203, "step": 554800 }, { "epoch": 7.39758168801909, "grad_norm": 2.926112174987793, "learning_rate": 3.239101527564219e-05, "loss": 0.4856, "step": 554900 }, { "epoch": 7.398914825825545, "grad_norm": 3.5790023803710938, "learning_rate": 3.238236262934224e-05, "loss": 0.4943, "step": 555000 }, { "epoch": 7.400247963632001, "grad_norm": 4.762354373931885, "learning_rate": 3.2373709783598415e-05, "loss": 0.4886, "step": 555100 }, { "epoch": 7.401581101438456, "grad_norm": 6.9148736000061035, "learning_rate": 3.236505673913508e-05, "loss": 0.4496, "step": 555200 }, { "epoch": 7.402914239244911, "grad_norm": 2.754721164703369, "learning_rate": 3.235640349667665e-05, "loss": 0.4853, "step": 555300 }, { "epoch": 7.404247377051366, "grad_norm": 0.6670891046524048, "learning_rate": 3.234775005694755e-05, "loss": 0.5078, "step": 555400 }, { "epoch": 7.405580514857821, "grad_norm": 3.09795880317688, "learning_rate": 3.233909642067222e-05, "loss": 0.486, "step": 555500 }, { "epoch": 7.406913652664276, "grad_norm": 1.6908621788024902, "learning_rate": 3.23304425885751e-05, "loss": 0.49, "step": 555600 }, { "epoch": 7.408246790470731, "grad_norm": 1.963610291481018, "learning_rate": 3.232178856138068e-05, "loss": 0.4967, "step": 555700 }, { "epoch": 7.409579928277186, "grad_norm": 3.8315539360046387, "learning_rate": 3.2313134339813444e-05, "loss": 0.5213, "step": 555800 }, { "epoch": 7.410913066083641, "grad_norm": 4.705081462860107, "learning_rate": 3.230447992459789e-05, "loss": 0.522, "step": 555900 }, { "epoch": 7.4122462038900965, "grad_norm": 4.728804588317871, "learning_rate": 3.2295825316458534e-05, "loss": 0.5215, "step": 556000 }, { "epoch": 7.413579341696551, "grad_norm": 2.502100706100464, "learning_rate": 3.2287170516119925e-05, "loss": 0.5175, "step": 556100 }, { "epoch": 7.414912479503006, "grad_norm": 10.121981620788574, "learning_rate": 3.227851552430661e-05, "loss": 0.4688, "step": 556200 }, { "epoch": 7.416245617309461, "grad_norm": 8.269742965698242, "learning_rate": 3.2269860341743166e-05, "loss": 0.5518, "step": 556300 }, { "epoch": 7.417578755115916, "grad_norm": 9.906683921813965, "learning_rate": 3.226120496915415e-05, "loss": 0.4801, "step": 556400 }, { "epoch": 7.418911892922371, "grad_norm": 8.468199729919434, "learning_rate": 3.2252549407264206e-05, "loss": 0.5886, "step": 556500 }, { "epoch": 7.420245030728826, "grad_norm": 4.652136325836182, "learning_rate": 3.224389365679791e-05, "loss": 0.4767, "step": 556600 }, { "epoch": 7.421578168535282, "grad_norm": 30.477828979492188, "learning_rate": 3.223541083908253e-05, "loss": 0.5484, "step": 556700 }, { "epoch": 7.422911306341737, "grad_norm": 11.408888816833496, "learning_rate": 3.222675471737293e-05, "loss": 0.4613, "step": 556800 }, { "epoch": 7.424244444148192, "grad_norm": 4.714017868041992, "learning_rate": 3.221809840924642e-05, "loss": 0.5107, "step": 556900 }, { "epoch": 7.425577581954647, "grad_norm": 1.4211876392364502, "learning_rate": 3.2209441915427716e-05, "loss": 0.5131, "step": 557000 }, { "epoch": 7.426910719761102, "grad_norm": 8.081600189208984, "learning_rate": 3.220078523664151e-05, "loss": 0.4933, "step": 557100 }, { "epoch": 7.428243857567557, "grad_norm": 7.737690448760986, "learning_rate": 3.219212837361249e-05, "loss": 0.5115, "step": 557200 }, { "epoch": 7.429576995374012, "grad_norm": 12.499587059020996, "learning_rate": 3.218347132706539e-05, "loss": 0.5318, "step": 557300 }, { "epoch": 7.430910133180467, "grad_norm": 4.802304744720459, "learning_rate": 3.2174814097724956e-05, "loss": 0.4993, "step": 557400 }, { "epoch": 7.4322432709869215, "grad_norm": 1.798892617225647, "learning_rate": 3.2166156686315944e-05, "loss": 0.4281, "step": 557500 }, { "epoch": 7.433576408793377, "grad_norm": 8.262333869934082, "learning_rate": 3.215749909356311e-05, "loss": 0.4849, "step": 557600 }, { "epoch": 7.434909546599832, "grad_norm": 4.335300922393799, "learning_rate": 3.214884132019127e-05, "loss": 0.5266, "step": 557700 }, { "epoch": 7.436242684406287, "grad_norm": 2.971168041229248, "learning_rate": 3.21401833669252e-05, "loss": 0.4933, "step": 557800 }, { "epoch": 7.437575822212742, "grad_norm": 2.496009111404419, "learning_rate": 3.213152523448973e-05, "loss": 0.4358, "step": 557900 }, { "epoch": 7.438908960019197, "grad_norm": 22.461322784423828, "learning_rate": 3.212286692360968e-05, "loss": 0.5157, "step": 558000 }, { "epoch": 7.440242097825652, "grad_norm": 2.8346128463745117, "learning_rate": 3.2114208435009904e-05, "loss": 0.4854, "step": 558100 }, { "epoch": 7.441575235632107, "grad_norm": 3.2523632049560547, "learning_rate": 3.210554976941527e-05, "loss": 0.4723, "step": 558200 }, { "epoch": 7.442908373438562, "grad_norm": 0.9231402277946472, "learning_rate": 3.209689092755064e-05, "loss": 0.4697, "step": 558300 }, { "epoch": 7.444241511245018, "grad_norm": 4.067203521728516, "learning_rate": 3.208831850118159e-05, "loss": 0.4642, "step": 558400 }, { "epoch": 7.445574649051473, "grad_norm": 7.839169979095459, "learning_rate": 3.207965931069628e-05, "loss": 0.4451, "step": 558500 }, { "epoch": 7.446907786857928, "grad_norm": 5.8508830070495605, "learning_rate": 3.207099994610845e-05, "loss": 0.4534, "step": 558600 }, { "epoch": 7.448240924664383, "grad_norm": 6.2296462059021, "learning_rate": 3.206234040814303e-05, "loss": 0.4219, "step": 558700 }, { "epoch": 7.4495740624708375, "grad_norm": 3.8961620330810547, "learning_rate": 3.205368069752498e-05, "loss": 0.5355, "step": 558800 }, { "epoch": 7.4509072002772925, "grad_norm": 23.649045944213867, "learning_rate": 3.2045020814979246e-05, "loss": 0.5188, "step": 558900 }, { "epoch": 7.452240338083747, "grad_norm": 14.44454574584961, "learning_rate": 3.203636076123083e-05, "loss": 0.4901, "step": 559000 }, { "epoch": 7.453573475890202, "grad_norm": 4.973350524902344, "learning_rate": 3.20277005370047e-05, "loss": 0.5168, "step": 559100 }, { "epoch": 7.454906613696658, "grad_norm": 4.3107099533081055, "learning_rate": 3.201904014302587e-05, "loss": 0.5736, "step": 559200 }, { "epoch": 7.456239751503113, "grad_norm": 4.448544025421143, "learning_rate": 3.2010379580019374e-05, "loss": 0.5056, "step": 559300 }, { "epoch": 7.457572889309568, "grad_norm": 4.712948799133301, "learning_rate": 3.200171884871024e-05, "loss": 0.4981, "step": 559400 }, { "epoch": 7.458906027116023, "grad_norm": 1.4938583374023438, "learning_rate": 3.199305794982349e-05, "loss": 0.4497, "step": 559500 }, { "epoch": 7.460239164922478, "grad_norm": 2.660489320755005, "learning_rate": 3.198439688408424e-05, "loss": 0.386, "step": 559600 }, { "epoch": 7.461572302728933, "grad_norm": 6.87302827835083, "learning_rate": 3.197573565221752e-05, "loss": 0.49, "step": 559700 }, { "epoch": 7.462905440535388, "grad_norm": 15.871296882629395, "learning_rate": 3.196707425494845e-05, "loss": 0.4229, "step": 559800 }, { "epoch": 7.464238578341843, "grad_norm": 5.759802341461182, "learning_rate": 3.1958412693002115e-05, "loss": 0.5046, "step": 559900 }, { "epoch": 7.465571716148299, "grad_norm": 74.49104309082031, "learning_rate": 3.194975096710365e-05, "loss": 0.5616, "step": 560000 }, { "epoch": 7.466904853954754, "grad_norm": 2.1783180236816406, "learning_rate": 3.194108907797818e-05, "loss": 0.485, "step": 560100 }, { "epoch": 7.4682379917612085, "grad_norm": 38.31270217895508, "learning_rate": 3.193242702635084e-05, "loss": 0.4922, "step": 560200 }, { "epoch": 7.4695711295676634, "grad_norm": 2.6193079948425293, "learning_rate": 3.192376481294682e-05, "loss": 0.4839, "step": 560300 }, { "epoch": 7.470904267374118, "grad_norm": 18.853029251098633, "learning_rate": 3.191510243849125e-05, "loss": 0.4859, "step": 560400 }, { "epoch": 7.472237405180573, "grad_norm": 3.370605945587158, "learning_rate": 3.1906439903709355e-05, "loss": 0.4567, "step": 560500 }, { "epoch": 7.473570542987028, "grad_norm": 1.0544633865356445, "learning_rate": 3.1897777209326326e-05, "loss": 0.4814, "step": 560600 }, { "epoch": 7.474903680793483, "grad_norm": 8.769233703613281, "learning_rate": 3.188911435606736e-05, "loss": 0.5183, "step": 560700 }, { "epoch": 7.476236818599939, "grad_norm": 3.344935655593872, "learning_rate": 3.1880451344657706e-05, "loss": 0.5147, "step": 560800 }, { "epoch": 7.477569956406394, "grad_norm": 0.3057209849357605, "learning_rate": 3.187178817582258e-05, "loss": 0.5302, "step": 560900 }, { "epoch": 7.478903094212849, "grad_norm": 9.279345512390137, "learning_rate": 3.1863124850287264e-05, "loss": 0.4629, "step": 561000 }, { "epoch": 7.480236232019304, "grad_norm": 2.971364974975586, "learning_rate": 3.1854461368777e-05, "loss": 0.4706, "step": 561100 }, { "epoch": 7.481569369825759, "grad_norm": 2.1664445400238037, "learning_rate": 3.184579773201707e-05, "loss": 0.4416, "step": 561200 }, { "epoch": 7.482902507632214, "grad_norm": 0.956703782081604, "learning_rate": 3.183722057940814e-05, "loss": 0.4862, "step": 561300 }, { "epoch": 7.484235645438669, "grad_norm": 25.786457061767578, "learning_rate": 3.1828556635859185e-05, "loss": 0.4843, "step": 561400 }, { "epoch": 7.485568783245124, "grad_norm": 2.9556751251220703, "learning_rate": 3.181989253922924e-05, "loss": 0.515, "step": 561500 }, { "epoch": 7.4869019210515795, "grad_norm": 12.277351379394531, "learning_rate": 3.181122829024362e-05, "loss": 0.5297, "step": 561600 }, { "epoch": 7.488235058858034, "grad_norm": 3.6087470054626465, "learning_rate": 3.180256388962767e-05, "loss": 0.472, "step": 561700 }, { "epoch": 7.489568196664489, "grad_norm": 16.58281898498535, "learning_rate": 3.179389933810677e-05, "loss": 0.5054, "step": 561800 }, { "epoch": 7.490901334470944, "grad_norm": 3.9621849060058594, "learning_rate": 3.178523463640627e-05, "loss": 0.55, "step": 561900 }, { "epoch": 7.492234472277399, "grad_norm": 5.312218189239502, "learning_rate": 3.177656978525155e-05, "loss": 0.5097, "step": 562000 }, { "epoch": 7.493567610083854, "grad_norm": 1.5677305459976196, "learning_rate": 3.1767904785368e-05, "loss": 0.5198, "step": 562100 }, { "epoch": 7.494900747890309, "grad_norm": 5.477270603179932, "learning_rate": 3.175923963748105e-05, "loss": 0.4463, "step": 562200 }, { "epoch": 7.496233885696764, "grad_norm": 3.210580348968506, "learning_rate": 3.175057434231609e-05, "loss": 0.517, "step": 562300 }, { "epoch": 7.49756702350322, "grad_norm": 3.91300892829895, "learning_rate": 3.174190890059856e-05, "loss": 0.522, "step": 562400 }, { "epoch": 7.498900161309675, "grad_norm": 10.02996826171875, "learning_rate": 3.173324331305391e-05, "loss": 0.4535, "step": 562500 }, { "epoch": 7.50023329911613, "grad_norm": 10.525004386901855, "learning_rate": 3.172457758040761e-05, "loss": 0.5073, "step": 562600 }, { "epoch": 7.501566436922585, "grad_norm": 1.0644351243972778, "learning_rate": 3.17159117033851e-05, "loss": 0.4348, "step": 562700 }, { "epoch": 7.50289957472904, "grad_norm": 4.631031036376953, "learning_rate": 3.170724568271187e-05, "loss": 0.4467, "step": 562800 }, { "epoch": 7.504232712535495, "grad_norm": 1.735280156135559, "learning_rate": 3.169857951911342e-05, "loss": 0.5295, "step": 562900 }, { "epoch": 7.50556585034195, "grad_norm": 27.37042999267578, "learning_rate": 3.1689913213315244e-05, "loss": 0.5172, "step": 563000 }, { "epoch": 7.506898988148405, "grad_norm": 2.7515034675598145, "learning_rate": 3.168124676604285e-05, "loss": 0.5429, "step": 563100 }, { "epoch": 7.50823212595486, "grad_norm": 5.367389678955078, "learning_rate": 3.1672580178021786e-05, "loss": 0.4811, "step": 563200 }, { "epoch": 7.509565263761315, "grad_norm": 24.70200538635254, "learning_rate": 3.166391344997759e-05, "loss": 0.5339, "step": 563300 }, { "epoch": 7.51089840156777, "grad_norm": 3.332958221435547, "learning_rate": 3.1655246582635794e-05, "loss": 0.5652, "step": 563400 }, { "epoch": 7.512231539374225, "grad_norm": 2.5279202461242676, "learning_rate": 3.164657957672198e-05, "loss": 0.4645, "step": 563500 }, { "epoch": 7.51356467718068, "grad_norm": 1.448872685432434, "learning_rate": 3.163791243296171e-05, "loss": 0.4676, "step": 563600 }, { "epoch": 7.514897814987135, "grad_norm": 4.677046298980713, "learning_rate": 3.1629245152080566e-05, "loss": 0.5459, "step": 563700 }, { "epoch": 7.51623095279359, "grad_norm": 5.3716840744018555, "learning_rate": 3.162057773480417e-05, "loss": 0.4829, "step": 563800 }, { "epoch": 7.517564090600045, "grad_norm": 8.784220695495605, "learning_rate": 3.16119101818581e-05, "loss": 0.4504, "step": 563900 }, { "epoch": 7.518897228406501, "grad_norm": 3.4041426181793213, "learning_rate": 3.1603242493967996e-05, "loss": 0.373, "step": 564000 }, { "epoch": 7.520230366212956, "grad_norm": 3.6045989990234375, "learning_rate": 3.159457467185948e-05, "loss": 0.5268, "step": 564100 }, { "epoch": 7.521563504019411, "grad_norm": 0.724831223487854, "learning_rate": 3.158590671625821e-05, "loss": 0.4652, "step": 564200 }, { "epoch": 7.522896641825866, "grad_norm": 2.555330276489258, "learning_rate": 3.1577238627889813e-05, "loss": 0.4873, "step": 564300 }, { "epoch": 7.5242297796323205, "grad_norm": 5.517434120178223, "learning_rate": 3.1568570407479984e-05, "loss": 0.4588, "step": 564400 }, { "epoch": 7.5255629174387755, "grad_norm": 2.885845422744751, "learning_rate": 3.155990205575438e-05, "loss": 0.521, "step": 564500 }, { "epoch": 7.52689605524523, "grad_norm": 1.139628291130066, "learning_rate": 3.155123357343869e-05, "loss": 0.4921, "step": 564600 }, { "epoch": 7.528229193051686, "grad_norm": 6.321044445037842, "learning_rate": 3.154256496125863e-05, "loss": 0.4915, "step": 564700 }, { "epoch": 7.529562330858141, "grad_norm": 5.3132429122924805, "learning_rate": 3.153389621993987e-05, "loss": 0.5569, "step": 564800 }, { "epoch": 7.530895468664596, "grad_norm": 2.787217617034912, "learning_rate": 3.152522735020817e-05, "loss": 0.5065, "step": 564900 }, { "epoch": 7.532228606471051, "grad_norm": 3.265589952468872, "learning_rate": 3.1516558352789244e-05, "loss": 0.4483, "step": 565000 }, { "epoch": 7.533561744277506, "grad_norm": 1.9324654340744019, "learning_rate": 3.150788922840883e-05, "loss": 0.4616, "step": 565100 }, { "epoch": 7.534894882083961, "grad_norm": 2.4086954593658447, "learning_rate": 3.1499219977792686e-05, "loss": 0.5126, "step": 565200 }, { "epoch": 7.536228019890416, "grad_norm": 8.562496185302734, "learning_rate": 3.149055060166656e-05, "loss": 0.4656, "step": 565300 }, { "epoch": 7.537561157696871, "grad_norm": 2.014115810394287, "learning_rate": 3.148188110075625e-05, "loss": 0.5372, "step": 565400 }, { "epoch": 7.538894295503326, "grad_norm": 2.387145757675171, "learning_rate": 3.147321147578752e-05, "loss": 0.5069, "step": 565500 }, { "epoch": 7.540227433309781, "grad_norm": 4.241783618927002, "learning_rate": 3.146454172748617e-05, "loss": 0.5456, "step": 565600 }, { "epoch": 7.541560571116237, "grad_norm": 3.0603442192077637, "learning_rate": 3.1455871856578e-05, "loss": 0.513, "step": 565700 }, { "epoch": 7.5428937089226915, "grad_norm": 4.7452216148376465, "learning_rate": 3.144720186378883e-05, "loss": 0.5217, "step": 565800 }, { "epoch": 7.5442268467291465, "grad_norm": 7.838562965393066, "learning_rate": 3.143853174984449e-05, "loss": 0.4686, "step": 565900 }, { "epoch": 7.545559984535601, "grad_norm": 2.6580917835235596, "learning_rate": 3.142986151547079e-05, "loss": 0.5016, "step": 566000 }, { "epoch": 7.546893122342056, "grad_norm": 14.499248504638672, "learning_rate": 3.14211911613936e-05, "loss": 0.4526, "step": 566100 }, { "epoch": 7.548226260148511, "grad_norm": 7.736771583557129, "learning_rate": 3.141252068833876e-05, "loss": 0.5122, "step": 566200 }, { "epoch": 7.549559397954966, "grad_norm": 6.750231742858887, "learning_rate": 3.140385009703215e-05, "loss": 0.4537, "step": 566300 }, { "epoch": 7.550892535761422, "grad_norm": 5.453547954559326, "learning_rate": 3.1395179388199635e-05, "loss": 0.5039, "step": 566400 }, { "epoch": 7.552225673567877, "grad_norm": 10.871614456176758, "learning_rate": 3.138650856256709e-05, "loss": 0.4244, "step": 566500 }, { "epoch": 7.553558811374332, "grad_norm": 31.592899322509766, "learning_rate": 3.137783762086043e-05, "loss": 0.5223, "step": 566600 }, { "epoch": 7.554891949180787, "grad_norm": 5.725273609161377, "learning_rate": 3.1369166563805545e-05, "loss": 0.4962, "step": 566700 }, { "epoch": 7.556225086987242, "grad_norm": 10.98172378540039, "learning_rate": 3.136049539212835e-05, "loss": 0.5645, "step": 566800 }, { "epoch": 7.557558224793697, "grad_norm": 5.36147928237915, "learning_rate": 3.135182410655477e-05, "loss": 0.4641, "step": 566900 }, { "epoch": 7.558891362600152, "grad_norm": 2.8910727500915527, "learning_rate": 3.134315270781073e-05, "loss": 0.5489, "step": 567000 }, { "epoch": 7.560224500406607, "grad_norm": 1.04542076587677, "learning_rate": 3.133448119662219e-05, "loss": 0.4779, "step": 567100 }, { "epoch": 7.561557638213062, "grad_norm": 4.056291103363037, "learning_rate": 3.132580957371508e-05, "loss": 0.4727, "step": 567200 }, { "epoch": 7.562890776019517, "grad_norm": 1.9435396194458008, "learning_rate": 3.131713783981538e-05, "loss": 0.5128, "step": 567300 }, { "epoch": 7.564223913825972, "grad_norm": 8.585222244262695, "learning_rate": 3.130846599564905e-05, "loss": 0.5353, "step": 567400 }, { "epoch": 7.565557051632427, "grad_norm": 7.906815528869629, "learning_rate": 3.1299794041942075e-05, "loss": 0.4854, "step": 567500 }, { "epoch": 7.566890189438882, "grad_norm": 2.3343331813812256, "learning_rate": 3.129112197942043e-05, "loss": 0.4856, "step": 567600 }, { "epoch": 7.568223327245337, "grad_norm": 3.151662588119507, "learning_rate": 3.1282449808810136e-05, "loss": 0.3902, "step": 567700 }, { "epoch": 7.569556465051792, "grad_norm": 0.7186216115951538, "learning_rate": 3.127377753083718e-05, "loss": 0.4512, "step": 567800 }, { "epoch": 7.570889602858247, "grad_norm": 4.100468635559082, "learning_rate": 3.126510514622758e-05, "loss": 0.4452, "step": 567900 }, { "epoch": 7.572222740664703, "grad_norm": 3.822277069091797, "learning_rate": 3.125643265570737e-05, "loss": 0.4562, "step": 568000 }, { "epoch": 7.573555878471158, "grad_norm": 4.551997661590576, "learning_rate": 3.124776006000258e-05, "loss": 0.4556, "step": 568100 }, { "epoch": 7.574889016277613, "grad_norm": 1.8984633684158325, "learning_rate": 3.123908735983924e-05, "loss": 0.5046, "step": 568200 }, { "epoch": 7.576222154084068, "grad_norm": 10.463957786560059, "learning_rate": 3.1230414555943424e-05, "loss": 0.5223, "step": 568300 }, { "epoch": 7.577555291890523, "grad_norm": 3.0663743019104004, "learning_rate": 3.122174164904118e-05, "loss": 0.4847, "step": 568400 }, { "epoch": 7.578888429696978, "grad_norm": 7.082832336425781, "learning_rate": 3.121306863985857e-05, "loss": 0.4583, "step": 568500 }, { "epoch": 7.580221567503433, "grad_norm": 30.962383270263672, "learning_rate": 3.1204395529121684e-05, "loss": 0.5351, "step": 568600 }, { "epoch": 7.5815547053098875, "grad_norm": 1.5664913654327393, "learning_rate": 3.11957223175566e-05, "loss": 0.5041, "step": 568700 }, { "epoch": 7.5828878431163425, "grad_norm": 1.6064164638519287, "learning_rate": 3.118704900588941e-05, "loss": 0.4639, "step": 568800 }, { "epoch": 7.584220980922798, "grad_norm": 8.797924041748047, "learning_rate": 3.117837559484622e-05, "loss": 0.4885, "step": 568900 }, { "epoch": 7.585554118729253, "grad_norm": 18.040985107421875, "learning_rate": 3.116970208515314e-05, "loss": 0.4941, "step": 569000 }, { "epoch": 7.586887256535708, "grad_norm": 5.558032035827637, "learning_rate": 3.116111521409479e-05, "loss": 0.4917, "step": 569100 }, { "epoch": 7.588220394342163, "grad_norm": 3.492983818054199, "learning_rate": 3.115244151024867e-05, "loss": 0.4807, "step": 569200 }, { "epoch": 7.589553532148618, "grad_norm": 1.5618106126785278, "learning_rate": 3.114376770992379e-05, "loss": 0.5293, "step": 569300 }, { "epoch": 7.590886669955073, "grad_norm": 5.749027729034424, "learning_rate": 3.113509381384628e-05, "loss": 0.505, "step": 569400 }, { "epoch": 7.592219807761528, "grad_norm": 4.939303874969482, "learning_rate": 3.112641982274229e-05, "loss": 0.5495, "step": 569500 }, { "epoch": 7.593552945567984, "grad_norm": 3.74672532081604, "learning_rate": 3.111774573733799e-05, "loss": 0.508, "step": 569600 }, { "epoch": 7.594886083374439, "grad_norm": 3.6204020977020264, "learning_rate": 3.110907155835953e-05, "loss": 0.4434, "step": 569700 }, { "epoch": 7.596219221180894, "grad_norm": 5.873044490814209, "learning_rate": 3.11003972865331e-05, "loss": 0.5118, "step": 569800 }, { "epoch": 7.597552358987349, "grad_norm": 7.176312446594238, "learning_rate": 3.1091722922584884e-05, "loss": 0.5107, "step": 569900 }, { "epoch": 7.5988854967938035, "grad_norm": 3.972724676132202, "learning_rate": 3.108304846724107e-05, "loss": 0.504, "step": 570000 }, { "epoch": 7.6002186346002585, "grad_norm": 3.756805658340454, "learning_rate": 3.107437392122783e-05, "loss": 0.5007, "step": 570100 }, { "epoch": 7.601551772406713, "grad_norm": 3.1476054191589355, "learning_rate": 3.1065699285271413e-05, "loss": 0.4425, "step": 570200 }, { "epoch": 7.602884910213168, "grad_norm": 14.639533996582031, "learning_rate": 3.1057024560098e-05, "loss": 0.464, "step": 570300 }, { "epoch": 7.604218048019623, "grad_norm": 3.318371057510376, "learning_rate": 3.1048349746433825e-05, "loss": 0.4935, "step": 570400 }, { "epoch": 7.605551185826079, "grad_norm": 4.884493827819824, "learning_rate": 3.1039674845005126e-05, "loss": 0.4554, "step": 570500 }, { "epoch": 7.606884323632534, "grad_norm": 5.313143253326416, "learning_rate": 3.103099985653811e-05, "loss": 0.4863, "step": 570600 }, { "epoch": 7.608217461438989, "grad_norm": 4.4491496086120605, "learning_rate": 3.102232478175904e-05, "loss": 0.4999, "step": 570700 }, { "epoch": 7.609550599245444, "grad_norm": 4.233879566192627, "learning_rate": 3.101364962139416e-05, "loss": 0.5291, "step": 570800 }, { "epoch": 7.610883737051899, "grad_norm": 4.7617926597595215, "learning_rate": 3.100497437616974e-05, "loss": 0.4314, "step": 570900 }, { "epoch": 7.612216874858354, "grad_norm": 3.3471949100494385, "learning_rate": 3.0996299046812025e-05, "loss": 0.4333, "step": 571000 }, { "epoch": 7.613550012664809, "grad_norm": 0.7505238056182861, "learning_rate": 3.0987623634047296e-05, "loss": 0.4703, "step": 571100 }, { "epoch": 7.614883150471265, "grad_norm": 3.482792615890503, "learning_rate": 3.097903489396316e-05, "loss": 0.5324, "step": 571200 }, { "epoch": 7.61621628827772, "grad_norm": 3.277397871017456, "learning_rate": 3.09703593173792e-05, "loss": 0.541, "step": 571300 }, { "epoch": 7.6175494260841745, "grad_norm": 3.3899550437927246, "learning_rate": 3.096168365955981e-05, "loss": 0.5045, "step": 571400 }, { "epoch": 7.6188825638906295, "grad_norm": 18.776580810546875, "learning_rate": 3.0953007921231284e-05, "loss": 0.5355, "step": 571500 }, { "epoch": 7.620215701697084, "grad_norm": 3.4625847339630127, "learning_rate": 3.094433210311995e-05, "loss": 0.4999, "step": 571600 }, { "epoch": 7.621548839503539, "grad_norm": 4.860867977142334, "learning_rate": 3.0935656205952106e-05, "loss": 0.4684, "step": 571700 }, { "epoch": 7.622881977309994, "grad_norm": 1.486431360244751, "learning_rate": 3.092698023045408e-05, "loss": 0.4277, "step": 571800 }, { "epoch": 7.624215115116449, "grad_norm": 7.497514724731445, "learning_rate": 3.091830417735218e-05, "loss": 0.533, "step": 571900 }, { "epoch": 7.625548252922904, "grad_norm": 3.7486674785614014, "learning_rate": 3.090962804737275e-05, "loss": 0.4816, "step": 572000 }, { "epoch": 7.62688139072936, "grad_norm": 5.796505928039551, "learning_rate": 3.0901038603678e-05, "loss": 0.4622, "step": 572100 }, { "epoch": 7.628214528535815, "grad_norm": 6.024917125701904, "learning_rate": 3.08923623228732e-05, "loss": 0.4903, "step": 572200 }, { "epoch": 7.62954766634227, "grad_norm": 27.370920181274414, "learning_rate": 3.088368596736262e-05, "loss": 0.4524, "step": 572300 }, { "epoch": 7.630880804148725, "grad_norm": 0.8891161680221558, "learning_rate": 3.087500953787264e-05, "loss": 0.4435, "step": 572400 }, { "epoch": 7.63221394195518, "grad_norm": 5.172715663909912, "learning_rate": 3.0866333035129624e-05, "loss": 0.4846, "step": 572500 }, { "epoch": 7.633547079761635, "grad_norm": 42.74033737182617, "learning_rate": 3.085765645985994e-05, "loss": 0.5426, "step": 572600 }, { "epoch": 7.63488021756809, "grad_norm": 3.633256673812866, "learning_rate": 3.084897981278996e-05, "loss": 0.5108, "step": 572700 }, { "epoch": 7.6362133553745455, "grad_norm": 2.3511743545532227, "learning_rate": 3.084030309464606e-05, "loss": 0.5741, "step": 572800 }, { "epoch": 7.637546493181, "grad_norm": 3.7329914569854736, "learning_rate": 3.0831626306154634e-05, "loss": 0.4602, "step": 572900 }, { "epoch": 7.638879630987455, "grad_norm": 1.966962218284607, "learning_rate": 3.0822949448042075e-05, "loss": 0.4429, "step": 573000 }, { "epoch": 7.64021276879391, "grad_norm": 9.183927536010742, "learning_rate": 3.081427252103479e-05, "loss": 0.4791, "step": 573100 }, { "epoch": 7.641545906600365, "grad_norm": 12.800286293029785, "learning_rate": 3.080559552585916e-05, "loss": 0.4914, "step": 573200 }, { "epoch": 7.64287904440682, "grad_norm": 1.2299764156341553, "learning_rate": 3.0796918463241634e-05, "loss": 0.4876, "step": 573300 }, { "epoch": 7.644212182213275, "grad_norm": 7.018562316894531, "learning_rate": 3.078824133390859e-05, "loss": 0.4649, "step": 573400 }, { "epoch": 7.64554532001973, "grad_norm": 2.661900281906128, "learning_rate": 3.0779564138586476e-05, "loss": 0.4236, "step": 573500 }, { "epoch": 7.646878457826185, "grad_norm": 5.682485580444336, "learning_rate": 3.077088687800171e-05, "loss": 0.4963, "step": 573600 }, { "epoch": 7.648211595632641, "grad_norm": 5.368531703948975, "learning_rate": 3.076220955288072e-05, "loss": 0.5123, "step": 573700 }, { "epoch": 7.649544733439096, "grad_norm": 4.174295902252197, "learning_rate": 3.0753532163949946e-05, "loss": 0.4141, "step": 573800 }, { "epoch": 7.650877871245551, "grad_norm": 4.616021633148193, "learning_rate": 3.074485471193583e-05, "loss": 0.4682, "step": 573900 }, { "epoch": 7.652211009052006, "grad_norm": 10.815668106079102, "learning_rate": 3.073617719756484e-05, "loss": 0.4855, "step": 574000 }, { "epoch": 7.653544146858461, "grad_norm": 5.005978584289551, "learning_rate": 3.07274996215634e-05, "loss": 0.423, "step": 574100 }, { "epoch": 7.654877284664916, "grad_norm": 8.9451265335083, "learning_rate": 3.0718821984658e-05, "loss": 0.4753, "step": 574200 }, { "epoch": 7.6562104224713705, "grad_norm": 10.47827434539795, "learning_rate": 3.071014428757508e-05, "loss": 0.5244, "step": 574300 }, { "epoch": 7.657543560277826, "grad_norm": 6.136248588562012, "learning_rate": 3.070146653104113e-05, "loss": 0.4768, "step": 574400 }, { "epoch": 7.658876698084281, "grad_norm": 4.916762828826904, "learning_rate": 3.06927887157826e-05, "loss": 0.4567, "step": 574500 }, { "epoch": 7.660209835890736, "grad_norm": 12.258155822753906, "learning_rate": 3.068411084252599e-05, "loss": 0.5146, "step": 574600 }, { "epoch": 7.661542973697191, "grad_norm": 2.9152371883392334, "learning_rate": 3.067543291199778e-05, "loss": 0.4263, "step": 574700 }, { "epoch": 7.662876111503646, "grad_norm": 0.9222007989883423, "learning_rate": 3.066684170507269e-05, "loss": 0.4812, "step": 574800 }, { "epoch": 7.664209249310101, "grad_norm": 5.4136152267456055, "learning_rate": 3.0658163662735325e-05, "loss": 0.4982, "step": 574900 }, { "epoch": 7.665542387116556, "grad_norm": 18.394521713256836, "learning_rate": 3.0649485565298585e-05, "loss": 0.5268, "step": 575000 }, { "epoch": 7.666875524923011, "grad_norm": 33.94581985473633, "learning_rate": 3.064080741348895e-05, "loss": 0.4585, "step": 575100 }, { "epoch": 7.668208662729466, "grad_norm": 6.351477146148682, "learning_rate": 3.063212920803295e-05, "loss": 0.4805, "step": 575200 }, { "epoch": 7.669541800535922, "grad_norm": 3.5928051471710205, "learning_rate": 3.062345094965707e-05, "loss": 0.4853, "step": 575300 }, { "epoch": 7.670874938342377, "grad_norm": 2.1460001468658447, "learning_rate": 3.061477263908786e-05, "loss": 0.5156, "step": 575400 }, { "epoch": 7.672208076148832, "grad_norm": 3.9954025745391846, "learning_rate": 3.0606094277051814e-05, "loss": 0.5666, "step": 575500 }, { "epoch": 7.6735412139552865, "grad_norm": 58.01121139526367, "learning_rate": 3.059741586427545e-05, "loss": 0.4413, "step": 575600 }, { "epoch": 7.6748743517617415, "grad_norm": 43.73873519897461, "learning_rate": 3.058873740148533e-05, "loss": 0.5012, "step": 575700 }, { "epoch": 7.676207489568196, "grad_norm": 6.041365146636963, "learning_rate": 3.0580058889407967e-05, "loss": 0.4426, "step": 575800 }, { "epoch": 7.677540627374651, "grad_norm": 1.9126763343811035, "learning_rate": 3.057138032876991e-05, "loss": 0.4447, "step": 575900 }, { "epoch": 7.678873765181107, "grad_norm": 16.21956443786621, "learning_rate": 3.0562701720297676e-05, "loss": 0.4894, "step": 576000 }, { "epoch": 7.680206902987562, "grad_norm": 3.4523138999938965, "learning_rate": 3.055402306471784e-05, "loss": 0.5175, "step": 576100 }, { "epoch": 7.681540040794017, "grad_norm": 16.86785125732422, "learning_rate": 3.054534436275694e-05, "loss": 0.4947, "step": 576200 }, { "epoch": 7.682873178600472, "grad_norm": 8.060080528259277, "learning_rate": 3.0536665615141516e-05, "loss": 0.5133, "step": 576300 }, { "epoch": 7.684206316406927, "grad_norm": 9.07410717010498, "learning_rate": 3.052798682259816e-05, "loss": 0.4289, "step": 576400 }, { "epoch": 7.685539454213382, "grad_norm": 8.703191757202148, "learning_rate": 3.05193079858534e-05, "loss": 0.4658, "step": 576500 }, { "epoch": 7.686872592019837, "grad_norm": 5.970273494720459, "learning_rate": 3.0510629105633806e-05, "loss": 0.4932, "step": 576600 }, { "epoch": 7.688205729826292, "grad_norm": 1.5641499757766724, "learning_rate": 3.0501950182665955e-05, "loss": 0.4516, "step": 576700 }, { "epoch": 7.689538867632747, "grad_norm": 5.1286420822143555, "learning_rate": 3.0493271217676424e-05, "loss": 0.5051, "step": 576800 }, { "epoch": 7.690872005439203, "grad_norm": 10.997450828552246, "learning_rate": 3.048459221139178e-05, "loss": 0.5073, "step": 576900 }, { "epoch": 7.6922051432456575, "grad_norm": 1.7398574352264404, "learning_rate": 3.04759131645386e-05, "loss": 0.4801, "step": 577000 }, { "epoch": 7.6935382810521125, "grad_norm": 3.579380989074707, "learning_rate": 3.0467234077843478e-05, "loss": 0.4982, "step": 577100 }, { "epoch": 7.694871418858567, "grad_norm": 3.043131113052368, "learning_rate": 3.0458554952032985e-05, "loss": 0.4205, "step": 577200 }, { "epoch": 7.696204556665022, "grad_norm": 1.9618897438049316, "learning_rate": 3.0449875787833712e-05, "loss": 0.5011, "step": 577300 }, { "epoch": 7.697537694471477, "grad_norm": 2.283190965652466, "learning_rate": 3.0441196585972258e-05, "loss": 0.5268, "step": 577400 }, { "epoch": 7.698870832277932, "grad_norm": 7.549167156219482, "learning_rate": 3.0432517347175234e-05, "loss": 0.432, "step": 577500 }, { "epoch": 7.700203970084388, "grad_norm": 0.9561374187469482, "learning_rate": 3.04238380721692e-05, "loss": 0.4119, "step": 577600 }, { "epoch": 7.701537107890843, "grad_norm": 6.7019524574279785, "learning_rate": 3.041515876168079e-05, "loss": 0.4944, "step": 577700 }, { "epoch": 7.702870245697298, "grad_norm": 3.840529203414917, "learning_rate": 3.0406479416436586e-05, "loss": 0.4282, "step": 577800 }, { "epoch": 7.704203383503753, "grad_norm": 2.1438655853271484, "learning_rate": 3.039780003716322e-05, "loss": 0.4469, "step": 577900 }, { "epoch": 7.705536521310208, "grad_norm": 2.994344472885132, "learning_rate": 3.0389120624587284e-05, "loss": 0.4718, "step": 578000 }, { "epoch": 7.706869659116663, "grad_norm": 4.573742389678955, "learning_rate": 3.038044117943539e-05, "loss": 0.4726, "step": 578100 }, { "epoch": 7.708202796923118, "grad_norm": 78.42147827148438, "learning_rate": 3.0371761702434174e-05, "loss": 0.452, "step": 578200 }, { "epoch": 7.709535934729573, "grad_norm": 3.1554746627807617, "learning_rate": 3.0363082194310227e-05, "loss": 0.5244, "step": 578300 }, { "epoch": 7.710869072536028, "grad_norm": 3.355010509490967, "learning_rate": 3.0354402655790185e-05, "loss": 0.5105, "step": 578400 }, { "epoch": 7.712202210342483, "grad_norm": 2.070815324783325, "learning_rate": 3.0345723087600672e-05, "loss": 0.4707, "step": 578500 }, { "epoch": 7.713535348148938, "grad_norm": 16.87848472595215, "learning_rate": 3.0337043490468318e-05, "loss": 0.4713, "step": 578600 }, { "epoch": 7.714868485955393, "grad_norm": 4.499565601348877, "learning_rate": 3.032836386511973e-05, "loss": 0.481, "step": 578700 }, { "epoch": 7.716201623761848, "grad_norm": 5.852302551269531, "learning_rate": 3.0319684212281564e-05, "loss": 0.5057, "step": 578800 }, { "epoch": 7.717534761568303, "grad_norm": 11.772604942321777, "learning_rate": 3.031100453268045e-05, "loss": 0.4321, "step": 578900 }, { "epoch": 7.718867899374758, "grad_norm": 2.066009044647217, "learning_rate": 3.0302324827043012e-05, "loss": 0.5758, "step": 579000 }, { "epoch": 7.720201037181213, "grad_norm": 1.4346550703048706, "learning_rate": 3.029364509609588e-05, "loss": 0.4197, "step": 579100 }, { "epoch": 7.721534174987669, "grad_norm": 17.52436637878418, "learning_rate": 3.028496534056572e-05, "loss": 0.4604, "step": 579200 }, { "epoch": 7.722867312794124, "grad_norm": 8.14867877960205, "learning_rate": 3.027628556117916e-05, "loss": 0.4764, "step": 579300 }, { "epoch": 7.724200450600579, "grad_norm": 3.083763599395752, "learning_rate": 3.0267605758662842e-05, "loss": 0.4551, "step": 579400 }, { "epoch": 7.725533588407034, "grad_norm": 17.59881019592285, "learning_rate": 3.025892593374341e-05, "loss": 0.3969, "step": 579500 }, { "epoch": 7.726866726213489, "grad_norm": 2.2478885650634766, "learning_rate": 3.025024608714752e-05, "loss": 0.4651, "step": 579600 }, { "epoch": 7.728199864019944, "grad_norm": 4.220457553863525, "learning_rate": 3.0241566219601805e-05, "loss": 0.4815, "step": 579700 }, { "epoch": 7.729533001826399, "grad_norm": 1.9636605978012085, "learning_rate": 3.0232886331832945e-05, "loss": 0.4654, "step": 579800 }, { "epoch": 7.7308661396328535, "grad_norm": 9.871723175048828, "learning_rate": 3.022420642456756e-05, "loss": 0.4509, "step": 579900 }, { "epoch": 7.7321992774393085, "grad_norm": 2.1222047805786133, "learning_rate": 3.0215526498532334e-05, "loss": 0.4694, "step": 580000 }, { "epoch": 7.733532415245764, "grad_norm": 8.728272438049316, "learning_rate": 3.020684655445389e-05, "loss": 0.5493, "step": 580100 }, { "epoch": 7.734865553052219, "grad_norm": 4.097177505493164, "learning_rate": 3.019816659305892e-05, "loss": 0.4126, "step": 580200 }, { "epoch": 7.736198690858674, "grad_norm": 2.293060064315796, "learning_rate": 3.0189486615074064e-05, "loss": 0.4735, "step": 580300 }, { "epoch": 7.737531828665129, "grad_norm": 9.42333984375, "learning_rate": 3.0180806621225982e-05, "loss": 0.471, "step": 580400 }, { "epoch": 7.738864966471584, "grad_norm": 4.734903335571289, "learning_rate": 3.0172126612241347e-05, "loss": 0.4955, "step": 580500 }, { "epoch": 7.740198104278039, "grad_norm": 14.026568412780762, "learning_rate": 3.0163533389149698e-05, "loss": 0.4736, "step": 580600 }, { "epoch": 7.741531242084494, "grad_norm": 4.684177398681641, "learning_rate": 3.0154853352205162e-05, "loss": 0.4475, "step": 580700 }, { "epoch": 7.74286437989095, "grad_norm": 20.473735809326172, "learning_rate": 3.0146173302296798e-05, "loss": 0.4655, "step": 580800 }, { "epoch": 7.744197517697405, "grad_norm": 5.2148756980896, "learning_rate": 3.0137493240151254e-05, "loss": 0.4779, "step": 580900 }, { "epoch": 7.74553065550386, "grad_norm": 13.093814849853516, "learning_rate": 3.012881316649522e-05, "loss": 0.5254, "step": 581000 }, { "epoch": 7.746863793310315, "grad_norm": 17.85177230834961, "learning_rate": 3.012013308205535e-05, "loss": 0.4574, "step": 581100 }, { "epoch": 7.7481969311167695, "grad_norm": 4.909456729888916, "learning_rate": 3.0111452987558312e-05, "loss": 0.4378, "step": 581200 }, { "epoch": 7.7495300689232245, "grad_norm": 4.406807899475098, "learning_rate": 3.0102772883730784e-05, "loss": 0.3665, "step": 581300 }, { "epoch": 7.750863206729679, "grad_norm": 14.315512657165527, "learning_rate": 3.0094092771299445e-05, "loss": 0.4899, "step": 581400 }, { "epoch": 7.752196344536134, "grad_norm": 18.724220275878906, "learning_rate": 3.0085412650990944e-05, "loss": 0.4704, "step": 581500 }, { "epoch": 7.753529482342589, "grad_norm": 2.095304012298584, "learning_rate": 3.0076732523531966e-05, "loss": 0.505, "step": 581600 }, { "epoch": 7.754862620149045, "grad_norm": 1.259588360786438, "learning_rate": 3.0068052389649192e-05, "loss": 0.5343, "step": 581700 }, { "epoch": 7.7561957579555, "grad_norm": 7.5228400230407715, "learning_rate": 3.005937225006928e-05, "loss": 0.5111, "step": 581800 }, { "epoch": 7.757528895761955, "grad_norm": 3.0367581844329834, "learning_rate": 3.0050692105518904e-05, "loss": 0.5406, "step": 581900 }, { "epoch": 7.75886203356841, "grad_norm": 6.438803672790527, "learning_rate": 3.0042011956724752e-05, "loss": 0.5256, "step": 582000 }, { "epoch": 7.760195171374865, "grad_norm": 12.45401668548584, "learning_rate": 3.0033331804413496e-05, "loss": 0.5087, "step": 582100 }, { "epoch": 7.76152830918132, "grad_norm": 4.0324296951293945, "learning_rate": 3.0024651649311795e-05, "loss": 0.4236, "step": 582200 }, { "epoch": 7.762861446987775, "grad_norm": 4.934926986694336, "learning_rate": 3.001597149214634e-05, "loss": 0.5064, "step": 582300 }, { "epoch": 7.764194584794231, "grad_norm": 3.148266315460205, "learning_rate": 3.000729133364381e-05, "loss": 0.4588, "step": 582400 }, { "epoch": 7.765527722600686, "grad_norm": 8.176344871520996, "learning_rate": 2.9998611174530862e-05, "loss": 0.4841, "step": 582500 }, { "epoch": 7.7668608604071405, "grad_norm": 6.632533550262451, "learning_rate": 2.9989931015534185e-05, "loss": 0.5127, "step": 582600 }, { "epoch": 7.7681939982135955, "grad_norm": 2.49768328666687, "learning_rate": 2.9981250857380458e-05, "loss": 0.5077, "step": 582700 }, { "epoch": 7.76952713602005, "grad_norm": 4.265908718109131, "learning_rate": 2.9972570700796344e-05, "loss": 0.5024, "step": 582800 }, { "epoch": 7.770860273826505, "grad_norm": 2.66544246673584, "learning_rate": 2.996389054650852e-05, "loss": 0.4683, "step": 582900 }, { "epoch": 7.77219341163296, "grad_norm": 7.311587333679199, "learning_rate": 2.9955297196738963e-05, "loss": 0.4675, "step": 583000 }, { "epoch": 7.773526549439415, "grad_norm": 7.651158809661865, "learning_rate": 2.994661704918268e-05, "loss": 0.5173, "step": 583100 }, { "epoch": 7.77485968724587, "grad_norm": 1.5321362018585205, "learning_rate": 2.9937936906095425e-05, "loss": 0.5659, "step": 583200 }, { "epoch": 7.776192825052326, "grad_norm": 11.571553230285645, "learning_rate": 2.9929256768203905e-05, "loss": 0.5125, "step": 583300 }, { "epoch": 7.777525962858781, "grad_norm": 6.312839984893799, "learning_rate": 2.9920576636234792e-05, "loss": 0.4994, "step": 583400 }, { "epoch": 7.778859100665236, "grad_norm": 8.891522407531738, "learning_rate": 2.9911896510914742e-05, "loss": 0.4492, "step": 583500 }, { "epoch": 7.780192238471691, "grad_norm": 4.53195333480835, "learning_rate": 2.9903216392970445e-05, "loss": 0.4967, "step": 583600 }, { "epoch": 7.781525376278146, "grad_norm": 2.6542842388153076, "learning_rate": 2.9894536283128567e-05, "loss": 0.4915, "step": 583700 }, { "epoch": 7.782858514084601, "grad_norm": 4.410309791564941, "learning_rate": 2.9885856182115774e-05, "loss": 0.483, "step": 583800 }, { "epoch": 7.784191651891056, "grad_norm": 6.123071670532227, "learning_rate": 2.987717609065875e-05, "loss": 0.5009, "step": 583900 }, { "epoch": 7.7855247896975115, "grad_norm": 1.8337290287017822, "learning_rate": 2.9868496009484146e-05, "loss": 0.3886, "step": 584000 }, { "epoch": 7.786857927503966, "grad_norm": 8.183956146240234, "learning_rate": 2.9859815939318646e-05, "loss": 0.4828, "step": 584100 }, { "epoch": 7.788191065310421, "grad_norm": 4.6446733474731445, "learning_rate": 2.985113588088892e-05, "loss": 0.4047, "step": 584200 }, { "epoch": 7.789524203116876, "grad_norm": 35.91421890258789, "learning_rate": 2.9842455834921614e-05, "loss": 0.4612, "step": 584300 }, { "epoch": 7.790857340923331, "grad_norm": 4.659276485443115, "learning_rate": 2.9833775802143417e-05, "loss": 0.5287, "step": 584400 }, { "epoch": 7.792190478729786, "grad_norm": 2.3186593055725098, "learning_rate": 2.982509578328099e-05, "loss": 0.429, "step": 584500 }, { "epoch": 7.793523616536241, "grad_norm": 2.4537248611450195, "learning_rate": 2.981641577906099e-05, "loss": 0.4045, "step": 584600 }, { "epoch": 7.794856754342696, "grad_norm": 6.851655006408691, "learning_rate": 2.980773579021008e-05, "loss": 0.5301, "step": 584700 }, { "epoch": 7.796189892149151, "grad_norm": 13.48827075958252, "learning_rate": 2.9799055817454927e-05, "loss": 0.5069, "step": 584800 }, { "epoch": 7.797523029955607, "grad_norm": 1.1795498132705688, "learning_rate": 2.9790375861522185e-05, "loss": 0.5456, "step": 584900 }, { "epoch": 7.798856167762062, "grad_norm": 5.1263909339904785, "learning_rate": 2.9781695923138515e-05, "loss": 0.4837, "step": 585000 }, { "epoch": 7.800189305568517, "grad_norm": 2.833465337753296, "learning_rate": 2.9773016003030577e-05, "loss": 0.4807, "step": 585100 }, { "epoch": 7.801522443374972, "grad_norm": 2.7149007320404053, "learning_rate": 2.976433610192502e-05, "loss": 0.5126, "step": 585200 }, { "epoch": 7.802855581181427, "grad_norm": 1.3987224102020264, "learning_rate": 2.9755656220548494e-05, "loss": 0.4356, "step": 585300 }, { "epoch": 7.804188718987882, "grad_norm": 5.8463215827941895, "learning_rate": 2.974697635962768e-05, "loss": 0.449, "step": 585400 }, { "epoch": 7.8055218567943365, "grad_norm": 3.7068421840667725, "learning_rate": 2.9738296519889187e-05, "loss": 0.466, "step": 585500 }, { "epoch": 7.806854994600792, "grad_norm": 2.195404291152954, "learning_rate": 2.9729616702059705e-05, "loss": 0.4731, "step": 585600 }, { "epoch": 7.808188132407247, "grad_norm": 8.911552429199219, "learning_rate": 2.972093690686584e-05, "loss": 0.4796, "step": 585700 }, { "epoch": 7.809521270213702, "grad_norm": 8.958198547363281, "learning_rate": 2.9712257135034263e-05, "loss": 0.4862, "step": 585800 }, { "epoch": 7.810854408020157, "grad_norm": 8.601954460144043, "learning_rate": 2.970357738729162e-05, "loss": 0.4528, "step": 585900 }, { "epoch": 7.812187545826612, "grad_norm": 3.1199779510498047, "learning_rate": 2.969489766436454e-05, "loss": 0.5184, "step": 586000 }, { "epoch": 7.813520683633067, "grad_norm": 2.81821346282959, "learning_rate": 2.968621796697966e-05, "loss": 0.4378, "step": 586100 }, { "epoch": 7.814853821439522, "grad_norm": 8.749211311340332, "learning_rate": 2.9677538295863634e-05, "loss": 0.4742, "step": 586200 }, { "epoch": 7.816186959245977, "grad_norm": 4.476765155792236, "learning_rate": 2.9668858651743076e-05, "loss": 0.4531, "step": 586300 }, { "epoch": 7.817520097052432, "grad_norm": 9.650407791137695, "learning_rate": 2.9660265831369003e-05, "loss": 0.4671, "step": 586400 }, { "epoch": 7.818853234858888, "grad_norm": 4.886266231536865, "learning_rate": 2.9651586243131203e-05, "loss": 0.5086, "step": 586500 }, { "epoch": 7.820186372665343, "grad_norm": 7.790103435516357, "learning_rate": 2.9642906684061528e-05, "loss": 0.5775, "step": 586600 }, { "epoch": 7.821519510471798, "grad_norm": 2.211639404296875, "learning_rate": 2.9634227154886556e-05, "loss": 0.467, "step": 586700 }, { "epoch": 7.8228526482782526, "grad_norm": 9.709601402282715, "learning_rate": 2.962554765633294e-05, "loss": 0.5309, "step": 586800 }, { "epoch": 7.8241857860847075, "grad_norm": 3.2430503368377686, "learning_rate": 2.9616868189127302e-05, "loss": 0.4651, "step": 586900 }, { "epoch": 7.825518923891162, "grad_norm": 9.93731689453125, "learning_rate": 2.9608188753996247e-05, "loss": 0.5038, "step": 587000 }, { "epoch": 7.826852061697617, "grad_norm": 5.415825366973877, "learning_rate": 2.95995093516664e-05, "loss": 0.471, "step": 587100 }, { "epoch": 7.828185199504073, "grad_norm": 5.93042516708374, "learning_rate": 2.9590829982864358e-05, "loss": 0.4303, "step": 587200 }, { "epoch": 7.829518337310528, "grad_norm": 5.236921787261963, "learning_rate": 2.958215064831674e-05, "loss": 0.4967, "step": 587300 }, { "epoch": 7.830851475116983, "grad_norm": 2.859698534011841, "learning_rate": 2.9573471348750164e-05, "loss": 0.4577, "step": 587400 }, { "epoch": 7.832184612923438, "grad_norm": 6.9502363204956055, "learning_rate": 2.956479208489121e-05, "loss": 0.4791, "step": 587500 }, { "epoch": 7.833517750729893, "grad_norm": 8.028416633605957, "learning_rate": 2.955611285746649e-05, "loss": 0.5212, "step": 587600 }, { "epoch": 7.834850888536348, "grad_norm": 8.125819206237793, "learning_rate": 2.954743366720262e-05, "loss": 0.4761, "step": 587700 }, { "epoch": 7.836184026342803, "grad_norm": 3.1540300846099854, "learning_rate": 2.9538754514826158e-05, "loss": 0.4644, "step": 587800 }, { "epoch": 7.837517164149258, "grad_norm": 1.937567949295044, "learning_rate": 2.9530075401063716e-05, "loss": 0.483, "step": 587900 }, { "epoch": 7.838850301955713, "grad_norm": 4.031713962554932, "learning_rate": 2.952139632664189e-05, "loss": 0.4392, "step": 588000 }, { "epoch": 7.840183439762169, "grad_norm": 1.5084829330444336, "learning_rate": 2.9512717292287247e-05, "loss": 0.4895, "step": 588100 }, { "epoch": 7.8415165775686235, "grad_norm": 3.597160816192627, "learning_rate": 2.9504038298726377e-05, "loss": 0.4225, "step": 588200 }, { "epoch": 7.8428497153750785, "grad_norm": 3.5678088665008545, "learning_rate": 2.9495359346685862e-05, "loss": 0.4357, "step": 588300 }, { "epoch": 7.844182853181533, "grad_norm": 11.911551475524902, "learning_rate": 2.9486680436892268e-05, "loss": 0.5322, "step": 588400 }, { "epoch": 7.845515990987988, "grad_norm": 0.9827804565429688, "learning_rate": 2.9478001570072175e-05, "loss": 0.4516, "step": 588500 }, { "epoch": 7.846849128794443, "grad_norm": 4.832343578338623, "learning_rate": 2.946932274695214e-05, "loss": 0.4792, "step": 588600 }, { "epoch": 7.848182266600898, "grad_norm": 4.485294342041016, "learning_rate": 2.946064396825873e-05, "loss": 0.4557, "step": 588700 }, { "epoch": 7.849515404407354, "grad_norm": 9.732768058776855, "learning_rate": 2.9451965234718514e-05, "loss": 0.5259, "step": 588800 }, { "epoch": 7.850848542213809, "grad_norm": 2.7522799968719482, "learning_rate": 2.9443286547058033e-05, "loss": 0.483, "step": 588900 }, { "epoch": 7.852181680020264, "grad_norm": 1.6051558256149292, "learning_rate": 2.9434607906003842e-05, "loss": 0.4584, "step": 589000 }, { "epoch": 7.853514817826719, "grad_norm": 13.308854103088379, "learning_rate": 2.9425929312282516e-05, "loss": 0.4639, "step": 589100 }, { "epoch": 7.854847955633174, "grad_norm": 3.6657421588897705, "learning_rate": 2.9417250766620552e-05, "loss": 0.3829, "step": 589200 }, { "epoch": 7.856181093439629, "grad_norm": 3.312549591064453, "learning_rate": 2.9408572269744532e-05, "loss": 0.4082, "step": 589300 }, { "epoch": 7.857514231246084, "grad_norm": 7.334097862243652, "learning_rate": 2.939989382238098e-05, "loss": 0.4448, "step": 589400 }, { "epoch": 7.858847369052539, "grad_norm": 3.0003459453582764, "learning_rate": 2.9391302208976597e-05, "loss": 0.519, "step": 589500 }, { "epoch": 7.860180506858994, "grad_norm": 5.650839805603027, "learning_rate": 2.938271064551641e-05, "loss": 0.557, "step": 589600 }, { "epoch": 7.861513644665449, "grad_norm": 3.7329859733581543, "learning_rate": 2.9374032350008462e-05, "loss": 0.518, "step": 589700 }, { "epoch": 7.862846782471904, "grad_norm": 18.75484848022461, "learning_rate": 2.936535410690456e-05, "loss": 0.3852, "step": 589800 }, { "epoch": 7.864179920278359, "grad_norm": 3.8135812282562256, "learning_rate": 2.9356675916931205e-05, "loss": 0.4644, "step": 589900 }, { "epoch": 7.865513058084814, "grad_norm": 1.799813985824585, "learning_rate": 2.9347997780814913e-05, "loss": 0.5152, "step": 590000 }, { "epoch": 7.866846195891269, "grad_norm": 5.567330360412598, "learning_rate": 2.9339319699282203e-05, "loss": 0.4855, "step": 590100 }, { "epoch": 7.868179333697724, "grad_norm": 8.268217086791992, "learning_rate": 2.933064167305955e-05, "loss": 0.455, "step": 590200 }, { "epoch": 7.869512471504179, "grad_norm": 3.133887767791748, "learning_rate": 2.932196370287347e-05, "loss": 0.4632, "step": 590300 }, { "epoch": 7.870845609310635, "grad_norm": 14.686548233032227, "learning_rate": 2.9313285789450457e-05, "loss": 0.5388, "step": 590400 }, { "epoch": 7.87217874711709, "grad_norm": 2.850125312805176, "learning_rate": 2.930460793351699e-05, "loss": 0.4568, "step": 590500 }, { "epoch": 7.873511884923545, "grad_norm": 5.301990509033203, "learning_rate": 2.9295930135799555e-05, "loss": 0.4677, "step": 590600 }, { "epoch": 7.87484502273, "grad_norm": 2.8432939052581787, "learning_rate": 2.9287252397024633e-05, "loss": 0.5071, "step": 590700 }, { "epoch": 7.876178160536455, "grad_norm": 1.9475107192993164, "learning_rate": 2.9278574717918685e-05, "loss": 0.449, "step": 590800 }, { "epoch": 7.87751129834291, "grad_norm": 9.708836555480957, "learning_rate": 2.9269897099208193e-05, "loss": 0.4041, "step": 590900 }, { "epoch": 7.878844436149365, "grad_norm": 3.786428451538086, "learning_rate": 2.9261219541619618e-05, "loss": 0.5251, "step": 591000 }, { "epoch": 7.8801775739558195, "grad_norm": 3.6608071327209473, "learning_rate": 2.925254204587941e-05, "loss": 0.4787, "step": 591100 }, { "epoch": 7.8815107117622745, "grad_norm": 3.207094430923462, "learning_rate": 2.924386461271402e-05, "loss": 0.4726, "step": 591200 }, { "epoch": 7.88284384956873, "grad_norm": 3.514838933944702, "learning_rate": 2.9235187242849918e-05, "loss": 0.4814, "step": 591300 }, { "epoch": 7.884176987375185, "grad_norm": 6.8471174240112305, "learning_rate": 2.9226509937013508e-05, "loss": 0.4132, "step": 591400 }, { "epoch": 7.88551012518164, "grad_norm": 6.717045783996582, "learning_rate": 2.9217832695931273e-05, "loss": 0.6003, "step": 591500 }, { "epoch": 7.886843262988095, "grad_norm": 6.1869378089904785, "learning_rate": 2.9209155520329593e-05, "loss": 0.4669, "step": 591600 }, { "epoch": 7.88817640079455, "grad_norm": 3.262815237045288, "learning_rate": 2.920047841093493e-05, "loss": 0.4807, "step": 591700 }, { "epoch": 7.889509538601005, "grad_norm": 3.476327657699585, "learning_rate": 2.91918013684737e-05, "loss": 0.506, "step": 591800 }, { "epoch": 7.89084267640746, "grad_norm": 2.7143659591674805, "learning_rate": 2.9183124393672302e-05, "loss": 0.4958, "step": 591900 }, { "epoch": 7.892175814213916, "grad_norm": 1.7666059732437134, "learning_rate": 2.917444748725716e-05, "loss": 0.4404, "step": 592000 }, { "epoch": 7.893508952020371, "grad_norm": 2.905275344848633, "learning_rate": 2.9165770649954678e-05, "loss": 0.4781, "step": 592100 }, { "epoch": 7.894842089826826, "grad_norm": 4.466587543487549, "learning_rate": 2.9157093882491237e-05, "loss": 0.4571, "step": 592200 }, { "epoch": 7.896175227633281, "grad_norm": 6.025383949279785, "learning_rate": 2.9148417185593237e-05, "loss": 0.4589, "step": 592300 }, { "epoch": 7.897508365439736, "grad_norm": 13.208641052246094, "learning_rate": 2.9139740559987082e-05, "loss": 0.4693, "step": 592400 }, { "epoch": 7.8988415032461905, "grad_norm": 212.8010711669922, "learning_rate": 2.9131064006399118e-05, "loss": 0.4773, "step": 592500 }, { "epoch": 7.900174641052645, "grad_norm": 8.53069019317627, "learning_rate": 2.9122387525555743e-05, "loss": 0.5086, "step": 592600 }, { "epoch": 7.9015077788591, "grad_norm": 8.232853889465332, "learning_rate": 2.9113711118183325e-05, "loss": 0.4085, "step": 592700 }, { "epoch": 7.902840916665555, "grad_norm": 9.610072135925293, "learning_rate": 2.910503478500821e-05, "loss": 0.4888, "step": 592800 }, { "epoch": 7.904174054472011, "grad_norm": 58.153499603271484, "learning_rate": 2.9096358526756766e-05, "loss": 0.5232, "step": 592900 }, { "epoch": 7.905507192278466, "grad_norm": 3.3624656200408936, "learning_rate": 2.9087682344155338e-05, "loss": 0.455, "step": 593000 }, { "epoch": 7.906840330084921, "grad_norm": 0.6471236944198608, "learning_rate": 2.907900623793026e-05, "loss": 0.4947, "step": 593100 }, { "epoch": 7.908173467891376, "grad_norm": 39.37928771972656, "learning_rate": 2.907033020880789e-05, "loss": 0.4781, "step": 593200 }, { "epoch": 7.909506605697831, "grad_norm": 1.9740984439849854, "learning_rate": 2.9061654257514534e-05, "loss": 0.4827, "step": 593300 }, { "epoch": 7.910839743504286, "grad_norm": 5.805507659912109, "learning_rate": 2.905297838477653e-05, "loss": 0.561, "step": 593400 }, { "epoch": 7.912172881310741, "grad_norm": 5.796795845031738, "learning_rate": 2.904430259132019e-05, "loss": 0.548, "step": 593500 }, { "epoch": 7.913506019117197, "grad_norm": 11.450343132019043, "learning_rate": 2.903562687787182e-05, "loss": 0.5176, "step": 593600 }, { "epoch": 7.914839156923652, "grad_norm": 4.0875043869018555, "learning_rate": 2.9026951245157726e-05, "loss": 0.4986, "step": 593700 }, { "epoch": 7.9161722947301065, "grad_norm": 11.397767066955566, "learning_rate": 2.901827569390422e-05, "loss": 0.4427, "step": 593800 }, { "epoch": 7.9175054325365615, "grad_norm": 3.3568949699401855, "learning_rate": 2.900960022483756e-05, "loss": 0.4801, "step": 593900 }, { "epoch": 7.918838570343016, "grad_norm": 19.253742218017578, "learning_rate": 2.9000924838684056e-05, "loss": 0.4586, "step": 594000 }, { "epoch": 7.920171708149471, "grad_norm": 3.8880791664123535, "learning_rate": 2.8992249536169983e-05, "loss": 0.469, "step": 594100 }, { "epoch": 7.921504845955926, "grad_norm": 12.259921073913574, "learning_rate": 2.8983574318021593e-05, "loss": 0.5112, "step": 594200 }, { "epoch": 7.922837983762381, "grad_norm": 1.7943437099456787, "learning_rate": 2.8974899184965158e-05, "loss": 0.4881, "step": 594300 }, { "epoch": 7.924171121568836, "grad_norm": 19.449052810668945, "learning_rate": 2.8966224137726937e-05, "loss": 0.4601, "step": 594400 }, { "epoch": 7.925504259375291, "grad_norm": 4.863277912139893, "learning_rate": 2.8957549177033165e-05, "loss": 0.4879, "step": 594500 }, { "epoch": 7.926837397181747, "grad_norm": 4.046449184417725, "learning_rate": 2.8948874303610095e-05, "loss": 0.4659, "step": 594600 }, { "epoch": 7.928170534988202, "grad_norm": 2.0264456272125244, "learning_rate": 2.8940199518183954e-05, "loss": 0.465, "step": 594700 }, { "epoch": 7.929503672794657, "grad_norm": 9.783272743225098, "learning_rate": 2.893152482148096e-05, "loss": 0.47, "step": 594800 }, { "epoch": 7.930836810601112, "grad_norm": 2.1837565898895264, "learning_rate": 2.892285021422736e-05, "loss": 0.5061, "step": 594900 }, { "epoch": 7.932169948407567, "grad_norm": 5.821410655975342, "learning_rate": 2.8914175697149322e-05, "loss": 0.4401, "step": 595000 }, { "epoch": 7.933503086214022, "grad_norm": 53.9454231262207, "learning_rate": 2.890550127097308e-05, "loss": 0.4598, "step": 595100 }, { "epoch": 7.934836224020477, "grad_norm": 2.5383923053741455, "learning_rate": 2.889682693642483e-05, "loss": 0.4389, "step": 595200 }, { "epoch": 7.9361693618269324, "grad_norm": 3.5508525371551514, "learning_rate": 2.888815269423074e-05, "loss": 0.4994, "step": 595300 }, { "epoch": 7.937502499633387, "grad_norm": 33.138267517089844, "learning_rate": 2.8879478545117005e-05, "loss": 0.5378, "step": 595400 }, { "epoch": 7.938835637439842, "grad_norm": 5.12501335144043, "learning_rate": 2.887089122989614e-05, "loss": 0.4176, "step": 595500 }, { "epoch": 7.940168775246297, "grad_norm": 3.3422648906707764, "learning_rate": 2.8862217268172704e-05, "loss": 0.4959, "step": 595600 }, { "epoch": 7.941501913052752, "grad_norm": 8.798276901245117, "learning_rate": 2.8853543401700844e-05, "loss": 0.4624, "step": 595700 }, { "epoch": 7.942835050859207, "grad_norm": 3.088038921356201, "learning_rate": 2.8844869631206707e-05, "loss": 0.4302, "step": 595800 }, { "epoch": 7.944168188665662, "grad_norm": 2.3284904956817627, "learning_rate": 2.8836195957416465e-05, "loss": 0.4393, "step": 595900 }, { "epoch": 7.945501326472117, "grad_norm": 61.23097229003906, "learning_rate": 2.882752238105621e-05, "loss": 0.4726, "step": 596000 }, { "epoch": 7.946834464278572, "grad_norm": 33.95167541503906, "learning_rate": 2.8818848902852104e-05, "loss": 0.4995, "step": 596100 }, { "epoch": 7.948167602085028, "grad_norm": 5.028281211853027, "learning_rate": 2.8810175523530227e-05, "loss": 0.429, "step": 596200 }, { "epoch": 7.949500739891483, "grad_norm": 4.437539577484131, "learning_rate": 2.880150224381671e-05, "loss": 0.4427, "step": 596300 }, { "epoch": 7.950833877697938, "grad_norm": 5.528796672821045, "learning_rate": 2.879282906443765e-05, "loss": 0.4767, "step": 596400 }, { "epoch": 7.952167015504393, "grad_norm": 3.638840675354004, "learning_rate": 2.8784155986119133e-05, "loss": 0.4476, "step": 596500 }, { "epoch": 7.953500153310848, "grad_norm": 9.16433334350586, "learning_rate": 2.8775483009587243e-05, "loss": 0.4727, "step": 596600 }, { "epoch": 7.9548332911173025, "grad_norm": 3.448869466781616, "learning_rate": 2.876681013556806e-05, "loss": 0.4555, "step": 596700 }, { "epoch": 7.9561664289237575, "grad_norm": 3.2251107692718506, "learning_rate": 2.8758137364787644e-05, "loss": 0.4301, "step": 596800 }, { "epoch": 7.957499566730213, "grad_norm": 6.332494735717773, "learning_rate": 2.8749464697972046e-05, "loss": 0.5504, "step": 596900 }, { "epoch": 7.958832704536668, "grad_norm": 5.631305694580078, "learning_rate": 2.8740792135847333e-05, "loss": 0.4116, "step": 597000 }, { "epoch": 7.960165842343123, "grad_norm": 6.764893054962158, "learning_rate": 2.8732119679139525e-05, "loss": 0.4388, "step": 597100 }, { "epoch": 7.961498980149578, "grad_norm": 2.9075746536254883, "learning_rate": 2.8723447328574653e-05, "loss": 0.4238, "step": 597200 }, { "epoch": 7.962832117956033, "grad_norm": 9.903363227844238, "learning_rate": 2.871477508487877e-05, "loss": 0.456, "step": 597300 }, { "epoch": 7.964165255762488, "grad_norm": 3.558929920196533, "learning_rate": 2.8706102948777842e-05, "loss": 0.4711, "step": 597400 }, { "epoch": 7.965498393568943, "grad_norm": 8.960393905639648, "learning_rate": 2.869743092099792e-05, "loss": 0.4965, "step": 597500 }, { "epoch": 7.966831531375398, "grad_norm": 8.822463035583496, "learning_rate": 2.868875900226495e-05, "loss": 0.4439, "step": 597600 }, { "epoch": 7.968164669181853, "grad_norm": 7.6325249671936035, "learning_rate": 2.868008719330495e-05, "loss": 0.4733, "step": 597700 }, { "epoch": 7.969497806988309, "grad_norm": 33.029788970947266, "learning_rate": 2.8671502211279155e-05, "loss": 0.4658, "step": 597800 }, { "epoch": 7.970830944794764, "grad_norm": 3.0968825817108154, "learning_rate": 2.8662830622927167e-05, "loss": 0.4257, "step": 597900 }, { "epoch": 7.972164082601219, "grad_norm": 1.761178970336914, "learning_rate": 2.865415914651878e-05, "loss": 0.4487, "step": 598000 }, { "epoch": 7.9734972204076735, "grad_norm": 5.872411727905273, "learning_rate": 2.8645487782779945e-05, "loss": 0.4656, "step": 598100 }, { "epoch": 7.9748303582141284, "grad_norm": 3.7261040210723877, "learning_rate": 2.863681653243661e-05, "loss": 0.4596, "step": 598200 }, { "epoch": 7.976163496020583, "grad_norm": 3.103875160217285, "learning_rate": 2.862814539621469e-05, "loss": 0.4362, "step": 598300 }, { "epoch": 7.977496633827038, "grad_norm": 7.882070541381836, "learning_rate": 2.8619474374840104e-05, "loss": 0.5111, "step": 598400 }, { "epoch": 7.978829771633494, "grad_norm": 7.163458824157715, "learning_rate": 2.8610803469038798e-05, "loss": 0.4657, "step": 598500 }, { "epoch": 7.980162909439949, "grad_norm": 10.512417793273926, "learning_rate": 2.8602132679536623e-05, "loss": 0.4386, "step": 598600 }, { "epoch": 7.981496047246404, "grad_norm": 5.044588088989258, "learning_rate": 2.8593462007059497e-05, "loss": 0.4258, "step": 598700 }, { "epoch": 7.982829185052859, "grad_norm": 110.1544189453125, "learning_rate": 2.858479145233331e-05, "loss": 0.4555, "step": 598800 }, { "epoch": 7.984162322859314, "grad_norm": 5.822049140930176, "learning_rate": 2.8576121016083915e-05, "loss": 0.4782, "step": 598900 }, { "epoch": 7.985495460665769, "grad_norm": 2.2079639434814453, "learning_rate": 2.8567450699037183e-05, "loss": 0.4763, "step": 599000 }, { "epoch": 7.986828598472224, "grad_norm": 3.075523614883423, "learning_rate": 2.8558780501918958e-05, "loss": 0.4466, "step": 599100 }, { "epoch": 7.988161736278679, "grad_norm": 6.2319254875183105, "learning_rate": 2.855011042545509e-05, "loss": 0.5127, "step": 599200 }, { "epoch": 7.989494874085134, "grad_norm": 5.551966190338135, "learning_rate": 2.8541440470371406e-05, "loss": 0.4757, "step": 599300 }, { "epoch": 7.9908280118915895, "grad_norm": 5.848865509033203, "learning_rate": 2.8532770637393726e-05, "loss": 0.4091, "step": 599400 }, { "epoch": 7.9921611496980445, "grad_norm": 10.451080322265625, "learning_rate": 2.8524100927247852e-05, "loss": 0.4641, "step": 599500 }, { "epoch": 7.993494287504499, "grad_norm": 5.988492012023926, "learning_rate": 2.8515431340659622e-05, "loss": 0.4959, "step": 599600 }, { "epoch": 7.994827425310954, "grad_norm": 0.6315526962280273, "learning_rate": 2.850676187835478e-05, "loss": 0.4959, "step": 599700 }, { "epoch": 7.996160563117409, "grad_norm": 1.5342737436294556, "learning_rate": 2.8498092541059126e-05, "loss": 0.4682, "step": 599800 }, { "epoch": 7.997493700923864, "grad_norm": 12.730924606323242, "learning_rate": 2.8489423329498444e-05, "loss": 0.4907, "step": 599900 }, { "epoch": 7.998826838730319, "grad_norm": 6.031864643096924, "learning_rate": 2.8480754244398466e-05, "loss": 0.4679, "step": 600000 }, { "epoch": 7.998826838730319, "eval_accuracy": 0.9059597101137301, "eval_cer": 0.08426723736005515, "eval_loss": 0.4962616562843323, "eval_runtime": 9762.3331, "eval_samples_per_second": 5.71, "eval_steps_per_second": 0.357, "eval_wer": 0.1708597002408855, "step": 600000 }, { "epoch": 8.000159976536775, "grad_norm": 6.635284900665283, "learning_rate": 2.847208528648496e-05, "loss": 0.4296, "step": 600100 }, { "epoch": 8.00149311434323, "grad_norm": 3.2974696159362793, "learning_rate": 2.8463416456483663e-05, "loss": 0.369, "step": 600200 }, { "epoch": 8.002826252149685, "grad_norm": 4.781069278717041, "learning_rate": 2.8454747755120286e-05, "loss": 0.4552, "step": 600300 }, { "epoch": 8.00415938995614, "grad_norm": 3.047403573989868, "learning_rate": 2.844607918312056e-05, "loss": 0.412, "step": 600400 }, { "epoch": 8.005492527762595, "grad_norm": 7.315854072570801, "learning_rate": 2.843749742498297e-05, "loss": 0.4079, "step": 600500 }, { "epoch": 8.00682566556905, "grad_norm": 1.6829696893692017, "learning_rate": 2.84288291125759e-05, "loss": 0.4285, "step": 600600 }, { "epoch": 8.008158803375505, "grad_norm": 4.1900739669799805, "learning_rate": 2.8420160931702296e-05, "loss": 0.4468, "step": 600700 }, { "epoch": 8.00949194118196, "grad_norm": 4.706144332885742, "learning_rate": 2.841149288308786e-05, "loss": 0.4613, "step": 600800 }, { "epoch": 8.010825078988415, "grad_norm": 5.494304656982422, "learning_rate": 2.840282496745821e-05, "loss": 0.435, "step": 600900 }, { "epoch": 8.01215821679487, "grad_norm": 3.7563092708587646, "learning_rate": 2.8394157185539025e-05, "loss": 0.4386, "step": 601000 }, { "epoch": 8.013491354601324, "grad_norm": 1.5042479038238525, "learning_rate": 2.8385489538055946e-05, "loss": 0.4417, "step": 601100 }, { "epoch": 8.01482449240778, "grad_norm": 6.378519535064697, "learning_rate": 2.837682202573459e-05, "loss": 0.4331, "step": 601200 }, { "epoch": 8.016157630214236, "grad_norm": 2.9681475162506104, "learning_rate": 2.8368154649300575e-05, "loss": 0.4046, "step": 601300 }, { "epoch": 8.017490768020691, "grad_norm": 1.2700837850570679, "learning_rate": 2.8359487409479514e-05, "loss": 0.4046, "step": 601400 }, { "epoch": 8.018823905827146, "grad_norm": 2.9970579147338867, "learning_rate": 2.8350820306996993e-05, "loss": 0.4473, "step": 601500 }, { "epoch": 8.020157043633601, "grad_norm": 3.460447311401367, "learning_rate": 2.8342153342578595e-05, "loss": 0.4631, "step": 601600 }, { "epoch": 8.021490181440056, "grad_norm": 3.864112615585327, "learning_rate": 2.83334865169499e-05, "loss": 0.383, "step": 601700 }, { "epoch": 8.02282331924651, "grad_norm": 3.9474124908447266, "learning_rate": 2.8324819830836453e-05, "loss": 0.3996, "step": 601800 }, { "epoch": 8.024156457052966, "grad_norm": 14.643585205078125, "learning_rate": 2.8316153284963804e-05, "loss": 0.3994, "step": 601900 }, { "epoch": 8.02548959485942, "grad_norm": 3.2006571292877197, "learning_rate": 2.8307486880057513e-05, "loss": 0.4645, "step": 602000 }, { "epoch": 8.026822732665876, "grad_norm": 4.8997344970703125, "learning_rate": 2.8298820616843066e-05, "loss": 0.3523, "step": 602100 }, { "epoch": 8.02815587047233, "grad_norm": 1.2752453088760376, "learning_rate": 2.829015449604601e-05, "loss": 0.4006, "step": 602200 }, { "epoch": 8.029489008278786, "grad_norm": 3.977823257446289, "learning_rate": 2.8281488518391815e-05, "loss": 0.4506, "step": 602300 }, { "epoch": 8.03082214608524, "grad_norm": 0.9112719893455505, "learning_rate": 2.827282268460599e-05, "loss": 0.4124, "step": 602400 }, { "epoch": 8.032155283891695, "grad_norm": 1.3787462711334229, "learning_rate": 2.826415699541401e-05, "loss": 0.4613, "step": 602500 }, { "epoch": 8.03348842169815, "grad_norm": 3.1246957778930664, "learning_rate": 2.8255491451541327e-05, "loss": 0.4219, "step": 602600 }, { "epoch": 8.034821559504605, "grad_norm": 7.700145244598389, "learning_rate": 2.8246826053713405e-05, "loss": 0.4599, "step": 602700 }, { "epoch": 8.03615469731106, "grad_norm": 2.22575044631958, "learning_rate": 2.8238160802655682e-05, "loss": 0.3829, "step": 602800 }, { "epoch": 8.037487835117517, "grad_norm": 5.740710258483887, "learning_rate": 2.822949569909358e-05, "loss": 0.4362, "step": 602900 }, { "epoch": 8.038820972923972, "grad_norm": 3.97401762008667, "learning_rate": 2.8220830743752507e-05, "loss": 0.4936, "step": 603000 }, { "epoch": 8.040154110730427, "grad_norm": 2.2383742332458496, "learning_rate": 2.8212165937357896e-05, "loss": 0.472, "step": 603100 }, { "epoch": 8.041487248536882, "grad_norm": 4.166161060333252, "learning_rate": 2.82035012806351e-05, "loss": 0.4172, "step": 603200 }, { "epoch": 8.042820386343337, "grad_norm": 5.391569137573242, "learning_rate": 2.8194836774309516e-05, "loss": 0.4068, "step": 603300 }, { "epoch": 8.044153524149792, "grad_norm": 22.291988372802734, "learning_rate": 2.8186172419106517e-05, "loss": 0.4471, "step": 603400 }, { "epoch": 8.045486661956247, "grad_norm": 2.298757791519165, "learning_rate": 2.8177508215751432e-05, "loss": 0.4267, "step": 603500 }, { "epoch": 8.046819799762702, "grad_norm": 0.6069477796554565, "learning_rate": 2.816884416496963e-05, "loss": 0.4077, "step": 603600 }, { "epoch": 8.048152937569157, "grad_norm": 3.294616222381592, "learning_rate": 2.8160180267486412e-05, "loss": 0.401, "step": 603700 }, { "epoch": 8.049486075375611, "grad_norm": 16.78533363342285, "learning_rate": 2.81515165240271e-05, "loss": 0.4544, "step": 603800 }, { "epoch": 8.050819213182066, "grad_norm": 4.104328155517578, "learning_rate": 2.8142852935317006e-05, "loss": 0.3878, "step": 603900 }, { "epoch": 8.052152350988521, "grad_norm": 1.0761442184448242, "learning_rate": 2.8134189502081408e-05, "loss": 0.3651, "step": 604000 }, { "epoch": 8.053485488794976, "grad_norm": 13.234503746032715, "learning_rate": 2.8125526225045576e-05, "loss": 0.428, "step": 604100 }, { "epoch": 8.054818626601431, "grad_norm": 1.9998966455459595, "learning_rate": 2.811686310493479e-05, "loss": 0.4702, "step": 604200 }, { "epoch": 8.056151764407886, "grad_norm": 5.845706462860107, "learning_rate": 2.810820014247428e-05, "loss": 0.4757, "step": 604300 }, { "epoch": 8.057484902214341, "grad_norm": 6.169501304626465, "learning_rate": 2.809953733838929e-05, "loss": 0.5021, "step": 604400 }, { "epoch": 8.058818040020798, "grad_norm": 4.5676140785217285, "learning_rate": 2.8090874693405055e-05, "loss": 0.4732, "step": 604500 }, { "epoch": 8.060151177827253, "grad_norm": 5.996246337890625, "learning_rate": 2.8082212208246753e-05, "loss": 0.4527, "step": 604600 }, { "epoch": 8.061484315633708, "grad_norm": 1.9172961711883545, "learning_rate": 2.8073549883639613e-05, "loss": 0.4409, "step": 604700 }, { "epoch": 8.062817453440163, "grad_norm": 15.370955467224121, "learning_rate": 2.8064887720308806e-05, "loss": 0.4059, "step": 604800 }, { "epoch": 8.064150591246618, "grad_norm": 5.219126224517822, "learning_rate": 2.805622571897949e-05, "loss": 0.4764, "step": 604900 }, { "epoch": 8.065483729053073, "grad_norm": 8.640523910522461, "learning_rate": 2.8047563880376834e-05, "loss": 0.3821, "step": 605000 }, { "epoch": 8.066816866859527, "grad_norm": 3.764019012451172, "learning_rate": 2.8038902205225982e-05, "loss": 0.4122, "step": 605100 }, { "epoch": 8.068150004665982, "grad_norm": 5.765705585479736, "learning_rate": 2.8030240694252047e-05, "loss": 0.412, "step": 605200 }, { "epoch": 8.069483142472437, "grad_norm": 2.1095268726348877, "learning_rate": 2.8021579348180163e-05, "loss": 0.4482, "step": 605300 }, { "epoch": 8.070816280278892, "grad_norm": 1.1395649909973145, "learning_rate": 2.8012918167735404e-05, "loss": 0.4068, "step": 605400 }, { "epoch": 8.072149418085347, "grad_norm": 13.071459770202637, "learning_rate": 2.8004257153642874e-05, "loss": 0.4455, "step": 605500 }, { "epoch": 8.073482555891802, "grad_norm": 3.732647657394409, "learning_rate": 2.7995682914268387e-05, "loss": 0.4902, "step": 605600 }, { "epoch": 8.074815693698257, "grad_norm": 6.855227470397949, "learning_rate": 2.7987022233373916e-05, "loss": 0.427, "step": 605700 }, { "epoch": 8.076148831504712, "grad_norm": 4.492496013641357, "learning_rate": 2.7978361720999584e-05, "loss": 0.3974, "step": 605800 }, { "epoch": 8.077481969311167, "grad_norm": 1.561570405960083, "learning_rate": 2.796970137787044e-05, "loss": 0.4445, "step": 605900 }, { "epoch": 8.078815107117622, "grad_norm": 4.65047025680542, "learning_rate": 2.7961041204711506e-05, "loss": 0.4282, "step": 606000 }, { "epoch": 8.080148244924079, "grad_norm": 1.5158259868621826, "learning_rate": 2.7952381202247763e-05, "loss": 0.4571, "step": 606100 }, { "epoch": 8.081481382730534, "grad_norm": 11.23193359375, "learning_rate": 2.794372137120422e-05, "loss": 0.3695, "step": 606200 }, { "epoch": 8.082814520536989, "grad_norm": 2.589097499847412, "learning_rate": 2.793506171230584e-05, "loss": 0.4044, "step": 606300 }, { "epoch": 8.084147658343444, "grad_norm": 7.900779724121094, "learning_rate": 2.792640222627758e-05, "loss": 0.3981, "step": 606400 }, { "epoch": 8.085480796149898, "grad_norm": 36.008949279785156, "learning_rate": 2.7917742913844396e-05, "loss": 0.4012, "step": 606500 }, { "epoch": 8.086813933956353, "grad_norm": 17.990859985351562, "learning_rate": 2.7909083775731205e-05, "loss": 0.464, "step": 606600 }, { "epoch": 8.088147071762808, "grad_norm": 4.874861240386963, "learning_rate": 2.7900424812662922e-05, "loss": 0.4663, "step": 606700 }, { "epoch": 8.089480209569263, "grad_norm": 1.0830636024475098, "learning_rate": 2.7891766025364473e-05, "loss": 0.4446, "step": 606800 }, { "epoch": 8.090813347375718, "grad_norm": 2.43906831741333, "learning_rate": 2.788310741456071e-05, "loss": 0.4195, "step": 606900 }, { "epoch": 8.092146485182173, "grad_norm": 2.6908202171325684, "learning_rate": 2.7874448980976527e-05, "loss": 0.4511, "step": 607000 }, { "epoch": 8.093479622988628, "grad_norm": 3.0162017345428467, "learning_rate": 2.7865790725336782e-05, "loss": 0.4004, "step": 607100 }, { "epoch": 8.094812760795083, "grad_norm": 5.481684684753418, "learning_rate": 2.78571326483663e-05, "loss": 0.4512, "step": 607200 }, { "epoch": 8.096145898601538, "grad_norm": 3.2865729331970215, "learning_rate": 2.7848474750789925e-05, "loss": 0.4156, "step": 607300 }, { "epoch": 8.097479036407993, "grad_norm": 3.4952189922332764, "learning_rate": 2.783981703333246e-05, "loss": 0.4692, "step": 607400 }, { "epoch": 8.098812174214448, "grad_norm": 3.7131576538085938, "learning_rate": 2.78311594967187e-05, "loss": 0.4306, "step": 607500 }, { "epoch": 8.100145312020903, "grad_norm": 3.6274421215057373, "learning_rate": 2.7822502141673425e-05, "loss": 0.4552, "step": 607600 }, { "epoch": 8.101478449827358, "grad_norm": 3.168419122695923, "learning_rate": 2.7813844968921427e-05, "loss": 0.4128, "step": 607700 }, { "epoch": 8.102811587633814, "grad_norm": 3.8871707916259766, "learning_rate": 2.7805187979187418e-05, "loss": 0.4278, "step": 607800 }, { "epoch": 8.10414472544027, "grad_norm": 2.5747222900390625, "learning_rate": 2.7796531173196156e-05, "loss": 0.426, "step": 607900 }, { "epoch": 8.105477863246724, "grad_norm": 3.3122975826263428, "learning_rate": 2.7787874551672366e-05, "loss": 0.4763, "step": 608000 }, { "epoch": 8.10681100105318, "grad_norm": 1.0537798404693604, "learning_rate": 2.7779218115340736e-05, "loss": 0.4079, "step": 608100 }, { "epoch": 8.108144138859634, "grad_norm": 0.12858480215072632, "learning_rate": 2.7770561864925978e-05, "loss": 0.4425, "step": 608200 }, { "epoch": 8.10947727666609, "grad_norm": 2.880673408508301, "learning_rate": 2.7761905801152743e-05, "loss": 0.4292, "step": 608300 }, { "epoch": 8.110810414472544, "grad_norm": 4.81473970413208, "learning_rate": 2.7753249924745697e-05, "loss": 0.3639, "step": 608400 }, { "epoch": 8.112143552278999, "grad_norm": 12.364450454711914, "learning_rate": 2.7744594236429493e-05, "loss": 0.4552, "step": 608500 }, { "epoch": 8.113476690085454, "grad_norm": 5.0534892082214355, "learning_rate": 2.7735938736928744e-05, "loss": 0.3911, "step": 608600 }, { "epoch": 8.114809827891909, "grad_norm": 4.181804656982422, "learning_rate": 2.7727283426968067e-05, "loss": 0.4569, "step": 608700 }, { "epoch": 8.116142965698364, "grad_norm": 5.032891273498535, "learning_rate": 2.7718628307272063e-05, "loss": 0.4471, "step": 608800 }, { "epoch": 8.117476103504819, "grad_norm": 11.357526779174805, "learning_rate": 2.77099733785653e-05, "loss": 0.4184, "step": 608900 }, { "epoch": 8.118809241311274, "grad_norm": 8.723687171936035, "learning_rate": 2.7701318641572337e-05, "loss": 0.3967, "step": 609000 }, { "epoch": 8.120142379117729, "grad_norm": 2.6373298168182373, "learning_rate": 2.7692664097017756e-05, "loss": 0.4236, "step": 609100 }, { "epoch": 8.121475516924184, "grad_norm": 1.8439723253250122, "learning_rate": 2.7684009745626043e-05, "loss": 0.4284, "step": 609200 }, { "epoch": 8.12280865473064, "grad_norm": 8.987915992736816, "learning_rate": 2.7675355588121738e-05, "loss": 0.4501, "step": 609300 }, { "epoch": 8.124141792537095, "grad_norm": 2.1933698654174805, "learning_rate": 2.766670162522934e-05, "loss": 0.4603, "step": 609400 }, { "epoch": 8.12547493034355, "grad_norm": 9.99882698059082, "learning_rate": 2.7658047857673325e-05, "loss": 0.4378, "step": 609500 }, { "epoch": 8.126808068150005, "grad_norm": 7.880140781402588, "learning_rate": 2.7649394286178155e-05, "loss": 0.4289, "step": 609600 }, { "epoch": 8.12814120595646, "grad_norm": 1.6850535869598389, "learning_rate": 2.764091397702931e-05, "loss": 0.4588, "step": 609700 }, { "epoch": 8.129474343762915, "grad_norm": 14.109061241149902, "learning_rate": 2.763234732670348e-05, "loss": 0.4648, "step": 609800 }, { "epoch": 8.13080748156937, "grad_norm": 3.3715908527374268, "learning_rate": 2.762369434177994e-05, "loss": 0.3526, "step": 609900 }, { "epoch": 8.132140619375825, "grad_norm": 13.534637451171875, "learning_rate": 2.7615041555793212e-05, "loss": 0.484, "step": 610000 }, { "epoch": 8.13347375718228, "grad_norm": 1.7013144493103027, "learning_rate": 2.760638896946768e-05, "loss": 0.464, "step": 610100 }, { "epoch": 8.134806894988735, "grad_norm": 5.252017974853516, "learning_rate": 2.759773658352774e-05, "loss": 0.4105, "step": 610200 }, { "epoch": 8.13614003279519, "grad_norm": 5.3326921463012695, "learning_rate": 2.75890843986977e-05, "loss": 0.4191, "step": 610300 }, { "epoch": 8.137473170601645, "grad_norm": 4.316520690917969, "learning_rate": 2.7580432415701922e-05, "loss": 0.3906, "step": 610400 }, { "epoch": 8.1388063084081, "grad_norm": 4.808220386505127, "learning_rate": 2.7571780635264723e-05, "loss": 0.3604, "step": 610500 }, { "epoch": 8.140139446214555, "grad_norm": 4.793806076049805, "learning_rate": 2.756312905811038e-05, "loss": 0.4135, "step": 610600 }, { "epoch": 8.14147258402101, "grad_norm": 1.4621015787124634, "learning_rate": 2.7554477684963197e-05, "loss": 0.465, "step": 610700 }, { "epoch": 8.142805721827465, "grad_norm": 3.0156519412994385, "learning_rate": 2.754582651654744e-05, "loss": 0.3863, "step": 610800 }, { "epoch": 8.14413885963392, "grad_norm": 2.633178472518921, "learning_rate": 2.7537175553587347e-05, "loss": 0.3934, "step": 610900 }, { "epoch": 8.145471997440376, "grad_norm": 2.522381067276001, "learning_rate": 2.7528524796807154e-05, "loss": 0.4154, "step": 611000 }, { "epoch": 8.146805135246831, "grad_norm": 2.471951484680176, "learning_rate": 2.751987424693107e-05, "loss": 0.417, "step": 611100 }, { "epoch": 8.148138273053286, "grad_norm": 0.11362064629793167, "learning_rate": 2.7511223904683295e-05, "loss": 0.4155, "step": 611200 }, { "epoch": 8.149471410859741, "grad_norm": 1.23954439163208, "learning_rate": 2.7502573770788018e-05, "loss": 0.3984, "step": 611300 }, { "epoch": 8.150804548666196, "grad_norm": 5.860588550567627, "learning_rate": 2.7494010344180263e-05, "loss": 0.3976, "step": 611400 }, { "epoch": 8.152137686472651, "grad_norm": 3.697157144546509, "learning_rate": 2.748536062706083e-05, "loss": 0.3697, "step": 611500 }, { "epoch": 8.153470824279106, "grad_norm": 3.470151424407959, "learning_rate": 2.7476711120459077e-05, "loss": 0.4481, "step": 611600 }, { "epoch": 8.15480396208556, "grad_norm": 7.456782341003418, "learning_rate": 2.7468061825099126e-05, "loss": 0.4395, "step": 611700 }, { "epoch": 8.156137099892016, "grad_norm": 1.3400404453277588, "learning_rate": 2.745941274170505e-05, "loss": 0.4363, "step": 611800 }, { "epoch": 8.15747023769847, "grad_norm": 4.3452630043029785, "learning_rate": 2.7450763871000934e-05, "loss": 0.4059, "step": 611900 }, { "epoch": 8.158803375504926, "grad_norm": 0.10836423933506012, "learning_rate": 2.7442115213710835e-05, "loss": 0.4396, "step": 612000 }, { "epoch": 8.16013651331138, "grad_norm": 5.920718193054199, "learning_rate": 2.7433466770558782e-05, "loss": 0.4038, "step": 612100 }, { "epoch": 8.161469651117836, "grad_norm": 2.9143450260162354, "learning_rate": 2.7424818542268797e-05, "loss": 0.4182, "step": 612200 }, { "epoch": 8.16280278892429, "grad_norm": 2.3744008541107178, "learning_rate": 2.7416170529564895e-05, "loss": 0.4675, "step": 612300 }, { "epoch": 8.164135926730745, "grad_norm": 1.7567836046218872, "learning_rate": 2.7407522733171043e-05, "loss": 0.385, "step": 612400 }, { "epoch": 8.1654690645372, "grad_norm": 7.534013748168945, "learning_rate": 2.7398875153811223e-05, "loss": 0.4016, "step": 612500 }, { "epoch": 8.166802202343657, "grad_norm": 5.5202317237854, "learning_rate": 2.739022779220937e-05, "loss": 0.4745, "step": 612600 }, { "epoch": 8.168135340150112, "grad_norm": 17.378198623657227, "learning_rate": 2.7381580649089412e-05, "loss": 0.3681, "step": 612700 }, { "epoch": 8.169468477956567, "grad_norm": 4.402022361755371, "learning_rate": 2.7372933725175288e-05, "loss": 0.4036, "step": 612800 }, { "epoch": 8.170801615763022, "grad_norm": 1.0864570140838623, "learning_rate": 2.7364287021190853e-05, "loss": 0.4009, "step": 612900 }, { "epoch": 8.172134753569477, "grad_norm": 3.485239028930664, "learning_rate": 2.7355640537860005e-05, "loss": 0.4306, "step": 613000 }, { "epoch": 8.173467891375932, "grad_norm": 1.8247028589248657, "learning_rate": 2.7346994275906605e-05, "loss": 0.5046, "step": 613100 }, { "epoch": 8.174801029182387, "grad_norm": 4.929708480834961, "learning_rate": 2.733834823605447e-05, "loss": 0.3729, "step": 613200 }, { "epoch": 8.176134166988842, "grad_norm": 3.7898991107940674, "learning_rate": 2.7329702419027435e-05, "loss": 0.4133, "step": 613300 }, { "epoch": 8.177467304795297, "grad_norm": 2.818772077560425, "learning_rate": 2.73210568255493e-05, "loss": 0.3955, "step": 613400 }, { "epoch": 8.178800442601752, "grad_norm": 22.13646125793457, "learning_rate": 2.7312411456343835e-05, "loss": 0.4601, "step": 613500 }, { "epoch": 8.180133580408206, "grad_norm": 2.302079439163208, "learning_rate": 2.7303766312134807e-05, "loss": 0.4008, "step": 613600 }, { "epoch": 8.181466718214661, "grad_norm": 0.4821556508541107, "learning_rate": 2.7295121393645982e-05, "loss": 0.4142, "step": 613700 }, { "epoch": 8.182799856021116, "grad_norm": 6.2649359703063965, "learning_rate": 2.7286476701601047e-05, "loss": 0.4241, "step": 613800 }, { "epoch": 8.184132993827571, "grad_norm": 4.039252758026123, "learning_rate": 2.727783223672375e-05, "loss": 0.4003, "step": 613900 }, { "epoch": 8.185466131634026, "grad_norm": 3.9995217323303223, "learning_rate": 2.726918799973774e-05, "loss": 0.4912, "step": 614000 }, { "epoch": 8.186799269440481, "grad_norm": 1.9511204957962036, "learning_rate": 2.7260543991366703e-05, "loss": 0.4613, "step": 614100 }, { "epoch": 8.188132407246938, "grad_norm": 3.887852668762207, "learning_rate": 2.7251900212334296e-05, "loss": 0.4572, "step": 614200 }, { "epoch": 8.189465545053393, "grad_norm": 8.95012378692627, "learning_rate": 2.7243256663364136e-05, "loss": 0.4776, "step": 614300 }, { "epoch": 8.190798682859848, "grad_norm": 2.423240900039673, "learning_rate": 2.723461334517984e-05, "loss": 0.4129, "step": 614400 }, { "epoch": 8.192131820666303, "grad_norm": 1.785844087600708, "learning_rate": 2.7225970258505e-05, "loss": 0.4083, "step": 614500 }, { "epoch": 8.193464958472758, "grad_norm": 6.428056716918945, "learning_rate": 2.721732740406318e-05, "loss": 0.436, "step": 614600 }, { "epoch": 8.194798096279213, "grad_norm": 1.8648744821548462, "learning_rate": 2.7208684782577937e-05, "loss": 0.4416, "step": 614700 }, { "epoch": 8.196131234085668, "grad_norm": 3.7871885299682617, "learning_rate": 2.7200042394772816e-05, "loss": 0.403, "step": 614800 }, { "epoch": 8.197464371892123, "grad_norm": 3.1239240169525146, "learning_rate": 2.7191400241371307e-05, "loss": 0.4572, "step": 614900 }, { "epoch": 8.198797509698577, "grad_norm": 2.368175506591797, "learning_rate": 2.718275832309691e-05, "loss": 0.419, "step": 615000 }, { "epoch": 8.200130647505032, "grad_norm": 4.913453578948975, "learning_rate": 2.7174116640673128e-05, "loss": 0.4586, "step": 615100 }, { "epoch": 8.201463785311487, "grad_norm": 4.327546119689941, "learning_rate": 2.7165475194823373e-05, "loss": 0.4246, "step": 615200 }, { "epoch": 8.202796923117942, "grad_norm": 2.036126136779785, "learning_rate": 2.7156833986271107e-05, "loss": 0.433, "step": 615300 }, { "epoch": 8.204130060924397, "grad_norm": 8.912394523620605, "learning_rate": 2.7148193015739742e-05, "loss": 0.4569, "step": 615400 }, { "epoch": 8.205463198730852, "grad_norm": 4.6833295822143555, "learning_rate": 2.7139552283952663e-05, "loss": 0.3685, "step": 615500 }, { "epoch": 8.206796336537307, "grad_norm": 5.297565937042236, "learning_rate": 2.7130911791633257e-05, "loss": 0.421, "step": 615600 }, { "epoch": 8.208129474343762, "grad_norm": 5.414465427398682, "learning_rate": 2.7122271539504863e-05, "loss": 0.468, "step": 615700 }, { "epoch": 8.209462612150219, "grad_norm": 1.468476414680481, "learning_rate": 2.7113631528290823e-05, "loss": 0.5017, "step": 615800 }, { "epoch": 8.210795749956674, "grad_norm": 2.4667270183563232, "learning_rate": 2.7104991758714463e-05, "loss": 0.4329, "step": 615900 }, { "epoch": 8.212128887763129, "grad_norm": 4.8360772132873535, "learning_rate": 2.7096352231499057e-05, "loss": 0.4275, "step": 616000 }, { "epoch": 8.213462025569584, "grad_norm": 2.5546388626098633, "learning_rate": 2.7087712947367887e-05, "loss": 0.3867, "step": 616100 }, { "epoch": 8.214795163376039, "grad_norm": 3.048527479171753, "learning_rate": 2.7079073907044228e-05, "loss": 0.4216, "step": 616200 }, { "epoch": 8.216128301182493, "grad_norm": 0.9018763303756714, "learning_rate": 2.7070521497996408e-05, "loss": 0.3972, "step": 616300 }, { "epoch": 8.217461438988948, "grad_norm": 0.8207379579544067, "learning_rate": 2.706188294500127e-05, "loss": 0.4172, "step": 616400 }, { "epoch": 8.218794576795403, "grad_norm": 3.8662614822387695, "learning_rate": 2.7053244637976035e-05, "loss": 0.3841, "step": 616500 }, { "epoch": 8.220127714601858, "grad_norm": 1.1351512670516968, "learning_rate": 2.704460657764388e-05, "loss": 0.4145, "step": 616600 }, { "epoch": 8.221460852408313, "grad_norm": 7.750576496124268, "learning_rate": 2.7035968764727937e-05, "loss": 0.393, "step": 616700 }, { "epoch": 8.222793990214768, "grad_norm": 6.839167594909668, "learning_rate": 2.702733119995135e-05, "loss": 0.3796, "step": 616800 }, { "epoch": 8.224127128021223, "grad_norm": 2.535181999206543, "learning_rate": 2.7018693884037225e-05, "loss": 0.4947, "step": 616900 }, { "epoch": 8.225460265827678, "grad_norm": 2.9619741439819336, "learning_rate": 2.7010056817708647e-05, "loss": 0.4734, "step": 617000 }, { "epoch": 8.226793403634133, "grad_norm": 2.6793148517608643, "learning_rate": 2.7001420001688692e-05, "loss": 0.41, "step": 617100 }, { "epoch": 8.228126541440588, "grad_norm": 23.490137100219727, "learning_rate": 2.69927834367004e-05, "loss": 0.4753, "step": 617200 }, { "epoch": 8.229459679247043, "grad_norm": 1.4509962797164917, "learning_rate": 2.698414712346679e-05, "loss": 0.4429, "step": 617300 }, { "epoch": 8.2307928170535, "grad_norm": 0.6821339130401611, "learning_rate": 2.697551106271089e-05, "loss": 0.3768, "step": 617400 }, { "epoch": 8.232125954859955, "grad_norm": 4.16099214553833, "learning_rate": 2.6966875255155657e-05, "loss": 0.4258, "step": 617500 }, { "epoch": 8.23345909266641, "grad_norm": 12.110384941101074, "learning_rate": 2.6958239701524073e-05, "loss": 0.4538, "step": 617600 }, { "epoch": 8.234792230472864, "grad_norm": 4.348330497741699, "learning_rate": 2.6949604402539074e-05, "loss": 0.3851, "step": 617700 }, { "epoch": 8.23612536827932, "grad_norm": 2.971926689147949, "learning_rate": 2.6940969358923578e-05, "loss": 0.4066, "step": 617800 }, { "epoch": 8.237458506085774, "grad_norm": 3.6666252613067627, "learning_rate": 2.693233457140048e-05, "loss": 0.392, "step": 617900 }, { "epoch": 8.23879164389223, "grad_norm": 12.185742378234863, "learning_rate": 2.692370004069267e-05, "loss": 0.4525, "step": 618000 }, { "epoch": 8.240124781698684, "grad_norm": 0.9765805006027222, "learning_rate": 2.6915065767522988e-05, "loss": 0.4184, "step": 618100 }, { "epoch": 8.24145791950514, "grad_norm": 2.267960548400879, "learning_rate": 2.6906431752614268e-05, "loss": 0.445, "step": 618200 }, { "epoch": 8.242791057311594, "grad_norm": 1.3451803922653198, "learning_rate": 2.6897797996689355e-05, "loss": 0.4635, "step": 618300 }, { "epoch": 8.244124195118049, "grad_norm": 2.9269943237304688, "learning_rate": 2.6889164500470987e-05, "loss": 0.3992, "step": 618400 }, { "epoch": 8.245457332924504, "grad_norm": 3.8012583255767822, "learning_rate": 2.6880531264681986e-05, "loss": 0.3897, "step": 618500 }, { "epoch": 8.246790470730959, "grad_norm": 2.751598358154297, "learning_rate": 2.6871898290045054e-05, "loss": 0.403, "step": 618600 }, { "epoch": 8.248123608537414, "grad_norm": 1.2528883218765259, "learning_rate": 2.686326557728294e-05, "loss": 0.4039, "step": 618700 }, { "epoch": 8.249456746343869, "grad_norm": 1.8297845125198364, "learning_rate": 2.6854633127118357e-05, "loss": 0.4015, "step": 618800 }, { "epoch": 8.250789884150324, "grad_norm": 3.3884503841400146, "learning_rate": 2.684600094027396e-05, "loss": 0.4533, "step": 618900 }, { "epoch": 8.25212302195678, "grad_norm": 3.286609649658203, "learning_rate": 2.6837369017472426e-05, "loss": 0.5102, "step": 619000 }, { "epoch": 8.253456159763235, "grad_norm": 5.064011573791504, "learning_rate": 2.6828737359436392e-05, "loss": 0.4357, "step": 619100 }, { "epoch": 8.25478929756969, "grad_norm": 6.821256160736084, "learning_rate": 2.6820105966888466e-05, "loss": 0.3568, "step": 619200 }, { "epoch": 8.256122435376145, "grad_norm": 2.143423318862915, "learning_rate": 2.6811474840551246e-05, "loss": 0.3762, "step": 619300 }, { "epoch": 8.2574555731826, "grad_norm": 1.0846328735351562, "learning_rate": 2.6802843981147304e-05, "loss": 0.454, "step": 619400 }, { "epoch": 8.258788710989055, "grad_norm": 6.24471378326416, "learning_rate": 2.679421338939918e-05, "loss": 0.4267, "step": 619500 }, { "epoch": 8.26012184879551, "grad_norm": 7.549164772033691, "learning_rate": 2.6785583066029396e-05, "loss": 0.3507, "step": 619600 }, { "epoch": 8.261454986601965, "grad_norm": 1.1367627382278442, "learning_rate": 2.677695301176049e-05, "loss": 0.4669, "step": 619700 }, { "epoch": 8.26278812440842, "grad_norm": 2.0954856872558594, "learning_rate": 2.6768323227314897e-05, "loss": 0.4496, "step": 619800 }, { "epoch": 8.264121262214875, "grad_norm": 6.164377689361572, "learning_rate": 2.67596937134151e-05, "loss": 0.3954, "step": 619900 }, { "epoch": 8.26545440002133, "grad_norm": 2.5690693855285645, "learning_rate": 2.675106447078354e-05, "loss": 0.434, "step": 620000 }, { "epoch": 8.266787537827785, "grad_norm": 11.087382316589355, "learning_rate": 2.6742435500142614e-05, "loss": 0.4064, "step": 620100 }, { "epoch": 8.26812067563424, "grad_norm": 2.2926862239837646, "learning_rate": 2.6733806802214724e-05, "loss": 0.4255, "step": 620200 }, { "epoch": 8.269453813440695, "grad_norm": 12.749166488647461, "learning_rate": 2.672517837772223e-05, "loss": 0.4146, "step": 620300 }, { "epoch": 8.27078695124715, "grad_norm": 6.078705310821533, "learning_rate": 2.6716550227387476e-05, "loss": 0.4033, "step": 620400 }, { "epoch": 8.272120089053605, "grad_norm": 2.789581060409546, "learning_rate": 2.6707922351932793e-05, "loss": 0.4317, "step": 620500 }, { "epoch": 8.273453226860061, "grad_norm": 4.890499591827393, "learning_rate": 2.669938102671239e-05, "loss": 0.4269, "step": 620600 }, { "epoch": 8.274786364666516, "grad_norm": 3.707977533340454, "learning_rate": 2.6690753700417883e-05, "loss": 0.3854, "step": 620700 }, { "epoch": 8.276119502472971, "grad_norm": 4.724511623382568, "learning_rate": 2.6682126651163034e-05, "loss": 0.4378, "step": 620800 }, { "epoch": 8.277452640279426, "grad_norm": 3.988452434539795, "learning_rate": 2.6673499879670104e-05, "loss": 0.4606, "step": 620900 }, { "epoch": 8.278785778085881, "grad_norm": 20.959251403808594, "learning_rate": 2.6664873386661263e-05, "loss": 0.3954, "step": 621000 }, { "epoch": 8.280118915892336, "grad_norm": 2.9564290046691895, "learning_rate": 2.665624717285871e-05, "loss": 0.5082, "step": 621100 }, { "epoch": 8.281452053698791, "grad_norm": 3.3454389572143555, "learning_rate": 2.6647621238984613e-05, "loss": 0.382, "step": 621200 }, { "epoch": 8.282785191505246, "grad_norm": 7.737370491027832, "learning_rate": 2.663899558576109e-05, "loss": 0.4671, "step": 621300 }, { "epoch": 8.284118329311701, "grad_norm": 15.127071380615234, "learning_rate": 2.6630542718585158e-05, "loss": 0.4072, "step": 621400 }, { "epoch": 8.285451467118156, "grad_norm": 5.068517208099365, "learning_rate": 2.6621917623180144e-05, "loss": 0.3875, "step": 621500 }, { "epoch": 8.28678460492461, "grad_norm": 3.450463056564331, "learning_rate": 2.6613292810577543e-05, "loss": 0.4311, "step": 621600 }, { "epoch": 8.288117742731066, "grad_norm": 32.56620788574219, "learning_rate": 2.6604668281499384e-05, "loss": 0.4153, "step": 621700 }, { "epoch": 8.28945088053752, "grad_norm": 9.467123985290527, "learning_rate": 2.6596044036667695e-05, "loss": 0.4679, "step": 621800 }, { "epoch": 8.290784018343976, "grad_norm": 2.7587759494781494, "learning_rate": 2.6587420076804476e-05, "loss": 0.4016, "step": 621900 }, { "epoch": 8.29211715615043, "grad_norm": 3.2259137630462646, "learning_rate": 2.657879640263168e-05, "loss": 0.4396, "step": 622000 }, { "epoch": 8.293450293956885, "grad_norm": 3.8530869483947754, "learning_rate": 2.657017301487126e-05, "loss": 0.4127, "step": 622100 }, { "epoch": 8.294783431763342, "grad_norm": 9.383904457092285, "learning_rate": 2.6561549914245168e-05, "loss": 0.4693, "step": 622200 }, { "epoch": 8.296116569569797, "grad_norm": 3.161144971847534, "learning_rate": 2.6552927101475254e-05, "loss": 0.4379, "step": 622300 }, { "epoch": 8.297449707376252, "grad_norm": 3.520691394805908, "learning_rate": 2.6544304577283427e-05, "loss": 0.4281, "step": 622400 }, { "epoch": 8.298782845182707, "grad_norm": 7.744431972503662, "learning_rate": 2.653568234239153e-05, "loss": 0.4179, "step": 622500 }, { "epoch": 8.300115982989162, "grad_norm": 10.688519477844238, "learning_rate": 2.652706039752138e-05, "loss": 0.4385, "step": 622600 }, { "epoch": 8.301449120795617, "grad_norm": 4.4836626052856445, "learning_rate": 2.651843874339479e-05, "loss": 0.4564, "step": 622700 }, { "epoch": 8.302782258602072, "grad_norm": 3.600659132003784, "learning_rate": 2.6509817380733535e-05, "loss": 0.4173, "step": 622800 }, { "epoch": 8.304115396408527, "grad_norm": 3.244840145111084, "learning_rate": 2.6501196310259365e-05, "loss": 0.4076, "step": 622900 }, { "epoch": 8.305448534214982, "grad_norm": 2.986072063446045, "learning_rate": 2.6492575532694016e-05, "loss": 0.358, "step": 623000 }, { "epoch": 8.306781672021437, "grad_norm": 4.235045433044434, "learning_rate": 2.648395504875918e-05, "loss": 0.4029, "step": 623100 }, { "epoch": 8.308114809827892, "grad_norm": 2.5336763858795166, "learning_rate": 2.6475334859176537e-05, "loss": 0.4298, "step": 623200 }, { "epoch": 8.309447947634347, "grad_norm": 4.329512119293213, "learning_rate": 2.6466714964667765e-05, "loss": 0.4078, "step": 623300 }, { "epoch": 8.310781085440802, "grad_norm": 8.926721572875977, "learning_rate": 2.6458095365954466e-05, "loss": 0.4153, "step": 623400 }, { "epoch": 8.312114223247256, "grad_norm": 1.9916718006134033, "learning_rate": 2.644947606375826e-05, "loss": 0.4049, "step": 623500 }, { "epoch": 8.313447361053711, "grad_norm": 12.051835060119629, "learning_rate": 2.644085705880073e-05, "loss": 0.4405, "step": 623600 }, { "epoch": 8.314780498860166, "grad_norm": 3.7199630737304688, "learning_rate": 2.6432238351803427e-05, "loss": 0.4119, "step": 623700 }, { "epoch": 8.316113636666623, "grad_norm": 3.45485258102417, "learning_rate": 2.642361994348788e-05, "loss": 0.4384, "step": 623800 }, { "epoch": 8.317446774473078, "grad_norm": 23.205808639526367, "learning_rate": 2.6415001834575603e-05, "loss": 0.49, "step": 623900 }, { "epoch": 8.318779912279533, "grad_norm": 3.022475242614746, "learning_rate": 2.6406384025788065e-05, "loss": 0.4636, "step": 624000 }, { "epoch": 8.320113050085988, "grad_norm": 4.9207763671875, "learning_rate": 2.6397766517846725e-05, "loss": 0.43, "step": 624100 }, { "epoch": 8.321446187892443, "grad_norm": 0.9540917873382568, "learning_rate": 2.6389149311473034e-05, "loss": 0.4354, "step": 624200 }, { "epoch": 8.322779325698898, "grad_norm": 6.482949256896973, "learning_rate": 2.6380618574930517e-05, "loss": 0.3778, "step": 624300 }, { "epoch": 8.324112463505353, "grad_norm": 7.481374263763428, "learning_rate": 2.6372001970822584e-05, "loss": 0.4047, "step": 624400 }, { "epoch": 8.325445601311808, "grad_norm": 11.295862197875977, "learning_rate": 2.6363385670439234e-05, "loss": 0.3697, "step": 624500 }, { "epoch": 8.326778739118263, "grad_norm": 2.5699517726898193, "learning_rate": 2.6354769674501747e-05, "loss": 0.4173, "step": 624600 }, { "epoch": 8.328111876924718, "grad_norm": 1.7396363019943237, "learning_rate": 2.634615398373146e-05, "loss": 0.4519, "step": 624700 }, { "epoch": 8.329445014731172, "grad_norm": 4.561883449554443, "learning_rate": 2.6337538598849655e-05, "loss": 0.4012, "step": 624800 }, { "epoch": 8.330778152537627, "grad_norm": 2.257610321044922, "learning_rate": 2.6328923520577563e-05, "loss": 0.4924, "step": 624900 }, { "epoch": 8.332111290344082, "grad_norm": 7.804183006286621, "learning_rate": 2.6320308749636425e-05, "loss": 0.4606, "step": 625000 }, { "epoch": 8.333444428150537, "grad_norm": 2.6814095973968506, "learning_rate": 2.631169428674744e-05, "loss": 0.3961, "step": 625100 }, { "epoch": 8.334777565956992, "grad_norm": 1.8705077171325684, "learning_rate": 2.6303080132631777e-05, "loss": 0.3527, "step": 625200 }, { "epoch": 8.336110703763447, "grad_norm": 3.458979368209839, "learning_rate": 2.6294466288010596e-05, "loss": 0.4286, "step": 625300 }, { "epoch": 8.337443841569904, "grad_norm": 5.0154290199279785, "learning_rate": 2.628585275360502e-05, "loss": 0.3828, "step": 625400 }, { "epoch": 8.338776979376359, "grad_norm": 12.004517555236816, "learning_rate": 2.6277239530136132e-05, "loss": 0.4608, "step": 625500 }, { "epoch": 8.340110117182814, "grad_norm": 6.656261444091797, "learning_rate": 2.626862661832502e-05, "loss": 0.4268, "step": 625600 }, { "epoch": 8.341443254989269, "grad_norm": 4.005584239959717, "learning_rate": 2.6260014018892732e-05, "loss": 0.4663, "step": 625700 }, { "epoch": 8.342776392795724, "grad_norm": 303.90057373046875, "learning_rate": 2.625140173256028e-05, "loss": 0.3848, "step": 625800 }, { "epoch": 8.344109530602179, "grad_norm": 2.3202764987945557, "learning_rate": 2.6242789760048656e-05, "loss": 0.3812, "step": 625900 }, { "epoch": 8.345442668408634, "grad_norm": 2.791015148162842, "learning_rate": 2.6234178102078827e-05, "loss": 0.3875, "step": 626000 }, { "epoch": 8.346775806215089, "grad_norm": 4.971004009246826, "learning_rate": 2.6225566759371738e-05, "loss": 0.4048, "step": 626100 }, { "epoch": 8.348108944021543, "grad_norm": 6.752958297729492, "learning_rate": 2.6216955732648307e-05, "loss": 0.4947, "step": 626200 }, { "epoch": 8.349442081827998, "grad_norm": 2.381556749343872, "learning_rate": 2.6208345022629408e-05, "loss": 0.4067, "step": 626300 }, { "epoch": 8.350775219634453, "grad_norm": 4.265078544616699, "learning_rate": 2.6199734630035916e-05, "loss": 0.4891, "step": 626400 }, { "epoch": 8.352108357440908, "grad_norm": 2.8088576793670654, "learning_rate": 2.6191124555588663e-05, "loss": 0.4141, "step": 626500 }, { "epoch": 8.353441495247363, "grad_norm": 2.3057072162628174, "learning_rate": 2.6182514800008453e-05, "loss": 0.4354, "step": 626600 }, { "epoch": 8.354774633053818, "grad_norm": 3.9243268966674805, "learning_rate": 2.617390536401606e-05, "loss": 0.4062, "step": 626700 }, { "epoch": 8.356107770860273, "grad_norm": 5.36610746383667, "learning_rate": 2.6165296248332273e-05, "loss": 0.4004, "step": 626800 }, { "epoch": 8.357440908666728, "grad_norm": 5.234353065490723, "learning_rate": 2.6156687453677773e-05, "loss": 0.3792, "step": 626900 }, { "epoch": 8.358774046473185, "grad_norm": 7.845030307769775, "learning_rate": 2.6148078980773294e-05, "loss": 0.4118, "step": 627000 }, { "epoch": 8.36010718427964, "grad_norm": 2.7761659622192383, "learning_rate": 2.6139470830339504e-05, "loss": 0.3884, "step": 627100 }, { "epoch": 8.361440322086095, "grad_norm": 2.7073793411254883, "learning_rate": 2.613086300309704e-05, "loss": 0.4676, "step": 627200 }, { "epoch": 8.36277345989255, "grad_norm": 9.755728721618652, "learning_rate": 2.6122255499766528e-05, "loss": 0.4671, "step": 627300 }, { "epoch": 8.364106597699005, "grad_norm": 1.5469779968261719, "learning_rate": 2.6113648321068572e-05, "loss": 0.4312, "step": 627400 }, { "epoch": 8.36543973550546, "grad_norm": 2.9742326736450195, "learning_rate": 2.6105041467723716e-05, "loss": 0.3857, "step": 627500 }, { "epoch": 8.366772873311914, "grad_norm": 1.5654010772705078, "learning_rate": 2.6096434940452525e-05, "loss": 0.3819, "step": 627600 }, { "epoch": 8.36810601111837, "grad_norm": 5.693063259124756, "learning_rate": 2.6087828739975482e-05, "loss": 0.3761, "step": 627700 }, { "epoch": 8.369439148924824, "grad_norm": 4.791553020477295, "learning_rate": 2.6079222867013082e-05, "loss": 0.4511, "step": 627800 }, { "epoch": 8.37077228673128, "grad_norm": 4.307739734649658, "learning_rate": 2.6070617322285805e-05, "loss": 0.3504, "step": 627900 }, { "epoch": 8.372105424537734, "grad_norm": 2.7126245498657227, "learning_rate": 2.6062012106514032e-05, "loss": 0.3823, "step": 628000 }, { "epoch": 8.37343856234419, "grad_norm": 10.26627254486084, "learning_rate": 2.6053407220418206e-05, "loss": 0.4173, "step": 628100 }, { "epoch": 8.374771700150644, "grad_norm": 3.993340253829956, "learning_rate": 2.6044802664718688e-05, "loss": 0.4213, "step": 628200 }, { "epoch": 8.376104837957099, "grad_norm": 1.8628699779510498, "learning_rate": 2.603619844013582e-05, "loss": 0.3752, "step": 628300 }, { "epoch": 8.377437975763554, "grad_norm": 1.8289283514022827, "learning_rate": 2.6027594547389924e-05, "loss": 0.3757, "step": 628400 }, { "epoch": 8.378771113570009, "grad_norm": 5.017428398132324, "learning_rate": 2.6018990987201296e-05, "loss": 0.4283, "step": 628500 }, { "epoch": 8.380104251376466, "grad_norm": 2.991223096847534, "learning_rate": 2.6010387760290186e-05, "loss": 0.3611, "step": 628600 }, { "epoch": 8.38143738918292, "grad_norm": 6.342236042022705, "learning_rate": 2.6001784867376837e-05, "loss": 0.4594, "step": 628700 }, { "epoch": 8.382770526989376, "grad_norm": 3.808173179626465, "learning_rate": 2.599318230918146e-05, "loss": 0.4546, "step": 628800 }, { "epoch": 8.38410366479583, "grad_norm": 5.059627056121826, "learning_rate": 2.5984580086424226e-05, "loss": 0.4664, "step": 628900 }, { "epoch": 8.385436802602285, "grad_norm": 2.86466121673584, "learning_rate": 2.5975978199825294e-05, "loss": 0.4232, "step": 629000 }, { "epoch": 8.38676994040874, "grad_norm": 4.712153434753418, "learning_rate": 2.5967376650104776e-05, "loss": 0.4416, "step": 629100 }, { "epoch": 8.388103078215195, "grad_norm": 3.1418211460113525, "learning_rate": 2.5958775437982768e-05, "loss": 0.4381, "step": 629200 }, { "epoch": 8.38943621602165, "grad_norm": 6.105987548828125, "learning_rate": 2.5950174564179362e-05, "loss": 0.4339, "step": 629300 }, { "epoch": 8.390769353828105, "grad_norm": 2.1340222358703613, "learning_rate": 2.5941574029414552e-05, "loss": 0.4321, "step": 629400 }, { "epoch": 8.39210249163456, "grad_norm": 1.0525054931640625, "learning_rate": 2.593297383440838e-05, "loss": 0.3942, "step": 629500 }, { "epoch": 8.393435629441015, "grad_norm": 2.1980748176574707, "learning_rate": 2.5924373979880824e-05, "loss": 0.4746, "step": 629600 }, { "epoch": 8.39476876724747, "grad_norm": 2.994011878967285, "learning_rate": 2.5915774466551825e-05, "loss": 0.3651, "step": 629700 }, { "epoch": 8.396101905053925, "grad_norm": 2.269597053527832, "learning_rate": 2.5907175295141312e-05, "loss": 0.4728, "step": 629800 }, { "epoch": 8.39743504286038, "grad_norm": 13.517525672912598, "learning_rate": 2.5898576466369186e-05, "loss": 0.4361, "step": 629900 }, { "epoch": 8.398768180666835, "grad_norm": 7.2307305335998535, "learning_rate": 2.5889977980955305e-05, "loss": 0.4053, "step": 630000 }, { "epoch": 8.40010131847329, "grad_norm": 1.1815239191055298, "learning_rate": 2.5881379839619507e-05, "loss": 0.444, "step": 630100 }, { "epoch": 8.401434456279745, "grad_norm": 4.868844032287598, "learning_rate": 2.5872782043081625e-05, "loss": 0.4158, "step": 630200 }, { "epoch": 8.402767594086201, "grad_norm": 16.60516357421875, "learning_rate": 2.58641845920614e-05, "loss": 0.3895, "step": 630300 }, { "epoch": 8.404100731892656, "grad_norm": 4.3582282066345215, "learning_rate": 2.5855587487278608e-05, "loss": 0.4319, "step": 630400 }, { "epoch": 8.405433869699111, "grad_norm": 4.286393165588379, "learning_rate": 2.5846990729452974e-05, "loss": 0.4293, "step": 630500 }, { "epoch": 8.406767007505566, "grad_norm": 3.49959659576416, "learning_rate": 2.583839431930418e-05, "loss": 0.4124, "step": 630600 }, { "epoch": 8.408100145312021, "grad_norm": 2.928600311279297, "learning_rate": 2.5829798257551903e-05, "loss": 0.3966, "step": 630700 }, { "epoch": 8.409433283118476, "grad_norm": 10.649654388427734, "learning_rate": 2.582120254491576e-05, "loss": 0.3962, "step": 630800 }, { "epoch": 8.410766420924931, "grad_norm": 1.2747983932495117, "learning_rate": 2.581269313400933e-05, "loss": 0.4239, "step": 630900 }, { "epoch": 8.412099558731386, "grad_norm": 4.603875637054443, "learning_rate": 2.580409811825515e-05, "loss": 0.4549, "step": 631000 }, { "epoch": 8.413432696537841, "grad_norm": 8.130369186401367, "learning_rate": 2.5795503453768654e-05, "loss": 0.4088, "step": 631100 }, { "epoch": 8.414765834344296, "grad_norm": 0.8816012740135193, "learning_rate": 2.578690914126935e-05, "loss": 0.3591, "step": 631200 }, { "epoch": 8.41609897215075, "grad_norm": 7.155123233795166, "learning_rate": 2.577831518147673e-05, "loss": 0.41, "step": 631300 }, { "epoch": 8.417432109957206, "grad_norm": 4.3404221534729, "learning_rate": 2.5769807509422102e-05, "loss": 0.4236, "step": 631400 }, { "epoch": 8.41876524776366, "grad_norm": 16.178810119628906, "learning_rate": 2.57612142536562e-05, "loss": 0.4168, "step": 631500 }, { "epoch": 8.420098385570116, "grad_norm": 1.6719590425491333, "learning_rate": 2.5752621352748062e-05, "loss": 0.3866, "step": 631600 }, { "epoch": 8.42143152337657, "grad_norm": 2.8542587757110596, "learning_rate": 2.574402880741708e-05, "loss": 0.4909, "step": 631700 }, { "epoch": 8.422764661183027, "grad_norm": 1.4396040439605713, "learning_rate": 2.5735436618382596e-05, "loss": 0.4284, "step": 631800 }, { "epoch": 8.424097798989482, "grad_norm": 2.1673977375030518, "learning_rate": 2.572684478636391e-05, "loss": 0.4627, "step": 631900 }, { "epoch": 8.425430936795937, "grad_norm": 1.8539800643920898, "learning_rate": 2.5718253312080316e-05, "loss": 0.4512, "step": 632000 }, { "epoch": 8.426764074602392, "grad_norm": 8.798223495483398, "learning_rate": 2.570966219625105e-05, "loss": 0.4434, "step": 632100 }, { "epoch": 8.428097212408847, "grad_norm": 8.303171157836914, "learning_rate": 2.5701071439595342e-05, "loss": 0.4202, "step": 632200 }, { "epoch": 8.429430350215302, "grad_norm": 4.259398937225342, "learning_rate": 2.5692481042832382e-05, "loss": 0.4199, "step": 632300 }, { "epoch": 8.430763488021757, "grad_norm": 0.6829582452774048, "learning_rate": 2.5683891006681333e-05, "loss": 0.4034, "step": 632400 }, { "epoch": 8.432096625828212, "grad_norm": 3.587023973464966, "learning_rate": 2.567530133186131e-05, "loss": 0.412, "step": 632500 }, { "epoch": 8.433429763634667, "grad_norm": 4.30176305770874, "learning_rate": 2.5666712019091447e-05, "loss": 0.3938, "step": 632600 }, { "epoch": 8.434762901441122, "grad_norm": 50.27660369873047, "learning_rate": 2.5658123069090777e-05, "loss": 0.3885, "step": 632700 }, { "epoch": 8.436096039247577, "grad_norm": 1.6303200721740723, "learning_rate": 2.5649534482578362e-05, "loss": 0.3965, "step": 632800 }, { "epoch": 8.437429177054032, "grad_norm": 1.1516741514205933, "learning_rate": 2.5640946260273217e-05, "loss": 0.4067, "step": 632900 }, { "epoch": 8.438762314860487, "grad_norm": 4.211666584014893, "learning_rate": 2.56323584028943e-05, "loss": 0.4595, "step": 633000 }, { "epoch": 8.440095452666942, "grad_norm": 20.211641311645508, "learning_rate": 2.562377091116057e-05, "loss": 0.4209, "step": 633100 }, { "epoch": 8.441428590473397, "grad_norm": 16.828493118286133, "learning_rate": 2.561518378579095e-05, "loss": 0.3946, "step": 633200 }, { "epoch": 8.442761728279851, "grad_norm": 4.610256195068359, "learning_rate": 2.5606597027504317e-05, "loss": 0.444, "step": 633300 }, { "epoch": 8.444094866086306, "grad_norm": 6.851181983947754, "learning_rate": 2.5598010637019536e-05, "loss": 0.3868, "step": 633400 }, { "epoch": 8.445428003892763, "grad_norm": 2.515794038772583, "learning_rate": 2.558942461505542e-05, "loss": 0.4385, "step": 633500 }, { "epoch": 8.446761141699218, "grad_norm": 0.6964602470397949, "learning_rate": 2.5580838962330775e-05, "loss": 0.4865, "step": 633600 }, { "epoch": 8.448094279505673, "grad_norm": 1.3006601333618164, "learning_rate": 2.5572253679564366e-05, "loss": 0.355, "step": 633700 }, { "epoch": 8.449427417312128, "grad_norm": 4.650136470794678, "learning_rate": 2.5563668767474917e-05, "loss": 0.3956, "step": 633800 }, { "epoch": 8.450760555118583, "grad_norm": 3.044780969619751, "learning_rate": 2.5555084226781127e-05, "loss": 0.421, "step": 633900 }, { "epoch": 8.452093692925038, "grad_norm": 3.3738789558410645, "learning_rate": 2.5546500058201694e-05, "loss": 0.3807, "step": 634000 }, { "epoch": 8.453426830731493, "grad_norm": 1.2704161405563354, "learning_rate": 2.5537916262455217e-05, "loss": 0.4058, "step": 634100 }, { "epoch": 8.454759968537948, "grad_norm": 2.0113887786865234, "learning_rate": 2.5529332840260333e-05, "loss": 0.4408, "step": 634200 }, { "epoch": 8.456093106344403, "grad_norm": 4.122312545776367, "learning_rate": 2.5520749792335614e-05, "loss": 0.4615, "step": 634300 }, { "epoch": 8.457426244150858, "grad_norm": 14.158418655395508, "learning_rate": 2.55121671193996e-05, "loss": 0.4665, "step": 634400 }, { "epoch": 8.458759381957313, "grad_norm": 1.8205244541168213, "learning_rate": 2.5503670643280987e-05, "loss": 0.3944, "step": 634500 }, { "epoch": 8.460092519763768, "grad_norm": 16.582048416137695, "learning_rate": 2.549508871871008e-05, "loss": 0.4308, "step": 634600 }, { "epoch": 8.461425657570222, "grad_norm": 2.3675930500030518, "learning_rate": 2.5486507171276157e-05, "loss": 0.4084, "step": 634700 }, { "epoch": 8.462758795376677, "grad_norm": 2.058462619781494, "learning_rate": 2.547792600169762e-05, "loss": 0.4323, "step": 634800 }, { "epoch": 8.464091933183132, "grad_norm": 7.829526901245117, "learning_rate": 2.546934521069287e-05, "loss": 0.4027, "step": 634900 }, { "epoch": 8.465425070989589, "grad_norm": 8.897164344787598, "learning_rate": 2.5460764798980246e-05, "loss": 0.3461, "step": 635000 }, { "epoch": 8.466758208796044, "grad_norm": 2.454590320587158, "learning_rate": 2.545218476727808e-05, "loss": 0.3674, "step": 635100 }, { "epoch": 8.468091346602499, "grad_norm": 23.986347198486328, "learning_rate": 2.544360511630469e-05, "loss": 0.3892, "step": 635200 }, { "epoch": 8.469424484408954, "grad_norm": 2.7005927562713623, "learning_rate": 2.5435025846778293e-05, "loss": 0.4264, "step": 635300 }, { "epoch": 8.470757622215409, "grad_norm": 19.300168991088867, "learning_rate": 2.5426446959417157e-05, "loss": 0.4652, "step": 635400 }, { "epoch": 8.472090760021864, "grad_norm": 17.19140625, "learning_rate": 2.541786845493946e-05, "loss": 0.3677, "step": 635500 }, { "epoch": 8.473423897828319, "grad_norm": 3.054070234298706, "learning_rate": 2.5409290334063374e-05, "loss": 0.3687, "step": 635600 }, { "epoch": 8.474757035634774, "grad_norm": 7.958230495452881, "learning_rate": 2.5400712597507028e-05, "loss": 0.3556, "step": 635700 }, { "epoch": 8.476090173441229, "grad_norm": 8.746050834655762, "learning_rate": 2.5392135245988532e-05, "loss": 0.4079, "step": 635800 }, { "epoch": 8.477423311247684, "grad_norm": 14.502352714538574, "learning_rate": 2.5383558280225937e-05, "loss": 0.4771, "step": 635900 }, { "epoch": 8.478756449054138, "grad_norm": 1.2857160568237305, "learning_rate": 2.5374981700937285e-05, "loss": 0.4344, "step": 636000 }, { "epoch": 8.480089586860593, "grad_norm": 4.041748046875, "learning_rate": 2.536640550884061e-05, "loss": 0.4664, "step": 636100 }, { "epoch": 8.481422724667048, "grad_norm": 4.433659553527832, "learning_rate": 2.5357829704653836e-05, "loss": 0.4405, "step": 636200 }, { "epoch": 8.482755862473503, "grad_norm": 14.709857940673828, "learning_rate": 2.5349254289094936e-05, "loss": 0.374, "step": 636300 }, { "epoch": 8.484089000279958, "grad_norm": 49.98051834106445, "learning_rate": 2.534067926288181e-05, "loss": 0.4482, "step": 636400 }, { "epoch": 8.485422138086413, "grad_norm": 5.740966796875, "learning_rate": 2.533210462673232e-05, "loss": 0.3561, "step": 636500 }, { "epoch": 8.486755275892868, "grad_norm": 8.673155784606934, "learning_rate": 2.5323530381364327e-05, "loss": 0.4517, "step": 636600 }, { "epoch": 8.488088413699325, "grad_norm": 1.442804217338562, "learning_rate": 2.531495652749562e-05, "loss": 0.4198, "step": 636700 }, { "epoch": 8.48942155150578, "grad_norm": 5.195811748504639, "learning_rate": 2.5306383065843987e-05, "loss": 0.4326, "step": 636800 }, { "epoch": 8.490754689312235, "grad_norm": 2.899010181427002, "learning_rate": 2.529780999712717e-05, "loss": 0.3993, "step": 636900 }, { "epoch": 8.49208782711869, "grad_norm": 9.115338325500488, "learning_rate": 2.5289237322062875e-05, "loss": 0.423, "step": 637000 }, { "epoch": 8.493420964925145, "grad_norm": 3.3695085048675537, "learning_rate": 2.5280750762221232e-05, "loss": 0.4387, "step": 637100 }, { "epoch": 8.4947541027316, "grad_norm": 2.5177416801452637, "learning_rate": 2.5272178872660547e-05, "loss": 0.4106, "step": 637200 }, { "epoch": 8.496087240538055, "grad_norm": 0.6135843396186829, "learning_rate": 2.526360737889816e-05, "loss": 0.4182, "step": 637300 }, { "epoch": 8.49742037834451, "grad_norm": 2.6592626571655273, "learning_rate": 2.525503628165162e-05, "loss": 0.3652, "step": 637400 }, { "epoch": 8.498753516150964, "grad_norm": 4.497987747192383, "learning_rate": 2.5246465581638488e-05, "loss": 0.5046, "step": 637500 }, { "epoch": 8.50008665395742, "grad_norm": 2.0769383907318115, "learning_rate": 2.5237895279576276e-05, "loss": 0.4418, "step": 637600 }, { "epoch": 8.501419791763874, "grad_norm": 9.50698184967041, "learning_rate": 2.5229325376182457e-05, "loss": 0.4729, "step": 637700 }, { "epoch": 8.50275292957033, "grad_norm": 1.2771267890930176, "learning_rate": 2.5220755872174482e-05, "loss": 0.3768, "step": 637800 }, { "epoch": 8.504086067376784, "grad_norm": 5.518548488616943, "learning_rate": 2.521218676826976e-05, "loss": 0.4014, "step": 637900 }, { "epoch": 8.50541920518324, "grad_norm": 1.241821527481079, "learning_rate": 2.520361806518566e-05, "loss": 0.3891, "step": 638000 }, { "epoch": 8.506752342989694, "grad_norm": 4.332045555114746, "learning_rate": 2.5195135444665038e-05, "loss": 0.4455, "step": 638100 }, { "epoch": 8.50808548079615, "grad_norm": 1.6752817630767822, "learning_rate": 2.5186567541348108e-05, "loss": 0.4411, "step": 638200 }, { "epoch": 8.509418618602606, "grad_norm": 3.3045663833618164, "learning_rate": 2.5178000040996566e-05, "loss": 0.4024, "step": 638300 }, { "epoch": 8.51075175640906, "grad_norm": 2.0221104621887207, "learning_rate": 2.5169432944327652e-05, "loss": 0.3894, "step": 638400 }, { "epoch": 8.512084894215516, "grad_norm": 1.8536226749420166, "learning_rate": 2.5160866252058606e-05, "loss": 0.3791, "step": 638500 }, { "epoch": 8.51341803202197, "grad_norm": 3.614849805831909, "learning_rate": 2.515229996490656e-05, "loss": 0.432, "step": 638600 }, { "epoch": 8.514751169828426, "grad_norm": 1.67757248878479, "learning_rate": 2.5143734083588692e-05, "loss": 0.4608, "step": 638700 }, { "epoch": 8.51608430763488, "grad_norm": 4.321796417236328, "learning_rate": 2.51351686088221e-05, "loss": 0.3992, "step": 638800 }, { "epoch": 8.517417445441335, "grad_norm": 1.051107406616211, "learning_rate": 2.512660354132385e-05, "loss": 0.4159, "step": 638900 }, { "epoch": 8.51875058324779, "grad_norm": 2.686805486679077, "learning_rate": 2.511803888181099e-05, "loss": 0.4706, "step": 639000 }, { "epoch": 8.520083721054245, "grad_norm": 5.772696018218994, "learning_rate": 2.5109474631000534e-05, "loss": 0.4428, "step": 639100 }, { "epoch": 8.5214168588607, "grad_norm": 2.1660351753234863, "learning_rate": 2.5100910789609442e-05, "loss": 0.4103, "step": 639200 }, { "epoch": 8.522749996667155, "grad_norm": 11.755346298217773, "learning_rate": 2.5092347358354658e-05, "loss": 0.3881, "step": 639300 }, { "epoch": 8.52408313447361, "grad_norm": 4.172033786773682, "learning_rate": 2.5083784337953076e-05, "loss": 0.4143, "step": 639400 }, { "epoch": 8.525416272280065, "grad_norm": 5.741822719573975, "learning_rate": 2.5075221729121574e-05, "loss": 0.39, "step": 639500 }, { "epoch": 8.52674941008652, "grad_norm": 1.7207380533218384, "learning_rate": 2.506665953257699e-05, "loss": 0.4395, "step": 639600 }, { "epoch": 8.528082547892975, "grad_norm": 12.767962455749512, "learning_rate": 2.505809774903611e-05, "loss": 0.3886, "step": 639700 }, { "epoch": 8.52941568569943, "grad_norm": 1.795064926147461, "learning_rate": 2.50495363792157e-05, "loss": 0.4359, "step": 639800 }, { "epoch": 8.530748823505887, "grad_norm": 0.5495204925537109, "learning_rate": 2.5040975423832527e-05, "loss": 0.4285, "step": 639900 }, { "epoch": 8.532081961312342, "grad_norm": 3.2458930015563965, "learning_rate": 2.5032414883603235e-05, "loss": 0.4481, "step": 640000 }, { "epoch": 8.533415099118796, "grad_norm": 1.4589756727218628, "learning_rate": 2.5023854759244518e-05, "loss": 0.4032, "step": 640100 }, { "epoch": 8.534748236925251, "grad_norm": 4.9729695320129395, "learning_rate": 2.5015295051472997e-05, "loss": 0.4113, "step": 640200 }, { "epoch": 8.536081374731706, "grad_norm": 12.271809577941895, "learning_rate": 2.5006735761005257e-05, "loss": 0.4181, "step": 640300 }, { "epoch": 8.537414512538161, "grad_norm": 74.51883697509766, "learning_rate": 2.4998176888557858e-05, "loss": 0.403, "step": 640400 }, { "epoch": 8.538747650344616, "grad_norm": 6.973826885223389, "learning_rate": 2.4989618434847333e-05, "loss": 0.4132, "step": 640500 }, { "epoch": 8.540080788151071, "grad_norm": 5.139581203460693, "learning_rate": 2.4981060400590146e-05, "loss": 0.3997, "step": 640600 }, { "epoch": 8.541413925957526, "grad_norm": 0.36329326033592224, "learning_rate": 2.497250278650276e-05, "loss": 0.4026, "step": 640700 }, { "epoch": 8.542747063763981, "grad_norm": 3.555025815963745, "learning_rate": 2.496394559330161e-05, "loss": 0.405, "step": 640800 }, { "epoch": 8.544080201570436, "grad_norm": 38.590858459472656, "learning_rate": 2.495538882170304e-05, "loss": 0.4442, "step": 640900 }, { "epoch": 8.545413339376891, "grad_norm": 3.100019693374634, "learning_rate": 2.4946832472423435e-05, "loss": 0.3993, "step": 641000 }, { "epoch": 8.546746477183346, "grad_norm": 4.1956257820129395, "learning_rate": 2.4938276546179068e-05, "loss": 0.469, "step": 641100 }, { "epoch": 8.5480796149898, "grad_norm": 3.419843912124634, "learning_rate": 2.4929721043686236e-05, "loss": 0.3939, "step": 641200 }, { "epoch": 8.549412752796256, "grad_norm": 2.9797849655151367, "learning_rate": 2.4921165965661188e-05, "loss": 0.4002, "step": 641300 }, { "epoch": 8.550745890602713, "grad_norm": 2.543818712234497, "learning_rate": 2.4912611312820105e-05, "loss": 0.4331, "step": 641400 }, { "epoch": 8.552079028409167, "grad_norm": 2.1329495906829834, "learning_rate": 2.4904057085879166e-05, "loss": 0.3608, "step": 641500 }, { "epoch": 8.553412166215622, "grad_norm": 16.121463775634766, "learning_rate": 2.4895503285554513e-05, "loss": 0.4011, "step": 641600 }, { "epoch": 8.554745304022077, "grad_norm": 5.231592655181885, "learning_rate": 2.4886949912562226e-05, "loss": 0.4068, "step": 641700 }, { "epoch": 8.556078441828532, "grad_norm": 8.256763458251953, "learning_rate": 2.4878396967618375e-05, "loss": 0.3279, "step": 641800 }, { "epoch": 8.557411579634987, "grad_norm": 3.9485268592834473, "learning_rate": 2.486984445143899e-05, "loss": 0.3982, "step": 641900 }, { "epoch": 8.558744717441442, "grad_norm": 2.948268175125122, "learning_rate": 2.486129236474005e-05, "loss": 0.4742, "step": 642000 }, { "epoch": 8.560077855247897, "grad_norm": 2.600238800048828, "learning_rate": 2.485274070823751e-05, "loss": 0.4368, "step": 642100 }, { "epoch": 8.561410993054352, "grad_norm": 3.708225965499878, "learning_rate": 2.4844189482647307e-05, "loss": 0.4112, "step": 642200 }, { "epoch": 8.562744130860807, "grad_norm": 3.2067723274230957, "learning_rate": 2.4835638688685295e-05, "loss": 0.4133, "step": 642300 }, { "epoch": 8.564077268667262, "grad_norm": 10.695347785949707, "learning_rate": 2.482708832706735e-05, "loss": 0.4127, "step": 642400 }, { "epoch": 8.565410406473717, "grad_norm": 22.001262664794922, "learning_rate": 2.4818538398509242e-05, "loss": 0.4558, "step": 642500 }, { "epoch": 8.566743544280172, "grad_norm": 1.824432134628296, "learning_rate": 2.4809988903726772e-05, "loss": 0.3991, "step": 642600 }, { "epoch": 8.568076682086627, "grad_norm": 4.096462249755859, "learning_rate": 2.480143984343568e-05, "loss": 0.4246, "step": 642700 }, { "epoch": 8.569409819893082, "grad_norm": 9.923975944519043, "learning_rate": 2.4792891218351653e-05, "loss": 0.3997, "step": 642800 }, { "epoch": 8.570742957699537, "grad_norm": 8.749378204345703, "learning_rate": 2.4784343029190355e-05, "loss": 0.4284, "step": 642900 }, { "epoch": 8.572076095505992, "grad_norm": 3.9736952781677246, "learning_rate": 2.4775795276667422e-05, "loss": 0.3842, "step": 643000 }, { "epoch": 8.573409233312448, "grad_norm": 4.488655090332031, "learning_rate": 2.476724796149844e-05, "loss": 0.3984, "step": 643100 }, { "epoch": 8.574742371118903, "grad_norm": 5.074779033660889, "learning_rate": 2.475870108439895e-05, "loss": 0.4907, "step": 643200 }, { "epoch": 8.576075508925358, "grad_norm": 4.420143127441406, "learning_rate": 2.4750154646084507e-05, "loss": 0.4084, "step": 643300 }, { "epoch": 8.577408646731813, "grad_norm": 2.3995018005371094, "learning_rate": 2.4741608647270546e-05, "loss": 0.3861, "step": 643400 }, { "epoch": 8.578741784538268, "grad_norm": 2.0843794345855713, "learning_rate": 2.473306308867254e-05, "loss": 0.4299, "step": 643500 }, { "epoch": 8.580074922344723, "grad_norm": 3.0388314723968506, "learning_rate": 2.47245179710059e-05, "loss": 0.4115, "step": 643600 }, { "epoch": 8.581408060151178, "grad_norm": 2.2185423374176025, "learning_rate": 2.4715973294985978e-05, "loss": 0.4032, "step": 643700 }, { "epoch": 8.582741197957633, "grad_norm": 3.4384961128234863, "learning_rate": 2.4707429061328112e-05, "loss": 0.4703, "step": 643800 }, { "epoch": 8.584074335764088, "grad_norm": 2.8585617542266846, "learning_rate": 2.4698885270747607e-05, "loss": 0.3779, "step": 643900 }, { "epoch": 8.585407473570543, "grad_norm": 2.094590425491333, "learning_rate": 2.4690341923959716e-05, "loss": 0.3799, "step": 644000 }, { "epoch": 8.586740611376998, "grad_norm": 4.018677234649658, "learning_rate": 2.468179902167966e-05, "loss": 0.3674, "step": 644100 }, { "epoch": 8.588073749183453, "grad_norm": 0.9751749634742737, "learning_rate": 2.4673256564622624e-05, "loss": 0.4038, "step": 644200 }, { "epoch": 8.589406886989908, "grad_norm": 6.902432918548584, "learning_rate": 2.466471455350376e-05, "loss": 0.4446, "step": 644300 }, { "epoch": 8.590740024796363, "grad_norm": 0.5239677429199219, "learning_rate": 2.4656172989038176e-05, "loss": 0.4573, "step": 644400 }, { "epoch": 8.592073162602818, "grad_norm": 2.0677387714385986, "learning_rate": 2.4647631871940933e-05, "loss": 0.3969, "step": 644500 }, { "epoch": 8.593406300409274, "grad_norm": 2.186229705810547, "learning_rate": 2.4639091202927086e-05, "loss": 0.4573, "step": 644600 }, { "epoch": 8.59473943821573, "grad_norm": 0.609247088432312, "learning_rate": 2.4630550982711628e-05, "loss": 0.4557, "step": 644700 }, { "epoch": 8.596072576022184, "grad_norm": 2.1754658222198486, "learning_rate": 2.4622011212009513e-05, "loss": 0.4654, "step": 644800 }, { "epoch": 8.597405713828639, "grad_norm": 7.616021156311035, "learning_rate": 2.461347189153566e-05, "loss": 0.4277, "step": 644900 }, { "epoch": 8.598738851635094, "grad_norm": 0.9764040112495422, "learning_rate": 2.4604933022004977e-05, "loss": 0.4604, "step": 645000 }, { "epoch": 8.600071989441549, "grad_norm": 2.2710952758789062, "learning_rate": 2.459639460413228e-05, "loss": 0.39, "step": 645100 }, { "epoch": 8.601405127248004, "grad_norm": 6.061894416809082, "learning_rate": 2.45878566386324e-05, "loss": 0.4518, "step": 645200 }, { "epoch": 8.602738265054459, "grad_norm": 4.7429118156433105, "learning_rate": 2.4579319126220102e-05, "loss": 0.3859, "step": 645300 }, { "epoch": 8.604071402860914, "grad_norm": 2.106248378753662, "learning_rate": 2.457078206761011e-05, "loss": 0.4337, "step": 645400 }, { "epoch": 8.605404540667369, "grad_norm": 2.004446506500244, "learning_rate": 2.456224546351713e-05, "loss": 0.4146, "step": 645500 }, { "epoch": 8.606737678473824, "grad_norm": 1.165575623512268, "learning_rate": 2.4553709314655833e-05, "loss": 0.4414, "step": 645600 }, { "epoch": 8.608070816280279, "grad_norm": 3.9438016414642334, "learning_rate": 2.4545173621740807e-05, "loss": 0.3565, "step": 645700 }, { "epoch": 8.609403954086734, "grad_norm": 2.611035108566284, "learning_rate": 2.453663838548667e-05, "loss": 0.3931, "step": 645800 }, { "epoch": 8.610737091893188, "grad_norm": 3.511343479156494, "learning_rate": 2.4528103606607922e-05, "loss": 0.4824, "step": 645900 }, { "epoch": 8.612070229699643, "grad_norm": 4.109745025634766, "learning_rate": 2.4519569285819105e-05, "loss": 0.3975, "step": 646000 }, { "epoch": 8.613403367506098, "grad_norm": 5.171520709991455, "learning_rate": 2.4511035423834672e-05, "loss": 0.4405, "step": 646100 }, { "epoch": 8.614736505312553, "grad_norm": 3.4948432445526123, "learning_rate": 2.4502502021369046e-05, "loss": 0.3992, "step": 646200 }, { "epoch": 8.61606964311901, "grad_norm": 4.465986251831055, "learning_rate": 2.4494054406278446e-05, "loss": 0.375, "step": 646300 }, { "epoch": 8.617402780925465, "grad_norm": 13.07432746887207, "learning_rate": 2.448552192038056e-05, "loss": 0.3681, "step": 646400 }, { "epoch": 8.61873591873192, "grad_norm": 2.3998334407806396, "learning_rate": 2.4476989896137405e-05, "loss": 0.4059, "step": 646500 }, { "epoch": 8.620069056538375, "grad_norm": 13.667027473449707, "learning_rate": 2.446845833426324e-05, "loss": 0.3861, "step": 646600 }, { "epoch": 8.62140219434483, "grad_norm": 10.998230934143066, "learning_rate": 2.44599272354723e-05, "loss": 0.4564, "step": 646700 }, { "epoch": 8.622735332151285, "grad_norm": 3.8341615200042725, "learning_rate": 2.4451396600478804e-05, "loss": 0.4167, "step": 646800 }, { "epoch": 8.62406846995774, "grad_norm": 1.317133903503418, "learning_rate": 2.4442866429996875e-05, "loss": 0.4205, "step": 646900 }, { "epoch": 8.625401607764195, "grad_norm": 4.395927906036377, "learning_rate": 2.443442201948801e-05, "loss": 0.4155, "step": 647000 }, { "epoch": 8.62673474557065, "grad_norm": 2.7031548023223877, "learning_rate": 2.4425892775508663e-05, "loss": 0.413, "step": 647100 }, { "epoch": 8.628067883377105, "grad_norm": 3.689227342605591, "learning_rate": 2.4417363998175978e-05, "loss": 0.3784, "step": 647200 }, { "epoch": 8.62940102118356, "grad_norm": 9.01891803741455, "learning_rate": 2.4408835688203988e-05, "loss": 0.4434, "step": 647300 }, { "epoch": 8.630734158990014, "grad_norm": 1.8955227136611938, "learning_rate": 2.4400307846306654e-05, "loss": 0.4212, "step": 647400 }, { "epoch": 8.63206729679647, "grad_norm": 3.2813949584960938, "learning_rate": 2.4391780473197888e-05, "loss": 0.3821, "step": 647500 }, { "epoch": 8.633400434602924, "grad_norm": 13.622870445251465, "learning_rate": 2.4383253569591587e-05, "loss": 0.3799, "step": 647600 }, { "epoch": 8.63473357240938, "grad_norm": 5.485611438751221, "learning_rate": 2.4374727136201593e-05, "loss": 0.3905, "step": 647700 }, { "epoch": 8.636066710215836, "grad_norm": 2.30792498588562, "learning_rate": 2.436620117374171e-05, "loss": 0.4479, "step": 647800 }, { "epoch": 8.63739984802229, "grad_norm": 2.6896440982818604, "learning_rate": 2.4357675682925698e-05, "loss": 0.376, "step": 647900 }, { "epoch": 8.638732985828746, "grad_norm": 2.4707446098327637, "learning_rate": 2.4349150664467305e-05, "loss": 0.3965, "step": 648000 }, { "epoch": 8.6400661236352, "grad_norm": 0.7734869718551636, "learning_rate": 2.434062611908019e-05, "loss": 0.4187, "step": 648100 }, { "epoch": 8.641399261441656, "grad_norm": 1.2834445238113403, "learning_rate": 2.4332102047478025e-05, "loss": 0.3584, "step": 648200 }, { "epoch": 8.64273239924811, "grad_norm": 6.805966377258301, "learning_rate": 2.4323578450374413e-05, "loss": 0.3707, "step": 648300 }, { "epoch": 8.644065537054566, "grad_norm": 1.651754379272461, "learning_rate": 2.4315055328482915e-05, "loss": 0.4049, "step": 648400 }, { "epoch": 8.64539867486102, "grad_norm": 22.033769607543945, "learning_rate": 2.430653268251707e-05, "loss": 0.4352, "step": 648500 }, { "epoch": 8.646731812667475, "grad_norm": 7.076706886291504, "learning_rate": 2.4298010513190355e-05, "loss": 0.4237, "step": 648600 }, { "epoch": 8.64806495047393, "grad_norm": 30.51984977722168, "learning_rate": 2.428948882121623e-05, "loss": 0.4562, "step": 648700 }, { "epoch": 8.649398088280385, "grad_norm": 0.9852753281593323, "learning_rate": 2.4280967607308098e-05, "loss": 0.3472, "step": 648800 }, { "epoch": 8.65073122608684, "grad_norm": 5.155050754547119, "learning_rate": 2.427244687217933e-05, "loss": 0.386, "step": 648900 }, { "epoch": 8.652064363893295, "grad_norm": 4.268121242523193, "learning_rate": 2.4263926616543245e-05, "loss": 0.394, "step": 649000 }, { "epoch": 8.65339750169975, "grad_norm": 2.8599183559417725, "learning_rate": 2.4255406841113164e-05, "loss": 0.3875, "step": 649100 }, { "epoch": 8.654730639506205, "grad_norm": 4.658400058746338, "learning_rate": 2.424688754660229e-05, "loss": 0.4038, "step": 649200 }, { "epoch": 8.65606377731266, "grad_norm": 4.602883338928223, "learning_rate": 2.4238368733723863e-05, "loss": 0.4237, "step": 649300 }, { "epoch": 8.657396915119115, "grad_norm": 26.77126121520996, "learning_rate": 2.4229850403191054e-05, "loss": 0.3911, "step": 649400 }, { "epoch": 8.658730052925572, "grad_norm": 1.487999439239502, "learning_rate": 2.422133255571697e-05, "loss": 0.3953, "step": 649500 }, { "epoch": 8.660063190732027, "grad_norm": 2.6201186180114746, "learning_rate": 2.421281519201471e-05, "loss": 0.4208, "step": 649600 }, { "epoch": 8.661396328538482, "grad_norm": 12.407637596130371, "learning_rate": 2.4204298312797327e-05, "loss": 0.4007, "step": 649700 }, { "epoch": 8.662729466344937, "grad_norm": 7.023801326751709, "learning_rate": 2.419578191877781e-05, "loss": 0.4143, "step": 649800 }, { "epoch": 8.664062604151392, "grad_norm": 4.685179710388184, "learning_rate": 2.4187266010669137e-05, "loss": 0.4544, "step": 649900 }, { "epoch": 8.665395741957846, "grad_norm": 2.7542459964752197, "learning_rate": 2.4178750589184235e-05, "loss": 0.4489, "step": 650000 }, { "epoch": 8.666728879764301, "grad_norm": 3.779141902923584, "learning_rate": 2.417023565503598e-05, "loss": 0.4134, "step": 650100 }, { "epoch": 8.668062017570756, "grad_norm": 2.373152732849121, "learning_rate": 2.416172120893722e-05, "loss": 0.4248, "step": 650200 }, { "epoch": 8.669395155377211, "grad_norm": 4.890758991241455, "learning_rate": 2.4153207251600753e-05, "loss": 0.393, "step": 650300 }, { "epoch": 8.670728293183666, "grad_norm": 6.604227542877197, "learning_rate": 2.414469378373934e-05, "loss": 0.4336, "step": 650400 }, { "epoch": 8.672061430990121, "grad_norm": 1.7630958557128906, "learning_rate": 2.4136180806065724e-05, "loss": 0.4176, "step": 650500 }, { "epoch": 8.673394568796576, "grad_norm": 3.888669490814209, "learning_rate": 2.4127668319292544e-05, "loss": 0.403, "step": 650600 }, { "epoch": 8.674727706603031, "grad_norm": 3.464843511581421, "learning_rate": 2.4119156324132472e-05, "loss": 0.416, "step": 650700 }, { "epoch": 8.676060844409486, "grad_norm": 2.6252572536468506, "learning_rate": 2.4110644821298097e-05, "loss": 0.3555, "step": 650800 }, { "epoch": 8.677393982215941, "grad_norm": 3.3568413257598877, "learning_rate": 2.410213381150197e-05, "loss": 0.3308, "step": 650900 }, { "epoch": 8.678727120022398, "grad_norm": 10.807781219482422, "learning_rate": 2.4093623295456607e-05, "loss": 0.4335, "step": 651000 }, { "epoch": 8.680060257828853, "grad_norm": 3.295496702194214, "learning_rate": 2.408511327387449e-05, "loss": 0.4416, "step": 651100 }, { "epoch": 8.681393395635308, "grad_norm": 6.340712070465088, "learning_rate": 2.4076603747468037e-05, "loss": 0.4262, "step": 651200 }, { "epoch": 8.682726533441762, "grad_norm": 2.576784372329712, "learning_rate": 2.4068094716949636e-05, "loss": 0.3874, "step": 651300 }, { "epoch": 8.684059671248217, "grad_norm": 2.8261849880218506, "learning_rate": 2.4059586183031674e-05, "loss": 0.4358, "step": 651400 }, { "epoch": 8.685392809054672, "grad_norm": 9.262933731079102, "learning_rate": 2.405116322432843e-05, "loss": 0.4512, "step": 651500 }, { "epoch": 8.686725946861127, "grad_norm": 3.181638240814209, "learning_rate": 2.4042655680764374e-05, "loss": 0.4259, "step": 651600 }, { "epoch": 8.688059084667582, "grad_norm": 3.569941282272339, "learning_rate": 2.403414863593042e-05, "loss": 0.4, "step": 651700 }, { "epoch": 8.689392222474037, "grad_norm": 5.8882575035095215, "learning_rate": 2.4025642090538726e-05, "loss": 0.4256, "step": 651800 }, { "epoch": 8.690725360280492, "grad_norm": 0.12516507506370544, "learning_rate": 2.4017136045301442e-05, "loss": 0.4073, "step": 651900 }, { "epoch": 8.692058498086947, "grad_norm": 3.6712586879730225, "learning_rate": 2.4008630500930675e-05, "loss": 0.4307, "step": 652000 }, { "epoch": 8.693391635893402, "grad_norm": 1.6126044988632202, "learning_rate": 2.4000125458138474e-05, "loss": 0.383, "step": 652100 }, { "epoch": 8.694724773699857, "grad_norm": 4.105743408203125, "learning_rate": 2.3991620917636854e-05, "loss": 0.4366, "step": 652200 }, { "epoch": 8.696057911506312, "grad_norm": 2.056652307510376, "learning_rate": 2.3983116880137798e-05, "loss": 0.4075, "step": 652300 }, { "epoch": 8.697391049312767, "grad_norm": 2.8283531665802, "learning_rate": 2.397461334635322e-05, "loss": 0.3904, "step": 652400 }, { "epoch": 8.698724187119222, "grad_norm": 3.880777359008789, "learning_rate": 2.3966110316995018e-05, "loss": 0.3882, "step": 652500 }, { "epoch": 8.700057324925677, "grad_norm": 73.72489929199219, "learning_rate": 2.3957607792775057e-05, "loss": 0.4504, "step": 652600 }, { "epoch": 8.701390462732132, "grad_norm": 4.076862812042236, "learning_rate": 2.39491057744051e-05, "loss": 0.4889, "step": 652700 }, { "epoch": 8.702723600538588, "grad_norm": 0.477245032787323, "learning_rate": 2.3940604262596948e-05, "loss": 0.3724, "step": 652800 }, { "epoch": 8.704056738345043, "grad_norm": 5.664085865020752, "learning_rate": 2.393210325806231e-05, "loss": 0.4664, "step": 652900 }, { "epoch": 8.705389876151498, "grad_norm": 3.641997814178467, "learning_rate": 2.3923602761512853e-05, "loss": 0.4134, "step": 653000 }, { "epoch": 8.706723013957953, "grad_norm": 2.1253368854522705, "learning_rate": 2.3915102773660222e-05, "loss": 0.4241, "step": 653100 }, { "epoch": 8.708056151764408, "grad_norm": 2.3717846870422363, "learning_rate": 2.3906688287476538e-05, "loss": 0.4111, "step": 653200 }, { "epoch": 8.709389289570863, "grad_norm": 4.872218608856201, "learning_rate": 2.3898189314047563e-05, "loss": 0.356, "step": 653300 }, { "epoch": 8.710722427377318, "grad_norm": 3.453996419906616, "learning_rate": 2.388969085144294e-05, "loss": 0.4254, "step": 653400 }, { "epoch": 8.712055565183773, "grad_norm": 6.817221641540527, "learning_rate": 2.388119290037414e-05, "loss": 0.3761, "step": 653500 }, { "epoch": 8.713388702990228, "grad_norm": 4.386962413787842, "learning_rate": 2.3872695461552582e-05, "loss": 0.3566, "step": 653600 }, { "epoch": 8.714721840796683, "grad_norm": 2.0966413021087646, "learning_rate": 2.386419853568964e-05, "loss": 0.3929, "step": 653700 }, { "epoch": 8.716054978603138, "grad_norm": 4.579959392547607, "learning_rate": 2.385570212349667e-05, "loss": 0.39, "step": 653800 }, { "epoch": 8.717388116409593, "grad_norm": 1.9261335134506226, "learning_rate": 2.384720622568493e-05, "loss": 0.3946, "step": 653900 }, { "epoch": 8.718721254216048, "grad_norm": 5.758813381195068, "learning_rate": 2.38387108429657e-05, "loss": 0.4004, "step": 654000 }, { "epoch": 8.720054392022503, "grad_norm": 5.152990341186523, "learning_rate": 2.383021597605018e-05, "loss": 0.4222, "step": 654100 }, { "epoch": 8.721387529828958, "grad_norm": 3.4846205711364746, "learning_rate": 2.3821721625649523e-05, "loss": 0.4608, "step": 654200 }, { "epoch": 8.722720667635414, "grad_norm": 4.140558242797852, "learning_rate": 2.381322779247486e-05, "loss": 0.4349, "step": 654300 }, { "epoch": 8.72405380544187, "grad_norm": 7.242987632751465, "learning_rate": 2.3804734477237264e-05, "loss": 0.4088, "step": 654400 }, { "epoch": 8.725386943248324, "grad_norm": 15.577600479125977, "learning_rate": 2.3796241680647765e-05, "loss": 0.4144, "step": 654500 }, { "epoch": 8.72672008105478, "grad_norm": 3.1357812881469727, "learning_rate": 2.378774940341736e-05, "loss": 0.4617, "step": 654600 }, { "epoch": 8.728053218861234, "grad_norm": 4.39472770690918, "learning_rate": 2.3779257646256987e-05, "loss": 0.3651, "step": 654700 }, { "epoch": 8.729386356667689, "grad_norm": 3.945751190185547, "learning_rate": 2.377076640987756e-05, "loss": 0.3834, "step": 654800 }, { "epoch": 8.730719494474144, "grad_norm": 3.149172782897949, "learning_rate": 2.3762275694989936e-05, "loss": 0.4397, "step": 654900 }, { "epoch": 8.732052632280599, "grad_norm": 6.662808895111084, "learning_rate": 2.375378550230492e-05, "loss": 0.425, "step": 655000 }, { "epoch": 8.733385770087054, "grad_norm": 8.916525840759277, "learning_rate": 2.3745295832533286e-05, "loss": 0.3955, "step": 655100 }, { "epoch": 8.734718907893509, "grad_norm": 5.199489593505859, "learning_rate": 2.373680668638579e-05, "loss": 0.382, "step": 655200 }, { "epoch": 8.736052045699964, "grad_norm": 0.6454601287841797, "learning_rate": 2.3728318064573073e-05, "loss": 0.3982, "step": 655300 }, { "epoch": 8.737385183506419, "grad_norm": 8.247655868530273, "learning_rate": 2.37198299678058e-05, "loss": 0.421, "step": 655400 }, { "epoch": 8.738718321312874, "grad_norm": 1.9876439571380615, "learning_rate": 2.371142726989986e-05, "loss": 0.4066, "step": 655500 }, { "epoch": 8.740051459119329, "grad_norm": 2.9593334197998047, "learning_rate": 2.3702940220087046e-05, "loss": 0.433, "step": 655600 }, { "epoch": 8.741384596925784, "grad_norm": 2.8537254333496094, "learning_rate": 2.3694453697444215e-05, "loss": 0.4413, "step": 655700 }, { "epoch": 8.742717734732238, "grad_norm": 3.0069003105163574, "learning_rate": 2.368596770268185e-05, "loss": 0.4154, "step": 655800 }, { "epoch": 8.744050872538693, "grad_norm": 77.14488983154297, "learning_rate": 2.3677482236510368e-05, "loss": 0.3761, "step": 655900 }, { "epoch": 8.74538401034515, "grad_norm": 3.8261733055114746, "learning_rate": 2.3668997299640136e-05, "loss": 0.4542, "step": 656000 }, { "epoch": 8.746717148151605, "grad_norm": 8.412721633911133, "learning_rate": 2.3660512892781503e-05, "loss": 0.3311, "step": 656100 }, { "epoch": 8.74805028595806, "grad_norm": 1.4069747924804688, "learning_rate": 2.365202901664474e-05, "loss": 0.4066, "step": 656200 }, { "epoch": 8.749383423764515, "grad_norm": 5.120360374450684, "learning_rate": 2.3643545671940088e-05, "loss": 0.4375, "step": 656300 }, { "epoch": 8.75071656157097, "grad_norm": 4.088204860687256, "learning_rate": 2.363514768486695e-05, "loss": 0.4475, "step": 656400 }, { "epoch": 8.752049699377425, "grad_norm": 10.436254501342773, "learning_rate": 2.3626665399825063e-05, "loss": 0.3913, "step": 656500 }, { "epoch": 8.75338283718388, "grad_norm": 1.4812445640563965, "learning_rate": 2.3618183648338653e-05, "loss": 0.4476, "step": 656600 }, { "epoch": 8.754715974990335, "grad_norm": 5.710411548614502, "learning_rate": 2.3609702431117792e-05, "loss": 0.3915, "step": 656700 }, { "epoch": 8.75604911279679, "grad_norm": 3.195730209350586, "learning_rate": 2.3601221748872502e-05, "loss": 0.4461, "step": 656800 }, { "epoch": 8.757382250603245, "grad_norm": 4.911877632141113, "learning_rate": 2.3592741602312756e-05, "loss": 0.4524, "step": 656900 }, { "epoch": 8.7587153884097, "grad_norm": 7.89554500579834, "learning_rate": 2.3584261992148482e-05, "loss": 0.3638, "step": 657000 }, { "epoch": 8.760048526216154, "grad_norm": 4.710727691650391, "learning_rate": 2.357578291908958e-05, "loss": 0.4444, "step": 657100 }, { "epoch": 8.76138166402261, "grad_norm": 1.0895565748214722, "learning_rate": 2.356730438384587e-05, "loss": 0.3703, "step": 657200 }, { "epoch": 8.762714801829064, "grad_norm": 3.965521812438965, "learning_rate": 2.355891116442632e-05, "loss": 0.4003, "step": 657300 }, { "epoch": 8.76404793963552, "grad_norm": 2.4455373287200928, "learning_rate": 2.3550433701546508e-05, "loss": 0.3339, "step": 657400 }, { "epoch": 8.765381077441976, "grad_norm": 25.299339294433594, "learning_rate": 2.354195677860405e-05, "loss": 0.3741, "step": 657500 }, { "epoch": 8.766714215248431, "grad_norm": 3.301482677459717, "learning_rate": 2.3533480396308603e-05, "loss": 0.4714, "step": 657600 }, { "epoch": 8.768047353054886, "grad_norm": 38.31972122192383, "learning_rate": 2.3525004555369817e-05, "loss": 0.4348, "step": 657700 }, { "epoch": 8.76938049086134, "grad_norm": 4.15824556350708, "learning_rate": 2.3516529256497216e-05, "loss": 0.3788, "step": 657800 }, { "epoch": 8.770713628667796, "grad_norm": 5.188144207000732, "learning_rate": 2.3508054500400357e-05, "loss": 0.4371, "step": 657900 }, { "epoch": 8.77204676647425, "grad_norm": 7.236889839172363, "learning_rate": 2.349958028778871e-05, "loss": 0.4694, "step": 658000 }, { "epoch": 8.773379904280706, "grad_norm": 4.103797912597656, "learning_rate": 2.3491106619371713e-05, "loss": 0.4351, "step": 658100 }, { "epoch": 8.77471304208716, "grad_norm": 1.8268893957138062, "learning_rate": 2.3482633495858747e-05, "loss": 0.3951, "step": 658200 }, { "epoch": 8.776046179893616, "grad_norm": 1.3988142013549805, "learning_rate": 2.3474160917959167e-05, "loss": 0.3534, "step": 658300 }, { "epoch": 8.77737931770007, "grad_norm": 2.4743032455444336, "learning_rate": 2.346568888638226e-05, "loss": 0.3948, "step": 658400 }, { "epoch": 8.778712455506525, "grad_norm": 25.25037956237793, "learning_rate": 2.3457217401837275e-05, "loss": 0.4151, "step": 658500 }, { "epoch": 8.78004559331298, "grad_norm": 3.8186721801757812, "learning_rate": 2.344874646503344e-05, "loss": 0.368, "step": 658600 }, { "epoch": 8.781378731119435, "grad_norm": 4.492400169372559, "learning_rate": 2.3440276076679884e-05, "loss": 0.3863, "step": 658700 }, { "epoch": 8.78271186892589, "grad_norm": 9.959489822387695, "learning_rate": 2.3431806237485756e-05, "loss": 0.4161, "step": 658800 }, { "epoch": 8.784045006732345, "grad_norm": 3.1790897846221924, "learning_rate": 2.3423336948160084e-05, "loss": 0.3887, "step": 658900 }, { "epoch": 8.7853781445388, "grad_norm": 2.853238582611084, "learning_rate": 2.3414868209411916e-05, "loss": 0.4324, "step": 659000 }, { "epoch": 8.786711282345255, "grad_norm": 4.15254020690918, "learning_rate": 2.3406400021950232e-05, "loss": 0.4138, "step": 659100 }, { "epoch": 8.788044420151712, "grad_norm": 1.6544560194015503, "learning_rate": 2.3397932386483945e-05, "loss": 0.3892, "step": 659200 }, { "epoch": 8.789377557958167, "grad_norm": 6.306262493133545, "learning_rate": 2.338946530372195e-05, "loss": 0.3444, "step": 659300 }, { "epoch": 8.790710695764622, "grad_norm": 7.279769420623779, "learning_rate": 2.3380998774373085e-05, "loss": 0.4403, "step": 659400 }, { "epoch": 8.792043833571077, "grad_norm": 4.0506672859191895, "learning_rate": 2.3372532799146137e-05, "loss": 0.4172, "step": 659500 }, { "epoch": 8.793376971377532, "grad_norm": 6.502220153808594, "learning_rate": 2.336406737874984e-05, "loss": 0.4184, "step": 659600 }, { "epoch": 8.794710109183987, "grad_norm": 4.708191871643066, "learning_rate": 2.3355602513892923e-05, "loss": 0.4122, "step": 659700 }, { "epoch": 8.796043246990441, "grad_norm": 1.7013784646987915, "learning_rate": 2.3347138205284e-05, "loss": 0.3672, "step": 659800 }, { "epoch": 8.797376384796896, "grad_norm": 2.896003007888794, "learning_rate": 2.3338674453631706e-05, "loss": 0.4612, "step": 659900 }, { "epoch": 8.798709522603351, "grad_norm": 3.019578218460083, "learning_rate": 2.3330211259644586e-05, "loss": 0.4319, "step": 660000 }, { "epoch": 8.800042660409806, "grad_norm": 0.19150377810001373, "learning_rate": 2.3321748624031156e-05, "loss": 0.4243, "step": 660100 }, { "epoch": 8.801375798216261, "grad_norm": 3.33025860786438, "learning_rate": 2.3313286547499875e-05, "loss": 0.4147, "step": 660200 }, { "epoch": 8.802708936022716, "grad_norm": 4.4438982009887695, "learning_rate": 2.3304825030759174e-05, "loss": 0.4513, "step": 660300 }, { "epoch": 8.804042073829171, "grad_norm": 5.38702392578125, "learning_rate": 2.329636407451741e-05, "loss": 0.4153, "step": 660400 }, { "epoch": 8.805375211635626, "grad_norm": 1.872101902961731, "learning_rate": 2.3287903679482918e-05, "loss": 0.4178, "step": 660500 }, { "epoch": 8.806708349442081, "grad_norm": 1.942386269569397, "learning_rate": 2.327944384636397e-05, "loss": 0.3726, "step": 660600 }, { "epoch": 8.808041487248538, "grad_norm": 3.748807907104492, "learning_rate": 2.3270984575868794e-05, "loss": 0.3611, "step": 660700 }, { "epoch": 8.809374625054993, "grad_norm": 2.4372639656066895, "learning_rate": 2.3262525868705584e-05, "loss": 0.4204, "step": 660800 }, { "epoch": 8.810707762861448, "grad_norm": 3.035362720489502, "learning_rate": 2.3254067725582468e-05, "loss": 0.4066, "step": 660900 }, { "epoch": 8.812040900667903, "grad_norm": 5.321990966796875, "learning_rate": 2.3245610147207523e-05, "loss": 0.3735, "step": 661000 }, { "epoch": 8.813374038474358, "grad_norm": 3.5088438987731934, "learning_rate": 2.323715313428883e-05, "loss": 0.3802, "step": 661100 }, { "epoch": 8.814707176280812, "grad_norm": 3.355329751968384, "learning_rate": 2.3228696687534335e-05, "loss": 0.3735, "step": 661200 }, { "epoch": 8.816040314087267, "grad_norm": 3.9945666790008545, "learning_rate": 2.322024080765202e-05, "loss": 0.3591, "step": 661300 }, { "epoch": 8.817373451893722, "grad_norm": 8.007088661193848, "learning_rate": 2.3211785495349776e-05, "loss": 0.4157, "step": 661400 }, { "epoch": 8.818706589700177, "grad_norm": 2.714353322982788, "learning_rate": 2.3203330751335447e-05, "loss": 0.4354, "step": 661500 }, { "epoch": 8.820039727506632, "grad_norm": 11.689910888671875, "learning_rate": 2.3194876576316845e-05, "loss": 0.4277, "step": 661600 }, { "epoch": 8.821372865313087, "grad_norm": 13.372685432434082, "learning_rate": 2.3186422971001724e-05, "loss": 0.4027, "step": 661700 }, { "epoch": 8.822706003119542, "grad_norm": 10.264671325683594, "learning_rate": 2.317796993609779e-05, "loss": 0.3689, "step": 661800 }, { "epoch": 8.824039140925997, "grad_norm": 3.763463258743286, "learning_rate": 2.3169517472312716e-05, "loss": 0.3953, "step": 661900 }, { "epoch": 8.825372278732452, "grad_norm": 15.053168296813965, "learning_rate": 2.3161065580354102e-05, "loss": 0.4153, "step": 662000 }, { "epoch": 8.826705416538907, "grad_norm": 4.17020845413208, "learning_rate": 2.315261426092951e-05, "loss": 0.4268, "step": 662100 }, { "epoch": 8.828038554345362, "grad_norm": 2.0289855003356934, "learning_rate": 2.314416351474649e-05, "loss": 0.3735, "step": 662200 }, { "epoch": 8.829371692151817, "grad_norm": 14.070477485656738, "learning_rate": 2.3135713342512468e-05, "loss": 0.3837, "step": 662300 }, { "epoch": 8.830704829958274, "grad_norm": 4.093085289001465, "learning_rate": 2.3127348238063796e-05, "loss": 0.3934, "step": 662400 }, { "epoch": 8.832037967764728, "grad_norm": 6.999688148498535, "learning_rate": 2.31188992100929e-05, "loss": 0.4265, "step": 662500 }, { "epoch": 8.833371105571183, "grad_norm": 4.108164310455322, "learning_rate": 2.311045075818607e-05, "loss": 0.4338, "step": 662600 }, { "epoch": 8.834704243377638, "grad_norm": 1.7331708669662476, "learning_rate": 2.3102002883050583e-05, "loss": 0.4152, "step": 662700 }, { "epoch": 8.836037381184093, "grad_norm": 3.808750867843628, "learning_rate": 2.309355558539367e-05, "loss": 0.4149, "step": 662800 }, { "epoch": 8.837370518990548, "grad_norm": 6.820799827575684, "learning_rate": 2.3085108865922516e-05, "loss": 0.425, "step": 662900 }, { "epoch": 8.838703656797003, "grad_norm": 12.373029708862305, "learning_rate": 2.3076662725344245e-05, "loss": 0.4331, "step": 663000 }, { "epoch": 8.840036794603458, "grad_norm": 0.9481847286224365, "learning_rate": 2.306821716436594e-05, "loss": 0.4443, "step": 663100 }, { "epoch": 8.841369932409913, "grad_norm": 27.427074432373047, "learning_rate": 2.3059772183694666e-05, "loss": 0.4295, "step": 663200 }, { "epoch": 8.842703070216368, "grad_norm": 2.1898088455200195, "learning_rate": 2.3051327784037367e-05, "loss": 0.4144, "step": 663300 }, { "epoch": 8.844036208022823, "grad_norm": 2.9744784832000732, "learning_rate": 2.304288396610102e-05, "loss": 0.3644, "step": 663400 }, { "epoch": 8.845369345829278, "grad_norm": 1.3864506483078003, "learning_rate": 2.3034440730592474e-05, "loss": 0.4249, "step": 663500 }, { "epoch": 8.846702483635733, "grad_norm": 6.374649524688721, "learning_rate": 2.30259980782186e-05, "loss": 0.445, "step": 663600 }, { "epoch": 8.848035621442188, "grad_norm": 3.65010404586792, "learning_rate": 2.3017556009686192e-05, "loss": 0.3562, "step": 663700 }, { "epoch": 8.849368759248643, "grad_norm": 2.63903546333313, "learning_rate": 2.300911452570198e-05, "loss": 0.3925, "step": 663800 }, { "epoch": 8.8507018970551, "grad_norm": 1.1499814987182617, "learning_rate": 2.3000673626972657e-05, "loss": 0.4244, "step": 663900 }, { "epoch": 8.852035034861554, "grad_norm": 7.831442356109619, "learning_rate": 2.2992233314204883e-05, "loss": 0.3957, "step": 664000 }, { "epoch": 8.85336817266801, "grad_norm": 5.09313440322876, "learning_rate": 2.298379358810524e-05, "loss": 0.4413, "step": 664100 }, { "epoch": 8.854701310474464, "grad_norm": 3.6135714054107666, "learning_rate": 2.297535444938028e-05, "loss": 0.3852, "step": 664200 }, { "epoch": 8.85603444828092, "grad_norm": 4.247795581817627, "learning_rate": 2.296691589873651e-05, "loss": 0.3586, "step": 664300 }, { "epoch": 8.857367586087374, "grad_norm": 3.097728729248047, "learning_rate": 2.2958477936880362e-05, "loss": 0.4079, "step": 664400 }, { "epoch": 8.85870072389383, "grad_norm": 3.6170153617858887, "learning_rate": 2.295004056451824e-05, "loss": 0.4003, "step": 664500 }, { "epoch": 8.860033861700284, "grad_norm": 5.5880208015441895, "learning_rate": 2.2941603782356516e-05, "loss": 0.3862, "step": 664600 }, { "epoch": 8.861366999506739, "grad_norm": 5.414849281311035, "learning_rate": 2.2933167591101457e-05, "loss": 0.3809, "step": 664700 }, { "epoch": 8.862700137313194, "grad_norm": 3.014390230178833, "learning_rate": 2.2924731991459335e-05, "loss": 0.3905, "step": 664800 }, { "epoch": 8.864033275119649, "grad_norm": 1.1448534727096558, "learning_rate": 2.291629698413635e-05, "loss": 0.4203, "step": 664900 }, { "epoch": 8.865366412926104, "grad_norm": 2.1329314708709717, "learning_rate": 2.290786256983865e-05, "loss": 0.4432, "step": 665000 }, { "epoch": 8.866699550732559, "grad_norm": 2.4224750995635986, "learning_rate": 2.2899428749272344e-05, "loss": 0.4146, "step": 665100 }, { "epoch": 8.868032688539014, "grad_norm": 8.36918830871582, "learning_rate": 2.2890995523143474e-05, "loss": 0.3756, "step": 665200 }, { "epoch": 8.869365826345469, "grad_norm": 6.749317646026611, "learning_rate": 2.288256289215805e-05, "loss": 0.3631, "step": 665300 }, { "epoch": 8.870698964151924, "grad_norm": 3.024859666824341, "learning_rate": 2.287413085702203e-05, "loss": 0.3309, "step": 665400 }, { "epoch": 8.872032101958379, "grad_norm": 2.6598474979400635, "learning_rate": 2.286569941844131e-05, "loss": 0.4373, "step": 665500 }, { "epoch": 8.873365239764835, "grad_norm": 3.191885232925415, "learning_rate": 2.2857268577121733e-05, "loss": 0.3793, "step": 665600 }, { "epoch": 8.87469837757129, "grad_norm": 3.5327484607696533, "learning_rate": 2.284883833376914e-05, "loss": 0.3642, "step": 665700 }, { "epoch": 8.876031515377745, "grad_norm": 1.5472910404205322, "learning_rate": 2.2840408689089235e-05, "loss": 0.3963, "step": 665800 }, { "epoch": 8.8773646531842, "grad_norm": 5.297419548034668, "learning_rate": 2.2831979643787754e-05, "loss": 0.4351, "step": 665900 }, { "epoch": 8.878697790990655, "grad_norm": 3.242870807647705, "learning_rate": 2.2823551198570346e-05, "loss": 0.3775, "step": 666000 }, { "epoch": 8.88003092879711, "grad_norm": 0.7712294459342957, "learning_rate": 2.2815123354142606e-05, "loss": 0.3946, "step": 666100 }, { "epoch": 8.881364066603565, "grad_norm": 3.0603208541870117, "learning_rate": 2.280669611121009e-05, "loss": 0.4218, "step": 666200 }, { "epoch": 8.88269720441002, "grad_norm": 2.5978779792785645, "learning_rate": 2.2798269470478305e-05, "loss": 0.4256, "step": 666300 }, { "epoch": 8.884030342216475, "grad_norm": 5.476433753967285, "learning_rate": 2.278984343265269e-05, "loss": 0.4509, "step": 666400 }, { "epoch": 8.88536348002293, "grad_norm": 3.1187148094177246, "learning_rate": 2.2781417998438665e-05, "loss": 0.409, "step": 666500 }, { "epoch": 8.886696617829385, "grad_norm": 3.3385441303253174, "learning_rate": 2.277299316854156e-05, "loss": 0.3767, "step": 666600 }, { "epoch": 8.88802975563584, "grad_norm": 4.516843795776367, "learning_rate": 2.276456894366669e-05, "loss": 0.4167, "step": 666700 }, { "epoch": 8.889362893442295, "grad_norm": 2.0622551441192627, "learning_rate": 2.2756145324519305e-05, "loss": 0.4558, "step": 666800 }, { "epoch": 8.89069603124875, "grad_norm": 7.035465717315674, "learning_rate": 2.2747722311804587e-05, "loss": 0.4023, "step": 666900 }, { "epoch": 8.892029169055204, "grad_norm": 4.302188396453857, "learning_rate": 2.2739299906227697e-05, "loss": 0.4027, "step": 667000 }, { "epoch": 8.893362306861661, "grad_norm": 6.26791524887085, "learning_rate": 2.2730878108493746e-05, "loss": 0.3649, "step": 667100 }, { "epoch": 8.894695444668116, "grad_norm": 5.8927788734436035, "learning_rate": 2.272245691930775e-05, "loss": 0.3835, "step": 667200 }, { "epoch": 8.896028582474571, "grad_norm": 13.373100280761719, "learning_rate": 2.2714036339374723e-05, "loss": 0.4323, "step": 667300 }, { "epoch": 8.897361720281026, "grad_norm": 21.131269454956055, "learning_rate": 2.2705616369399612e-05, "loss": 0.4515, "step": 667400 }, { "epoch": 8.898694858087481, "grad_norm": 5.404270172119141, "learning_rate": 2.26971970100873e-05, "loss": 0.404, "step": 667500 }, { "epoch": 8.900027995893936, "grad_norm": 4.733029842376709, "learning_rate": 2.2688778262142635e-05, "loss": 0.4132, "step": 667600 }, { "epoch": 8.90136113370039, "grad_norm": 92.83067321777344, "learning_rate": 2.268036012627041e-05, "loss": 0.3635, "step": 667700 }, { "epoch": 8.902694271506846, "grad_norm": 2.2889068126678467, "learning_rate": 2.2671942603175357e-05, "loss": 0.4473, "step": 667800 }, { "epoch": 8.9040274093133, "grad_norm": 19.0352840423584, "learning_rate": 2.2663525693562164e-05, "loss": 0.4142, "step": 667900 }, { "epoch": 8.905360547119756, "grad_norm": 1.400231122970581, "learning_rate": 2.2655193558047224e-05, "loss": 0.4386, "step": 668000 }, { "epoch": 8.90669368492621, "grad_norm": 3.0618419647216797, "learning_rate": 2.264677787135923e-05, "loss": 0.432, "step": 668100 }, { "epoch": 8.908026822732666, "grad_norm": 4.037939071655273, "learning_rate": 2.2638362800259808e-05, "loss": 0.3564, "step": 668200 }, { "epoch": 8.90935996053912, "grad_norm": 5.174807071685791, "learning_rate": 2.262994834545346e-05, "loss": 0.4548, "step": 668300 }, { "epoch": 8.910693098345575, "grad_norm": 0.8646267652511597, "learning_rate": 2.2621534507644585e-05, "loss": 0.3724, "step": 668400 }, { "epoch": 8.91202623615203, "grad_norm": 1.938132643699646, "learning_rate": 2.2613121287537595e-05, "loss": 0.4208, "step": 668500 }, { "epoch": 8.913359373958485, "grad_norm": 5.049327850341797, "learning_rate": 2.2604708685836807e-05, "loss": 0.4093, "step": 668600 }, { "epoch": 8.91469251176494, "grad_norm": 3.7981185913085938, "learning_rate": 2.259629670324649e-05, "loss": 0.4112, "step": 668700 }, { "epoch": 8.916025649571397, "grad_norm": 2.6679201126098633, "learning_rate": 2.2587885340470877e-05, "loss": 0.3771, "step": 668800 }, { "epoch": 8.917358787377852, "grad_norm": 3.4589388370513916, "learning_rate": 2.2579474598214144e-05, "loss": 0.4169, "step": 668900 }, { "epoch": 8.918691925184307, "grad_norm": 3.7518694400787354, "learning_rate": 2.2571064477180398e-05, "loss": 0.4127, "step": 669000 }, { "epoch": 8.920025062990762, "grad_norm": 3.2513136863708496, "learning_rate": 2.2562654978073712e-05, "loss": 0.4021, "step": 669100 }, { "epoch": 8.921358200797217, "grad_norm": 5.054916858673096, "learning_rate": 2.2554246101598123e-05, "loss": 0.5596, "step": 669200 }, { "epoch": 8.922691338603672, "grad_norm": 3.145190954208374, "learning_rate": 2.254583784845756e-05, "loss": 0.4125, "step": 669300 }, { "epoch": 8.924024476410127, "grad_norm": 2.959585666656494, "learning_rate": 2.2537430219355964e-05, "loss": 0.3576, "step": 669400 }, { "epoch": 8.925357614216582, "grad_norm": 0.507377028465271, "learning_rate": 2.252902321499719e-05, "loss": 0.4153, "step": 669500 }, { "epoch": 8.926690752023037, "grad_norm": 39.112979888916016, "learning_rate": 2.2520616836085036e-05, "loss": 0.3938, "step": 669600 }, { "epoch": 8.928023889829491, "grad_norm": 6.377727031707764, "learning_rate": 2.251221108332327e-05, "loss": 0.3644, "step": 669700 }, { "epoch": 8.929357027635946, "grad_norm": 4.40878438949585, "learning_rate": 2.2503805957415584e-05, "loss": 0.3631, "step": 669800 }, { "epoch": 8.930690165442401, "grad_norm": 3.2641758918762207, "learning_rate": 2.249540145906563e-05, "loss": 0.3899, "step": 669900 }, { "epoch": 8.932023303248856, "grad_norm": 2.786327838897705, "learning_rate": 2.2486997588977016e-05, "loss": 0.404, "step": 670000 }, { "epoch": 8.933356441055311, "grad_norm": 11.235705375671387, "learning_rate": 2.2478594347853276e-05, "loss": 0.4165, "step": 670100 }, { "epoch": 8.934689578861766, "grad_norm": 5.368438243865967, "learning_rate": 2.247019173639791e-05, "loss": 0.4389, "step": 670200 }, { "epoch": 8.936022716668223, "grad_norm": 4.529044151306152, "learning_rate": 2.246178975531436e-05, "loss": 0.4421, "step": 670300 }, { "epoch": 8.937355854474678, "grad_norm": 3.1466591358184814, "learning_rate": 2.2453388405306e-05, "loss": 0.4533, "step": 670400 }, { "epoch": 8.938688992281133, "grad_norm": 0.362505167722702, "learning_rate": 2.244498768707617e-05, "loss": 0.4635, "step": 670500 }, { "epoch": 8.940022130087588, "grad_norm": 5.293878078460693, "learning_rate": 2.2436587601328177e-05, "loss": 0.3921, "step": 670600 }, { "epoch": 8.941355267894043, "grad_norm": 0.7003200054168701, "learning_rate": 2.2428188148765203e-05, "loss": 0.4081, "step": 670700 }, { "epoch": 8.942688405700498, "grad_norm": 3.231741428375244, "learning_rate": 2.241978933009046e-05, "loss": 0.3596, "step": 670800 }, { "epoch": 8.944021543506953, "grad_norm": 3.638915538787842, "learning_rate": 2.241139114600706e-05, "loss": 0.3935, "step": 670900 }, { "epoch": 8.945354681313407, "grad_norm": 2.4366133213043213, "learning_rate": 2.2402993597218072e-05, "loss": 0.4041, "step": 671000 }, { "epoch": 8.946687819119862, "grad_norm": 6.238260269165039, "learning_rate": 2.2394596684426518e-05, "loss": 0.412, "step": 671100 }, { "epoch": 8.948020956926317, "grad_norm": 32.201786041259766, "learning_rate": 2.2386200408335345e-05, "loss": 0.3587, "step": 671200 }, { "epoch": 8.949354094732772, "grad_norm": 2.979917526245117, "learning_rate": 2.2377804769647477e-05, "loss": 0.4015, "step": 671300 }, { "epoch": 8.950687232539227, "grad_norm": 10.365193367004395, "learning_rate": 2.236940976906577e-05, "loss": 0.4, "step": 671400 }, { "epoch": 8.952020370345682, "grad_norm": 2.292255163192749, "learning_rate": 2.2361099347746336e-05, "loss": 0.4363, "step": 671500 }, { "epoch": 8.953353508152137, "grad_norm": 7.0077900886535645, "learning_rate": 2.23527056190867e-05, "loss": 0.3615, "step": 671600 }, { "epoch": 8.954686645958592, "grad_norm": 3.4755940437316895, "learning_rate": 2.2344312530634434e-05, "loss": 0.4577, "step": 671700 }, { "epoch": 8.956019783765047, "grad_norm": 2.7259607315063477, "learning_rate": 2.233592008309221e-05, "loss": 0.4138, "step": 671800 }, { "epoch": 8.957352921571502, "grad_norm": 1.7907359600067139, "learning_rate": 2.232752827716258e-05, "loss": 0.3832, "step": 671900 }, { "epoch": 8.958686059377959, "grad_norm": 8.970582962036133, "learning_rate": 2.2319137113548103e-05, "loss": 0.3939, "step": 672000 }, { "epoch": 8.960019197184414, "grad_norm": 11.034483909606934, "learning_rate": 2.2310746592951258e-05, "loss": 0.4152, "step": 672100 }, { "epoch": 8.961352334990869, "grad_norm": 3.2302298545837402, "learning_rate": 2.230235671607447e-05, "loss": 0.4003, "step": 672200 }, { "epoch": 8.962685472797324, "grad_norm": 8.516895294189453, "learning_rate": 2.2293967483620105e-05, "loss": 0.4145, "step": 672300 }, { "epoch": 8.964018610603778, "grad_norm": 2.444551944732666, "learning_rate": 2.22855788962905e-05, "loss": 0.3937, "step": 672400 }, { "epoch": 8.965351748410233, "grad_norm": 2.248262882232666, "learning_rate": 2.2277190954787908e-05, "loss": 0.4657, "step": 672500 }, { "epoch": 8.966684886216688, "grad_norm": 2.556236982345581, "learning_rate": 2.2268803659814547e-05, "loss": 0.4851, "step": 672600 }, { "epoch": 8.968018024023143, "grad_norm": 2.816887140274048, "learning_rate": 2.226041701207257e-05, "loss": 0.3956, "step": 672700 }, { "epoch": 8.969351161829598, "grad_norm": 16.305843353271484, "learning_rate": 2.2252031012264073e-05, "loss": 0.4004, "step": 672800 }, { "epoch": 8.970684299636053, "grad_norm": 4.2054314613342285, "learning_rate": 2.2243645661091138e-05, "loss": 0.3936, "step": 672900 }, { "epoch": 8.972017437442508, "grad_norm": 3.54886531829834, "learning_rate": 2.2235260959255717e-05, "loss": 0.4316, "step": 673000 }, { "epoch": 8.973350575248963, "grad_norm": 2.733499765396118, "learning_rate": 2.222687690745978e-05, "loss": 0.4126, "step": 673100 }, { "epoch": 8.974683713055418, "grad_norm": 2.1224379539489746, "learning_rate": 2.2218577337192278e-05, "loss": 0.3615, "step": 673200 }, { "epoch": 8.976016850861873, "grad_norm": 1.995148777961731, "learning_rate": 2.2210194581063002e-05, "loss": 0.3718, "step": 673300 }, { "epoch": 8.977349988668328, "grad_norm": 3.066067934036255, "learning_rate": 2.2201812477071667e-05, "loss": 0.3914, "step": 673400 }, { "epoch": 8.978683126474785, "grad_norm": 2.4311206340789795, "learning_rate": 2.219343102592001e-05, "loss": 0.3949, "step": 673500 }, { "epoch": 8.98001626428124, "grad_norm": 4.36743688583374, "learning_rate": 2.218505022830971e-05, "loss": 0.3854, "step": 673600 }, { "epoch": 8.981349402087695, "grad_norm": 3.2278153896331787, "learning_rate": 2.217667008494236e-05, "loss": 0.4732, "step": 673700 }, { "epoch": 8.98268253989415, "grad_norm": 2.5295112133026123, "learning_rate": 2.2168290596519526e-05, "loss": 0.3708, "step": 673800 }, { "epoch": 8.984015677700604, "grad_norm": 9.726244926452637, "learning_rate": 2.215991176374273e-05, "loss": 0.4235, "step": 673900 }, { "epoch": 8.98534881550706, "grad_norm": 10.03494930267334, "learning_rate": 2.2151533587313394e-05, "loss": 0.4485, "step": 674000 }, { "epoch": 8.986681953313514, "grad_norm": 2.1909940242767334, "learning_rate": 2.2143156067932938e-05, "loss": 0.3604, "step": 674100 }, { "epoch": 8.98801509111997, "grad_norm": 9.54818058013916, "learning_rate": 2.2134779206302677e-05, "loss": 0.3698, "step": 674200 }, { "epoch": 8.989348228926424, "grad_norm": 2.038267135620117, "learning_rate": 2.212640300312391e-05, "loss": 0.4068, "step": 674300 }, { "epoch": 8.990681366732879, "grad_norm": 4.463588714599609, "learning_rate": 2.211802745909787e-05, "loss": 0.3962, "step": 674400 }, { "epoch": 8.992014504539334, "grad_norm": 2.1821038722991943, "learning_rate": 2.210965257492572e-05, "loss": 0.4199, "step": 674500 }, { "epoch": 8.993347642345789, "grad_norm": 5.032412528991699, "learning_rate": 2.210127835130858e-05, "loss": 0.4457, "step": 674600 }, { "epoch": 8.994680780152244, "grad_norm": 2.314882278442383, "learning_rate": 2.2092904788947528e-05, "loss": 0.3587, "step": 674700 }, { "epoch": 8.996013917958699, "grad_norm": 13.031248092651367, "learning_rate": 2.2084531888543552e-05, "loss": 0.4022, "step": 674800 }, { "epoch": 8.997347055765154, "grad_norm": 6.77982759475708, "learning_rate": 2.2076159650797617e-05, "loss": 0.4293, "step": 674900 }, { "epoch": 8.998680193571609, "grad_norm": 5.706037521362305, "learning_rate": 2.2067788076410625e-05, "loss": 0.4284, "step": 675000 }, { "epoch": 9.000013331378064, "grad_norm": 5.103981971740723, "learning_rate": 2.2059417166083402e-05, "loss": 0.4109, "step": 675100 }, { "epoch": 9.00134646918452, "grad_norm": 0.5335285067558289, "learning_rate": 2.205104692051674e-05, "loss": 0.3734, "step": 675200 }, { "epoch": 9.002679606990975, "grad_norm": 1.844555139541626, "learning_rate": 2.204267734041139e-05, "loss": 0.3766, "step": 675300 }, { "epoch": 9.00401274479743, "grad_norm": 5.731840133666992, "learning_rate": 2.203430842646799e-05, "loss": 0.3515, "step": 675400 }, { "epoch": 9.005345882603885, "grad_norm": 16.77870750427246, "learning_rate": 2.2025940179387183e-05, "loss": 0.3381, "step": 675500 }, { "epoch": 9.00667902041034, "grad_norm": 1.361257791519165, "learning_rate": 2.2017572599869537e-05, "loss": 0.3916, "step": 675600 }, { "epoch": 9.008012158216795, "grad_norm": 5.305366516113281, "learning_rate": 2.2009205688615543e-05, "loss": 0.3148, "step": 675700 }, { "epoch": 9.00934529602325, "grad_norm": 5.9340386390686035, "learning_rate": 2.2000839446325666e-05, "loss": 0.3329, "step": 675800 }, { "epoch": 9.010678433829705, "grad_norm": 6.570971488952637, "learning_rate": 2.1992473873700288e-05, "loss": 0.388, "step": 675900 }, { "epoch": 9.01201157163616, "grad_norm": 1.1933377981185913, "learning_rate": 2.1984108971439756e-05, "loss": 0.3286, "step": 676000 }, { "epoch": 9.013344709442615, "grad_norm": 4.947225093841553, "learning_rate": 2.197574474024436e-05, "loss": 0.381, "step": 676100 }, { "epoch": 9.01467784724907, "grad_norm": 1.5239200592041016, "learning_rate": 2.196738118081431e-05, "loss": 0.3926, "step": 676200 }, { "epoch": 9.016010985055525, "grad_norm": 2.7290658950805664, "learning_rate": 2.195901829384978e-05, "loss": 0.4039, "step": 676300 }, { "epoch": 9.01734412286198, "grad_norm": 1.6549805402755737, "learning_rate": 2.1950656080050915e-05, "loss": 0.3454, "step": 676400 }, { "epoch": 9.018677260668435, "grad_norm": 4.352859973907471, "learning_rate": 2.194229454011773e-05, "loss": 0.3828, "step": 676500 }, { "epoch": 9.02001039847489, "grad_norm": 4.222822189331055, "learning_rate": 2.193393367475025e-05, "loss": 0.3663, "step": 676600 }, { "epoch": 9.021343536281345, "grad_norm": 3.1560680866241455, "learning_rate": 2.192557348464842e-05, "loss": 0.3799, "step": 676700 }, { "epoch": 9.022676674087801, "grad_norm": 6.001035690307617, "learning_rate": 2.1917213970512123e-05, "loss": 0.3824, "step": 676800 }, { "epoch": 9.024009811894256, "grad_norm": 4.260810375213623, "learning_rate": 2.190885513304119e-05, "loss": 0.3627, "step": 676900 }, { "epoch": 9.025342949700711, "grad_norm": 5.736567974090576, "learning_rate": 2.190049697293541e-05, "loss": 0.3795, "step": 677000 }, { "epoch": 9.026676087507166, "grad_norm": 2.3488059043884277, "learning_rate": 2.1892139490894482e-05, "loss": 0.3749, "step": 677100 }, { "epoch": 9.028009225313621, "grad_norm": 10.331917762756348, "learning_rate": 2.1883782687618087e-05, "loss": 0.3614, "step": 677200 }, { "epoch": 9.029342363120076, "grad_norm": 9.371152877807617, "learning_rate": 2.1875426563805813e-05, "loss": 0.4072, "step": 677300 }, { "epoch": 9.030675500926531, "grad_norm": 1.4010608196258545, "learning_rate": 2.1867071120157207e-05, "loss": 0.3873, "step": 677400 }, { "epoch": 9.032008638732986, "grad_norm": 4.763891696929932, "learning_rate": 2.1858716357371794e-05, "loss": 0.3329, "step": 677500 }, { "epoch": 9.03334177653944, "grad_norm": 10.9829683303833, "learning_rate": 2.185036227614896e-05, "loss": 0.3488, "step": 677600 }, { "epoch": 9.034674914345896, "grad_norm": 2.2118980884552, "learning_rate": 2.1842008877188117e-05, "loss": 0.3422, "step": 677700 }, { "epoch": 9.03600805215235, "grad_norm": 2.979210138320923, "learning_rate": 2.1833656161188578e-05, "loss": 0.3199, "step": 677800 }, { "epoch": 9.037341189958806, "grad_norm": 2.4392473697662354, "learning_rate": 2.18253041288496e-05, "loss": 0.363, "step": 677900 }, { "epoch": 9.03867432776526, "grad_norm": 5.647691249847412, "learning_rate": 2.1816952780870384e-05, "loss": 0.4083, "step": 678000 }, { "epoch": 9.040007465571716, "grad_norm": 4.0304484367370605, "learning_rate": 2.1808602117950096e-05, "loss": 0.3313, "step": 678100 }, { "epoch": 9.04134060337817, "grad_norm": 5.107643127441406, "learning_rate": 2.1800252140787813e-05, "loss": 0.3462, "step": 678200 }, { "epoch": 9.042673741184625, "grad_norm": 9.39930248260498, "learning_rate": 2.1791902850082573e-05, "loss": 0.3421, "step": 678300 }, { "epoch": 9.044006878991082, "grad_norm": 3.2764430046081543, "learning_rate": 2.1783554246533354e-05, "loss": 0.3684, "step": 678400 }, { "epoch": 9.045340016797537, "grad_norm": 7.5226216316223145, "learning_rate": 2.1775206330839076e-05, "loss": 0.3892, "step": 678500 }, { "epoch": 9.046673154603992, "grad_norm": 4.707427024841309, "learning_rate": 2.1766859103698582e-05, "loss": 0.3938, "step": 678600 }, { "epoch": 9.048006292410447, "grad_norm": 2.234403133392334, "learning_rate": 2.1758512565810712e-05, "loss": 0.3269, "step": 678700 }, { "epoch": 9.049339430216902, "grad_norm": 13.415184020996094, "learning_rate": 2.1750166717874173e-05, "loss": 0.3777, "step": 678800 }, { "epoch": 9.050672568023357, "grad_norm": 5.147421360015869, "learning_rate": 2.174182156058769e-05, "loss": 0.3465, "step": 678900 }, { "epoch": 9.052005705829812, "grad_norm": 4.341352939605713, "learning_rate": 2.1733477094649852e-05, "loss": 0.359, "step": 679000 }, { "epoch": 9.053338843636267, "grad_norm": 1.7036263942718506, "learning_rate": 2.172513332075926e-05, "loss": 0.3908, "step": 679100 }, { "epoch": 9.054671981442722, "grad_norm": 2.4894609451293945, "learning_rate": 2.171679023961443e-05, "loss": 0.3761, "step": 679200 }, { "epoch": 9.056005119249177, "grad_norm": 4.068241596221924, "learning_rate": 2.17084478519138e-05, "loss": 0.3417, "step": 679300 }, { "epoch": 9.057338257055632, "grad_norm": 5.90222692489624, "learning_rate": 2.1700106158355783e-05, "loss": 0.3655, "step": 679400 }, { "epoch": 9.058671394862087, "grad_norm": 4.471129417419434, "learning_rate": 2.1691848566184133e-05, "loss": 0.3729, "step": 679500 }, { "epoch": 9.060004532668541, "grad_norm": 5.270854473114014, "learning_rate": 2.1683508256047454e-05, "loss": 0.3733, "step": 679600 }, { "epoch": 9.061337670474996, "grad_norm": 3.037018060684204, "learning_rate": 2.167516864214124e-05, "loss": 0.3775, "step": 679700 }, { "epoch": 9.062670808281451, "grad_norm": 2.830204486846924, "learning_rate": 2.166682972516366e-05, "loss": 0.3063, "step": 679800 }, { "epoch": 9.064003946087906, "grad_norm": 2.7837603092193604, "learning_rate": 2.1658491505812844e-05, "loss": 0.3847, "step": 679900 }, { "epoch": 9.065337083894363, "grad_norm": 3.743624448776245, "learning_rate": 2.1650153984786806e-05, "loss": 0.3748, "step": 680000 }, { "epoch": 9.066670221700818, "grad_norm": 4.511292457580566, "learning_rate": 2.1641817162783574e-05, "loss": 0.3617, "step": 680100 }, { "epoch": 9.068003359507273, "grad_norm": 6.963269233703613, "learning_rate": 2.1633481040501043e-05, "loss": 0.3366, "step": 680200 }, { "epoch": 9.069336497313728, "grad_norm": 2.5900702476501465, "learning_rate": 2.1625145618637114e-05, "loss": 0.3715, "step": 680300 }, { "epoch": 9.070669635120183, "grad_norm": 12.714282989501953, "learning_rate": 2.1616810897889602e-05, "loss": 0.3814, "step": 680400 }, { "epoch": 9.072002772926638, "grad_norm": 3.509276866912842, "learning_rate": 2.1608476878956243e-05, "loss": 0.33, "step": 680500 }, { "epoch": 9.073335910733093, "grad_norm": 2.373464584350586, "learning_rate": 2.160014356253475e-05, "loss": 0.3564, "step": 680600 }, { "epoch": 9.074669048539548, "grad_norm": 1.1471309661865234, "learning_rate": 2.1591810949322766e-05, "loss": 0.3681, "step": 680700 }, { "epoch": 9.076002186346003, "grad_norm": 4.270313262939453, "learning_rate": 2.1583479040017862e-05, "loss": 0.3395, "step": 680800 }, { "epoch": 9.077335324152457, "grad_norm": 1.4222990274429321, "learning_rate": 2.157514783531755e-05, "loss": 0.3301, "step": 680900 }, { "epoch": 9.078668461958912, "grad_norm": 6.61674165725708, "learning_rate": 2.1566817335919323e-05, "loss": 0.3303, "step": 681000 }, { "epoch": 9.080001599765367, "grad_norm": 1.467603087425232, "learning_rate": 2.1558487542520545e-05, "loss": 0.3426, "step": 681100 }, { "epoch": 9.081334737571822, "grad_norm": 3.3554317951202393, "learning_rate": 2.155015845581859e-05, "loss": 0.33, "step": 681200 }, { "epoch": 9.082667875378277, "grad_norm": 2.8304615020751953, "learning_rate": 2.154183007651074e-05, "loss": 0.4219, "step": 681300 }, { "epoch": 9.084001013184732, "grad_norm": 3.8671889305114746, "learning_rate": 2.1533502405294205e-05, "loss": 0.4008, "step": 681400 }, { "epoch": 9.085334150991187, "grad_norm": 2.4322593212127686, "learning_rate": 2.152517544286616e-05, "loss": 0.3948, "step": 681500 }, { "epoch": 9.086667288797644, "grad_norm": 4.9669060707092285, "learning_rate": 2.1516849189923717e-05, "loss": 0.3201, "step": 681600 }, { "epoch": 9.088000426604099, "grad_norm": 3.2804343700408936, "learning_rate": 2.150860689907382e-05, "loss": 0.3894, "step": 681700 }, { "epoch": 9.089333564410554, "grad_norm": 7.069194316864014, "learning_rate": 2.1500282060081404e-05, "loss": 0.3741, "step": 681800 }, { "epoch": 9.090666702217009, "grad_norm": 3.690378189086914, "learning_rate": 2.149195793265859e-05, "loss": 0.39, "step": 681900 }, { "epoch": 9.091999840023464, "grad_norm": 6.973857879638672, "learning_rate": 2.1483634517502228e-05, "loss": 0.3696, "step": 682000 }, { "epoch": 9.093332977829919, "grad_norm": 0.4404642581939697, "learning_rate": 2.1475311815309125e-05, "loss": 0.2918, "step": 682100 }, { "epoch": 9.094666115636374, "grad_norm": 0.11926861107349396, "learning_rate": 2.1466989826776076e-05, "loss": 0.3897, "step": 682200 }, { "epoch": 9.095999253442828, "grad_norm": 3.1579036712646484, "learning_rate": 2.1458668552599715e-05, "loss": 0.3776, "step": 682300 }, { "epoch": 9.097332391249283, "grad_norm": 2.491457939147949, "learning_rate": 2.1450347993476703e-05, "loss": 0.4349, "step": 682400 }, { "epoch": 9.098665529055738, "grad_norm": 3.8888938426971436, "learning_rate": 2.144202815010362e-05, "loss": 0.4117, "step": 682500 }, { "epoch": 9.099998666862193, "grad_norm": 4.85810661315918, "learning_rate": 2.1433709023176948e-05, "loss": 0.451, "step": 682600 }, { "epoch": 9.101331804668648, "grad_norm": 3.3918349742889404, "learning_rate": 2.1425390613393155e-05, "loss": 0.3855, "step": 682700 }, { "epoch": 9.102664942475103, "grad_norm": 14.010659217834473, "learning_rate": 2.141707292144864e-05, "loss": 0.3772, "step": 682800 }, { "epoch": 9.103998080281558, "grad_norm": 3.025904893875122, "learning_rate": 2.1408839114214774e-05, "loss": 0.445, "step": 682900 }, { "epoch": 9.105331218088013, "grad_norm": 6.052210807800293, "learning_rate": 2.1400522852841953e-05, "loss": 0.3251, "step": 683000 }, { "epoch": 9.106664355894468, "grad_norm": 3.2209465503692627, "learning_rate": 2.139220731139026e-05, "loss": 0.3649, "step": 683100 }, { "epoch": 9.107997493700925, "grad_norm": 6.807058334350586, "learning_rate": 2.138389249055583e-05, "loss": 0.3986, "step": 683200 }, { "epoch": 9.10933063150738, "grad_norm": 4.079283714294434, "learning_rate": 2.1375578391034758e-05, "loss": 0.3731, "step": 683300 }, { "epoch": 9.110663769313835, "grad_norm": 4.259159564971924, "learning_rate": 2.1367265013523093e-05, "loss": 0.3114, "step": 683400 }, { "epoch": 9.11199690712029, "grad_norm": 0.22912593185901642, "learning_rate": 2.1358952358716765e-05, "loss": 0.4022, "step": 683500 }, { "epoch": 9.113330044926744, "grad_norm": 50.56855392456055, "learning_rate": 2.1350640427311718e-05, "loss": 0.325, "step": 683600 }, { "epoch": 9.1146631827332, "grad_norm": 3.138005495071411, "learning_rate": 2.1342329220003794e-05, "loss": 0.4134, "step": 683700 }, { "epoch": 9.115996320539654, "grad_norm": 6.120549201965332, "learning_rate": 2.133401873748877e-05, "loss": 0.3829, "step": 683800 }, { "epoch": 9.11732945834611, "grad_norm": 4.362492561340332, "learning_rate": 2.1325708980462378e-05, "loss": 0.4008, "step": 683900 }, { "epoch": 9.118662596152564, "grad_norm": 2.667461395263672, "learning_rate": 2.1317399949620293e-05, "loss": 0.3161, "step": 684000 }, { "epoch": 9.11999573395902, "grad_norm": 3.0387322902679443, "learning_rate": 2.1309091645658112e-05, "loss": 0.3221, "step": 684100 }, { "epoch": 9.121328871765474, "grad_norm": 3.0137503147125244, "learning_rate": 2.130078406927138e-05, "loss": 0.3713, "step": 684200 }, { "epoch": 9.122662009571929, "grad_norm": 1.3231230974197388, "learning_rate": 2.129247722115559e-05, "loss": 0.3967, "step": 684300 }, { "epoch": 9.123995147378384, "grad_norm": 4.30649471282959, "learning_rate": 2.128417110200615e-05, "loss": 0.3767, "step": 684400 }, { "epoch": 9.125328285184839, "grad_norm": 4.367331504821777, "learning_rate": 2.1275865712518428e-05, "loss": 0.3418, "step": 684500 }, { "epoch": 9.126661422991294, "grad_norm": 1.9180195331573486, "learning_rate": 2.1267561053387744e-05, "loss": 0.3653, "step": 684600 }, { "epoch": 9.127994560797749, "grad_norm": 1.5409947633743286, "learning_rate": 2.1259257125309303e-05, "loss": 0.3898, "step": 684700 }, { "epoch": 9.129327698604206, "grad_norm": 2.5448224544525146, "learning_rate": 2.125095392897832e-05, "loss": 0.3809, "step": 684800 }, { "epoch": 9.13066083641066, "grad_norm": 3.5337815284729004, "learning_rate": 2.124265146508988e-05, "loss": 0.3526, "step": 684900 }, { "epoch": 9.131993974217115, "grad_norm": 29.33833122253418, "learning_rate": 2.1234349734339057e-05, "loss": 0.3771, "step": 685000 }, { "epoch": 9.13332711202357, "grad_norm": 2.91447114944458, "learning_rate": 2.1226214750163158e-05, "loss": 0.3882, "step": 685100 }, { "epoch": 9.134660249830025, "grad_norm": 56.403221130371094, "learning_rate": 2.1217914473075122e-05, "loss": 0.3665, "step": 685200 }, { "epoch": 9.13599338763648, "grad_norm": 28.31877326965332, "learning_rate": 2.1209614931195614e-05, "loss": 0.3516, "step": 685300 }, { "epoch": 9.137326525442935, "grad_norm": 7.997844219207764, "learning_rate": 2.1201316125219425e-05, "loss": 0.3862, "step": 685400 }, { "epoch": 9.13865966324939, "grad_norm": 1.5996493101119995, "learning_rate": 2.1193018055841316e-05, "loss": 0.3605, "step": 685500 }, { "epoch": 9.139992801055845, "grad_norm": 1.9378092288970947, "learning_rate": 2.118472072375598e-05, "loss": 0.3791, "step": 685600 }, { "epoch": 9.1413259388623, "grad_norm": 8.075027465820312, "learning_rate": 2.1176424129658022e-05, "loss": 0.37, "step": 685700 }, { "epoch": 9.142659076668755, "grad_norm": 3.115708827972412, "learning_rate": 2.116812827424203e-05, "loss": 0.3777, "step": 685800 }, { "epoch": 9.14399221447521, "grad_norm": 1.057037353515625, "learning_rate": 2.1159833158202502e-05, "loss": 0.3788, "step": 685900 }, { "epoch": 9.145325352281665, "grad_norm": 5.827889919281006, "learning_rate": 2.115153878223387e-05, "loss": 0.3481, "step": 686000 }, { "epoch": 9.14665849008812, "grad_norm": 10.540380477905273, "learning_rate": 2.1143245147030528e-05, "loss": 0.3667, "step": 686100 }, { "epoch": 9.147991627894575, "grad_norm": 4.3630475997924805, "learning_rate": 2.1134952253286774e-05, "loss": 0.404, "step": 686200 }, { "epoch": 9.14932476570103, "grad_norm": 2.1063780784606934, "learning_rate": 2.1126660101696876e-05, "loss": 0.3575, "step": 686300 }, { "epoch": 9.150657903507486, "grad_norm": 4.641506671905518, "learning_rate": 2.1118368692955025e-05, "loss": 0.3697, "step": 686400 }, { "epoch": 9.151991041313941, "grad_norm": 3.7126080989837646, "learning_rate": 2.1110078027755345e-05, "loss": 0.3384, "step": 686500 }, { "epoch": 9.153324179120396, "grad_norm": 6.445128440856934, "learning_rate": 2.1101788106791907e-05, "loss": 0.3759, "step": 686600 }, { "epoch": 9.154657316926851, "grad_norm": 4.442340850830078, "learning_rate": 2.109349893075873e-05, "loss": 0.3734, "step": 686700 }, { "epoch": 9.155990454733306, "grad_norm": 3.8588082790374756, "learning_rate": 2.1085210500349735e-05, "loss": 0.3562, "step": 686800 }, { "epoch": 9.157323592539761, "grad_norm": 1.399511694908142, "learning_rate": 2.1076922816258812e-05, "loss": 0.3468, "step": 686900 }, { "epoch": 9.158656730346216, "grad_norm": 0.9780900478363037, "learning_rate": 2.1068635879179802e-05, "loss": 0.3723, "step": 687000 }, { "epoch": 9.159989868152671, "grad_norm": 8.014349937438965, "learning_rate": 2.1060349689806424e-05, "loss": 0.3947, "step": 687100 }, { "epoch": 9.161323005959126, "grad_norm": 11.313102722167969, "learning_rate": 2.1052064248832394e-05, "loss": 0.3506, "step": 687200 }, { "epoch": 9.162656143765581, "grad_norm": 1.726171612739563, "learning_rate": 2.1043779556951345e-05, "loss": 0.3548, "step": 687300 }, { "epoch": 9.163989281572036, "grad_norm": 2.723379135131836, "learning_rate": 2.1035495614856834e-05, "loss": 0.3917, "step": 687400 }, { "epoch": 9.16532241937849, "grad_norm": 0.01379234716296196, "learning_rate": 2.1027212423242374e-05, "loss": 0.3388, "step": 687500 }, { "epoch": 9.166655557184946, "grad_norm": 13.101237297058105, "learning_rate": 2.1018929982801404e-05, "loss": 0.4555, "step": 687600 }, { "epoch": 9.1679886949914, "grad_norm": 1.9266440868377686, "learning_rate": 2.1010648294227306e-05, "loss": 0.3142, "step": 687700 }, { "epoch": 9.169321832797856, "grad_norm": 3.5107221603393555, "learning_rate": 2.10023673582134e-05, "loss": 0.3203, "step": 687800 }, { "epoch": 9.17065497060431, "grad_norm": 5.333093166351318, "learning_rate": 2.0994087175452925e-05, "loss": 0.3596, "step": 687900 }, { "epoch": 9.171988108410767, "grad_norm": 9.876811027526855, "learning_rate": 2.0985807746639083e-05, "loss": 0.3628, "step": 688000 }, { "epoch": 9.173321246217222, "grad_norm": 8.497897148132324, "learning_rate": 2.0977529072465013e-05, "loss": 0.4145, "step": 688100 }, { "epoch": 9.174654384023677, "grad_norm": 29.799474716186523, "learning_rate": 2.0969251153623755e-05, "loss": 0.3695, "step": 688200 }, { "epoch": 9.175987521830132, "grad_norm": 4.466526508331299, "learning_rate": 2.096097399080833e-05, "loss": 0.3817, "step": 688300 }, { "epoch": 9.177320659636587, "grad_norm": 15.102163314819336, "learning_rate": 2.0952697584711674e-05, "loss": 0.3789, "step": 688400 }, { "epoch": 9.178653797443042, "grad_norm": 6.877872943878174, "learning_rate": 2.094442193602665e-05, "loss": 0.3206, "step": 688500 }, { "epoch": 9.179986935249497, "grad_norm": 6.110257625579834, "learning_rate": 2.0936147045446072e-05, "loss": 0.367, "step": 688600 }, { "epoch": 9.181320073055952, "grad_norm": 4.896378993988037, "learning_rate": 2.09278729136627e-05, "loss": 0.3673, "step": 688700 }, { "epoch": 9.182653210862407, "grad_norm": 5.001102924346924, "learning_rate": 2.0919599541369205e-05, "loss": 0.3676, "step": 688800 }, { "epoch": 9.183986348668862, "grad_norm": 2.427250623703003, "learning_rate": 2.0911326929258215e-05, "loss": 0.3471, "step": 688900 }, { "epoch": 9.185319486475317, "grad_norm": 9.410653114318848, "learning_rate": 2.0903055078022285e-05, "loss": 0.365, "step": 689000 }, { "epoch": 9.186652624281772, "grad_norm": 2.93935489654541, "learning_rate": 2.089478398835391e-05, "loss": 0.3, "step": 689100 }, { "epoch": 9.187985762088227, "grad_norm": 8.765149116516113, "learning_rate": 2.0886513660945517e-05, "loss": 0.3741, "step": 689200 }, { "epoch": 9.189318899894682, "grad_norm": 2.2920267581939697, "learning_rate": 2.0878244096489463e-05, "loss": 0.3705, "step": 689300 }, { "epoch": 9.190652037701136, "grad_norm": 3.874507188796997, "learning_rate": 2.086997529567806e-05, "loss": 0.3594, "step": 689400 }, { "epoch": 9.191985175507591, "grad_norm": 5.600805282592773, "learning_rate": 2.0861707259203557e-05, "loss": 0.3389, "step": 689500 }, { "epoch": 9.193318313314048, "grad_norm": 1.7480393648147583, "learning_rate": 2.0853439987758103e-05, "loss": 0.3685, "step": 689600 }, { "epoch": 9.194651451120503, "grad_norm": 2.7593889236450195, "learning_rate": 2.084517348203382e-05, "loss": 0.3514, "step": 689700 }, { "epoch": 9.195984588926958, "grad_norm": 1.7001078128814697, "learning_rate": 2.083690774272277e-05, "loss": 0.3645, "step": 689800 }, { "epoch": 9.197317726733413, "grad_norm": 3.0736639499664307, "learning_rate": 2.0828642770516908e-05, "loss": 0.3631, "step": 689900 }, { "epoch": 9.198650864539868, "grad_norm": 4.0902485847473145, "learning_rate": 2.082037856610816e-05, "loss": 0.351, "step": 690000 }, { "epoch": 9.199984002346323, "grad_norm": 0.9811041951179504, "learning_rate": 2.0812197760741295e-05, "loss": 0.3997, "step": 690100 }, { "epoch": 9.201317140152778, "grad_norm": 2.555699110031128, "learning_rate": 2.080393508630705e-05, "loss": 0.3591, "step": 690200 }, { "epoch": 9.202650277959233, "grad_norm": 2.9426984786987305, "learning_rate": 2.079567318173837e-05, "loss": 0.3879, "step": 690300 }, { "epoch": 9.203983415765688, "grad_norm": 6.405473232269287, "learning_rate": 2.0787412047726908e-05, "loss": 0.3637, "step": 690400 }, { "epoch": 9.205316553572143, "grad_norm": 2.9845659732818604, "learning_rate": 2.077915168496429e-05, "loss": 0.3623, "step": 690500 }, { "epoch": 9.206649691378598, "grad_norm": 3.562354564666748, "learning_rate": 2.0770892094142002e-05, "loss": 0.3528, "step": 690600 }, { "epoch": 9.207982829185053, "grad_norm": 3.5880112648010254, "learning_rate": 2.0762633275951557e-05, "loss": 0.3556, "step": 690700 }, { "epoch": 9.209315966991507, "grad_norm": 3.258507490158081, "learning_rate": 2.0754375231084317e-05, "loss": 0.3848, "step": 690800 }, { "epoch": 9.210649104797962, "grad_norm": 8.172873497009277, "learning_rate": 2.0746117960231648e-05, "loss": 0.3681, "step": 690900 }, { "epoch": 9.211982242604417, "grad_norm": 2.622802495956421, "learning_rate": 2.0737861464084815e-05, "loss": 0.3642, "step": 691000 }, { "epoch": 9.213315380410872, "grad_norm": 7.58245325088501, "learning_rate": 2.0729605743335012e-05, "loss": 0.3674, "step": 691100 }, { "epoch": 9.214648518217329, "grad_norm": 1.5580602884292603, "learning_rate": 2.07213507986734e-05, "loss": 0.3629, "step": 691200 }, { "epoch": 9.215981656023784, "grad_norm": 3.540623188018799, "learning_rate": 2.071309663079106e-05, "loss": 0.3418, "step": 691300 }, { "epoch": 9.217314793830239, "grad_norm": 3.3190672397613525, "learning_rate": 2.0704843240378982e-05, "loss": 0.3441, "step": 691400 }, { "epoch": 9.218647931636694, "grad_norm": 1.1000090837478638, "learning_rate": 2.069659062812812e-05, "loss": 0.395, "step": 691500 }, { "epoch": 9.219981069443149, "grad_norm": 0.8545821905136108, "learning_rate": 2.0688338794729388e-05, "loss": 0.3526, "step": 691600 }, { "epoch": 9.221314207249604, "grad_norm": 4.413358688354492, "learning_rate": 2.068008774087356e-05, "loss": 0.3273, "step": 691700 }, { "epoch": 9.222647345056059, "grad_norm": 5.603926658630371, "learning_rate": 2.067183746725141e-05, "loss": 0.3482, "step": 691800 }, { "epoch": 9.223980482862514, "grad_norm": 5.952563762664795, "learning_rate": 2.066358797455363e-05, "loss": 0.3728, "step": 691900 }, { "epoch": 9.225313620668969, "grad_norm": 5.452901363372803, "learning_rate": 2.0655339263470827e-05, "loss": 0.3293, "step": 692000 }, { "epoch": 9.226646758475423, "grad_norm": 7.0439910888671875, "learning_rate": 2.0647091334693573e-05, "loss": 0.3482, "step": 692100 }, { "epoch": 9.227979896281878, "grad_norm": 3.893639087677002, "learning_rate": 2.063884418891234e-05, "loss": 0.3632, "step": 692200 }, { "epoch": 9.229313034088333, "grad_norm": 3.671520948410034, "learning_rate": 2.0630597826817565e-05, "loss": 0.3776, "step": 692300 }, { "epoch": 9.230646171894788, "grad_norm": 2.0095903873443604, "learning_rate": 2.062235224909961e-05, "loss": 0.3702, "step": 692400 }, { "epoch": 9.231979309701243, "grad_norm": 4.732251167297363, "learning_rate": 2.0614107456448754e-05, "loss": 0.286, "step": 692500 }, { "epoch": 9.233312447507698, "grad_norm": 3.455205202102661, "learning_rate": 2.0605863449555236e-05, "loss": 0.3293, "step": 692600 }, { "epoch": 9.234645585314153, "grad_norm": 1.9781677722930908, "learning_rate": 2.0597620229109228e-05, "loss": 0.3507, "step": 692700 }, { "epoch": 9.23597872312061, "grad_norm": 0.2695735991001129, "learning_rate": 2.0589377795800807e-05, "loss": 0.3532, "step": 692800 }, { "epoch": 9.237311860927065, "grad_norm": 0.8478692173957825, "learning_rate": 2.058113615032001e-05, "loss": 0.3408, "step": 692900 }, { "epoch": 9.23864499873352, "grad_norm": 1.7711228132247925, "learning_rate": 2.0572895293356817e-05, "loss": 0.3079, "step": 693000 }, { "epoch": 9.239978136539975, "grad_norm": 3.566897392272949, "learning_rate": 2.0564655225601096e-05, "loss": 0.3586, "step": 693100 }, { "epoch": 9.24131127434643, "grad_norm": 2.894045352935791, "learning_rate": 2.0556415947742704e-05, "loss": 0.3111, "step": 693200 }, { "epoch": 9.242644412152885, "grad_norm": 9.77861213684082, "learning_rate": 2.0548177460471404e-05, "loss": 0.3613, "step": 693300 }, { "epoch": 9.24397754995934, "grad_norm": 1.9094946384429932, "learning_rate": 2.0539939764476887e-05, "loss": 0.35, "step": 693400 }, { "epoch": 9.245310687765794, "grad_norm": 1.5511294603347778, "learning_rate": 2.0531702860448795e-05, "loss": 0.3004, "step": 693500 }, { "epoch": 9.24664382557225, "grad_norm": 1.9077417850494385, "learning_rate": 2.0523466749076695e-05, "loss": 0.3128, "step": 693600 }, { "epoch": 9.247976963378704, "grad_norm": 4.5071210861206055, "learning_rate": 2.0515231431050084e-05, "loss": 0.3176, "step": 693700 }, { "epoch": 9.24931010118516, "grad_norm": 2.4308202266693115, "learning_rate": 2.0506996907058402e-05, "loss": 0.3106, "step": 693800 }, { "epoch": 9.250643238991614, "grad_norm": 14.505195617675781, "learning_rate": 2.049876317779101e-05, "loss": 0.3529, "step": 693900 }, { "epoch": 9.25197637679807, "grad_norm": 1.5347449779510498, "learning_rate": 2.049061256933619e-05, "loss": 0.3697, "step": 694000 }, { "epoch": 9.253309514604524, "grad_norm": 8.053715705871582, "learning_rate": 2.048238042362078e-05, "loss": 0.3503, "step": 694100 }, { "epoch": 9.254642652410979, "grad_norm": 2.3925321102142334, "learning_rate": 2.0474149074690496e-05, "loss": 0.4142, "step": 694200 }, { "epoch": 9.255975790217434, "grad_norm": 4.574872970581055, "learning_rate": 2.0465918523234402e-05, "loss": 0.3284, "step": 694300 }, { "epoch": 9.257308928023889, "grad_norm": 4.227188587188721, "learning_rate": 2.0457688769941568e-05, "loss": 0.3847, "step": 694400 }, { "epoch": 9.258642065830346, "grad_norm": 18.361125946044922, "learning_rate": 2.0449459815500952e-05, "loss": 0.3369, "step": 694500 }, { "epoch": 9.2599752036368, "grad_norm": 4.319841384887695, "learning_rate": 2.0441231660601448e-05, "loss": 0.3717, "step": 694600 }, { "epoch": 9.261308341443256, "grad_norm": 2.4924604892730713, "learning_rate": 2.04330043059319e-05, "loss": 0.3177, "step": 694700 }, { "epoch": 9.26264147924971, "grad_norm": 4.860583782196045, "learning_rate": 2.0424777752181078e-05, "loss": 0.3507, "step": 694800 }, { "epoch": 9.263974617056165, "grad_norm": 4.319258689880371, "learning_rate": 2.041655200003767e-05, "loss": 0.3554, "step": 694900 }, { "epoch": 9.26530775486262, "grad_norm": 3.576378345489502, "learning_rate": 2.040832705019032e-05, "loss": 0.3496, "step": 695000 }, { "epoch": 9.266640892669075, "grad_norm": 4.440478324890137, "learning_rate": 2.0400102903327608e-05, "loss": 0.3747, "step": 695100 }, { "epoch": 9.26797403047553, "grad_norm": 4.539759159088135, "learning_rate": 2.0391879560138002e-05, "loss": 0.3725, "step": 695200 }, { "epoch": 9.269307168281985, "grad_norm": 4.983639717102051, "learning_rate": 2.038365702130997e-05, "loss": 0.3657, "step": 695300 }, { "epoch": 9.27064030608844, "grad_norm": 9.85300064086914, "learning_rate": 2.0375435287531836e-05, "loss": 0.3971, "step": 695400 }, { "epoch": 9.271973443894895, "grad_norm": 3.0907347202301025, "learning_rate": 2.0367214359491932e-05, "loss": 0.4, "step": 695500 }, { "epoch": 9.27330658170135, "grad_norm": 3.218557119369507, "learning_rate": 2.0358994237878483e-05, "loss": 0.3674, "step": 695600 }, { "epoch": 9.274639719507805, "grad_norm": 5.054907321929932, "learning_rate": 2.0350774923379636e-05, "loss": 0.385, "step": 695700 }, { "epoch": 9.27597285731426, "grad_norm": 2.6631531715393066, "learning_rate": 2.03425564166835e-05, "loss": 0.3403, "step": 695800 }, { "epoch": 9.277305995120715, "grad_norm": 0.9196220636367798, "learning_rate": 2.03343387184781e-05, "loss": 0.351, "step": 695900 }, { "epoch": 9.278639132927172, "grad_norm": 2.188589096069336, "learning_rate": 2.032612182945139e-05, "loss": 0.3309, "step": 696000 }, { "epoch": 9.279972270733627, "grad_norm": 1.7453643083572388, "learning_rate": 2.0317905750291262e-05, "loss": 0.3908, "step": 696100 }, { "epoch": 9.281305408540081, "grad_norm": 4.046865940093994, "learning_rate": 2.030969048168555e-05, "loss": 0.3464, "step": 696200 }, { "epoch": 9.282638546346536, "grad_norm": 1.117696762084961, "learning_rate": 2.0301476024322002e-05, "loss": 0.3331, "step": 696300 }, { "epoch": 9.283971684152991, "grad_norm": 3.849154233932495, "learning_rate": 2.02932623788883e-05, "loss": 0.3673, "step": 696400 }, { "epoch": 9.285304821959446, "grad_norm": 13.091184616088867, "learning_rate": 2.0285049546072096e-05, "loss": 0.368, "step": 696500 }, { "epoch": 9.286637959765901, "grad_norm": 4.901585102081299, "learning_rate": 2.0276837526560898e-05, "loss": 0.3785, "step": 696600 }, { "epoch": 9.287971097572356, "grad_norm": 7.892422199249268, "learning_rate": 2.026862632104223e-05, "loss": 0.3495, "step": 696700 }, { "epoch": 9.289304235378811, "grad_norm": 6.249841690063477, "learning_rate": 2.0260415930203473e-05, "loss": 0.3186, "step": 696800 }, { "epoch": 9.290637373185266, "grad_norm": 4.716853618621826, "learning_rate": 2.0252206354732e-05, "loss": 0.3481, "step": 696900 }, { "epoch": 9.291970510991721, "grad_norm": 16.192001342773438, "learning_rate": 2.0243997595315087e-05, "loss": 0.338, "step": 697000 }, { "epoch": 9.293303648798176, "grad_norm": 5.508439064025879, "learning_rate": 2.023578965263994e-05, "loss": 0.4075, "step": 697100 }, { "epoch": 9.29463678660463, "grad_norm": 23.798751831054688, "learning_rate": 2.02275825273937e-05, "loss": 0.3243, "step": 697200 }, { "epoch": 9.295969924411086, "grad_norm": 20.53017234802246, "learning_rate": 2.021937622026345e-05, "loss": 0.3217, "step": 697300 }, { "epoch": 9.29730306221754, "grad_norm": 4.314487934112549, "learning_rate": 2.0211170731936188e-05, "loss": 0.3991, "step": 697400 }, { "epoch": 9.298636200023996, "grad_norm": 1.552829384803772, "learning_rate": 2.0202966063098852e-05, "loss": 0.3249, "step": 697500 }, { "epoch": 9.29996933783045, "grad_norm": 0.7984216213226318, "learning_rate": 2.0194762214438328e-05, "loss": 0.3808, "step": 697600 }, { "epoch": 9.301302475636907, "grad_norm": 0.802685022354126, "learning_rate": 2.0186559186641388e-05, "loss": 0.3576, "step": 697700 }, { "epoch": 9.302635613443362, "grad_norm": 1.769505500793457, "learning_rate": 2.017835698039478e-05, "loss": 0.3704, "step": 697800 }, { "epoch": 9.303968751249817, "grad_norm": 4.727851390838623, "learning_rate": 2.0170155596385174e-05, "loss": 0.4373, "step": 697900 }, { "epoch": 9.305301889056272, "grad_norm": 10.709331512451172, "learning_rate": 2.0162037036834284e-05, "loss": 0.3784, "step": 698000 }, { "epoch": 9.306635026862727, "grad_norm": 3.949573278427124, "learning_rate": 2.0153837291118873e-05, "loss": 0.3923, "step": 698100 }, { "epoch": 9.307968164669182, "grad_norm": 0.6248486042022705, "learning_rate": 2.0145638369693174e-05, "loss": 0.3533, "step": 698200 }, { "epoch": 9.309301302475637, "grad_norm": 2.088249444961548, "learning_rate": 2.0137440273243556e-05, "loss": 0.2896, "step": 698300 }, { "epoch": 9.310634440282092, "grad_norm": 4.425291538238525, "learning_rate": 2.0129243002456353e-05, "loss": 0.4434, "step": 698400 }, { "epoch": 9.311967578088547, "grad_norm": 2.4682791233062744, "learning_rate": 2.012104655801781e-05, "loss": 0.3771, "step": 698500 }, { "epoch": 9.313300715895002, "grad_norm": 6.044100284576416, "learning_rate": 2.0112850940614108e-05, "loss": 0.3629, "step": 698600 }, { "epoch": 9.314633853701457, "grad_norm": 2.517859697341919, "learning_rate": 2.010465615093135e-05, "loss": 0.3971, "step": 698700 }, { "epoch": 9.315966991507912, "grad_norm": 1.3277193307876587, "learning_rate": 2.0096462189655606e-05, "loss": 0.3572, "step": 698800 }, { "epoch": 9.317300129314367, "grad_norm": 3.3791661262512207, "learning_rate": 2.008826905747281e-05, "loss": 0.3156, "step": 698900 }, { "epoch": 9.318633267120822, "grad_norm": 3.237718343734741, "learning_rate": 2.0080076755068888e-05, "loss": 0.3656, "step": 699000 }, { "epoch": 9.319966404927277, "grad_norm": 1.327547550201416, "learning_rate": 2.007188528312968e-05, "loss": 0.359, "step": 699100 }, { "epoch": 9.321299542733733, "grad_norm": 15.133641242980957, "learning_rate": 2.006369464234093e-05, "loss": 0.3533, "step": 699200 }, { "epoch": 9.322632680540188, "grad_norm": 3.1311416625976562, "learning_rate": 2.0055504833388343e-05, "loss": 0.3691, "step": 699300 }, { "epoch": 9.323965818346643, "grad_norm": 3.296125650405884, "learning_rate": 2.0047315856957552e-05, "loss": 0.3684, "step": 699400 }, { "epoch": 9.325298956153098, "grad_norm": 3.5122509002685547, "learning_rate": 2.0039127713734098e-05, "loss": 0.3362, "step": 699500 }, { "epoch": 9.326632093959553, "grad_norm": 63.244895935058594, "learning_rate": 2.0030940404403468e-05, "loss": 0.4017, "step": 699600 }, { "epoch": 9.327965231766008, "grad_norm": 2.1907405853271484, "learning_rate": 2.0022835790265202e-05, "loss": 0.3558, "step": 699700 }, { "epoch": 9.329298369572463, "grad_norm": 5.5371623039245605, "learning_rate": 2.0014731994762064e-05, "loss": 0.3783, "step": 699800 }, { "epoch": 9.330631507378918, "grad_norm": 5.823005199432373, "learning_rate": 2.0006547174496445e-05, "loss": 0.3721, "step": 699900 }, { "epoch": 9.331964645185373, "grad_norm": 4.241772651672363, "learning_rate": 1.9998363190851187e-05, "loss": 0.3489, "step": 700000 }, { "epoch": 9.333297782991828, "grad_norm": 4.341956615447998, "learning_rate": 1.9990180044511455e-05, "loss": 0.3527, "step": 700100 }, { "epoch": 9.334630920798283, "grad_norm": 2.5926668643951416, "learning_rate": 1.9981997736162275e-05, "loss": 0.3832, "step": 700200 }, { "epoch": 9.335964058604738, "grad_norm": 4.159965991973877, "learning_rate": 1.9973816266488673e-05, "loss": 0.3227, "step": 700300 }, { "epoch": 9.337297196411193, "grad_norm": 4.175119876861572, "learning_rate": 1.9965635636175574e-05, "loss": 0.3919, "step": 700400 }, { "epoch": 9.338630334217648, "grad_norm": 4.198005199432373, "learning_rate": 1.995745584590782e-05, "loss": 0.3817, "step": 700500 }, { "epoch": 9.339963472024102, "grad_norm": 3.2334864139556885, "learning_rate": 1.994927689637021e-05, "loss": 0.3724, "step": 700600 }, { "epoch": 9.341296609830557, "grad_norm": 0.684106707572937, "learning_rate": 1.9941098788247457e-05, "loss": 0.3534, "step": 700700 }, { "epoch": 9.342629747637012, "grad_norm": 3.7732837200164795, "learning_rate": 1.9932921522224206e-05, "loss": 0.3485, "step": 700800 }, { "epoch": 9.343962885443469, "grad_norm": 10.807160377502441, "learning_rate": 1.9924745098985018e-05, "loss": 0.4117, "step": 700900 }, { "epoch": 9.345296023249924, "grad_norm": 3.8670127391815186, "learning_rate": 1.9916569519214432e-05, "loss": 0.4049, "step": 701000 }, { "epoch": 9.346629161056379, "grad_norm": 4.386839389801025, "learning_rate": 1.9908394783596838e-05, "loss": 0.4066, "step": 701100 }, { "epoch": 9.347962298862834, "grad_norm": 8.51851749420166, "learning_rate": 1.9900220892816644e-05, "loss": 0.3455, "step": 701200 }, { "epoch": 9.349295436669289, "grad_norm": 1.034294843673706, "learning_rate": 1.9892047847558105e-05, "loss": 0.3577, "step": 701300 }, { "epoch": 9.350628574475744, "grad_norm": 76.3376235961914, "learning_rate": 1.9883875648505453e-05, "loss": 0.3326, "step": 701400 }, { "epoch": 9.351961712282199, "grad_norm": 5.247332572937012, "learning_rate": 1.9875704296342857e-05, "loss": 0.3672, "step": 701500 }, { "epoch": 9.353294850088654, "grad_norm": 3.6861636638641357, "learning_rate": 1.9867533791754374e-05, "loss": 0.3481, "step": 701600 }, { "epoch": 9.354627987895109, "grad_norm": 1.7772605419158936, "learning_rate": 1.985936413542402e-05, "loss": 0.3582, "step": 701700 }, { "epoch": 9.355961125701564, "grad_norm": 20.946260452270508, "learning_rate": 1.9851195328035745e-05, "loss": 0.3823, "step": 701800 }, { "epoch": 9.357294263508019, "grad_norm": 2.7993998527526855, "learning_rate": 1.9843027370273396e-05, "loss": 0.3474, "step": 701900 }, { "epoch": 9.358627401314473, "grad_norm": 2.032299041748047, "learning_rate": 1.983486026282078e-05, "loss": 0.3286, "step": 702000 }, { "epoch": 9.359960539120928, "grad_norm": 2.2897889614105225, "learning_rate": 1.9826694006361622e-05, "loss": 0.4173, "step": 702100 }, { "epoch": 9.361293676927383, "grad_norm": 6.8472700119018555, "learning_rate": 1.981852860157957e-05, "loss": 0.4124, "step": 702200 }, { "epoch": 9.362626814733838, "grad_norm": 6.291987419128418, "learning_rate": 1.9810364049158198e-05, "loss": 0.3249, "step": 702300 }, { "epoch": 9.363959952540293, "grad_norm": 10.376267433166504, "learning_rate": 1.980220034978105e-05, "loss": 0.3986, "step": 702400 }, { "epoch": 9.36529309034675, "grad_norm": 3.317560911178589, "learning_rate": 1.9794037504131525e-05, "loss": 0.3529, "step": 702500 }, { "epoch": 9.366626228153205, "grad_norm": 4.800870895385742, "learning_rate": 1.9785875512893023e-05, "loss": 0.3241, "step": 702600 }, { "epoch": 9.36795936595966, "grad_norm": 1.640984296798706, "learning_rate": 1.977771437674881e-05, "loss": 0.3409, "step": 702700 }, { "epoch": 9.369292503766115, "grad_norm": 6.4745330810546875, "learning_rate": 1.9769554096382133e-05, "loss": 0.3659, "step": 702800 }, { "epoch": 9.37062564157257, "grad_norm": 2.9620304107666016, "learning_rate": 1.9761394672476137e-05, "loss": 0.361, "step": 702900 }, { "epoch": 9.371958779379025, "grad_norm": 1.7758361101150513, "learning_rate": 1.9753236105713904e-05, "loss": 0.3104, "step": 703000 }, { "epoch": 9.37329191718548, "grad_norm": 0.31045156717300415, "learning_rate": 1.9745078396778444e-05, "loss": 0.3662, "step": 703100 }, { "epoch": 9.374625054991935, "grad_norm": 4.300981044769287, "learning_rate": 1.97369215463527e-05, "loss": 0.3746, "step": 703200 }, { "epoch": 9.37595819279839, "grad_norm": 2.9701945781707764, "learning_rate": 1.9728765555119523e-05, "loss": 0.3741, "step": 703300 }, { "epoch": 9.377291330604844, "grad_norm": 4.011102676391602, "learning_rate": 1.9720610423761713e-05, "loss": 0.3761, "step": 703400 }, { "epoch": 9.3786244684113, "grad_norm": 4.333749294281006, "learning_rate": 1.971245615296201e-05, "loss": 0.3849, "step": 703500 }, { "epoch": 9.379957606217754, "grad_norm": 13.405869483947754, "learning_rate": 1.9704302743403033e-05, "loss": 0.3524, "step": 703600 }, { "epoch": 9.38129074402421, "grad_norm": 2.5245752334594727, "learning_rate": 1.969615019576738e-05, "loss": 0.349, "step": 703700 }, { "epoch": 9.382623881830664, "grad_norm": 3.8273749351501465, "learning_rate": 1.968799851073756e-05, "loss": 0.3651, "step": 703800 }, { "epoch": 9.38395701963712, "grad_norm": 7.408385276794434, "learning_rate": 1.967984768899599e-05, "loss": 0.3922, "step": 703900 }, { "epoch": 9.385290157443574, "grad_norm": 18.01251792907715, "learning_rate": 1.9671697731225043e-05, "loss": 0.3171, "step": 704000 }, { "epoch": 9.38662329525003, "grad_norm": 9.383051872253418, "learning_rate": 1.966363012475592e-05, "loss": 0.3342, "step": 704100 }, { "epoch": 9.387956433056486, "grad_norm": 8.392804145812988, "learning_rate": 1.9655481888316276e-05, "loss": 0.3306, "step": 704200 }, { "epoch": 9.38928957086294, "grad_norm": 3.8087456226348877, "learning_rate": 1.9647334517887078e-05, "loss": 0.4009, "step": 704300 }, { "epoch": 9.390622708669396, "grad_norm": 12.162156105041504, "learning_rate": 1.9639188014150408e-05, "loss": 0.3989, "step": 704400 }, { "epoch": 9.39195584647585, "grad_norm": 2.3686375617980957, "learning_rate": 1.963104237778825e-05, "loss": 0.3923, "step": 704500 }, { "epoch": 9.393288984282306, "grad_norm": 3.59387469291687, "learning_rate": 1.9622897609482533e-05, "loss": 0.3652, "step": 704600 }, { "epoch": 9.39462212208876, "grad_norm": 1.68874192237854, "learning_rate": 1.9614753709915134e-05, "loss": 0.3164, "step": 704700 }, { "epoch": 9.395955259895215, "grad_norm": 12.03089714050293, "learning_rate": 1.96066106797678e-05, "loss": 0.374, "step": 704800 }, { "epoch": 9.39728839770167, "grad_norm": 1.774086594581604, "learning_rate": 1.9598468519722267e-05, "loss": 0.3237, "step": 704900 }, { "epoch": 9.398621535508125, "grad_norm": 3.2711424827575684, "learning_rate": 1.9590327230460164e-05, "loss": 0.3309, "step": 705000 }, { "epoch": 9.39995467331458, "grad_norm": 26.105693817138672, "learning_rate": 1.9582186812663042e-05, "loss": 0.3029, "step": 705100 }, { "epoch": 9.401287811121035, "grad_norm": 3.25936222076416, "learning_rate": 1.95740472670124e-05, "loss": 0.3253, "step": 705200 }, { "epoch": 9.40262094892749, "grad_norm": 3.196516752243042, "learning_rate": 1.9565908594189666e-05, "loss": 0.3127, "step": 705300 }, { "epoch": 9.403954086733945, "grad_norm": 1.5447815656661987, "learning_rate": 1.9557770794876164e-05, "loss": 0.3594, "step": 705400 }, { "epoch": 9.4052872245404, "grad_norm": 8.78348159790039, "learning_rate": 1.9549633869753187e-05, "loss": 0.3394, "step": 705500 }, { "epoch": 9.406620362346855, "grad_norm": 2.2990028858184814, "learning_rate": 1.9541497819501908e-05, "loss": 0.2918, "step": 705600 }, { "epoch": 9.407953500153312, "grad_norm": 3.164003610610962, "learning_rate": 1.953336264480346e-05, "loss": 0.408, "step": 705700 }, { "epoch": 9.409286637959767, "grad_norm": 4.533453464508057, "learning_rate": 1.9525228346338908e-05, "loss": 0.4134, "step": 705800 }, { "epoch": 9.410619775766222, "grad_norm": 2.5157113075256348, "learning_rate": 1.9517094924789213e-05, "loss": 0.305, "step": 705900 }, { "epoch": 9.411952913572676, "grad_norm": 3.3328170776367188, "learning_rate": 1.950896238083528e-05, "loss": 0.4539, "step": 706000 }, { "epoch": 9.413286051379131, "grad_norm": 2.090214490890503, "learning_rate": 1.9500830715157968e-05, "loss": 0.3625, "step": 706100 }, { "epoch": 9.414619189185586, "grad_norm": 2.6664013862609863, "learning_rate": 1.9492699928437992e-05, "loss": 0.3443, "step": 706200 }, { "epoch": 9.415952326992041, "grad_norm": 28.107988357543945, "learning_rate": 1.9484651316070435e-05, "loss": 0.3695, "step": 706300 }, { "epoch": 9.417285464798496, "grad_norm": 9.115344047546387, "learning_rate": 1.94765222805006e-05, "loss": 0.3852, "step": 706400 }, { "epoch": 9.418618602604951, "grad_norm": 1.5028423070907593, "learning_rate": 1.9468394125923152e-05, "loss": 0.3864, "step": 706500 }, { "epoch": 9.419951740411406, "grad_norm": 2.7477641105651855, "learning_rate": 1.9460266853018544e-05, "loss": 0.3834, "step": 706600 }, { "epoch": 9.421284878217861, "grad_norm": 0.9544646143913269, "learning_rate": 1.9452140462467168e-05, "loss": 0.3318, "step": 706700 }, { "epoch": 9.422618016024316, "grad_norm": 1.2250877618789673, "learning_rate": 1.944401495494935e-05, "loss": 0.3619, "step": 706800 }, { "epoch": 9.423951153830771, "grad_norm": 3.4986917972564697, "learning_rate": 1.943589033114532e-05, "loss": 0.3482, "step": 706900 }, { "epoch": 9.425284291637226, "grad_norm": 3.110976219177246, "learning_rate": 1.9427766591735256e-05, "loss": 0.4211, "step": 707000 }, { "epoch": 9.42661742944368, "grad_norm": 5.204037189483643, "learning_rate": 1.941964373739924e-05, "loss": 0.3393, "step": 707100 }, { "epoch": 9.427950567250136, "grad_norm": 2.7756595611572266, "learning_rate": 1.9411521768817294e-05, "loss": 0.3961, "step": 707200 }, { "epoch": 9.429283705056593, "grad_norm": 21.942350387573242, "learning_rate": 1.940340068666938e-05, "loss": 0.3661, "step": 707300 }, { "epoch": 9.430616842863047, "grad_norm": 3.41867733001709, "learning_rate": 1.9395280491635347e-05, "loss": 0.3883, "step": 707400 }, { "epoch": 9.431949980669502, "grad_norm": 3.5231785774230957, "learning_rate": 1.9387161184395003e-05, "loss": 0.3917, "step": 707500 }, { "epoch": 9.433283118475957, "grad_norm": 5.74641227722168, "learning_rate": 1.9379042765628077e-05, "loss": 0.3524, "step": 707600 }, { "epoch": 9.434616256282412, "grad_norm": 2.9452381134033203, "learning_rate": 1.9370925236014207e-05, "loss": 0.3389, "step": 707700 }, { "epoch": 9.435949394088867, "grad_norm": 4.937977313995361, "learning_rate": 1.936280859623297e-05, "loss": 0.4099, "step": 707800 }, { "epoch": 9.437282531895322, "grad_norm": 4.375119686126709, "learning_rate": 1.9354692846963868e-05, "loss": 0.345, "step": 707900 }, { "epoch": 9.438615669701777, "grad_norm": 2.0751867294311523, "learning_rate": 1.934657798888632e-05, "loss": 0.3965, "step": 708000 }, { "epoch": 9.439948807508232, "grad_norm": 4.252424240112305, "learning_rate": 1.9338464022679676e-05, "loss": 0.3569, "step": 708100 }, { "epoch": 9.441281945314687, "grad_norm": 6.156371593475342, "learning_rate": 1.9330350949023233e-05, "loss": 0.3694, "step": 708200 }, { "epoch": 9.442615083121142, "grad_norm": 2.28315806388855, "learning_rate": 1.9322238768596155e-05, "loss": 0.3556, "step": 708300 }, { "epoch": 9.443948220927597, "grad_norm": 3.1876823902130127, "learning_rate": 1.9314127482077597e-05, "loss": 0.3049, "step": 708400 }, { "epoch": 9.445281358734052, "grad_norm": 1.2281177043914795, "learning_rate": 1.9306017090146604e-05, "loss": 0.3143, "step": 708500 }, { "epoch": 9.446614496540507, "grad_norm": 4.393065929412842, "learning_rate": 1.929790759348215e-05, "loss": 0.3615, "step": 708600 }, { "epoch": 9.447947634346962, "grad_norm": 5.073090553283691, "learning_rate": 1.928979899276314e-05, "loss": 0.3657, "step": 708700 }, { "epoch": 9.449280772153417, "grad_norm": 1.4725494384765625, "learning_rate": 1.9281691288668385e-05, "loss": 0.3444, "step": 708800 }, { "epoch": 9.450613909959873, "grad_norm": 1.6871353387832642, "learning_rate": 1.9273584481876655e-05, "loss": 0.3963, "step": 708900 }, { "epoch": 9.451947047766328, "grad_norm": 2.9562408924102783, "learning_rate": 1.926547857306662e-05, "loss": 0.3281, "step": 709000 }, { "epoch": 9.453280185572783, "grad_norm": 3.958765745162964, "learning_rate": 1.925737356291688e-05, "loss": 0.3895, "step": 709100 }, { "epoch": 9.454613323379238, "grad_norm": 4.709973335266113, "learning_rate": 1.9249269452105954e-05, "loss": 0.3269, "step": 709200 }, { "epoch": 9.455946461185693, "grad_norm": 4.2079901695251465, "learning_rate": 1.9241166241312316e-05, "loss": 0.3273, "step": 709300 }, { "epoch": 9.457279598992148, "grad_norm": 2.0906553268432617, "learning_rate": 1.923306393121431e-05, "loss": 0.336, "step": 709400 }, { "epoch": 9.458612736798603, "grad_norm": 4.694326400756836, "learning_rate": 1.922496252249026e-05, "loss": 0.3591, "step": 709500 }, { "epoch": 9.459945874605058, "grad_norm": 0.41080284118652344, "learning_rate": 1.9216862015818383e-05, "loss": 0.3287, "step": 709600 }, { "epoch": 9.461279012411513, "grad_norm": 3.4920971393585205, "learning_rate": 1.9208762411876822e-05, "loss": 0.4148, "step": 709700 }, { "epoch": 9.462612150217968, "grad_norm": 10.141521453857422, "learning_rate": 1.9200663711343656e-05, "loss": 0.3767, "step": 709800 }, { "epoch": 9.463945288024423, "grad_norm": 8.054055213928223, "learning_rate": 1.9192565914896882e-05, "loss": 0.3224, "step": 709900 }, { "epoch": 9.465278425830878, "grad_norm": 6.548328399658203, "learning_rate": 1.9184469023214423e-05, "loss": 0.3195, "step": 710000 }, { "epoch": 9.466611563637333, "grad_norm": 1.807350516319275, "learning_rate": 1.9176373036974128e-05, "loss": 0.3744, "step": 710100 }, { "epoch": 9.467944701443788, "grad_norm": 2.1887571811676025, "learning_rate": 1.9168277956853755e-05, "loss": 0.3932, "step": 710200 }, { "epoch": 9.469277839250243, "grad_norm": 6.329771041870117, "learning_rate": 1.916018378353101e-05, "loss": 0.3541, "step": 710300 }, { "epoch": 9.470610977056698, "grad_norm": 10.839329719543457, "learning_rate": 1.915209051768351e-05, "loss": 0.3583, "step": 710400 }, { "epoch": 9.471944114863154, "grad_norm": 1.9094144105911255, "learning_rate": 1.9143998159988792e-05, "loss": 0.3608, "step": 710500 }, { "epoch": 9.47327725266961, "grad_norm": 1.2649993896484375, "learning_rate": 1.9135906711124326e-05, "loss": 0.342, "step": 710600 }, { "epoch": 9.474610390476064, "grad_norm": 3.9470953941345215, "learning_rate": 1.9127816171767515e-05, "loss": 0.35, "step": 710700 }, { "epoch": 9.475943528282519, "grad_norm": 4.746440887451172, "learning_rate": 1.911972654259565e-05, "loss": 0.3804, "step": 710800 }, { "epoch": 9.477276666088974, "grad_norm": 0.7803106904029846, "learning_rate": 1.9111637824285982e-05, "loss": 0.3724, "step": 710900 }, { "epoch": 9.478609803895429, "grad_norm": 18.855091094970703, "learning_rate": 1.910355001751568e-05, "loss": 0.3065, "step": 711000 }, { "epoch": 9.479942941701884, "grad_norm": 27.135168075561523, "learning_rate": 1.9095463122961817e-05, "loss": 0.3652, "step": 711100 }, { "epoch": 9.481276079508339, "grad_norm": 3.171736478805542, "learning_rate": 1.908737714130141e-05, "loss": 0.3803, "step": 711200 }, { "epoch": 9.482609217314794, "grad_norm": 7.267231464385986, "learning_rate": 1.9079292073211392e-05, "loss": 0.3979, "step": 711300 }, { "epoch": 9.483942355121249, "grad_norm": 3.6144254207611084, "learning_rate": 1.9071207919368606e-05, "loss": 0.3319, "step": 711400 }, { "epoch": 9.485275492927704, "grad_norm": 4.792123794555664, "learning_rate": 1.9063124680449843e-05, "loss": 0.3865, "step": 711500 }, { "epoch": 9.486608630734159, "grad_norm": 2.9706695079803467, "learning_rate": 1.9055042357131823e-05, "loss": 0.3539, "step": 711600 }, { "epoch": 9.487941768540614, "grad_norm": 0.8777964115142822, "learning_rate": 1.9046960950091134e-05, "loss": 0.3601, "step": 711700 }, { "epoch": 9.489274906347068, "grad_norm": 2.7870357036590576, "learning_rate": 1.9038880460004366e-05, "loss": 0.3618, "step": 711800 }, { "epoch": 9.490608044153523, "grad_norm": 2.768587350845337, "learning_rate": 1.9030800887547957e-05, "loss": 0.3196, "step": 711900 }, { "epoch": 9.491941181959978, "grad_norm": 2.301213502883911, "learning_rate": 1.9022722233398326e-05, "loss": 0.3061, "step": 712000 }, { "epoch": 9.493274319766435, "grad_norm": 3.3297648429870605, "learning_rate": 1.9014644498231796e-05, "loss": 0.3922, "step": 712100 }, { "epoch": 9.49460745757289, "grad_norm": 3.0723490715026855, "learning_rate": 1.9006567682724586e-05, "loss": 0.3401, "step": 712200 }, { "epoch": 9.495940595379345, "grad_norm": 0.7055426836013794, "learning_rate": 1.899849178755288e-05, "loss": 0.3448, "step": 712300 }, { "epoch": 9.4972737331858, "grad_norm": 2.6510536670684814, "learning_rate": 1.8990416813392767e-05, "loss": 0.3737, "step": 712400 }, { "epoch": 9.498606870992255, "grad_norm": 2.940584897994995, "learning_rate": 1.898234276092025e-05, "loss": 0.3908, "step": 712500 }, { "epoch": 9.49994000879871, "grad_norm": 1.961579442024231, "learning_rate": 1.897426963081126e-05, "loss": 0.3417, "step": 712600 }, { "epoch": 9.501273146605165, "grad_norm": 2.5666444301605225, "learning_rate": 1.8966197423741672e-05, "loss": 0.344, "step": 712700 }, { "epoch": 9.50260628441162, "grad_norm": 5.36723518371582, "learning_rate": 1.8958126140387242e-05, "loss": 0.398, "step": 712800 }, { "epoch": 9.503939422218075, "grad_norm": 5.175093650817871, "learning_rate": 1.8950055781423686e-05, "loss": 0.3249, "step": 712900 }, { "epoch": 9.50527256002453, "grad_norm": 1.4944709539413452, "learning_rate": 1.894198634752664e-05, "loss": 0.3356, "step": 713000 }, { "epoch": 9.506605697830985, "grad_norm": 3.047492265701294, "learning_rate": 1.8933917839371624e-05, "loss": 0.3429, "step": 713100 }, { "epoch": 9.50793883563744, "grad_norm": 3.5646145343780518, "learning_rate": 1.892585025763413e-05, "loss": 0.3485, "step": 713200 }, { "epoch": 9.509271973443894, "grad_norm": 2.7114899158477783, "learning_rate": 1.8917864264944668e-05, "loss": 0.2937, "step": 713300 }, { "epoch": 9.51060511125035, "grad_norm": 3.8395471572875977, "learning_rate": 1.8909879181554213e-05, "loss": 0.3043, "step": 713400 }, { "epoch": 9.511938249056804, "grad_norm": 4.306413650512695, "learning_rate": 1.890181436454582e-05, "loss": 0.3613, "step": 713500 }, { "epoch": 9.51327138686326, "grad_norm": 3.0434069633483887, "learning_rate": 1.8893750476642548e-05, "loss": 0.3342, "step": 713600 }, { "epoch": 9.514604524669714, "grad_norm": 8.13221549987793, "learning_rate": 1.8885768143496076e-05, "loss": 0.3919, "step": 713700 }, { "epoch": 9.515937662476171, "grad_norm": 3.8133604526519775, "learning_rate": 1.887770610652032e-05, "loss": 0.4235, "step": 713800 }, { "epoch": 9.517270800282626, "grad_norm": 2.923520088195801, "learning_rate": 1.8869645000667943e-05, "loss": 0.2764, "step": 713900 }, { "epoch": 9.51860393808908, "grad_norm": 3.956144094467163, "learning_rate": 1.8861584826613794e-05, "loss": 0.3879, "step": 714000 }, { "epoch": 9.519937075895536, "grad_norm": 3.4343857765197754, "learning_rate": 1.885352558503267e-05, "loss": 0.4029, "step": 714100 }, { "epoch": 9.52127021370199, "grad_norm": 1.0625752210617065, "learning_rate": 1.884546727659923e-05, "loss": 0.3601, "step": 714200 }, { "epoch": 9.522603351508446, "grad_norm": 7.074456691741943, "learning_rate": 1.8837409901988114e-05, "loss": 0.4078, "step": 714300 }, { "epoch": 9.5239364893149, "grad_norm": 8.447257041931152, "learning_rate": 1.8829434021647034e-05, "loss": 0.3725, "step": 714400 }, { "epoch": 9.525269627121355, "grad_norm": 3.9895520210266113, "learning_rate": 1.882137850734903e-05, "loss": 0.3108, "step": 714500 }, { "epoch": 9.52660276492781, "grad_norm": 2.1060266494750977, "learning_rate": 1.8813323928889985e-05, "loss": 0.4174, "step": 714600 }, { "epoch": 9.527935902734265, "grad_norm": 1.7834234237670898, "learning_rate": 1.88052702869442e-05, "loss": 0.3098, "step": 714700 }, { "epoch": 9.52926904054072, "grad_norm": 1.9824368953704834, "learning_rate": 1.8797217582185897e-05, "loss": 0.3958, "step": 714800 }, { "epoch": 9.530602178347175, "grad_norm": 2.8999571800231934, "learning_rate": 1.8789165815289223e-05, "loss": 0.3393, "step": 714900 }, { "epoch": 9.53193531615363, "grad_norm": 2.18916916847229, "learning_rate": 1.8781114986928252e-05, "loss": 0.376, "step": 715000 }, { "epoch": 9.533268453960085, "grad_norm": 1.6410778760910034, "learning_rate": 1.8773065097776963e-05, "loss": 0.3909, "step": 715100 }, { "epoch": 9.53460159176654, "grad_norm": 1.226635217666626, "learning_rate": 1.8765016148509268e-05, "loss": 0.3965, "step": 715200 }, { "epoch": 9.535934729572997, "grad_norm": 39.9113655090332, "learning_rate": 1.8756968139799026e-05, "loss": 0.318, "step": 715300 }, { "epoch": 9.537267867379452, "grad_norm": 1.8364613056182861, "learning_rate": 1.8748921072319948e-05, "loss": 0.3496, "step": 715400 }, { "epoch": 9.538601005185907, "grad_norm": 2.372851610183716, "learning_rate": 1.874087494674574e-05, "loss": 0.2995, "step": 715500 }, { "epoch": 9.539934142992362, "grad_norm": 0.9293150305747986, "learning_rate": 1.8732829763749995e-05, "loss": 0.3906, "step": 715600 }, { "epoch": 9.541267280798817, "grad_norm": 3.295428991317749, "learning_rate": 1.872478552400622e-05, "loss": 0.3731, "step": 715700 }, { "epoch": 9.542600418605272, "grad_norm": 3.22947359085083, "learning_rate": 1.8716742228187855e-05, "loss": 0.3451, "step": 715800 }, { "epoch": 9.543933556411726, "grad_norm": 3.6715903282165527, "learning_rate": 1.870869987696827e-05, "loss": 0.3833, "step": 715900 }, { "epoch": 9.545266694218181, "grad_norm": 4.899306297302246, "learning_rate": 1.8700658471020735e-05, "loss": 0.3604, "step": 716000 }, { "epoch": 9.546599832024636, "grad_norm": 2.6586098670959473, "learning_rate": 1.869261801101846e-05, "loss": 0.3621, "step": 716100 }, { "epoch": 9.547932969831091, "grad_norm": 3.1109180450439453, "learning_rate": 1.868457849763455e-05, "loss": 0.3972, "step": 716200 }, { "epoch": 9.549266107637546, "grad_norm": 2.284562110900879, "learning_rate": 1.8676539931542065e-05, "loss": 0.3242, "step": 716300 }, { "epoch": 9.550599245444001, "grad_norm": 3.0484774112701416, "learning_rate": 1.8668502313413966e-05, "loss": 0.3634, "step": 716400 }, { "epoch": 9.551932383250456, "grad_norm": 2.415921688079834, "learning_rate": 1.8660465643923126e-05, "loss": 0.3169, "step": 716500 }, { "epoch": 9.553265521056911, "grad_norm": 71.53580474853516, "learning_rate": 1.8652429923742352e-05, "loss": 0.3552, "step": 716600 }, { "epoch": 9.554598658863366, "grad_norm": 7.017696857452393, "learning_rate": 1.8644395153544383e-05, "loss": 0.3308, "step": 716700 }, { "epoch": 9.555931796669821, "grad_norm": 5.371305465698242, "learning_rate": 1.863636133400186e-05, "loss": 0.3808, "step": 716800 }, { "epoch": 9.557264934476276, "grad_norm": 2.7080276012420654, "learning_rate": 1.8628328465787337e-05, "loss": 0.3869, "step": 716900 }, { "epoch": 9.558598072282733, "grad_norm": 4.556452751159668, "learning_rate": 1.8620296549573314e-05, "loss": 0.3586, "step": 717000 }, { "epoch": 9.559931210089188, "grad_norm": 1.4005354642868042, "learning_rate": 1.8612265586032185e-05, "loss": 0.362, "step": 717100 }, { "epoch": 9.561264347895643, "grad_norm": 3.107563018798828, "learning_rate": 1.8604235575836287e-05, "loss": 0.3773, "step": 717200 }, { "epoch": 9.562597485702097, "grad_norm": 7.741209030151367, "learning_rate": 1.859620651965787e-05, "loss": 0.3612, "step": 717300 }, { "epoch": 9.563930623508552, "grad_norm": 6.999935626983643, "learning_rate": 1.8588178418169086e-05, "loss": 0.3377, "step": 717400 }, { "epoch": 9.565263761315007, "grad_norm": 1.9646857976913452, "learning_rate": 1.8580151272042037e-05, "loss": 0.3765, "step": 717500 }, { "epoch": 9.566596899121462, "grad_norm": 3.684581756591797, "learning_rate": 1.8572125081948726e-05, "loss": 0.31, "step": 717600 }, { "epoch": 9.567930036927917, "grad_norm": 3.160032033920288, "learning_rate": 1.8564099848561067e-05, "loss": 0.3737, "step": 717700 }, { "epoch": 9.569263174734372, "grad_norm": 2.918118476867676, "learning_rate": 1.8556075572550943e-05, "loss": 0.3423, "step": 717800 }, { "epoch": 9.570596312540827, "grad_norm": 17.974185943603516, "learning_rate": 1.854805225459008e-05, "loss": 0.3208, "step": 717900 }, { "epoch": 9.571929450347282, "grad_norm": 3.240806818008423, "learning_rate": 1.8540029895350178e-05, "loss": 0.3947, "step": 718000 }, { "epoch": 9.573262588153737, "grad_norm": 4.327065467834473, "learning_rate": 1.8532008495502866e-05, "loss": 0.3676, "step": 718100 }, { "epoch": 9.574595725960192, "grad_norm": 26.453800201416016, "learning_rate": 1.852398805571964e-05, "loss": 0.3697, "step": 718200 }, { "epoch": 9.575928863766647, "grad_norm": 5.286755084991455, "learning_rate": 1.851596857667196e-05, "loss": 0.375, "step": 718300 }, { "epoch": 9.577262001573102, "grad_norm": 2.622407913208008, "learning_rate": 1.8507950059031196e-05, "loss": 0.3706, "step": 718400 }, { "epoch": 9.578595139379559, "grad_norm": 6.565674304962158, "learning_rate": 1.8499932503468625e-05, "loss": 0.3692, "step": 718500 }, { "epoch": 9.579928277186013, "grad_norm": 8.435124397277832, "learning_rate": 1.849191591065544e-05, "loss": 0.3975, "step": 718600 }, { "epoch": 9.581261414992468, "grad_norm": 2.95816969871521, "learning_rate": 1.8483900281262804e-05, "loss": 0.3221, "step": 718700 }, { "epoch": 9.582594552798923, "grad_norm": 3.8176066875457764, "learning_rate": 1.8475885615961708e-05, "loss": 0.3988, "step": 718800 }, { "epoch": 9.583927690605378, "grad_norm": 2.4893341064453125, "learning_rate": 1.8467871915423154e-05, "loss": 0.404, "step": 718900 }, { "epoch": 9.585260828411833, "grad_norm": 0.9882601499557495, "learning_rate": 1.8459859180318017e-05, "loss": 0.4024, "step": 719000 }, { "epoch": 9.586593966218288, "grad_norm": 2.351858139038086, "learning_rate": 1.8451847411317082e-05, "loss": 0.3531, "step": 719100 }, { "epoch": 9.587927104024743, "grad_norm": 2.6753697395324707, "learning_rate": 1.8443836609091085e-05, "loss": 0.3026, "step": 719200 }, { "epoch": 9.589260241831198, "grad_norm": 4.335195541381836, "learning_rate": 1.8435826774310654e-05, "loss": 0.3378, "step": 719300 }, { "epoch": 9.590593379637653, "grad_norm": 2.903844118118286, "learning_rate": 1.8427817907646354e-05, "loss": 0.3656, "step": 719400 }, { "epoch": 9.591926517444108, "grad_norm": 7.099498271942139, "learning_rate": 1.841981000976866e-05, "loss": 0.3501, "step": 719500 }, { "epoch": 9.593259655250563, "grad_norm": 6.866002082824707, "learning_rate": 1.841180308134797e-05, "loss": 0.3739, "step": 719600 }, { "epoch": 9.594592793057018, "grad_norm": 3.9782402515411377, "learning_rate": 1.8403797123054595e-05, "loss": 0.3841, "step": 719700 }, { "epoch": 9.595925930863473, "grad_norm": 1.8575600385665894, "learning_rate": 1.8395792135558772e-05, "loss": 0.3735, "step": 719800 }, { "epoch": 9.597259068669928, "grad_norm": 17.314380645751953, "learning_rate": 1.838778811953065e-05, "loss": 0.3891, "step": 719900 }, { "epoch": 9.598592206476383, "grad_norm": 3.055427074432373, "learning_rate": 1.837978507564029e-05, "loss": 0.3466, "step": 720000 }, { "epoch": 9.599925344282838, "grad_norm": 3.790748357772827, "learning_rate": 1.837178300455772e-05, "loss": 0.3675, "step": 720100 }, { "epoch": 9.601258482089294, "grad_norm": 4.587248802185059, "learning_rate": 1.83637819069528e-05, "loss": 0.357, "step": 720200 }, { "epoch": 9.60259161989575, "grad_norm": 1.0242388248443604, "learning_rate": 1.8355781783495386e-05, "loss": 0.341, "step": 720300 }, { "epoch": 9.603924757702204, "grad_norm": 4.332290172576904, "learning_rate": 1.8347782634855214e-05, "loss": 0.318, "step": 720400 }, { "epoch": 9.60525789550866, "grad_norm": 5.611512660980225, "learning_rate": 1.833978446170195e-05, "loss": 0.3523, "step": 720500 }, { "epoch": 9.606591033315114, "grad_norm": 2.210395574569702, "learning_rate": 1.833178726470518e-05, "loss": 0.3854, "step": 720600 }, { "epoch": 9.607924171121569, "grad_norm": 2.7079010009765625, "learning_rate": 1.832379104453439e-05, "loss": 0.3593, "step": 720700 }, { "epoch": 9.609257308928024, "grad_norm": 2.7234535217285156, "learning_rate": 1.8315795801859004e-05, "loss": 0.4001, "step": 720800 }, { "epoch": 9.610590446734479, "grad_norm": 1.7910076379776, "learning_rate": 1.8307801537348378e-05, "loss": 0.3093, "step": 720900 }, { "epoch": 9.611923584540934, "grad_norm": 9.579779624938965, "learning_rate": 1.8299808251671736e-05, "loss": 0.3569, "step": 721000 }, { "epoch": 9.613256722347389, "grad_norm": 6.177175998687744, "learning_rate": 1.829181594549826e-05, "loss": 0.386, "step": 721100 }, { "epoch": 9.614589860153844, "grad_norm": 4.6484880447387695, "learning_rate": 1.8283824619497063e-05, "loss": 0.3524, "step": 721200 }, { "epoch": 9.615922997960299, "grad_norm": 5.54844331741333, "learning_rate": 1.8275834274337124e-05, "loss": 0.3571, "step": 721300 }, { "epoch": 9.617256135766754, "grad_norm": 1.7647584676742554, "learning_rate": 1.826784491068738e-05, "loss": 0.3877, "step": 721400 }, { "epoch": 9.618589273573209, "grad_norm": 3.100149154663086, "learning_rate": 1.825985652921669e-05, "loss": 0.3472, "step": 721500 }, { "epoch": 9.619922411379664, "grad_norm": 0.5389582514762878, "learning_rate": 1.8251869130593797e-05, "loss": 0.3537, "step": 721600 }, { "epoch": 9.62125554918612, "grad_norm": 4.067758560180664, "learning_rate": 1.824388271548738e-05, "loss": 0.3643, "step": 721700 }, { "epoch": 9.622588686992575, "grad_norm": 1.9246050119400024, "learning_rate": 1.823589728456606e-05, "loss": 0.3478, "step": 721800 }, { "epoch": 9.62392182479903, "grad_norm": 5.659984588623047, "learning_rate": 1.8227912838498323e-05, "loss": 0.3394, "step": 721900 }, { "epoch": 9.625254962605485, "grad_norm": 8.561624526977539, "learning_rate": 1.821992937795261e-05, "loss": 0.3713, "step": 722000 }, { "epoch": 9.62658810041194, "grad_norm": 2.529939651489258, "learning_rate": 1.821194690359729e-05, "loss": 0.3651, "step": 722100 }, { "epoch": 9.627921238218395, "grad_norm": 2.4189372062683105, "learning_rate": 1.8203965416100607e-05, "loss": 0.355, "step": 722200 }, { "epoch": 9.62925437602485, "grad_norm": 7.108790397644043, "learning_rate": 1.819598491613076e-05, "loss": 0.328, "step": 722300 }, { "epoch": 9.630587513831305, "grad_norm": 2.410386562347412, "learning_rate": 1.818800540435584e-05, "loss": 0.3837, "step": 722400 }, { "epoch": 9.63192065163776, "grad_norm": 1.4134117364883423, "learning_rate": 1.8180026881443867e-05, "loss": 0.3458, "step": 722500 }, { "epoch": 9.633253789444215, "grad_norm": 0.7256863713264465, "learning_rate": 1.8172049348062808e-05, "loss": 0.3, "step": 722600 }, { "epoch": 9.63458692725067, "grad_norm": 2.835331439971924, "learning_rate": 1.8164072804880466e-05, "loss": 0.3329, "step": 722700 }, { "epoch": 9.635920065057125, "grad_norm": 17.183134078979492, "learning_rate": 1.8156097252564648e-05, "loss": 0.3566, "step": 722800 }, { "epoch": 9.63725320286358, "grad_norm": 3.8677828311920166, "learning_rate": 1.814812269178304e-05, "loss": 0.355, "step": 722900 }, { "epoch": 9.638586340670035, "grad_norm": 16.091751098632812, "learning_rate": 1.8140149123203233e-05, "loss": 0.3583, "step": 723000 }, { "epoch": 9.63991947847649, "grad_norm": 5.355976104736328, "learning_rate": 1.8132176547492752e-05, "loss": 0.3843, "step": 723100 }, { "epoch": 9.641252616282944, "grad_norm": 3.573296308517456, "learning_rate": 1.8124204965319055e-05, "loss": 0.3831, "step": 723200 }, { "epoch": 9.6425857540894, "grad_norm": 0.09259039163589478, "learning_rate": 1.811623437734947e-05, "loss": 0.2801, "step": 723300 }, { "epoch": 9.643918891895856, "grad_norm": 4.559417247772217, "learning_rate": 1.8108264784251277e-05, "loss": 0.3946, "step": 723400 }, { "epoch": 9.645252029702311, "grad_norm": 1.6700669527053833, "learning_rate": 1.810029618669169e-05, "loss": 0.3537, "step": 723500 }, { "epoch": 9.646585167508766, "grad_norm": 8.204283714294434, "learning_rate": 1.809232858533777e-05, "loss": 0.4058, "step": 723600 }, { "epoch": 9.64791830531522, "grad_norm": 3.406721353530884, "learning_rate": 1.8084441641964683e-05, "loss": 0.3347, "step": 723700 }, { "epoch": 9.649251443121676, "grad_norm": 1.798346757888794, "learning_rate": 1.8076476025044457e-05, "loss": 0.3127, "step": 723800 }, { "epoch": 9.65058458092813, "grad_norm": 1.0274944305419922, "learning_rate": 1.8068511406324058e-05, "loss": 0.3581, "step": 723900 }, { "epoch": 9.651917718734586, "grad_norm": 0.8240087032318115, "learning_rate": 1.8060547786470274e-05, "loss": 0.351, "step": 724000 }, { "epoch": 9.65325085654104, "grad_norm": 1.5138497352600098, "learning_rate": 1.8052585166149798e-05, "loss": 0.3796, "step": 724100 }, { "epoch": 9.654583994347496, "grad_norm": 5.333702087402344, "learning_rate": 1.8044623546029228e-05, "loss": 0.357, "step": 724200 }, { "epoch": 9.65591713215395, "grad_norm": 4.2556376457214355, "learning_rate": 1.8036662926775088e-05, "loss": 0.3737, "step": 724300 }, { "epoch": 9.657250269960405, "grad_norm": 1.1114068031311035, "learning_rate": 1.8028703309053815e-05, "loss": 0.3768, "step": 724400 }, { "epoch": 9.65858340776686, "grad_norm": 2.195525884628296, "learning_rate": 1.802074469353177e-05, "loss": 0.392, "step": 724500 }, { "epoch": 9.659916545573315, "grad_norm": 3.5978128910064697, "learning_rate": 1.8012787080875203e-05, "loss": 0.44, "step": 724600 }, { "epoch": 9.66124968337977, "grad_norm": 0.8396918773651123, "learning_rate": 1.8004830471750334e-05, "loss": 0.3562, "step": 724700 }, { "epoch": 9.662582821186225, "grad_norm": 1.2074366807937622, "learning_rate": 1.799687486682323e-05, "loss": 0.3704, "step": 724800 }, { "epoch": 9.663915958992682, "grad_norm": 0.43981313705444336, "learning_rate": 1.7988920266759926e-05, "loss": 0.3451, "step": 724900 }, { "epoch": 9.665249096799137, "grad_norm": 2.195298671722412, "learning_rate": 1.7980966672226363e-05, "loss": 0.3664, "step": 725000 }, { "epoch": 9.666582234605592, "grad_norm": 1.4905869960784912, "learning_rate": 1.7973014083888382e-05, "loss": 0.3156, "step": 725100 }, { "epoch": 9.667915372412047, "grad_norm": 6.707123756408691, "learning_rate": 1.7965062502411753e-05, "loss": 0.4785, "step": 725200 }, { "epoch": 9.669248510218502, "grad_norm": 2.9453892707824707, "learning_rate": 1.7957111928462146e-05, "loss": 0.3312, "step": 725300 }, { "epoch": 9.670581648024957, "grad_norm": 4.2032575607299805, "learning_rate": 1.794916236270517e-05, "loss": 0.3768, "step": 725400 }, { "epoch": 9.671914785831412, "grad_norm": 3.816558599472046, "learning_rate": 1.7941213805806335e-05, "loss": 0.32, "step": 725500 }, { "epoch": 9.673247923637867, "grad_norm": 2.266390562057495, "learning_rate": 1.793326625843106e-05, "loss": 0.3127, "step": 725600 }, { "epoch": 9.674581061444322, "grad_norm": 1.3359041213989258, "learning_rate": 1.7925319721244704e-05, "loss": 0.4006, "step": 725700 }, { "epoch": 9.675914199250776, "grad_norm": 4.089098930358887, "learning_rate": 1.7917374194912518e-05, "loss": 0.3301, "step": 725800 }, { "epoch": 9.677247337057231, "grad_norm": 5.893824100494385, "learning_rate": 1.7909429680099675e-05, "loss": 0.3283, "step": 725900 }, { "epoch": 9.678580474863686, "grad_norm": 2.5032432079315186, "learning_rate": 1.7901486177471254e-05, "loss": 0.3266, "step": 726000 }, { "epoch": 9.679913612670141, "grad_norm": 1.5637019872665405, "learning_rate": 1.7893543687692296e-05, "loss": 0.3579, "step": 726100 }, { "epoch": 9.681246750476596, "grad_norm": 1.0089056491851807, "learning_rate": 1.7885602211427676e-05, "loss": 0.3995, "step": 726200 }, { "epoch": 9.682579888283051, "grad_norm": 3.6696362495422363, "learning_rate": 1.787766174934226e-05, "loss": 0.3263, "step": 726300 }, { "epoch": 9.683913026089506, "grad_norm": 2.959662675857544, "learning_rate": 1.7869722302100795e-05, "loss": 0.3469, "step": 726400 }, { "epoch": 9.685246163895961, "grad_norm": 4.821427345275879, "learning_rate": 1.7861783870367935e-05, "loss": 0.3248, "step": 726500 }, { "epoch": 9.686579301702418, "grad_norm": 3.16131329536438, "learning_rate": 1.7853846454808264e-05, "loss": 0.3761, "step": 726600 }, { "epoch": 9.687912439508873, "grad_norm": 0.9839168787002563, "learning_rate": 1.7845910056086285e-05, "loss": 0.3542, "step": 726700 }, { "epoch": 9.689245577315328, "grad_norm": 1.5291550159454346, "learning_rate": 1.7837974674866402e-05, "loss": 0.3254, "step": 726800 }, { "epoch": 9.690578715121783, "grad_norm": 5.674479007720947, "learning_rate": 1.7830040311812943e-05, "loss": 0.3716, "step": 726900 }, { "epoch": 9.691911852928238, "grad_norm": 4.621519088745117, "learning_rate": 1.782210696759014e-05, "loss": 0.37, "step": 727000 }, { "epoch": 9.693244990734692, "grad_norm": 9.80557632446289, "learning_rate": 1.7814174642862152e-05, "loss": 0.3522, "step": 727100 }, { "epoch": 9.694578128541147, "grad_norm": 6.486968994140625, "learning_rate": 1.7806243338293067e-05, "loss": 0.3439, "step": 727200 }, { "epoch": 9.695911266347602, "grad_norm": 4.3957438468933105, "learning_rate": 1.7798313054546832e-05, "loss": 0.3643, "step": 727300 }, { "epoch": 9.697244404154057, "grad_norm": 4.831750392913818, "learning_rate": 1.779038379228737e-05, "loss": 0.376, "step": 727400 }, { "epoch": 9.698577541960512, "grad_norm": 1.8272327184677124, "learning_rate": 1.7782455552178493e-05, "loss": 0.3173, "step": 727500 }, { "epoch": 9.699910679766967, "grad_norm": 3.3631184101104736, "learning_rate": 1.7774528334883922e-05, "loss": 0.4196, "step": 727600 }, { "epoch": 9.701243817573422, "grad_norm": 2.6011571884155273, "learning_rate": 1.7766681397937073e-05, "loss": 0.3528, "step": 727700 }, { "epoch": 9.702576955379877, "grad_norm": 4.100765228271484, "learning_rate": 1.7758756218017254e-05, "loss": 0.3898, "step": 727800 }, { "epoch": 9.703910093186332, "grad_norm": 30.936742782592773, "learning_rate": 1.775083206289579e-05, "loss": 0.3526, "step": 727900 }, { "epoch": 9.705243230992787, "grad_norm": 0.5757327675819397, "learning_rate": 1.774290893323604e-05, "loss": 0.394, "step": 728000 }, { "epoch": 9.706576368799244, "grad_norm": 1.0886120796203613, "learning_rate": 1.7734986829701315e-05, "loss": 0.2975, "step": 728100 }, { "epoch": 9.707909506605699, "grad_norm": 3.955735921859741, "learning_rate": 1.7727065752954847e-05, "loss": 0.304, "step": 728200 }, { "epoch": 9.709242644412154, "grad_norm": 1.7313307523727417, "learning_rate": 1.7719145703659727e-05, "loss": 0.3326, "step": 728300 }, { "epoch": 9.710575782218609, "grad_norm": 1.140540599822998, "learning_rate": 1.7711226682479035e-05, "loss": 0.3071, "step": 728400 }, { "epoch": 9.711908920025063, "grad_norm": 7.1457839012146, "learning_rate": 1.770330869007569e-05, "loss": 0.3634, "step": 728500 }, { "epoch": 9.713242057831518, "grad_norm": 4.261502742767334, "learning_rate": 1.7695391727112583e-05, "loss": 0.3453, "step": 728600 }, { "epoch": 9.714575195637973, "grad_norm": 5.966524600982666, "learning_rate": 1.7687475794252503e-05, "loss": 0.4022, "step": 728700 }, { "epoch": 9.715908333444428, "grad_norm": 1.723605990409851, "learning_rate": 1.7679560892158126e-05, "loss": 0.3114, "step": 728800 }, { "epoch": 9.717241471250883, "grad_norm": 1.9615421295166016, "learning_rate": 1.7671647021492076e-05, "loss": 0.2678, "step": 728900 }, { "epoch": 9.718574609057338, "grad_norm": 3.4129412174224854, "learning_rate": 1.766373418291688e-05, "loss": 0.4307, "step": 729000 }, { "epoch": 9.719907746863793, "grad_norm": 2.9649744033813477, "learning_rate": 1.7655901490038885e-05, "loss": 0.3386, "step": 729100 }, { "epoch": 9.721240884670248, "grad_norm": 5.436180114746094, "learning_rate": 1.7647990707295175e-05, "loss": 0.3627, "step": 729200 }, { "epoch": 9.722574022476703, "grad_norm": 1.9245848655700684, "learning_rate": 1.764008095862274e-05, "loss": 0.3221, "step": 729300 }, { "epoch": 9.723907160283158, "grad_norm": 1.107298731803894, "learning_rate": 1.7632172244683787e-05, "loss": 0.3485, "step": 729400 }, { "epoch": 9.725240298089613, "grad_norm": 3.767141103744507, "learning_rate": 1.7624264566140364e-05, "loss": 0.379, "step": 729500 }, { "epoch": 9.726573435896068, "grad_norm": 2.7868218421936035, "learning_rate": 1.761635792365452e-05, "loss": 0.3795, "step": 729600 }, { "epoch": 9.727906573702523, "grad_norm": 26.950210571289062, "learning_rate": 1.7608452317888132e-05, "loss": 0.352, "step": 729700 }, { "epoch": 9.72923971150898, "grad_norm": 0.5980510711669922, "learning_rate": 1.7600547749503064e-05, "loss": 0.36, "step": 729800 }, { "epoch": 9.730572849315434, "grad_norm": 1.8129160404205322, "learning_rate": 1.759264421916106e-05, "loss": 0.3006, "step": 729900 }, { "epoch": 9.73190598712189, "grad_norm": 1.5385167598724365, "learning_rate": 1.7584741727523764e-05, "loss": 0.2822, "step": 730000 }, { "epoch": 9.733239124928344, "grad_norm": 2.3897347450256348, "learning_rate": 1.757684027525275e-05, "loss": 0.3642, "step": 730100 }, { "epoch": 9.7345722627348, "grad_norm": 3.5884132385253906, "learning_rate": 1.7568939863009517e-05, "loss": 0.372, "step": 730200 }, { "epoch": 9.735905400541254, "grad_norm": 3.514551877975464, "learning_rate": 1.7561040491455444e-05, "loss": 0.3388, "step": 730300 }, { "epoch": 9.73723853834771, "grad_norm": 1.9200334548950195, "learning_rate": 1.755314216125185e-05, "loss": 0.3564, "step": 730400 }, { "epoch": 9.738571676154164, "grad_norm": 1.8410441875457764, "learning_rate": 1.7545244873059965e-05, "loss": 0.3806, "step": 730500 }, { "epoch": 9.739904813960619, "grad_norm": 4.4716105461120605, "learning_rate": 1.7537348627540914e-05, "loss": 0.3676, "step": 730600 }, { "epoch": 9.741237951767074, "grad_norm": 0.7240990996360779, "learning_rate": 1.7529453425355745e-05, "loss": 0.3448, "step": 730700 }, { "epoch": 9.742571089573529, "grad_norm": 2.3868465423583984, "learning_rate": 1.752155926716544e-05, "loss": 0.381, "step": 730800 }, { "epoch": 9.743904227379984, "grad_norm": 2.0826401710510254, "learning_rate": 1.7513666153630846e-05, "loss": 0.351, "step": 730900 }, { "epoch": 9.745237365186439, "grad_norm": 2.526383399963379, "learning_rate": 1.750577408541275e-05, "loss": 0.3208, "step": 731000 }, { "epoch": 9.746570502992894, "grad_norm": 6.526946544647217, "learning_rate": 1.7497883063171884e-05, "loss": 0.3829, "step": 731100 }, { "epoch": 9.747903640799349, "grad_norm": 2.884650468826294, "learning_rate": 1.7489993087568832e-05, "loss": 0.409, "step": 731200 }, { "epoch": 9.749236778605805, "grad_norm": 2.386693000793457, "learning_rate": 1.7482104159264126e-05, "loss": 0.4461, "step": 731300 }, { "epoch": 9.75056991641226, "grad_norm": 11.069429397583008, "learning_rate": 1.7474216278918185e-05, "loss": 0.3336, "step": 731400 }, { "epoch": 9.751903054218715, "grad_norm": 5.763792991638184, "learning_rate": 1.7466408310315827e-05, "loss": 0.3409, "step": 731500 }, { "epoch": 9.75323619202517, "grad_norm": 0.6550744771957397, "learning_rate": 1.7458522517372345e-05, "loss": 0.2906, "step": 731600 }, { "epoch": 9.754569329831625, "grad_norm": 12.745233535766602, "learning_rate": 1.7450637774361833e-05, "loss": 0.3871, "step": 731700 }, { "epoch": 9.75590246763808, "grad_norm": 10.587272644042969, "learning_rate": 1.7442754081944365e-05, "loss": 0.3674, "step": 731800 }, { "epoch": 9.757235605444535, "grad_norm": 1.0879101753234863, "learning_rate": 1.743487144077994e-05, "loss": 0.3808, "step": 731900 }, { "epoch": 9.75856874325099, "grad_norm": 5.195858955383301, "learning_rate": 1.742698985152848e-05, "loss": 0.4062, "step": 732000 }, { "epoch": 9.759901881057445, "grad_norm": 4.590649604797363, "learning_rate": 1.7419109314849785e-05, "loss": 0.3538, "step": 732100 }, { "epoch": 9.7612350188639, "grad_norm": 2.302734136581421, "learning_rate": 1.741122983140359e-05, "loss": 0.3593, "step": 732200 }, { "epoch": 9.762568156670355, "grad_norm": 6.090748310089111, "learning_rate": 1.740335140184957e-05, "loss": 0.3512, "step": 732300 }, { "epoch": 9.76390129447681, "grad_norm": 0.7547401785850525, "learning_rate": 1.7395474026847253e-05, "loss": 0.333, "step": 732400 }, { "epoch": 9.765234432283265, "grad_norm": 2.11946177482605, "learning_rate": 1.738759770705612e-05, "loss": 0.4103, "step": 732500 }, { "epoch": 9.76656757008972, "grad_norm": 2.4433844089508057, "learning_rate": 1.7379722443135545e-05, "loss": 0.3838, "step": 732600 }, { "epoch": 9.767900707896175, "grad_norm": 2.927440643310547, "learning_rate": 1.737184823574483e-05, "loss": 0.348, "step": 732700 }, { "epoch": 9.76923384570263, "grad_norm": 2.430426836013794, "learning_rate": 1.736397508554318e-05, "loss": 0.3412, "step": 732800 }, { "epoch": 9.770566983509084, "grad_norm": 4.2862982749938965, "learning_rate": 1.7356102993189692e-05, "loss": 0.389, "step": 732900 }, { "epoch": 9.771900121315541, "grad_norm": 31.28017807006836, "learning_rate": 1.7348231959343406e-05, "loss": 0.3415, "step": 733000 }, { "epoch": 9.773233259121996, "grad_norm": 5.193405628204346, "learning_rate": 1.734036198466326e-05, "loss": 0.3149, "step": 733100 }, { "epoch": 9.774566396928451, "grad_norm": 2.3598878383636475, "learning_rate": 1.73324930698081e-05, "loss": 0.3395, "step": 733200 }, { "epoch": 9.775899534734906, "grad_norm": 0.7963489294052124, "learning_rate": 1.7324625215436686e-05, "loss": 0.3304, "step": 733300 }, { "epoch": 9.777232672541361, "grad_norm": 4.890589714050293, "learning_rate": 1.731675842220771e-05, "loss": 0.3697, "step": 733400 }, { "epoch": 9.778565810347816, "grad_norm": 6.253011226654053, "learning_rate": 1.7308892690779723e-05, "loss": 0.3152, "step": 733500 }, { "epoch": 9.77989894815427, "grad_norm": 8.776147842407227, "learning_rate": 1.7301028021811226e-05, "loss": 0.3573, "step": 733600 }, { "epoch": 9.781232085960726, "grad_norm": 5.383430004119873, "learning_rate": 1.7293164415960652e-05, "loss": 0.3183, "step": 733700 }, { "epoch": 9.78256522376718, "grad_norm": 0.2781163454055786, "learning_rate": 1.728538049403917e-05, "loss": 0.3872, "step": 733800 }, { "epoch": 9.783898361573636, "grad_norm": 3.2178683280944824, "learning_rate": 1.7277519005751652e-05, "loss": 0.3675, "step": 733900 }, { "epoch": 9.78523149938009, "grad_norm": 6.77635383605957, "learning_rate": 1.726973718150782e-05, "loss": 0.3902, "step": 734000 }, { "epoch": 9.786564637186546, "grad_norm": 0.5921081304550171, "learning_rate": 1.726187781338966e-05, "loss": 0.3107, "step": 734100 }, { "epoch": 9.787897774993, "grad_norm": 19.575191497802734, "learning_rate": 1.7254019511666938e-05, "loss": 0.3325, "step": 734200 }, { "epoch": 9.789230912799455, "grad_norm": 2.8426430225372314, "learning_rate": 1.7246162276997518e-05, "loss": 0.3972, "step": 734300 }, { "epoch": 9.79056405060591, "grad_norm": 3.0956180095672607, "learning_rate": 1.7238306110039206e-05, "loss": 0.399, "step": 734400 }, { "epoch": 9.791897188412367, "grad_norm": 13.431285858154297, "learning_rate": 1.7230451011449666e-05, "loss": 0.3627, "step": 734500 }, { "epoch": 9.793230326218822, "grad_norm": 3.452833890914917, "learning_rate": 1.72225969818865e-05, "loss": 0.3567, "step": 734600 }, { "epoch": 9.794563464025277, "grad_norm": 4.647488594055176, "learning_rate": 1.721474402200726e-05, "loss": 0.3483, "step": 734700 }, { "epoch": 9.795896601831732, "grad_norm": 1.7700010538101196, "learning_rate": 1.7206892132469337e-05, "loss": 0.3108, "step": 734800 }, { "epoch": 9.797229739638187, "grad_norm": 2.0624873638153076, "learning_rate": 1.7199041313930076e-05, "loss": 0.3313, "step": 734900 }, { "epoch": 9.798562877444642, "grad_norm": 2.2288782596588135, "learning_rate": 1.7191191567046726e-05, "loss": 0.3607, "step": 735000 }, { "epoch": 9.799896015251097, "grad_norm": 1.0653237104415894, "learning_rate": 1.7183342892476438e-05, "loss": 0.3376, "step": 735100 }, { "epoch": 9.801229153057552, "grad_norm": 1.1646387577056885, "learning_rate": 1.7175495290876286e-05, "loss": 0.2995, "step": 735200 }, { "epoch": 9.802562290864007, "grad_norm": 28.632461547851562, "learning_rate": 1.7167648762903252e-05, "loss": 0.3691, "step": 735300 }, { "epoch": 9.803895428670462, "grad_norm": 2.926572799682617, "learning_rate": 1.71598033092142e-05, "loss": 0.3639, "step": 735400 }, { "epoch": 9.805228566476917, "grad_norm": 5.050868988037109, "learning_rate": 1.715195893046594e-05, "loss": 0.3975, "step": 735500 }, { "epoch": 9.806561704283371, "grad_norm": 3.6696977615356445, "learning_rate": 1.7144115627315178e-05, "loss": 0.3691, "step": 735600 }, { "epoch": 9.807894842089826, "grad_norm": 5.484383583068848, "learning_rate": 1.713627340041853e-05, "loss": 0.3607, "step": 735700 }, { "epoch": 9.809227979896281, "grad_norm": 6.09712028503418, "learning_rate": 1.7128432250432532e-05, "loss": 0.3027, "step": 735800 }, { "epoch": 9.810561117702736, "grad_norm": 1.5620826482772827, "learning_rate": 1.7120592178013604e-05, "loss": 0.3635, "step": 735900 }, { "epoch": 9.811894255509191, "grad_norm": 1.5752938985824585, "learning_rate": 1.7112753183818084e-05, "loss": 0.3147, "step": 736000 }, { "epoch": 9.813227393315646, "grad_norm": 2.2176146507263184, "learning_rate": 1.710491526850227e-05, "loss": 0.3406, "step": 736100 }, { "epoch": 9.814560531122103, "grad_norm": 6.814289569854736, "learning_rate": 1.7097078432722288e-05, "loss": 0.3962, "step": 736200 }, { "epoch": 9.815893668928558, "grad_norm": 2.7112436294555664, "learning_rate": 1.7089242677134225e-05, "loss": 0.3985, "step": 736300 }, { "epoch": 9.817226806735013, "grad_norm": 5.159742832183838, "learning_rate": 1.7081408002394068e-05, "loss": 0.3794, "step": 736400 }, { "epoch": 9.818559944541468, "grad_norm": 3.2098445892333984, "learning_rate": 1.707357440915771e-05, "loss": 0.3115, "step": 736500 }, { "epoch": 9.819893082347923, "grad_norm": 3.3179590702056885, "learning_rate": 1.7065741898080957e-05, "loss": 0.3889, "step": 736600 }, { "epoch": 9.821226220154378, "grad_norm": 5.520962238311768, "learning_rate": 1.7057910469819525e-05, "loss": 0.3435, "step": 736700 }, { "epoch": 9.822559357960833, "grad_norm": 2.317967176437378, "learning_rate": 1.7050080125029026e-05, "loss": 0.35, "step": 736800 }, { "epoch": 9.823892495767288, "grad_norm": 2.754263401031494, "learning_rate": 1.7042250864364984e-05, "loss": 0.3514, "step": 736900 }, { "epoch": 9.825225633573742, "grad_norm": 5.793337345123291, "learning_rate": 1.7034422688482875e-05, "loss": 0.3974, "step": 737000 }, { "epoch": 9.826558771380197, "grad_norm": 4.246674060821533, "learning_rate": 1.702667386356741e-05, "loss": 0.3266, "step": 737100 }, { "epoch": 9.827891909186652, "grad_norm": 4.3026442527771, "learning_rate": 1.7018847848350892e-05, "loss": 0.3384, "step": 737200 }, { "epoch": 9.829225046993107, "grad_norm": 6.500972747802734, "learning_rate": 1.7011022919875535e-05, "loss": 0.4106, "step": 737300 }, { "epoch": 9.830558184799562, "grad_norm": 2.464172840118408, "learning_rate": 1.7003199078796386e-05, "loss": 0.3397, "step": 737400 }, { "epoch": 9.831891322606017, "grad_norm": 3.6245744228363037, "learning_rate": 1.6995376325768435e-05, "loss": 0.3265, "step": 737500 }, { "epoch": 9.833224460412472, "grad_norm": 6.0083513259887695, "learning_rate": 1.6987554661446586e-05, "loss": 0.3151, "step": 737600 }, { "epoch": 9.834557598218929, "grad_norm": 4.767147064208984, "learning_rate": 1.6979734086485648e-05, "loss": 0.3713, "step": 737700 }, { "epoch": 9.835890736025384, "grad_norm": 2.992201566696167, "learning_rate": 1.6971914601540323e-05, "loss": 0.3587, "step": 737800 }, { "epoch": 9.837223873831839, "grad_norm": 3.0230023860931396, "learning_rate": 1.696409620726525e-05, "loss": 0.3246, "step": 737900 }, { "epoch": 9.838557011638294, "grad_norm": 5.156334400177002, "learning_rate": 1.695627890431494e-05, "loss": 0.3669, "step": 738000 }, { "epoch": 9.839890149444749, "grad_norm": 4.51071834564209, "learning_rate": 1.6948462693343835e-05, "loss": 0.3178, "step": 738100 }, { "epoch": 9.841223287251204, "grad_norm": 1.7955156564712524, "learning_rate": 1.694064757500631e-05, "loss": 0.4013, "step": 738200 }, { "epoch": 9.842556425057658, "grad_norm": 68.18395233154297, "learning_rate": 1.6932833549956594e-05, "loss": 0.3329, "step": 738300 }, { "epoch": 9.843889562864113, "grad_norm": 3.441025733947754, "learning_rate": 1.6925020618848864e-05, "loss": 0.2853, "step": 738400 }, { "epoch": 9.845222700670568, "grad_norm": 0.6207585334777832, "learning_rate": 1.6917208782337187e-05, "loss": 0.3281, "step": 738500 }, { "epoch": 9.846555838477023, "grad_norm": 4.434380054473877, "learning_rate": 1.690939804107555e-05, "loss": 0.3165, "step": 738600 }, { "epoch": 9.847888976283478, "grad_norm": 2.933720588684082, "learning_rate": 1.690158839571786e-05, "loss": 0.3755, "step": 738700 }, { "epoch": 9.849222114089933, "grad_norm": 2.297637462615967, "learning_rate": 1.6893779846917883e-05, "loss": 0.3203, "step": 738800 }, { "epoch": 9.850555251896388, "grad_norm": 1.163928747177124, "learning_rate": 1.6885972395329346e-05, "loss": 0.3677, "step": 738900 }, { "epoch": 9.851888389702843, "grad_norm": 0.6566413640975952, "learning_rate": 1.6878166041605863e-05, "loss": 0.3831, "step": 739000 }, { "epoch": 9.853221527509298, "grad_norm": 2.1127870082855225, "learning_rate": 1.687036078640095e-05, "loss": 0.3581, "step": 739100 }, { "epoch": 9.854554665315753, "grad_norm": 14.276124954223633, "learning_rate": 1.6862556630368048e-05, "loss": 0.3595, "step": 739200 }, { "epoch": 9.855887803122208, "grad_norm": 1.946542501449585, "learning_rate": 1.6854753574160502e-05, "loss": 0.3422, "step": 739300 }, { "epoch": 9.857220940928663, "grad_norm": 4.761856555938721, "learning_rate": 1.6846951618431537e-05, "loss": 0.387, "step": 739400 }, { "epoch": 9.85855407873512, "grad_norm": 3.090529680252075, "learning_rate": 1.6839150763834315e-05, "loss": 0.3642, "step": 739500 }, { "epoch": 9.859887216541575, "grad_norm": 0.9857513904571533, "learning_rate": 1.683135101102192e-05, "loss": 0.3408, "step": 739600 }, { "epoch": 9.86122035434803, "grad_norm": 1.6608755588531494, "learning_rate": 1.68235523606473e-05, "loss": 0.3621, "step": 739700 }, { "epoch": 9.862553492154484, "grad_norm": 1.2675361633300781, "learning_rate": 1.681575481336334e-05, "loss": 0.3254, "step": 739800 }, { "epoch": 9.86388662996094, "grad_norm": 3.978731632232666, "learning_rate": 1.6807958369822822e-05, "loss": 0.3483, "step": 739900 }, { "epoch": 9.865219767767394, "grad_norm": 3.6582093238830566, "learning_rate": 1.6800163030678454e-05, "loss": 0.3622, "step": 740000 }, { "epoch": 9.86655290557385, "grad_norm": 6.371555805206299, "learning_rate": 1.679236879658283e-05, "loss": 0.3341, "step": 740100 }, { "epoch": 9.867886043380304, "grad_norm": 1.2100505828857422, "learning_rate": 1.678457566818845e-05, "loss": 0.3458, "step": 740200 }, { "epoch": 9.86921918118676, "grad_norm": 2.7355265617370605, "learning_rate": 1.6776783646147732e-05, "loss": 0.4008, "step": 740300 }, { "epoch": 9.870552318993214, "grad_norm": 2.256025791168213, "learning_rate": 1.6768992731113005e-05, "loss": 0.3604, "step": 740400 }, { "epoch": 9.871885456799669, "grad_norm": 2.2763333320617676, "learning_rate": 1.67612029237365e-05, "loss": 0.3635, "step": 740500 }, { "epoch": 9.873218594606124, "grad_norm": 2.080554485321045, "learning_rate": 1.675341422467036e-05, "loss": 0.348, "step": 740600 }, { "epoch": 9.874551732412579, "grad_norm": 1.8390066623687744, "learning_rate": 1.6745626634566627e-05, "loss": 0.3326, "step": 740700 }, { "epoch": 9.875884870219034, "grad_norm": 1.7007784843444824, "learning_rate": 1.673784015407723e-05, "loss": 0.3637, "step": 740800 }, { "epoch": 9.877218008025489, "grad_norm": 3.5699286460876465, "learning_rate": 1.6730054783854063e-05, "loss": 0.3024, "step": 740900 }, { "epoch": 9.878551145831945, "grad_norm": 3.9645965099334717, "learning_rate": 1.672227052454889e-05, "loss": 0.3486, "step": 741000 }, { "epoch": 9.8798842836384, "grad_norm": 1.9959962368011475, "learning_rate": 1.671448737681336e-05, "loss": 0.3585, "step": 741100 }, { "epoch": 9.881217421444855, "grad_norm": 1.6765730381011963, "learning_rate": 1.670678315614657e-05, "loss": 0.2823, "step": 741200 }, { "epoch": 9.88255055925131, "grad_norm": 0.6402256488800049, "learning_rate": 1.6699002222373057e-05, "loss": 0.3532, "step": 741300 }, { "epoch": 9.883883697057765, "grad_norm": 1.499420404434204, "learning_rate": 1.6691222402117156e-05, "loss": 0.3535, "step": 741400 }, { "epoch": 9.88521683486422, "grad_norm": 2.2772903442382812, "learning_rate": 1.668344369603015e-05, "loss": 0.3516, "step": 741500 }, { "epoch": 9.886549972670675, "grad_norm": 1.5296063423156738, "learning_rate": 1.6675666104763258e-05, "loss": 0.413, "step": 741600 }, { "epoch": 9.88788311047713, "grad_norm": 5.636154651641846, "learning_rate": 1.66678896289676e-05, "loss": 0.3694, "step": 741700 }, { "epoch": 9.889216248283585, "grad_norm": 3.506992816925049, "learning_rate": 1.6660192017364e-05, "loss": 0.3425, "step": 741800 }, { "epoch": 9.89054938609004, "grad_norm": 3.2207486629486084, "learning_rate": 1.665241776329281e-05, "loss": 0.3415, "step": 741900 }, { "epoch": 9.891882523896495, "grad_norm": 2.762883424758911, "learning_rate": 1.664464462663916e-05, "loss": 0.3776, "step": 742000 }, { "epoch": 9.89321566170295, "grad_norm": 9.199800491333008, "learning_rate": 1.6636950322703026e-05, "loss": 0.3433, "step": 742100 }, { "epoch": 9.894548799509405, "grad_norm": 8.346784591674805, "learning_rate": 1.66291794116461e-05, "loss": 0.3756, "step": 742200 }, { "epoch": 9.89588193731586, "grad_norm": 2.5597426891326904, "learning_rate": 1.6621409619952133e-05, "loss": 0.319, "step": 742300 }, { "epoch": 9.897215075122315, "grad_norm": 0.7261689305305481, "learning_rate": 1.6613640948271588e-05, "loss": 0.3285, "step": 742400 }, { "epoch": 9.89854821292877, "grad_norm": 1.9042085409164429, "learning_rate": 1.6605873397254816e-05, "loss": 0.296, "step": 742500 }, { "epoch": 9.899881350735225, "grad_norm": 1.6723331212997437, "learning_rate": 1.6598106967552098e-05, "loss": 0.3142, "step": 742600 }, { "epoch": 9.901214488541681, "grad_norm": 1.9319761991500854, "learning_rate": 1.659034165981362e-05, "loss": 0.3909, "step": 742700 }, { "epoch": 9.902547626348136, "grad_norm": 3.1194348335266113, "learning_rate": 1.6582577474689468e-05, "loss": 0.384, "step": 742800 }, { "epoch": 9.903880764154591, "grad_norm": 0.7126434445381165, "learning_rate": 1.6574814412829643e-05, "loss": 0.3577, "step": 742900 }, { "epoch": 9.905213901961046, "grad_norm": 4.200797080993652, "learning_rate": 1.6567052474884026e-05, "loss": 0.3748, "step": 743000 }, { "epoch": 9.906547039767501, "grad_norm": 2.62400484085083, "learning_rate": 1.655929166150242e-05, "loss": 0.3279, "step": 743100 }, { "epoch": 9.907880177573956, "grad_norm": 4.103294372558594, "learning_rate": 1.6551531973334573e-05, "loss": 0.3861, "step": 743200 }, { "epoch": 9.909213315380411, "grad_norm": 4.366447925567627, "learning_rate": 1.654377341103006e-05, "loss": 0.3004, "step": 743300 }, { "epoch": 9.910546453186866, "grad_norm": 6.623878002166748, "learning_rate": 1.6536015975238424e-05, "loss": 0.3642, "step": 743400 }, { "epoch": 9.91187959099332, "grad_norm": 1.757867455482483, "learning_rate": 1.6528259666609082e-05, "loss": 0.3198, "step": 743500 }, { "epoch": 9.913212728799776, "grad_norm": 2.2540700435638428, "learning_rate": 1.652050448579138e-05, "loss": 0.3294, "step": 743600 }, { "epoch": 9.91454586660623, "grad_norm": 1.9718620777130127, "learning_rate": 1.6512750433434547e-05, "loss": 0.3379, "step": 743700 }, { "epoch": 9.915879004412686, "grad_norm": 1.6810353994369507, "learning_rate": 1.6504997510187745e-05, "loss": 0.3334, "step": 743800 }, { "epoch": 9.91721214221914, "grad_norm": 1.7273449897766113, "learning_rate": 1.6497245716700003e-05, "loss": 0.3718, "step": 743900 }, { "epoch": 9.918545280025596, "grad_norm": 1.779954195022583, "learning_rate": 1.648949505362027e-05, "loss": 0.3682, "step": 744000 }, { "epoch": 9.91987841783205, "grad_norm": 1.481253981590271, "learning_rate": 1.6481745521597446e-05, "loss": 0.3334, "step": 744100 }, { "epoch": 9.921211555638507, "grad_norm": 2.4520530700683594, "learning_rate": 1.6473997121280265e-05, "loss": 0.3425, "step": 744200 }, { "epoch": 9.922544693444962, "grad_norm": 3.4815304279327393, "learning_rate": 1.646624985331741e-05, "loss": 0.3717, "step": 744300 }, { "epoch": 9.923877831251417, "grad_norm": 1.2609919309616089, "learning_rate": 1.6458503718357445e-05, "loss": 0.3177, "step": 744400 }, { "epoch": 9.925210969057872, "grad_norm": 3.886215925216675, "learning_rate": 1.6450758717048867e-05, "loss": 0.3756, "step": 744500 }, { "epoch": 9.926544106864327, "grad_norm": 2.335726737976074, "learning_rate": 1.6443014850040073e-05, "loss": 0.3668, "step": 744600 }, { "epoch": 9.927877244670782, "grad_norm": 1.4012157917022705, "learning_rate": 1.6435272117979325e-05, "loss": 0.3268, "step": 744700 }, { "epoch": 9.929210382477237, "grad_norm": 17.890851974487305, "learning_rate": 1.642753052151484e-05, "loss": 0.3431, "step": 744800 }, { "epoch": 9.930543520283692, "grad_norm": 2.991281032562256, "learning_rate": 1.641979006129471e-05, "loss": 0.3311, "step": 744900 }, { "epoch": 9.931876658090147, "grad_norm": 1.9213874340057373, "learning_rate": 1.6412050737966954e-05, "loss": 0.3347, "step": 745000 }, { "epoch": 9.933209795896602, "grad_norm": 2.9804129600524902, "learning_rate": 1.640431255217947e-05, "loss": 0.3851, "step": 745100 }, { "epoch": 9.934542933703057, "grad_norm": 3.246122360229492, "learning_rate": 1.6396575504580098e-05, "loss": 0.3352, "step": 745200 }, { "epoch": 9.935876071509512, "grad_norm": 0.8378428816795349, "learning_rate": 1.6388839595816532e-05, "loss": 0.3528, "step": 745300 }, { "epoch": 9.937209209315967, "grad_norm": 2.0046589374542236, "learning_rate": 1.6381104826536396e-05, "loss": 0.3452, "step": 745400 }, { "epoch": 9.938542347122421, "grad_norm": 2.2904982566833496, "learning_rate": 1.6373371197387258e-05, "loss": 0.4044, "step": 745500 }, { "epoch": 9.939875484928876, "grad_norm": 5.183879852294922, "learning_rate": 1.6365638709016513e-05, "loss": 0.3064, "step": 745600 }, { "epoch": 9.941208622735331, "grad_norm": 1.4795827865600586, "learning_rate": 1.6357907362071522e-05, "loss": 0.3953, "step": 745700 }, { "epoch": 9.942541760541786, "grad_norm": 0.2948995530605316, "learning_rate": 1.635017715719952e-05, "loss": 0.2626, "step": 745800 }, { "epoch": 9.943874898348243, "grad_norm": 3.691040277481079, "learning_rate": 1.6342448095047657e-05, "loss": 0.3064, "step": 745900 }, { "epoch": 9.945208036154698, "grad_norm": 3.4698429107666016, "learning_rate": 1.6334720176263e-05, "loss": 0.3695, "step": 746000 }, { "epoch": 9.946541173961153, "grad_norm": 5.424574851989746, "learning_rate": 1.632699340149248e-05, "loss": 0.313, "step": 746100 }, { "epoch": 9.947874311767608, "grad_norm": 2.0685694217681885, "learning_rate": 1.6319267771382976e-05, "loss": 0.358, "step": 746200 }, { "epoch": 9.949207449574063, "grad_norm": 4.086202144622803, "learning_rate": 1.6311543286581247e-05, "loss": 0.3633, "step": 746300 }, { "epoch": 9.950540587380518, "grad_norm": 7.447272777557373, "learning_rate": 1.630381994773396e-05, "loss": 0.3803, "step": 746400 }, { "epoch": 9.951873725186973, "grad_norm": 0.7183212041854858, "learning_rate": 1.6296097755487696e-05, "loss": 0.3075, "step": 746500 }, { "epoch": 9.953206862993428, "grad_norm": 2.3665103912353516, "learning_rate": 1.628837671048894e-05, "loss": 0.3828, "step": 746600 }, { "epoch": 9.954540000799883, "grad_norm": 3.035789728164673, "learning_rate": 1.628065681338404e-05, "loss": 0.3056, "step": 746700 }, { "epoch": 9.955873138606337, "grad_norm": 2.1828713417053223, "learning_rate": 1.6272938064819317e-05, "loss": 0.3197, "step": 746800 }, { "epoch": 9.957206276412792, "grad_norm": 4.49215841293335, "learning_rate": 1.6265220465440962e-05, "loss": 0.3229, "step": 746900 }, { "epoch": 9.958539414219247, "grad_norm": 9.765684127807617, "learning_rate": 1.625750401589504e-05, "loss": 0.3522, "step": 747000 }, { "epoch": 9.959872552025702, "grad_norm": 1.9965888261795044, "learning_rate": 1.624978871682756e-05, "loss": 0.3203, "step": 747100 }, { "epoch": 9.961205689832157, "grad_norm": 5.920849323272705, "learning_rate": 1.6242074568884424e-05, "loss": 0.3563, "step": 747200 }, { "epoch": 9.962538827638612, "grad_norm": 19.212779998779297, "learning_rate": 1.623436157271144e-05, "loss": 0.3496, "step": 747300 }, { "epoch": 9.963871965445069, "grad_norm": 2.292334794998169, "learning_rate": 1.622664972895431e-05, "loss": 0.3715, "step": 747400 }, { "epoch": 9.965205103251524, "grad_norm": 2.1266465187072754, "learning_rate": 1.6218939038258657e-05, "loss": 0.3504, "step": 747500 }, { "epoch": 9.966538241057979, "grad_norm": 2.7385263442993164, "learning_rate": 1.6211229501269976e-05, "loss": 0.3359, "step": 747600 }, { "epoch": 9.967871378864434, "grad_norm": 8.01871109008789, "learning_rate": 1.6203521118633696e-05, "loss": 0.3318, "step": 747700 }, { "epoch": 9.969204516670889, "grad_norm": 3.163813829421997, "learning_rate": 1.6195813890995137e-05, "loss": 0.3058, "step": 747800 }, { "epoch": 9.970537654477344, "grad_norm": 8.553329467773438, "learning_rate": 1.6188107818999525e-05, "loss": 0.3994, "step": 747900 }, { "epoch": 9.971870792283799, "grad_norm": 3.7182161808013916, "learning_rate": 1.6180479946723316e-05, "loss": 0.3244, "step": 748000 }, { "epoch": 9.973203930090254, "grad_norm": 1.4558014869689941, "learning_rate": 1.617277617637637e-05, "loss": 0.3466, "step": 748100 }, { "epoch": 9.974537067896708, "grad_norm": 3.159560203552246, "learning_rate": 1.616507356360101e-05, "loss": 0.34, "step": 748200 }, { "epoch": 9.975870205703163, "grad_norm": 6.63757848739624, "learning_rate": 1.6157372109042074e-05, "loss": 0.3421, "step": 748300 }, { "epoch": 9.977203343509618, "grad_norm": 2.7939538955688477, "learning_rate": 1.614967181334431e-05, "loss": 0.3683, "step": 748400 }, { "epoch": 9.978536481316073, "grad_norm": 5.0258331298828125, "learning_rate": 1.6141972677152364e-05, "loss": 0.3235, "step": 748500 }, { "epoch": 9.979869619122528, "grad_norm": 5.9493513107299805, "learning_rate": 1.613427470111078e-05, "loss": 0.3604, "step": 748600 }, { "epoch": 9.981202756928983, "grad_norm": 5.353736400604248, "learning_rate": 1.6126577885864017e-05, "loss": 0.3326, "step": 748700 }, { "epoch": 9.982535894735438, "grad_norm": 42.02778625488281, "learning_rate": 1.6118882232056412e-05, "loss": 0.3751, "step": 748800 }, { "epoch": 9.983869032541893, "grad_norm": 13.162296295166016, "learning_rate": 1.611118774033222e-05, "loss": 0.3182, "step": 748900 }, { "epoch": 9.985202170348348, "grad_norm": 3.9194226264953613, "learning_rate": 1.6103494411335634e-05, "loss": 0.3571, "step": 749000 }, { "epoch": 9.986535308154805, "grad_norm": 28.904848098754883, "learning_rate": 1.6095802245710676e-05, "loss": 0.3733, "step": 749100 }, { "epoch": 9.98786844596126, "grad_norm": 2.9873239994049072, "learning_rate": 1.6088111244101336e-05, "loss": 0.3426, "step": 749200 }, { "epoch": 9.989201583767715, "grad_norm": 1.9225229024887085, "learning_rate": 1.608042140715144e-05, "loss": 0.311, "step": 749300 }, { "epoch": 9.99053472157417, "grad_norm": 18.600839614868164, "learning_rate": 1.6072732735504807e-05, "loss": 0.3733, "step": 749400 }, { "epoch": 9.991867859380624, "grad_norm": 7.5866265296936035, "learning_rate": 1.606504522980509e-05, "loss": 0.3642, "step": 749500 }, { "epoch": 9.99320099718708, "grad_norm": 1.1309887170791626, "learning_rate": 1.6057358890695854e-05, "loss": 0.329, "step": 749600 }, { "epoch": 9.994534134993534, "grad_norm": 14.240324020385742, "learning_rate": 1.6049673718820578e-05, "loss": 0.3869, "step": 749700 }, { "epoch": 9.99586727279999, "grad_norm": 4.976480960845947, "learning_rate": 1.6041989714822644e-05, "loss": 0.3056, "step": 749800 }, { "epoch": 9.997200410606444, "grad_norm": 3.055037260055542, "learning_rate": 1.603430687934533e-05, "loss": 0.3408, "step": 749900 }, { "epoch": 9.9985335484129, "grad_norm": 1.9389317035675049, "learning_rate": 1.602662521303182e-05, "loss": 0.3771, "step": 750000 }, { "epoch": 9.999866686219354, "grad_norm": 5.9384684562683105, "learning_rate": 1.6018944716525215e-05, "loss": 0.3255, "step": 750100 }, { "epoch": 10.001199824025809, "grad_norm": 1.0535697937011719, "learning_rate": 1.6011265390468458e-05, "loss": 0.272, "step": 750200 }, { "epoch": 10.002532961832264, "grad_norm": 3.1133487224578857, "learning_rate": 1.600358723550448e-05, "loss": 0.3274, "step": 750300 }, { "epoch": 10.003866099638719, "grad_norm": 4.546043395996094, "learning_rate": 1.5995910252276073e-05, "loss": 0.2976, "step": 750400 }, { "epoch": 10.005199237445174, "grad_norm": 0.4271894097328186, "learning_rate": 1.5988234441425898e-05, "loss": 0.3386, "step": 750500 }, { "epoch": 10.00653237525163, "grad_norm": 2.0767390727996826, "learning_rate": 1.598055980359657e-05, "loss": 0.3388, "step": 750600 }, { "epoch": 10.007865513058086, "grad_norm": 1.8385123014450073, "learning_rate": 1.597288633943058e-05, "loss": 0.2864, "step": 750700 }, { "epoch": 10.00919865086454, "grad_norm": 5.309218883514404, "learning_rate": 1.596521404957033e-05, "loss": 0.3429, "step": 750800 }, { "epoch": 10.010531788670995, "grad_norm": 5.191182613372803, "learning_rate": 1.5957542934658125e-05, "loss": 0.2946, "step": 750900 }, { "epoch": 10.01186492647745, "grad_norm": 5.491225242614746, "learning_rate": 1.5949872995336156e-05, "loss": 0.3196, "step": 751000 }, { "epoch": 10.013198064283905, "grad_norm": 3.314896821975708, "learning_rate": 1.5942204232246526e-05, "loss": 0.3067, "step": 751100 }, { "epoch": 10.01453120209036, "grad_norm": 3.3043019771575928, "learning_rate": 1.5934536646031243e-05, "loss": 0.281, "step": 751200 }, { "epoch": 10.015864339896815, "grad_norm": 3.578495740890503, "learning_rate": 1.5926870237332218e-05, "loss": 0.3633, "step": 751300 }, { "epoch": 10.01719747770327, "grad_norm": 5.387236595153809, "learning_rate": 1.5919205006791248e-05, "loss": 0.3668, "step": 751400 }, { "epoch": 10.018530615509725, "grad_norm": 2.193814754486084, "learning_rate": 1.5911540955050066e-05, "loss": 0.3088, "step": 751500 }, { "epoch": 10.01986375331618, "grad_norm": 1.5676088333129883, "learning_rate": 1.590387808275024e-05, "loss": 0.3249, "step": 751600 }, { "epoch": 10.021196891122635, "grad_norm": 7.648233413696289, "learning_rate": 1.5896216390533315e-05, "loss": 0.3047, "step": 751700 }, { "epoch": 10.02253002892909, "grad_norm": 4.42642879486084, "learning_rate": 1.5888555879040705e-05, "loss": 0.3195, "step": 751800 }, { "epoch": 10.023863166735545, "grad_norm": 2.190593719482422, "learning_rate": 1.5880896548913706e-05, "loss": 0.3586, "step": 751900 }, { "epoch": 10.025196304542, "grad_norm": 3.1065707206726074, "learning_rate": 1.587323840079354e-05, "loss": 0.3025, "step": 752000 }, { "epoch": 10.026529442348455, "grad_norm": 0.3673122227191925, "learning_rate": 1.5865581435321325e-05, "loss": 0.322, "step": 752100 }, { "epoch": 10.027862580154911, "grad_norm": 2.567157030105591, "learning_rate": 1.585800220510053e-05, "loss": 0.3027, "step": 752200 }, { "epoch": 10.029195717961366, "grad_norm": 2.0403480529785156, "learning_rate": 1.5850347595004692e-05, "loss": 0.2978, "step": 752300 }, { "epoch": 10.030528855767821, "grad_norm": 4.139500141143799, "learning_rate": 1.584269416947315e-05, "loss": 0.2598, "step": 752400 }, { "epoch": 10.031861993574276, "grad_norm": 3.729006767272949, "learning_rate": 1.583504192914663e-05, "loss": 0.3097, "step": 752500 }, { "epoch": 10.033195131380731, "grad_norm": 2.6076419353485107, "learning_rate": 1.5827390874665743e-05, "loss": 0.2848, "step": 752600 }, { "epoch": 10.034528269187186, "grad_norm": 0.2459980845451355, "learning_rate": 1.5819741006671027e-05, "loss": 0.3178, "step": 752700 }, { "epoch": 10.035861406993641, "grad_norm": 5.880007743835449, "learning_rate": 1.5812092325802874e-05, "loss": 0.2648, "step": 752800 }, { "epoch": 10.037194544800096, "grad_norm": 4.417851448059082, "learning_rate": 1.5804444832701635e-05, "loss": 0.2998, "step": 752900 }, { "epoch": 10.038527682606551, "grad_norm": 8.036844253540039, "learning_rate": 1.5796798528007544e-05, "loss": 0.4019, "step": 753000 }, { "epoch": 10.039860820413006, "grad_norm": 5.066902160644531, "learning_rate": 1.578915341236069e-05, "loss": 0.3529, "step": 753100 }, { "epoch": 10.041193958219461, "grad_norm": 1.0529907941818237, "learning_rate": 1.5781509486401117e-05, "loss": 0.3784, "step": 753200 }, { "epoch": 10.042527096025916, "grad_norm": 1.584439754486084, "learning_rate": 1.5773866750768752e-05, "loss": 0.3148, "step": 753300 }, { "epoch": 10.04386023383237, "grad_norm": 4.864838123321533, "learning_rate": 1.576630161565269e-05, "loss": 0.3051, "step": 753400 }, { "epoch": 10.045193371638826, "grad_norm": 2.6898205280303955, "learning_rate": 1.5758661250674876e-05, "loss": 0.3912, "step": 753500 }, { "epoch": 10.04652650944528, "grad_norm": 1.4661909341812134, "learning_rate": 1.575102207793705e-05, "loss": 0.3187, "step": 753600 }, { "epoch": 10.047859647251736, "grad_norm": 2.0278220176696777, "learning_rate": 1.574338409807874e-05, "loss": 0.3335, "step": 753700 }, { "epoch": 10.049192785058192, "grad_norm": 7.619815349578857, "learning_rate": 1.5735747311739374e-05, "loss": 0.3087, "step": 753800 }, { "epoch": 10.050525922864647, "grad_norm": 2.984185218811035, "learning_rate": 1.572811171955829e-05, "loss": 0.297, "step": 753900 }, { "epoch": 10.051859060671102, "grad_norm": 3.527442455291748, "learning_rate": 1.5720477322174677e-05, "loss": 0.3313, "step": 754000 }, { "epoch": 10.053192198477557, "grad_norm": 0.9326549768447876, "learning_rate": 1.5712844120227704e-05, "loss": 0.2903, "step": 754100 }, { "epoch": 10.054525336284012, "grad_norm": 2.724797010421753, "learning_rate": 1.5705212114356396e-05, "loss": 0.3317, "step": 754200 }, { "epoch": 10.055858474090467, "grad_norm": 7.636340618133545, "learning_rate": 1.5697581305199654e-05, "loss": 0.3118, "step": 754300 }, { "epoch": 10.057191611896922, "grad_norm": 1.4299007654190063, "learning_rate": 1.5689951693396322e-05, "loss": 0.3833, "step": 754400 }, { "epoch": 10.058524749703377, "grad_norm": 2.176684617996216, "learning_rate": 1.5682323279585125e-05, "loss": 0.319, "step": 754500 }, { "epoch": 10.059857887509832, "grad_norm": 2.2465028762817383, "learning_rate": 1.5674696064404696e-05, "loss": 0.3516, "step": 754600 }, { "epoch": 10.061191025316287, "grad_norm": 1.8339626789093018, "learning_rate": 1.5667070048493547e-05, "loss": 0.3654, "step": 754700 }, { "epoch": 10.062524163122742, "grad_norm": 4.179193496704102, "learning_rate": 1.5659445232490125e-05, "loss": 0.3723, "step": 754800 }, { "epoch": 10.063857300929197, "grad_norm": 0.5857337117195129, "learning_rate": 1.565182161703272e-05, "loss": 0.3122, "step": 754900 }, { "epoch": 10.065190438735652, "grad_norm": 3.045914888381958, "learning_rate": 1.5644199202759605e-05, "loss": 0.2957, "step": 755000 }, { "epoch": 10.066523576542107, "grad_norm": 3.5275235176086426, "learning_rate": 1.5636577990308868e-05, "loss": 0.2899, "step": 755100 }, { "epoch": 10.067856714348562, "grad_norm": 2.443899393081665, "learning_rate": 1.5628957980318543e-05, "loss": 0.2809, "step": 755200 }, { "epoch": 10.069189852155016, "grad_norm": 1.4381219148635864, "learning_rate": 1.5621339173426556e-05, "loss": 0.2921, "step": 755300 }, { "epoch": 10.070522989961471, "grad_norm": 2.13586163520813, "learning_rate": 1.5613721570270726e-05, "loss": 0.2773, "step": 755400 }, { "epoch": 10.071856127767928, "grad_norm": 1.75205397605896, "learning_rate": 1.560610517148878e-05, "loss": 0.2859, "step": 755500 }, { "epoch": 10.073189265574383, "grad_norm": 5.416157245635986, "learning_rate": 1.559848997771834e-05, "loss": 0.3517, "step": 755600 }, { "epoch": 10.074522403380838, "grad_norm": 2.3725690841674805, "learning_rate": 1.5590875989596915e-05, "loss": 0.3056, "step": 755700 }, { "epoch": 10.075855541187293, "grad_norm": 1.8350465297698975, "learning_rate": 1.5583263207761928e-05, "loss": 0.3394, "step": 755800 }, { "epoch": 10.077188678993748, "grad_norm": 2.0570316314697266, "learning_rate": 1.5575651632850702e-05, "loss": 0.3038, "step": 755900 }, { "epoch": 10.078521816800203, "grad_norm": 2.55316162109375, "learning_rate": 1.5568117363194444e-05, "loss": 0.3234, "step": 756000 }, { "epoch": 10.079854954606658, "grad_norm": 0.3704454004764557, "learning_rate": 1.556050819195714e-05, "loss": 0.3463, "step": 756100 }, { "epoch": 10.081188092413113, "grad_norm": 1.9749748706817627, "learning_rate": 1.5552900229548586e-05, "loss": 0.3471, "step": 756200 }, { "epoch": 10.082521230219568, "grad_norm": 1.768202304840088, "learning_rate": 1.554529347660569e-05, "loss": 0.3283, "step": 756300 }, { "epoch": 10.083854368026023, "grad_norm": 1.1370036602020264, "learning_rate": 1.553768793376525e-05, "loss": 0.3349, "step": 756400 }, { "epoch": 10.085187505832478, "grad_norm": 1.9253699779510498, "learning_rate": 1.5530083601663988e-05, "loss": 0.2379, "step": 756500 }, { "epoch": 10.086520643638933, "grad_norm": 2.411194324493408, "learning_rate": 1.5522480480938512e-05, "loss": 0.2896, "step": 756600 }, { "epoch": 10.087853781445387, "grad_norm": 2.383711576461792, "learning_rate": 1.5514878572225335e-05, "loss": 0.3157, "step": 756700 }, { "epoch": 10.089186919251842, "grad_norm": 1.8839054107666016, "learning_rate": 1.5507277876160876e-05, "loss": 0.3124, "step": 756800 }, { "epoch": 10.090520057058297, "grad_norm": 1.8470605611801147, "learning_rate": 1.5499678393381416e-05, "loss": 0.3378, "step": 756900 }, { "epoch": 10.091853194864752, "grad_norm": 3.3246941566467285, "learning_rate": 1.549208012452317e-05, "loss": 0.3175, "step": 757000 }, { "epoch": 10.093186332671209, "grad_norm": 5.410637855529785, "learning_rate": 1.5484483070222244e-05, "loss": 0.2557, "step": 757100 }, { "epoch": 10.094519470477664, "grad_norm": 1.1101069450378418, "learning_rate": 1.547688723111464e-05, "loss": 0.2928, "step": 757200 }, { "epoch": 10.095852608284119, "grad_norm": 2.685796022415161, "learning_rate": 1.546929260783625e-05, "loss": 0.305, "step": 757300 }, { "epoch": 10.097185746090574, "grad_norm": 3.6976099014282227, "learning_rate": 1.5461699201022896e-05, "loss": 0.3123, "step": 757400 }, { "epoch": 10.098518883897029, "grad_norm": 1.0461134910583496, "learning_rate": 1.545410701131023e-05, "loss": 0.2873, "step": 757500 }, { "epoch": 10.099852021703484, "grad_norm": 25.70598602294922, "learning_rate": 1.5446516039333885e-05, "loss": 0.3777, "step": 757600 }, { "epoch": 10.101185159509939, "grad_norm": 1.6136335134506226, "learning_rate": 1.543892628572935e-05, "loss": 0.3282, "step": 757700 }, { "epoch": 10.102518297316394, "grad_norm": 1.994789481163025, "learning_rate": 1.5431337751132e-05, "loss": 0.2955, "step": 757800 }, { "epoch": 10.103851435122849, "grad_norm": 2.4118449687957764, "learning_rate": 1.5423750436177125e-05, "loss": 0.2945, "step": 757900 }, { "epoch": 10.105184572929303, "grad_norm": 2.0049822330474854, "learning_rate": 1.5416164341499918e-05, "loss": 0.2787, "step": 758000 }, { "epoch": 10.106517710735758, "grad_norm": 1.345470666885376, "learning_rate": 1.540857946773546e-05, "loss": 0.2859, "step": 758100 }, { "epoch": 10.107850848542213, "grad_norm": 0.9076038002967834, "learning_rate": 1.5400995815518738e-05, "loss": 0.2858, "step": 758200 }, { "epoch": 10.109183986348668, "grad_norm": 8.32955265045166, "learning_rate": 1.539341338548462e-05, "loss": 0.3437, "step": 758300 }, { "epoch": 10.110517124155123, "grad_norm": 4.734578609466553, "learning_rate": 1.5385832178267874e-05, "loss": 0.3088, "step": 758400 }, { "epoch": 10.111850261961578, "grad_norm": 2.8538143634796143, "learning_rate": 1.537825219450322e-05, "loss": 0.3042, "step": 758500 }, { "epoch": 10.113183399768033, "grad_norm": 2.5789384841918945, "learning_rate": 1.5370673434825185e-05, "loss": 0.3654, "step": 758600 }, { "epoch": 10.11451653757449, "grad_norm": 4.786193370819092, "learning_rate": 1.5363095899868252e-05, "loss": 0.2679, "step": 758700 }, { "epoch": 10.115849675380945, "grad_norm": 6.979422092437744, "learning_rate": 1.535559534729521e-05, "loss": 0.3372, "step": 758800 }, { "epoch": 10.1171828131874, "grad_norm": 0.019174303859472275, "learning_rate": 1.5348020251420467e-05, "loss": 0.29, "step": 758900 }, { "epoch": 10.118515950993855, "grad_norm": 9.609233856201172, "learning_rate": 1.5340446382163264e-05, "loss": 0.3333, "step": 759000 }, { "epoch": 10.11984908880031, "grad_norm": 16.074052810668945, "learning_rate": 1.5332873740157684e-05, "loss": 0.3448, "step": 759100 }, { "epoch": 10.121182226606765, "grad_norm": 6.688719272613525, "learning_rate": 1.5325302326037675e-05, "loss": 0.3227, "step": 759200 }, { "epoch": 10.12251536441322, "grad_norm": 7.166050910949707, "learning_rate": 1.5317732140437095e-05, "loss": 0.3182, "step": 759300 }, { "epoch": 10.123848502219674, "grad_norm": 7.578860759735107, "learning_rate": 1.53101631839897e-05, "loss": 0.3065, "step": 759400 }, { "epoch": 10.12518164002613, "grad_norm": 2.522813558578491, "learning_rate": 1.5302595457329153e-05, "loss": 0.37, "step": 759500 }, { "epoch": 10.126514777832584, "grad_norm": 3.5340945720672607, "learning_rate": 1.5295028961088954e-05, "loss": 0.3279, "step": 759600 }, { "epoch": 10.12784791563904, "grad_norm": 2.8397152423858643, "learning_rate": 1.5287463695902602e-05, "loss": 0.2986, "step": 759700 }, { "epoch": 10.129181053445494, "grad_norm": 7.104159832000732, "learning_rate": 1.5279899662403404e-05, "loss": 0.3267, "step": 759800 }, { "epoch": 10.13051419125195, "grad_norm": 80.13990783691406, "learning_rate": 1.52723368612246e-05, "loss": 0.3602, "step": 759900 }, { "epoch": 10.131847329058404, "grad_norm": 1.5864659547805786, "learning_rate": 1.526477529299933e-05, "loss": 0.3212, "step": 760000 }, { "epoch": 10.133180466864859, "grad_norm": 2.5356154441833496, "learning_rate": 1.5257214958360624e-05, "loss": 0.2944, "step": 760100 }, { "epoch": 10.134513604671314, "grad_norm": 3.1233677864074707, "learning_rate": 1.5249655857941406e-05, "loss": 0.3314, "step": 760200 }, { "epoch": 10.13584674247777, "grad_norm": 4.622262954711914, "learning_rate": 1.5242097992374515e-05, "loss": 0.3355, "step": 760300 }, { "epoch": 10.137179880284226, "grad_norm": 2.1854281425476074, "learning_rate": 1.523454136229265e-05, "loss": 0.298, "step": 760400 }, { "epoch": 10.13851301809068, "grad_norm": 2.9762678146362305, "learning_rate": 1.522698596832842e-05, "loss": 0.2984, "step": 760500 }, { "epoch": 10.139846155897136, "grad_norm": 1.4966399669647217, "learning_rate": 1.5219431811114381e-05, "loss": 0.3104, "step": 760600 }, { "epoch": 10.14117929370359, "grad_norm": 12.673999786376953, "learning_rate": 1.5211878891282908e-05, "loss": 0.2916, "step": 760700 }, { "epoch": 10.142512431510045, "grad_norm": 13.660246849060059, "learning_rate": 1.520432720946632e-05, "loss": 0.3201, "step": 760800 }, { "epoch": 10.1438455693165, "grad_norm": 2.838900327682495, "learning_rate": 1.5196776766296815e-05, "loss": 0.3327, "step": 760900 }, { "epoch": 10.145178707122955, "grad_norm": 3.727123260498047, "learning_rate": 1.5189227562406498e-05, "loss": 0.3372, "step": 761000 }, { "epoch": 10.14651184492941, "grad_norm": 3.893094301223755, "learning_rate": 1.5181679598427374e-05, "loss": 0.3026, "step": 761100 }, { "epoch": 10.147844982735865, "grad_norm": 2.5247390270233154, "learning_rate": 1.5174132874991311e-05, "loss": 0.2877, "step": 761200 }, { "epoch": 10.14917812054232, "grad_norm": 2.6674692630767822, "learning_rate": 1.5166587392730105e-05, "loss": 0.305, "step": 761300 }, { "epoch": 10.150511258348775, "grad_norm": 1.7306371927261353, "learning_rate": 1.5159043152275446e-05, "loss": 0.2757, "step": 761400 }, { "epoch": 10.15184439615523, "grad_norm": 0.7351294755935669, "learning_rate": 1.5151500154258913e-05, "loss": 0.3037, "step": 761500 }, { "epoch": 10.153177533961685, "grad_norm": 2.350590705871582, "learning_rate": 1.5143958399311976e-05, "loss": 0.312, "step": 761600 }, { "epoch": 10.15451067176814, "grad_norm": 2.4633255004882812, "learning_rate": 1.5136417888066026e-05, "loss": 0.3193, "step": 761700 }, { "epoch": 10.155843809574595, "grad_norm": 0.48738357424736023, "learning_rate": 1.5128878621152306e-05, "loss": 0.3305, "step": 761800 }, { "epoch": 10.157176947381052, "grad_norm": 1.4072041511535645, "learning_rate": 1.5121340599201976e-05, "loss": 0.3194, "step": 761900 }, { "epoch": 10.158510085187507, "grad_norm": 0.5292414426803589, "learning_rate": 1.5113803822846132e-05, "loss": 0.2726, "step": 762000 }, { "epoch": 10.159843222993961, "grad_norm": 1.3972580432891846, "learning_rate": 1.51062682927157e-05, "loss": 0.3905, "step": 762100 }, { "epoch": 10.161176360800416, "grad_norm": 4.7280049324035645, "learning_rate": 1.5098734009441534e-05, "loss": 0.3487, "step": 762200 }, { "epoch": 10.162509498606871, "grad_norm": 3.4945993423461914, "learning_rate": 1.5091200973654382e-05, "loss": 0.3146, "step": 762300 }, { "epoch": 10.163842636413326, "grad_norm": 1.4491618871688843, "learning_rate": 1.5083669185984889e-05, "loss": 0.2992, "step": 762400 }, { "epoch": 10.165175774219781, "grad_norm": 2.6992716789245605, "learning_rate": 1.5076138647063595e-05, "loss": 0.3549, "step": 762500 }, { "epoch": 10.166508912026236, "grad_norm": 2.660885810852051, "learning_rate": 1.5068609357520938e-05, "loss": 0.3213, "step": 762600 }, { "epoch": 10.167842049832691, "grad_norm": 0.7733260989189148, "learning_rate": 1.5061081317987224e-05, "loss": 0.2508, "step": 762700 }, { "epoch": 10.169175187639146, "grad_norm": 0.015133580192923546, "learning_rate": 1.5053554529092695e-05, "loss": 0.2575, "step": 762800 }, { "epoch": 10.170508325445601, "grad_norm": 1.9408584833145142, "learning_rate": 1.5046028991467462e-05, "loss": 0.3094, "step": 762900 }, { "epoch": 10.171841463252056, "grad_norm": 3.0833194255828857, "learning_rate": 1.5038504705741546e-05, "loss": 0.3279, "step": 763000 }, { "epoch": 10.173174601058511, "grad_norm": 3.336042642593384, "learning_rate": 1.5031056896674728e-05, "loss": 0.3176, "step": 763100 }, { "epoch": 10.174507738864966, "grad_norm": 19.684709548950195, "learning_rate": 1.5023535104102365e-05, "loss": 0.3421, "step": 763200 }, { "epoch": 10.17584087667142, "grad_norm": 2.9920473098754883, "learning_rate": 1.5016014565312421e-05, "loss": 0.3667, "step": 763300 }, { "epoch": 10.177174014477876, "grad_norm": 4.428981781005859, "learning_rate": 1.5008495280934505e-05, "loss": 0.3355, "step": 763400 }, { "epoch": 10.178507152284332, "grad_norm": 0.834500789642334, "learning_rate": 1.5000977251598108e-05, "loss": 0.2793, "step": 763500 }, { "epoch": 10.179840290090787, "grad_norm": 2.683842897415161, "learning_rate": 1.4993460477932615e-05, "loss": 0.3287, "step": 763600 }, { "epoch": 10.181173427897242, "grad_norm": 3.3014256954193115, "learning_rate": 1.4985944960567304e-05, "loss": 0.3507, "step": 763700 }, { "epoch": 10.182506565703697, "grad_norm": 2.3822216987609863, "learning_rate": 1.4978430700131365e-05, "loss": 0.2711, "step": 763800 }, { "epoch": 10.183839703510152, "grad_norm": 26.397855758666992, "learning_rate": 1.4970917697253845e-05, "loss": 0.2907, "step": 763900 }, { "epoch": 10.185172841316607, "grad_norm": 2.125856876373291, "learning_rate": 1.4963405952563707e-05, "loss": 0.3012, "step": 764000 }, { "epoch": 10.186505979123062, "grad_norm": 7.313235282897949, "learning_rate": 1.4955895466689848e-05, "loss": 0.2922, "step": 764100 }, { "epoch": 10.187839116929517, "grad_norm": 7.025469779968262, "learning_rate": 1.4948386240260981e-05, "loss": 0.3352, "step": 764200 }, { "epoch": 10.189172254735972, "grad_norm": 1.5559231042861938, "learning_rate": 1.4940878273905786e-05, "loss": 0.3397, "step": 764300 }, { "epoch": 10.190505392542427, "grad_norm": 2.115717649459839, "learning_rate": 1.493337156825277e-05, "loss": 0.2885, "step": 764400 }, { "epoch": 10.191838530348882, "grad_norm": 5.698530673980713, "learning_rate": 1.4925866123930401e-05, "loss": 0.3427, "step": 764500 }, { "epoch": 10.193171668155337, "grad_norm": 2.804896354675293, "learning_rate": 1.4918361941567017e-05, "loss": 0.3436, "step": 764600 }, { "epoch": 10.194504805961792, "grad_norm": 6.46351432800293, "learning_rate": 1.4910859021790813e-05, "loss": 0.2995, "step": 764700 }, { "epoch": 10.195837943768247, "grad_norm": 1.8194690942764282, "learning_rate": 1.4903357365229932e-05, "loss": 0.3229, "step": 764800 }, { "epoch": 10.197171081574702, "grad_norm": 3.0195834636688232, "learning_rate": 1.4895856972512381e-05, "loss": 0.3038, "step": 764900 }, { "epoch": 10.198504219381157, "grad_norm": 3.2207493782043457, "learning_rate": 1.4888357844266072e-05, "loss": 0.372, "step": 765000 }, { "epoch": 10.199837357187613, "grad_norm": 1.4637647867202759, "learning_rate": 1.4880859981118811e-05, "loss": 0.2787, "step": 765100 }, { "epoch": 10.201170494994068, "grad_norm": 1.213226079940796, "learning_rate": 1.48733633836983e-05, "loss": 0.2892, "step": 765200 }, { "epoch": 10.202503632800523, "grad_norm": 3.1762795448303223, "learning_rate": 1.4865868052632114e-05, "loss": 0.3181, "step": 765300 }, { "epoch": 10.203836770606978, "grad_norm": 3.6346025466918945, "learning_rate": 1.4858373988547737e-05, "loss": 0.347, "step": 765400 }, { "epoch": 10.205169908413433, "grad_norm": 2.182187080383301, "learning_rate": 1.4850881192072581e-05, "loss": 0.3533, "step": 765500 }, { "epoch": 10.206503046219888, "grad_norm": 0.5723947286605835, "learning_rate": 1.4843389663833887e-05, "loss": 0.2719, "step": 765600 }, { "epoch": 10.207836184026343, "grad_norm": 1.4264967441558838, "learning_rate": 1.4835974300769642e-05, "loss": 0.3001, "step": 765700 }, { "epoch": 10.209169321832798, "grad_norm": 0.8276669383049011, "learning_rate": 1.482848529818729e-05, "loss": 0.3742, "step": 765800 }, { "epoch": 10.210502459639253, "grad_norm": 0.8740332722663879, "learning_rate": 1.4820997565716308e-05, "loss": 0.3213, "step": 765900 }, { "epoch": 10.211835597445708, "grad_norm": 4.249547958374023, "learning_rate": 1.4813511103983567e-05, "loss": 0.2925, "step": 766000 }, { "epoch": 10.213168735252163, "grad_norm": 1.7725199460983276, "learning_rate": 1.4806025913615802e-05, "loss": 0.3593, "step": 766100 }, { "epoch": 10.214501873058618, "grad_norm": 4.016871929168701, "learning_rate": 1.4798616828125003e-05, "loss": 0.3292, "step": 766200 }, { "epoch": 10.215835010865073, "grad_norm": 2.4291820526123047, "learning_rate": 1.4791134169637714e-05, "loss": 0.2914, "step": 766300 }, { "epoch": 10.217168148671528, "grad_norm": 2.3064634799957275, "learning_rate": 1.4783652784388722e-05, "loss": 0.3235, "step": 766400 }, { "epoch": 10.218501286477983, "grad_norm": 2.237088203430176, "learning_rate": 1.477617267300437e-05, "loss": 0.2923, "step": 766500 }, { "epoch": 10.219834424284437, "grad_norm": 0.8671818375587463, "learning_rate": 1.4768693836110844e-05, "loss": 0.3309, "step": 766600 }, { "epoch": 10.221167562090894, "grad_norm": 2.610992908477783, "learning_rate": 1.4761216274334258e-05, "loss": 0.253, "step": 766700 }, { "epoch": 10.222500699897349, "grad_norm": 4.497406959533691, "learning_rate": 1.4753739988300607e-05, "loss": 0.3101, "step": 766800 }, { "epoch": 10.223833837703804, "grad_norm": 1.5238174200057983, "learning_rate": 1.4746264978635785e-05, "loss": 0.3479, "step": 766900 }, { "epoch": 10.225166975510259, "grad_norm": 1.1579301357269287, "learning_rate": 1.4738791245965587e-05, "loss": 0.2584, "step": 767000 }, { "epoch": 10.226500113316714, "grad_norm": 4.698675155639648, "learning_rate": 1.4731318790915666e-05, "loss": 0.3543, "step": 767100 }, { "epoch": 10.227833251123169, "grad_norm": 4.3303375244140625, "learning_rate": 1.4723847614111602e-05, "loss": 0.3256, "step": 767200 }, { "epoch": 10.229166388929624, "grad_norm": 4.641096115112305, "learning_rate": 1.4716377716178867e-05, "loss": 0.3508, "step": 767300 }, { "epoch": 10.230499526736079, "grad_norm": 2.4533259868621826, "learning_rate": 1.4708909097742808e-05, "loss": 0.2988, "step": 767400 }, { "epoch": 10.231832664542534, "grad_norm": 1.877213716506958, "learning_rate": 1.4701441759428678e-05, "loss": 0.3101, "step": 767500 }, { "epoch": 10.233165802348989, "grad_norm": 1.9897594451904297, "learning_rate": 1.4693975701861629e-05, "loss": 0.2947, "step": 767600 }, { "epoch": 10.234498940155444, "grad_norm": 1.333959698677063, "learning_rate": 1.4686510925666673e-05, "loss": 0.3226, "step": 767700 }, { "epoch": 10.235832077961899, "grad_norm": 4.3749589920043945, "learning_rate": 1.4679047431468742e-05, "loss": 0.3635, "step": 767800 }, { "epoch": 10.237165215768353, "grad_norm": 3.2350375652313232, "learning_rate": 1.4671585219892686e-05, "loss": 0.3553, "step": 767900 }, { "epoch": 10.238498353574808, "grad_norm": 2.9650766849517822, "learning_rate": 1.4664124291563182e-05, "loss": 0.3008, "step": 768000 }, { "epoch": 10.239831491381263, "grad_norm": 2.633399248123169, "learning_rate": 1.4656664647104855e-05, "loss": 0.2896, "step": 768100 }, { "epoch": 10.241164629187718, "grad_norm": 3.7166695594787598, "learning_rate": 1.4649206287142194e-05, "loss": 0.3639, "step": 768200 }, { "epoch": 10.242497766994175, "grad_norm": 2.3778204917907715, "learning_rate": 1.464174921229959e-05, "loss": 0.3341, "step": 768300 }, { "epoch": 10.24383090480063, "grad_norm": 2.0362818241119385, "learning_rate": 1.4634293423201334e-05, "loss": 0.3088, "step": 768400 }, { "epoch": 10.245164042607085, "grad_norm": 3.6944496631622314, "learning_rate": 1.4626913459129324e-05, "loss": 0.3293, "step": 768500 }, { "epoch": 10.24649718041354, "grad_norm": 3.4868485927581787, "learning_rate": 1.4619460230519154e-05, "loss": 0.2838, "step": 768600 }, { "epoch": 10.247830318219995, "grad_norm": 1.309875726699829, "learning_rate": 1.461200828951929e-05, "loss": 0.3029, "step": 768700 }, { "epoch": 10.24916345602645, "grad_norm": 0.30841144919395447, "learning_rate": 1.4604557636753594e-05, "loss": 0.2959, "step": 768800 }, { "epoch": 10.250496593832905, "grad_norm": 0.5369181036949158, "learning_rate": 1.4597108272845794e-05, "loss": 0.2996, "step": 768900 }, { "epoch": 10.25182973163936, "grad_norm": 3.2110683917999268, "learning_rate": 1.4589660198419522e-05, "loss": 0.2993, "step": 769000 }, { "epoch": 10.253162869445815, "grad_norm": 3.8153209686279297, "learning_rate": 1.4582213414098343e-05, "loss": 0.3044, "step": 769100 }, { "epoch": 10.25449600725227, "grad_norm": 8.267338752746582, "learning_rate": 1.4574767920505645e-05, "loss": 0.2877, "step": 769200 }, { "epoch": 10.255829145058724, "grad_norm": 2.890918493270874, "learning_rate": 1.4567323718264752e-05, "loss": 0.2919, "step": 769300 }, { "epoch": 10.25716228286518, "grad_norm": 1.5988810062408447, "learning_rate": 1.455988080799887e-05, "loss": 0.3012, "step": 769400 }, { "epoch": 10.258495420671634, "grad_norm": 3.889702320098877, "learning_rate": 1.4552439190331097e-05, "loss": 0.3159, "step": 769500 }, { "epoch": 10.25982855847809, "grad_norm": 2.647775173187256, "learning_rate": 1.454499886588442e-05, "loss": 0.3101, "step": 769600 }, { "epoch": 10.261161696284544, "grad_norm": 2.039412498474121, "learning_rate": 1.4537559835281728e-05, "loss": 0.308, "step": 769700 }, { "epoch": 10.262494834091, "grad_norm": 3.347623586654663, "learning_rate": 1.4530122099145772e-05, "loss": 0.3106, "step": 769800 }, { "epoch": 10.263827971897456, "grad_norm": 6.218544006347656, "learning_rate": 1.452268565809922e-05, "loss": 0.3166, "step": 769900 }, { "epoch": 10.26516110970391, "grad_norm": 1.34297513961792, "learning_rate": 1.4515250512764659e-05, "loss": 0.3049, "step": 770000 }, { "epoch": 10.266494247510366, "grad_norm": 3.844195604324341, "learning_rate": 1.4507816663764502e-05, "loss": 0.2802, "step": 770100 }, { "epoch": 10.26782738531682, "grad_norm": 4.940938472747803, "learning_rate": 1.4500384111721109e-05, "loss": 0.3037, "step": 770200 }, { "epoch": 10.269160523123276, "grad_norm": 1.5086042881011963, "learning_rate": 1.4492952857256678e-05, "loss": 0.352, "step": 770300 }, { "epoch": 10.27049366092973, "grad_norm": 0.42457786202430725, "learning_rate": 1.4485522900993358e-05, "loss": 0.3223, "step": 770400 }, { "epoch": 10.271826798736186, "grad_norm": 3.8200595378875732, "learning_rate": 1.447809424355317e-05, "loss": 0.3197, "step": 770500 }, { "epoch": 10.27315993654264, "grad_norm": 1.7961255311965942, "learning_rate": 1.4470666885557992e-05, "loss": 0.3084, "step": 770600 }, { "epoch": 10.274493074349095, "grad_norm": 3.671712875366211, "learning_rate": 1.4463240827629628e-05, "loss": 0.3624, "step": 770700 }, { "epoch": 10.27582621215555, "grad_norm": 0.8762817978858948, "learning_rate": 1.4455816070389765e-05, "loss": 0.3158, "step": 770800 }, { "epoch": 10.277159349962005, "grad_norm": 1.6156182289123535, "learning_rate": 1.444839261445998e-05, "loss": 0.3123, "step": 770900 }, { "epoch": 10.27849248776846, "grad_norm": 3.824550151824951, "learning_rate": 1.4440970460461746e-05, "loss": 0.3126, "step": 771000 }, { "epoch": 10.279825625574915, "grad_norm": 12.424742698669434, "learning_rate": 1.4433549609016427e-05, "loss": 0.3189, "step": 771100 }, { "epoch": 10.28115876338137, "grad_norm": 6.825306415557861, "learning_rate": 1.4426130060745257e-05, "loss": 0.2924, "step": 771200 }, { "epoch": 10.282491901187825, "grad_norm": 4.698740482330322, "learning_rate": 1.4418711816269373e-05, "loss": 0.2852, "step": 771300 }, { "epoch": 10.28382503899428, "grad_norm": 4.776031017303467, "learning_rate": 1.4411294876209841e-05, "loss": 0.2619, "step": 771400 }, { "epoch": 10.285158176800737, "grad_norm": 13.314108848571777, "learning_rate": 1.440387924118755e-05, "loss": 0.298, "step": 771500 }, { "epoch": 10.286491314607192, "grad_norm": 15.49630355834961, "learning_rate": 1.439646491182334e-05, "loss": 0.3408, "step": 771600 }, { "epoch": 10.287824452413647, "grad_norm": 30.650976181030273, "learning_rate": 1.4389051888737875e-05, "loss": 0.332, "step": 771700 }, { "epoch": 10.289157590220102, "grad_norm": 3.4128458499908447, "learning_rate": 1.4381640172551788e-05, "loss": 0.2777, "step": 771800 }, { "epoch": 10.290490728026557, "grad_norm": 1.7491786479949951, "learning_rate": 1.4374229763885564e-05, "loss": 0.252, "step": 771900 }, { "epoch": 10.291823865833011, "grad_norm": 2.5969693660736084, "learning_rate": 1.4366820663359556e-05, "loss": 0.3106, "step": 772000 }, { "epoch": 10.293157003639466, "grad_norm": 4.170999050140381, "learning_rate": 1.4359412871594044e-05, "loss": 0.3385, "step": 772100 }, { "epoch": 10.294490141445921, "grad_norm": 2.1251847743988037, "learning_rate": 1.435200638920918e-05, "loss": 0.3103, "step": 772200 }, { "epoch": 10.295823279252376, "grad_norm": 1.6678904294967651, "learning_rate": 1.4344601216825018e-05, "loss": 0.3072, "step": 772300 }, { "epoch": 10.297156417058831, "grad_norm": 2.7791502475738525, "learning_rate": 1.4337197355061489e-05, "loss": 0.2756, "step": 772400 }, { "epoch": 10.298489554865286, "grad_norm": 4.403077125549316, "learning_rate": 1.4329794804538436e-05, "loss": 0.3309, "step": 772500 }, { "epoch": 10.299822692671741, "grad_norm": 1.0843902826309204, "learning_rate": 1.4322467571766437e-05, "loss": 0.3406, "step": 772600 }, { "epoch": 10.301155830478196, "grad_norm": 1.685717225074768, "learning_rate": 1.4315067632455483e-05, "loss": 0.3074, "step": 772700 }, { "epoch": 10.302488968284651, "grad_norm": 4.483470916748047, "learning_rate": 1.4307669006237619e-05, "loss": 0.3393, "step": 772800 }, { "epoch": 10.303822106091106, "grad_norm": 4.985830783843994, "learning_rate": 1.4300271693732244e-05, "loss": 0.2982, "step": 772900 }, { "epoch": 10.30515524389756, "grad_norm": 2.497006893157959, "learning_rate": 1.4292875695558628e-05, "loss": 0.3353, "step": 773000 }, { "epoch": 10.306488381704018, "grad_norm": 2.8163936138153076, "learning_rate": 1.4285481012335961e-05, "loss": 0.3658, "step": 773100 }, { "epoch": 10.307821519510473, "grad_norm": 4.393527030944824, "learning_rate": 1.4278087644683274e-05, "loss": 0.3497, "step": 773200 }, { "epoch": 10.309154657316927, "grad_norm": 3.333085536956787, "learning_rate": 1.4270695593219531e-05, "loss": 0.326, "step": 773300 }, { "epoch": 10.310487795123382, "grad_norm": 25.117162704467773, "learning_rate": 1.4263304858563572e-05, "loss": 0.2919, "step": 773400 }, { "epoch": 10.311820932929837, "grad_norm": 3.4703562259674072, "learning_rate": 1.425591544133412e-05, "loss": 0.3219, "step": 773500 }, { "epoch": 10.313154070736292, "grad_norm": 3.407264232635498, "learning_rate": 1.4248527342149797e-05, "loss": 0.2678, "step": 773600 }, { "epoch": 10.314487208542747, "grad_norm": 2.378135919570923, "learning_rate": 1.4241140561629127e-05, "loss": 0.3787, "step": 773700 }, { "epoch": 10.315820346349202, "grad_norm": 7.455240249633789, "learning_rate": 1.423375510039047e-05, "loss": 0.3383, "step": 773800 }, { "epoch": 10.317153484155657, "grad_norm": 1.6347278356552124, "learning_rate": 1.4226370959052146e-05, "loss": 0.318, "step": 773900 }, { "epoch": 10.318486621962112, "grad_norm": 3.591444969177246, "learning_rate": 1.4218988138232332e-05, "loss": 0.3486, "step": 774000 }, { "epoch": 10.319819759768567, "grad_norm": 3.9761147499084473, "learning_rate": 1.4211606638549078e-05, "loss": 0.3103, "step": 774100 }, { "epoch": 10.321152897575022, "grad_norm": 2.805393695831299, "learning_rate": 1.4204226460620345e-05, "loss": 0.3483, "step": 774200 }, { "epoch": 10.322486035381477, "grad_norm": 3.6335370540618896, "learning_rate": 1.4196847605063979e-05, "loss": 0.287, "step": 774300 }, { "epoch": 10.323819173187932, "grad_norm": 2.5044448375701904, "learning_rate": 1.4189470072497716e-05, "loss": 0.3308, "step": 774400 }, { "epoch": 10.325152310994387, "grad_norm": 3.6358888149261475, "learning_rate": 1.4182093863539182e-05, "loss": 0.3441, "step": 774500 }, { "epoch": 10.326485448800842, "grad_norm": 1.341336727142334, "learning_rate": 1.4174792721096283e-05, "loss": 0.2987, "step": 774600 }, { "epoch": 10.327818586607298, "grad_norm": 0.8202757239341736, "learning_rate": 1.4167419147954146e-05, "loss": 0.3171, "step": 774700 }, { "epoch": 10.329151724413753, "grad_norm": 2.515454053878784, "learning_rate": 1.4160046900265767e-05, "loss": 0.3371, "step": 774800 }, { "epoch": 10.330484862220208, "grad_norm": 1.7552673816680908, "learning_rate": 1.4152675978648344e-05, "loss": 0.2865, "step": 774900 }, { "epoch": 10.331818000026663, "grad_norm": 7.114172458648682, "learning_rate": 1.414530638371891e-05, "loss": 0.278, "step": 775000 }, { "epoch": 10.333151137833118, "grad_norm": 1.6447935104370117, "learning_rate": 1.4137938116094459e-05, "loss": 0.3094, "step": 775100 }, { "epoch": 10.334484275639573, "grad_norm": 7.3930864334106445, "learning_rate": 1.4130571176391837e-05, "loss": 0.2881, "step": 775200 }, { "epoch": 10.335817413446028, "grad_norm": 2.332552194595337, "learning_rate": 1.412320556522776e-05, "loss": 0.3251, "step": 775300 }, { "epoch": 10.337150551252483, "grad_norm": 3.3745176792144775, "learning_rate": 1.4115841283218866e-05, "loss": 0.3402, "step": 775400 }, { "epoch": 10.338483689058938, "grad_norm": 1.5156556367874146, "learning_rate": 1.4108478330981668e-05, "loss": 0.3726, "step": 775500 }, { "epoch": 10.339816826865393, "grad_norm": 2.8101000785827637, "learning_rate": 1.4101116709132572e-05, "loss": 0.3187, "step": 775600 }, { "epoch": 10.341149964671848, "grad_norm": 8.875125885009766, "learning_rate": 1.4093756418287866e-05, "loss": 0.3393, "step": 775700 }, { "epoch": 10.342483102478303, "grad_norm": 0.7521530389785767, "learning_rate": 1.4086397459063742e-05, "loss": 0.2687, "step": 775800 }, { "epoch": 10.343816240284758, "grad_norm": 2.2586517333984375, "learning_rate": 1.4079113401749525e-05, "loss": 0.3256, "step": 775900 }, { "epoch": 10.345149378091213, "grad_norm": 3.920685052871704, "learning_rate": 1.4071757094283055e-05, "loss": 0.3251, "step": 776000 }, { "epoch": 10.346482515897668, "grad_norm": 2.082818031311035, "learning_rate": 1.4064402120278883e-05, "loss": 0.2865, "step": 776100 }, { "epoch": 10.347815653704123, "grad_norm": 1.0610523223876953, "learning_rate": 1.4057048480352707e-05, "loss": 0.3225, "step": 776200 }, { "epoch": 10.34914879151058, "grad_norm": 3.2213239669799805, "learning_rate": 1.404969617512019e-05, "loss": 0.2933, "step": 776300 }, { "epoch": 10.350481929317034, "grad_norm": 4.964287281036377, "learning_rate": 1.4042345205196841e-05, "loss": 0.339, "step": 776400 }, { "epoch": 10.35181506712349, "grad_norm": 2.383469581604004, "learning_rate": 1.4034995571198035e-05, "loss": 0.3526, "step": 776500 }, { "epoch": 10.353148204929944, "grad_norm": 2.910885810852051, "learning_rate": 1.4027647273739074e-05, "loss": 0.3272, "step": 776600 }, { "epoch": 10.354481342736399, "grad_norm": 1.6934916973114014, "learning_rate": 1.4020300313435135e-05, "loss": 0.3203, "step": 776700 }, { "epoch": 10.355814480542854, "grad_norm": 1.2858902215957642, "learning_rate": 1.4012954690901279e-05, "loss": 0.3714, "step": 776800 }, { "epoch": 10.357147618349309, "grad_norm": 3.0160574913024902, "learning_rate": 1.4005610406752464e-05, "loss": 0.2607, "step": 776900 }, { "epoch": 10.358480756155764, "grad_norm": 2.0338218212127686, "learning_rate": 1.3998267461603536e-05, "loss": 0.3062, "step": 777000 }, { "epoch": 10.359813893962219, "grad_norm": 18.634689331054688, "learning_rate": 1.3990925856069203e-05, "loss": 0.3304, "step": 777100 }, { "epoch": 10.361147031768674, "grad_norm": 3.7987406253814697, "learning_rate": 1.3983585590764081e-05, "loss": 0.2815, "step": 777200 }, { "epoch": 10.362480169575129, "grad_norm": 2.2095937728881836, "learning_rate": 1.397624666630271e-05, "loss": 0.3095, "step": 777300 }, { "epoch": 10.363813307381584, "grad_norm": 2.861090898513794, "learning_rate": 1.3968909083299447e-05, "loss": 0.3239, "step": 777400 }, { "epoch": 10.365146445188039, "grad_norm": 2.964323043823242, "learning_rate": 1.3961572842368588e-05, "loss": 0.4079, "step": 777500 }, { "epoch": 10.366479582994494, "grad_norm": 3.408877372741699, "learning_rate": 1.3954237944124272e-05, "loss": 0.2672, "step": 777600 }, { "epoch": 10.367812720800949, "grad_norm": 1.9234308004379272, "learning_rate": 1.3946904389180588e-05, "loss": 0.29, "step": 777700 }, { "epoch": 10.369145858607403, "grad_norm": 3.117196798324585, "learning_rate": 1.3939572178151472e-05, "loss": 0.3326, "step": 777800 }, { "epoch": 10.370478996413858, "grad_norm": 5.192654609680176, "learning_rate": 1.393224131165074e-05, "loss": 0.3113, "step": 777900 }, { "epoch": 10.371812134220315, "grad_norm": 1.0031086206436157, "learning_rate": 1.3924911790292114e-05, "loss": 0.2761, "step": 778000 }, { "epoch": 10.37314527202677, "grad_norm": 2.9861435890197754, "learning_rate": 1.3917583614689198e-05, "loss": 0.2963, "step": 778100 }, { "epoch": 10.374478409833225, "grad_norm": 1.1893692016601562, "learning_rate": 1.3910256785455486e-05, "loss": 0.2921, "step": 778200 }, { "epoch": 10.37581154763968, "grad_norm": 4.598252773284912, "learning_rate": 1.390293130320436e-05, "loss": 0.2955, "step": 778300 }, { "epoch": 10.377144685446135, "grad_norm": 1.921966791152954, "learning_rate": 1.3895607168549091e-05, "loss": 0.2909, "step": 778400 }, { "epoch": 10.37847782325259, "grad_norm": 5.866818904876709, "learning_rate": 1.38882843821028e-05, "loss": 0.2772, "step": 778500 }, { "epoch": 10.379810961059045, "grad_norm": 1.5277519226074219, "learning_rate": 1.3880962944478566e-05, "loss": 0.272, "step": 778600 }, { "epoch": 10.3811440988655, "grad_norm": 3.419001817703247, "learning_rate": 1.3873642856289315e-05, "loss": 0.2889, "step": 778700 }, { "epoch": 10.382477236671955, "grad_norm": 7.91818380355835, "learning_rate": 1.3866324118147835e-05, "loss": 0.3349, "step": 778800 }, { "epoch": 10.38381037447841, "grad_norm": 1.5969423055648804, "learning_rate": 1.3859006730666846e-05, "loss": 0.3222, "step": 778900 }, { "epoch": 10.385143512284865, "grad_norm": 2.8785455226898193, "learning_rate": 1.3851690694458931e-05, "loss": 0.304, "step": 779000 }, { "epoch": 10.38647665009132, "grad_norm": 2.9050045013427734, "learning_rate": 1.3844376010136565e-05, "loss": 0.2881, "step": 779100 }, { "epoch": 10.387809787897774, "grad_norm": 3.213782548904419, "learning_rate": 1.3837062678312125e-05, "loss": 0.3034, "step": 779200 }, { "epoch": 10.38914292570423, "grad_norm": 1.921615481376648, "learning_rate": 1.3829750699597838e-05, "loss": 0.3431, "step": 779300 }, { "epoch": 10.390476063510684, "grad_norm": 2.882972240447998, "learning_rate": 1.382251317415283e-05, "loss": 0.2842, "step": 779400 }, { "epoch": 10.391809201317141, "grad_norm": 4.365665912628174, "learning_rate": 1.3815203889948789e-05, "loss": 0.2747, "step": 779500 }, { "epoch": 10.393142339123596, "grad_norm": 5.416311264038086, "learning_rate": 1.3807895960684869e-05, "loss": 0.3517, "step": 779600 }, { "epoch": 10.394475476930051, "grad_norm": 7.053952217102051, "learning_rate": 1.3800589386972836e-05, "loss": 0.3435, "step": 779700 }, { "epoch": 10.395808614736506, "grad_norm": 4.814449787139893, "learning_rate": 1.3793284169424408e-05, "loss": 0.3106, "step": 779800 }, { "epoch": 10.39714175254296, "grad_norm": 1.3398456573486328, "learning_rate": 1.3785980308651149e-05, "loss": 0.2935, "step": 779900 }, { "epoch": 10.398474890349416, "grad_norm": 2.7869961261749268, "learning_rate": 1.37786778052645e-05, "loss": 0.3439, "step": 780000 }, { "epoch": 10.39980802815587, "grad_norm": 1.2012362480163574, "learning_rate": 1.3771376659875808e-05, "loss": 0.3421, "step": 780100 }, { "epoch": 10.401141165962326, "grad_norm": 1.4150532484054565, "learning_rate": 1.3764076873096309e-05, "loss": 0.3396, "step": 780200 }, { "epoch": 10.40247430376878, "grad_norm": 0.9356967210769653, "learning_rate": 1.3756778445537108e-05, "loss": 0.3335, "step": 780300 }, { "epoch": 10.403807441575236, "grad_norm": 1.863513469696045, "learning_rate": 1.3749481377809228e-05, "loss": 0.3126, "step": 780400 }, { "epoch": 10.40514057938169, "grad_norm": 1.1365833282470703, "learning_rate": 1.3742185670523528e-05, "loss": 0.3151, "step": 780500 }, { "epoch": 10.406473717188145, "grad_norm": 3.1330349445343018, "learning_rate": 1.3734891324290797e-05, "loss": 0.3021, "step": 780600 }, { "epoch": 10.4078068549946, "grad_norm": 3.3271429538726807, "learning_rate": 1.372759833972169e-05, "loss": 0.3212, "step": 780700 }, { "epoch": 10.409139992801055, "grad_norm": 3.0287888050079346, "learning_rate": 1.3720306717426755e-05, "loss": 0.3169, "step": 780800 }, { "epoch": 10.41047313060751, "grad_norm": 0.7022061944007874, "learning_rate": 1.3713016458016425e-05, "loss": 0.2742, "step": 780900 }, { "epoch": 10.411806268413965, "grad_norm": 1.2744028568267822, "learning_rate": 1.3705727562101025e-05, "loss": 0.3239, "step": 781000 }, { "epoch": 10.41313940622042, "grad_norm": 4.74507999420166, "learning_rate": 1.3698440030290732e-05, "loss": 0.3306, "step": 781100 }, { "epoch": 10.414472544026877, "grad_norm": 2.9473507404327393, "learning_rate": 1.3691153863195659e-05, "loss": 0.2595, "step": 781200 }, { "epoch": 10.415805681833332, "grad_norm": 1.3679935932159424, "learning_rate": 1.3683869061425789e-05, "loss": 0.2564, "step": 781300 }, { "epoch": 10.417138819639787, "grad_norm": 2.6784844398498535, "learning_rate": 1.3676585625590953e-05, "loss": 0.2433, "step": 781400 }, { "epoch": 10.418471957446242, "grad_norm": 2.578575611114502, "learning_rate": 1.3669303556300916e-05, "loss": 0.2472, "step": 781500 }, { "epoch": 10.419805095252697, "grad_norm": 2.497523307800293, "learning_rate": 1.3662022854165308e-05, "loss": 0.284, "step": 781600 }, { "epoch": 10.421138233059152, "grad_norm": 1.921134352684021, "learning_rate": 1.3654743519793643e-05, "loss": 0.3177, "step": 781700 }, { "epoch": 10.422471370865606, "grad_norm": 4.1497802734375, "learning_rate": 1.3647465553795323e-05, "loss": 0.332, "step": 781800 }, { "epoch": 10.423804508672061, "grad_norm": 0.9408390522003174, "learning_rate": 1.3640188956779653e-05, "loss": 0.299, "step": 781900 }, { "epoch": 10.425137646478516, "grad_norm": 2.9665884971618652, "learning_rate": 1.363291372935577e-05, "loss": 0.3126, "step": 782000 }, { "epoch": 10.426470784284971, "grad_norm": 2.5433244705200195, "learning_rate": 1.3625639872132782e-05, "loss": 0.3425, "step": 782100 }, { "epoch": 10.427803922091426, "grad_norm": 1.4666757583618164, "learning_rate": 1.3618440103796223e-05, "loss": 0.2762, "step": 782200 }, { "epoch": 10.429137059897881, "grad_norm": 1.2239664793014526, "learning_rate": 1.3611168975084464e-05, "loss": 0.324, "step": 782300 }, { "epoch": 10.430470197704336, "grad_norm": 4.071089744567871, "learning_rate": 1.360389921839399e-05, "loss": 0.3197, "step": 782400 }, { "epoch": 10.431803335510791, "grad_norm": 2.5477404594421387, "learning_rate": 1.3596630834333403e-05, "loss": 0.3194, "step": 782500 }, { "epoch": 10.433136473317246, "grad_norm": 1.3847135305404663, "learning_rate": 1.3589363823511157e-05, "loss": 0.3412, "step": 782600 }, { "epoch": 10.434469611123703, "grad_norm": 0.5927440524101257, "learning_rate": 1.3582098186535645e-05, "loss": 0.3112, "step": 782700 }, { "epoch": 10.435802748930158, "grad_norm": 1.7519937753677368, "learning_rate": 1.3574833924015116e-05, "loss": 0.385, "step": 782800 }, { "epoch": 10.437135886736613, "grad_norm": 0.5644317865371704, "learning_rate": 1.3567571036557718e-05, "loss": 0.3147, "step": 782900 }, { "epoch": 10.438469024543068, "grad_norm": 5.216589450836182, "learning_rate": 1.3560309524771472e-05, "loss": 0.2814, "step": 783000 }, { "epoch": 10.439802162349523, "grad_norm": 1.003589153289795, "learning_rate": 1.3553049389264302e-05, "loss": 0.306, "step": 783100 }, { "epoch": 10.441135300155977, "grad_norm": 4.238986968994141, "learning_rate": 1.3545790630643969e-05, "loss": 0.3529, "step": 783200 }, { "epoch": 10.442468437962432, "grad_norm": 13.548593521118164, "learning_rate": 1.3538533249518184e-05, "loss": 0.3052, "step": 783300 }, { "epoch": 10.443801575768887, "grad_norm": 3.009427547454834, "learning_rate": 1.3531277246494518e-05, "loss": 0.3278, "step": 783400 }, { "epoch": 10.445134713575342, "grad_norm": 2.2546873092651367, "learning_rate": 1.3524022622180399e-05, "loss": 0.3143, "step": 783500 }, { "epoch": 10.446467851381797, "grad_norm": 1.355094075202942, "learning_rate": 1.3516769377183164e-05, "loss": 0.2723, "step": 783600 }, { "epoch": 10.447800989188252, "grad_norm": 1.5223771333694458, "learning_rate": 1.350951751211004e-05, "loss": 0.3154, "step": 783700 }, { "epoch": 10.449134126994707, "grad_norm": 2.141153335571289, "learning_rate": 1.350226702756813e-05, "loss": 0.3105, "step": 783800 }, { "epoch": 10.450467264801162, "grad_norm": 2.5662083625793457, "learning_rate": 1.3495017924164425e-05, "loss": 0.3082, "step": 783900 }, { "epoch": 10.451800402607617, "grad_norm": 7.936853885650635, "learning_rate": 1.3487770202505784e-05, "loss": 0.3076, "step": 784000 }, { "epoch": 10.453133540414072, "grad_norm": 2.726656675338745, "learning_rate": 1.3480523863198968e-05, "loss": 0.321, "step": 784100 }, { "epoch": 10.454466678220527, "grad_norm": 3.4432930946350098, "learning_rate": 1.347327890685062e-05, "loss": 0.3507, "step": 784200 }, { "epoch": 10.455799816026982, "grad_norm": 1.243260383605957, "learning_rate": 1.3466035334067269e-05, "loss": 0.2884, "step": 784300 }, { "epoch": 10.457132953833439, "grad_norm": 4.4488701820373535, "learning_rate": 1.3458793145455316e-05, "loss": 0.2757, "step": 784400 }, { "epoch": 10.458466091639893, "grad_norm": 3.439702033996582, "learning_rate": 1.3451552341621072e-05, "loss": 0.3152, "step": 784500 }, { "epoch": 10.459799229446348, "grad_norm": 3.510694980621338, "learning_rate": 1.3444312923170678e-05, "loss": 0.2839, "step": 784600 }, { "epoch": 10.461132367252803, "grad_norm": 3.7703304290771484, "learning_rate": 1.343707489071023e-05, "loss": 0.3013, "step": 784700 }, { "epoch": 10.462465505059258, "grad_norm": 3.2695415019989014, "learning_rate": 1.3429838244845668e-05, "loss": 0.3233, "step": 784800 }, { "epoch": 10.463798642865713, "grad_norm": 0.25830209255218506, "learning_rate": 1.342260298618281e-05, "loss": 0.3106, "step": 784900 }, { "epoch": 10.465131780672168, "grad_norm": 12.06386661529541, "learning_rate": 1.3415369115327368e-05, "loss": 0.304, "step": 785000 }, { "epoch": 10.466464918478623, "grad_norm": 2.2517693042755127, "learning_rate": 1.3408136632884949e-05, "loss": 0.3232, "step": 785100 }, { "epoch": 10.467798056285078, "grad_norm": 1.757250428199768, "learning_rate": 1.3400905539461026e-05, "loss": 0.2823, "step": 785200 }, { "epoch": 10.469131194091533, "grad_norm": 2.863081216812134, "learning_rate": 1.339367583566098e-05, "loss": 0.3151, "step": 785300 }, { "epoch": 10.470464331897988, "grad_norm": 2.1893577575683594, "learning_rate": 1.338644752209003e-05, "loss": 0.3222, "step": 785400 }, { "epoch": 10.471797469704443, "grad_norm": 15.882701873779297, "learning_rate": 1.3379292861694075e-05, "loss": 0.3418, "step": 785500 }, { "epoch": 10.473130607510898, "grad_norm": 2.267350912094116, "learning_rate": 1.3372067316479241e-05, "loss": 0.3147, "step": 785600 }, { "epoch": 10.474463745317353, "grad_norm": 4.68552303314209, "learning_rate": 1.3364843163302528e-05, "loss": 0.285, "step": 785700 }, { "epoch": 10.475796883123808, "grad_norm": 2.3688430786132812, "learning_rate": 1.3357620402768683e-05, "loss": 0.2982, "step": 785800 }, { "epoch": 10.477130020930264, "grad_norm": 5.255781173706055, "learning_rate": 1.3350399035482409e-05, "loss": 0.3122, "step": 785900 }, { "epoch": 10.47846315873672, "grad_norm": 2.442868232727051, "learning_rate": 1.3343179062048256e-05, "loss": 0.3078, "step": 786000 }, { "epoch": 10.479796296543174, "grad_norm": 20.14671516418457, "learning_rate": 1.3335960483070631e-05, "loss": 0.3034, "step": 786100 }, { "epoch": 10.48112943434963, "grad_norm": 0.8542176485061646, "learning_rate": 1.3328743299153866e-05, "loss": 0.3336, "step": 786200 }, { "epoch": 10.482462572156084, "grad_norm": 3.5506973266601562, "learning_rate": 1.3321527510902159e-05, "loss": 0.3274, "step": 786300 }, { "epoch": 10.48379570996254, "grad_norm": 3.748481273651123, "learning_rate": 1.3314313118919595e-05, "loss": 0.2928, "step": 786400 }, { "epoch": 10.485128847768994, "grad_norm": 4.625145435333252, "learning_rate": 1.330710012381015e-05, "loss": 0.3273, "step": 786500 }, { "epoch": 10.486461985575449, "grad_norm": 0.8647071719169617, "learning_rate": 1.3299888526177651e-05, "loss": 0.2814, "step": 786600 }, { "epoch": 10.487795123381904, "grad_norm": 3.40610933303833, "learning_rate": 1.3292678326625827e-05, "loss": 0.3179, "step": 786700 }, { "epoch": 10.489128261188359, "grad_norm": 2.386798620223999, "learning_rate": 1.3285469525758336e-05, "loss": 0.3702, "step": 786800 }, { "epoch": 10.490461398994814, "grad_norm": 4.765902519226074, "learning_rate": 1.327833419126598e-05, "loss": 0.2996, "step": 786900 }, { "epoch": 10.491794536801269, "grad_norm": 3.242711305618286, "learning_rate": 1.3271128175575551e-05, "loss": 0.2825, "step": 787000 }, { "epoch": 10.493127674607724, "grad_norm": 13.599834442138672, "learning_rate": 1.326392356037355e-05, "loss": 0.2927, "step": 787100 }, { "epoch": 10.494460812414179, "grad_norm": 0.9135857820510864, "learning_rate": 1.3256720346263118e-05, "loss": 0.3285, "step": 787200 }, { "epoch": 10.495793950220634, "grad_norm": 4.3100972175598145, "learning_rate": 1.3249518533847274e-05, "loss": 0.3089, "step": 787300 }, { "epoch": 10.497127088027089, "grad_norm": 2.609311819076538, "learning_rate": 1.3242318123728933e-05, "loss": 0.3627, "step": 787400 }, { "epoch": 10.498460225833544, "grad_norm": 0.9161561727523804, "learning_rate": 1.3235119116510893e-05, "loss": 0.3017, "step": 787500 }, { "epoch": 10.49979336364, "grad_norm": 3.4536590576171875, "learning_rate": 1.3227921512795837e-05, "loss": 0.3059, "step": 787600 }, { "epoch": 10.501126501446455, "grad_norm": 2.4639012813568115, "learning_rate": 1.3220725313186318e-05, "loss": 0.3188, "step": 787700 }, { "epoch": 10.50245963925291, "grad_norm": 0.5411824584007263, "learning_rate": 1.3213530518284797e-05, "loss": 0.2986, "step": 787800 }, { "epoch": 10.503792777059365, "grad_norm": 2.0790505409240723, "learning_rate": 1.320633712869356e-05, "loss": 0.305, "step": 787900 }, { "epoch": 10.50512591486582, "grad_norm": 4.947047233581543, "learning_rate": 1.3199145145014866e-05, "loss": 0.3502, "step": 788000 }, { "epoch": 10.506459052672275, "grad_norm": 2.107940673828125, "learning_rate": 1.3191954567850767e-05, "loss": 0.3345, "step": 788100 }, { "epoch": 10.50779219047873, "grad_norm": 2.1463234424591064, "learning_rate": 1.3184765397803247e-05, "loss": 0.2618, "step": 788200 }, { "epoch": 10.509125328285185, "grad_norm": 4.09726619720459, "learning_rate": 1.3177577635474164e-05, "loss": 0.2733, "step": 788300 }, { "epoch": 10.51045846609164, "grad_norm": 8.527904510498047, "learning_rate": 1.317039128146525e-05, "loss": 0.3206, "step": 788400 }, { "epoch": 10.511791603898095, "grad_norm": 3.3704864978790283, "learning_rate": 1.316320633637813e-05, "loss": 0.3623, "step": 788500 }, { "epoch": 10.51312474170455, "grad_norm": 3.6418955326080322, "learning_rate": 1.3156022800814307e-05, "loss": 0.2903, "step": 788600 }, { "epoch": 10.514457879511005, "grad_norm": 1.788956880569458, "learning_rate": 1.314884067537515e-05, "loss": 0.3426, "step": 788700 }, { "epoch": 10.51579101731746, "grad_norm": 2.7974367141723633, "learning_rate": 1.3141659960661935e-05, "loss": 0.3419, "step": 788800 }, { "epoch": 10.517124155123915, "grad_norm": 2.4763760566711426, "learning_rate": 1.3134480657275803e-05, "loss": 0.3358, "step": 788900 }, { "epoch": 10.51845729293037, "grad_norm": 0.5034871101379395, "learning_rate": 1.3127302765817788e-05, "loss": 0.2793, "step": 789000 }, { "epoch": 10.519790430736826, "grad_norm": 8.05400562286377, "learning_rate": 1.3120126286888793e-05, "loss": 0.3302, "step": 789100 }, { "epoch": 10.521123568543281, "grad_norm": 15.662542343139648, "learning_rate": 1.311295122108963e-05, "loss": 0.3248, "step": 789200 }, { "epoch": 10.522456706349736, "grad_norm": 2.1075093746185303, "learning_rate": 1.3105777569020936e-05, "loss": 0.3289, "step": 789300 }, { "epoch": 10.523789844156191, "grad_norm": 2.8264997005462646, "learning_rate": 1.3098605331283299e-05, "loss": 0.3761, "step": 789400 }, { "epoch": 10.525122981962646, "grad_norm": 7.753262042999268, "learning_rate": 1.3091434508477159e-05, "loss": 0.2894, "step": 789500 }, { "epoch": 10.526456119769101, "grad_norm": 3.82539963722229, "learning_rate": 1.3084265101202807e-05, "loss": 0.3185, "step": 789600 }, { "epoch": 10.527789257575556, "grad_norm": 0.8000016808509827, "learning_rate": 1.3077097110060462e-05, "loss": 0.3054, "step": 789700 }, { "epoch": 10.52912239538201, "grad_norm": 5.382650375366211, "learning_rate": 1.3069930535650197e-05, "loss": 0.3141, "step": 789800 }, { "epoch": 10.530455533188466, "grad_norm": 1.2087708711624146, "learning_rate": 1.3062765378571981e-05, "loss": 0.3113, "step": 789900 }, { "epoch": 10.53178867099492, "grad_norm": 0.9783385992050171, "learning_rate": 1.305560163942567e-05, "loss": 0.3297, "step": 790000 }, { "epoch": 10.533121808801376, "grad_norm": 2.530966281890869, "learning_rate": 1.3048439318810965e-05, "loss": 0.3634, "step": 790100 }, { "epoch": 10.53445494660783, "grad_norm": 3.7524681091308594, "learning_rate": 1.304127841732747e-05, "loss": 0.3395, "step": 790200 }, { "epoch": 10.535788084414285, "grad_norm": 5.716198444366455, "learning_rate": 1.3034118935574713e-05, "loss": 0.3312, "step": 790300 }, { "epoch": 10.53712122222074, "grad_norm": 2.0443074703216553, "learning_rate": 1.3026960874152023e-05, "loss": 0.28, "step": 790400 }, { "epoch": 10.538454360027195, "grad_norm": 10.68433666229248, "learning_rate": 1.3019804233658668e-05, "loss": 0.2811, "step": 790500 }, { "epoch": 10.53978749783365, "grad_norm": 7.1027679443359375, "learning_rate": 1.3012720559844888e-05, "loss": 0.2894, "step": 790600 }, { "epoch": 10.541120635640105, "grad_norm": 8.618742942810059, "learning_rate": 1.3005566748783241e-05, "loss": 0.3431, "step": 790700 }, { "epoch": 10.542453773446562, "grad_norm": 9.606592178344727, "learning_rate": 1.2998414360441966e-05, "loss": 0.3543, "step": 790800 }, { "epoch": 10.543786911253017, "grad_norm": 3.25053071975708, "learning_rate": 1.2991263395419846e-05, "loss": 0.2972, "step": 790900 }, { "epoch": 10.545120049059472, "grad_norm": 4.948827743530273, "learning_rate": 1.2984113854315539e-05, "loss": 0.3053, "step": 791000 }, { "epoch": 10.546453186865927, "grad_norm": 3.770141363143921, "learning_rate": 1.2976965737727581e-05, "loss": 0.3016, "step": 791100 }, { "epoch": 10.547786324672382, "grad_norm": 3.6521241664886475, "learning_rate": 1.2969819046254404e-05, "loss": 0.3089, "step": 791200 }, { "epoch": 10.549119462478837, "grad_norm": 3.4083621501922607, "learning_rate": 1.2962673780494276e-05, "loss": 0.3, "step": 791300 }, { "epoch": 10.550452600285292, "grad_norm": 3.263057231903076, "learning_rate": 1.2955529941045386e-05, "loss": 0.3098, "step": 791400 }, { "epoch": 10.551785738091747, "grad_norm": 3.4675650596618652, "learning_rate": 1.2948387528505822e-05, "loss": 0.3666, "step": 791500 }, { "epoch": 10.553118875898202, "grad_norm": 6.255260944366455, "learning_rate": 1.2941246543473491e-05, "loss": 0.312, "step": 791600 }, { "epoch": 10.554452013704656, "grad_norm": 2.2182862758636475, "learning_rate": 1.2934106986546223e-05, "loss": 0.3051, "step": 791700 }, { "epoch": 10.555785151511111, "grad_norm": 2.3369712829589844, "learning_rate": 1.2926968858321726e-05, "loss": 0.3216, "step": 791800 }, { "epoch": 10.557118289317566, "grad_norm": 2.7957394123077393, "learning_rate": 1.2919832159397576e-05, "loss": 0.3576, "step": 791900 }, { "epoch": 10.558451427124021, "grad_norm": 2.5580854415893555, "learning_rate": 1.2912696890371237e-05, "loss": 0.2881, "step": 792000 }, { "epoch": 10.559784564930476, "grad_norm": 2.4693241119384766, "learning_rate": 1.290556305184006e-05, "loss": 0.2644, "step": 792100 }, { "epoch": 10.561117702736931, "grad_norm": 2.6299479007720947, "learning_rate": 1.2898430644401249e-05, "loss": 0.3203, "step": 792200 }, { "epoch": 10.562450840543388, "grad_norm": 4.166855335235596, "learning_rate": 1.2891299668651912e-05, "loss": 0.3999, "step": 792300 }, { "epoch": 10.563783978349843, "grad_norm": 10.918128967285156, "learning_rate": 1.288417012518904e-05, "loss": 0.3568, "step": 792400 }, { "epoch": 10.565117116156298, "grad_norm": 15.831382751464844, "learning_rate": 1.287704201460949e-05, "loss": 0.2679, "step": 792500 }, { "epoch": 10.566450253962753, "grad_norm": 2.063458204269409, "learning_rate": 1.2869915337510013e-05, "loss": 0.3577, "step": 792600 }, { "epoch": 10.567783391769208, "grad_norm": 1.9969619512557983, "learning_rate": 1.2862790094487217e-05, "loss": 0.396, "step": 792700 }, { "epoch": 10.569116529575663, "grad_norm": 3.4216723442077637, "learning_rate": 1.2855666286137603e-05, "loss": 0.3064, "step": 792800 }, { "epoch": 10.570449667382118, "grad_norm": 3.7328574657440186, "learning_rate": 1.2848543913057588e-05, "loss": 0.2901, "step": 792900 }, { "epoch": 10.571782805188572, "grad_norm": 4.0836615562438965, "learning_rate": 1.2841422975843396e-05, "loss": 0.2868, "step": 793000 }, { "epoch": 10.573115942995027, "grad_norm": 3.6737546920776367, "learning_rate": 1.2834303475091187e-05, "loss": 0.3014, "step": 793100 }, { "epoch": 10.574449080801482, "grad_norm": 0.7196183204650879, "learning_rate": 1.2827185411396976e-05, "loss": 0.2879, "step": 793200 }, { "epoch": 10.575782218607937, "grad_norm": 1.9720029830932617, "learning_rate": 1.282006878535667e-05, "loss": 0.2841, "step": 793300 }, { "epoch": 10.577115356414392, "grad_norm": 2.2272989749908447, "learning_rate": 1.2813024742322675e-05, "loss": 0.3124, "step": 793400 }, { "epoch": 10.578448494220847, "grad_norm": 2.2929346561431885, "learning_rate": 1.2805982109495151e-05, "loss": 0.3184, "step": 793500 }, { "epoch": 10.579781632027302, "grad_norm": 2.724813461303711, "learning_rate": 1.2798869771196116e-05, "loss": 0.2978, "step": 793600 }, { "epoch": 10.581114769833757, "grad_norm": 2.604727268218994, "learning_rate": 1.2791758872921486e-05, "loss": 0.3627, "step": 793700 }, { "epoch": 10.582447907640212, "grad_norm": 3.064371347427368, "learning_rate": 1.278464941526654e-05, "loss": 0.3576, "step": 793800 }, { "epoch": 10.583781045446667, "grad_norm": 1.3188328742980957, "learning_rate": 1.2777541398826496e-05, "loss": 0.3213, "step": 793900 }, { "epoch": 10.585114183253124, "grad_norm": 0.08500589430332184, "learning_rate": 1.2770434824196389e-05, "loss": 0.2608, "step": 794000 }, { "epoch": 10.586447321059579, "grad_norm": 2.2315261363983154, "learning_rate": 1.2763329691971164e-05, "loss": 0.3201, "step": 794100 }, { "epoch": 10.587780458866034, "grad_norm": 0.44438204169273376, "learning_rate": 1.2756226002745643e-05, "loss": 0.2788, "step": 794200 }, { "epoch": 10.589113596672489, "grad_norm": 3.7758708000183105, "learning_rate": 1.2749123757114524e-05, "loss": 0.3194, "step": 794300 }, { "epoch": 10.590446734478943, "grad_norm": 1.0092556476593018, "learning_rate": 1.2742022955672386e-05, "loss": 0.2784, "step": 794400 }, { "epoch": 10.591779872285398, "grad_norm": 0.6101272702217102, "learning_rate": 1.2734923599013693e-05, "loss": 0.3221, "step": 794500 }, { "epoch": 10.593113010091853, "grad_norm": 2.550858497619629, "learning_rate": 1.272782568773276e-05, "loss": 0.3433, "step": 794600 }, { "epoch": 10.594446147898308, "grad_norm": 4.009382724761963, "learning_rate": 1.2720729222423818e-05, "loss": 0.3233, "step": 794700 }, { "epoch": 10.595779285704763, "grad_norm": 9.46632194519043, "learning_rate": 1.2713634203680954e-05, "loss": 0.2981, "step": 794800 }, { "epoch": 10.597112423511218, "grad_norm": 3.186215877532959, "learning_rate": 1.2706540632098143e-05, "loss": 0.3235, "step": 794900 }, { "epoch": 10.598445561317673, "grad_norm": 2.4085354804992676, "learning_rate": 1.2699448508269237e-05, "loss": 0.2723, "step": 795000 }, { "epoch": 10.599778699124128, "grad_norm": 3.4614179134368896, "learning_rate": 1.2692357832787975e-05, "loss": 0.3372, "step": 795100 }, { "epoch": 10.601111836930583, "grad_norm": 1.083500862121582, "learning_rate": 1.268526860624794e-05, "loss": 0.284, "step": 795200 }, { "epoch": 10.602444974737038, "grad_norm": 3.5160157680511475, "learning_rate": 1.2678180829242643e-05, "loss": 0.2886, "step": 795300 }, { "epoch": 10.603778112543493, "grad_norm": 1.1855789422988892, "learning_rate": 1.2671094502365456e-05, "loss": 0.3364, "step": 795400 }, { "epoch": 10.60511125034995, "grad_norm": 3.569288969039917, "learning_rate": 1.26640096262096e-05, "loss": 0.223, "step": 795500 }, { "epoch": 10.606444388156405, "grad_norm": 3.5463192462921143, "learning_rate": 1.2656926201368212e-05, "loss": 0.2905, "step": 795600 }, { "epoch": 10.60777752596286, "grad_norm": 1.262036681175232, "learning_rate": 1.2649844228434294e-05, "loss": 0.3176, "step": 795700 }, { "epoch": 10.609110663769314, "grad_norm": 3.2757418155670166, "learning_rate": 1.264276370800072e-05, "loss": 0.2623, "step": 795800 }, { "epoch": 10.61044380157577, "grad_norm": 2.193880558013916, "learning_rate": 1.2635684640660266e-05, "loss": 0.2882, "step": 795900 }, { "epoch": 10.611776939382224, "grad_norm": 3.3659770488739014, "learning_rate": 1.2628607027005547e-05, "loss": 0.3184, "step": 796000 }, { "epoch": 10.61311007718868, "grad_norm": 7.240360736846924, "learning_rate": 1.2621530867629078e-05, "loss": 0.3092, "step": 796100 }, { "epoch": 10.614443214995134, "grad_norm": 3.2549567222595215, "learning_rate": 1.2614456163123284e-05, "loss": 0.2893, "step": 796200 }, { "epoch": 10.61577635280159, "grad_norm": 3.148874282836914, "learning_rate": 1.2607382914080407e-05, "loss": 0.2598, "step": 796300 }, { "epoch": 10.617109490608044, "grad_norm": 2.940361499786377, "learning_rate": 1.2600311121092603e-05, "loss": 0.3171, "step": 796400 }, { "epoch": 10.618442628414499, "grad_norm": 2.1521308422088623, "learning_rate": 1.2593240784751907e-05, "loss": 0.3016, "step": 796500 }, { "epoch": 10.619775766220954, "grad_norm": 1.4938992261886597, "learning_rate": 1.2586171905650218e-05, "loss": 0.2702, "step": 796600 }, { "epoch": 10.621108904027409, "grad_norm": 3.687347412109375, "learning_rate": 1.2579104484379325e-05, "loss": 0.3414, "step": 796700 }, { "epoch": 10.622442041833864, "grad_norm": 2.8975701332092285, "learning_rate": 1.2572038521530903e-05, "loss": 0.3187, "step": 796800 }, { "epoch": 10.623775179640319, "grad_norm": 4.403350353240967, "learning_rate": 1.2564974017696463e-05, "loss": 0.3463, "step": 796900 }, { "epoch": 10.625108317446774, "grad_norm": 7.266265392303467, "learning_rate": 1.255791097346744e-05, "loss": 0.326, "step": 797000 }, { "epoch": 10.626441455253229, "grad_norm": 20.679649353027344, "learning_rate": 1.255084938943513e-05, "loss": 0.3199, "step": 797100 }, { "epoch": 10.627774593059684, "grad_norm": 3.4465367794036865, "learning_rate": 1.2543789266190702e-05, "loss": 0.2898, "step": 797200 }, { "epoch": 10.62910773086614, "grad_norm": 2.8600986003875732, "learning_rate": 1.2536730604325224e-05, "loss": 0.3505, "step": 797300 }, { "epoch": 10.630440868672595, "grad_norm": 1.345780372619629, "learning_rate": 1.2529673404429585e-05, "loss": 0.273, "step": 797400 }, { "epoch": 10.63177400647905, "grad_norm": 2.621748685836792, "learning_rate": 1.252261766709463e-05, "loss": 0.3405, "step": 797500 }, { "epoch": 10.633107144285505, "grad_norm": 5.01387882232666, "learning_rate": 1.2515563392911042e-05, "loss": 0.321, "step": 797600 }, { "epoch": 10.63444028209196, "grad_norm": 1.5078420639038086, "learning_rate": 1.250851058246936e-05, "loss": 0.3338, "step": 797700 }, { "epoch": 10.635773419898415, "grad_norm": 5.412436008453369, "learning_rate": 1.2501459236360036e-05, "loss": 0.2884, "step": 797800 }, { "epoch": 10.63710655770487, "grad_norm": 0.10889225453138351, "learning_rate": 1.2494409355173381e-05, "loss": 0.2875, "step": 797900 }, { "epoch": 10.638439695511325, "grad_norm": 4.47400426864624, "learning_rate": 1.2487360939499593e-05, "loss": 0.2869, "step": 798000 }, { "epoch": 10.63977283331778, "grad_norm": 2.1721513271331787, "learning_rate": 1.2480313989928742e-05, "loss": 0.3021, "step": 798100 }, { "epoch": 10.641105971124235, "grad_norm": 4.8158040046691895, "learning_rate": 1.2473268507050789e-05, "loss": 0.3185, "step": 798200 }, { "epoch": 10.64243910893069, "grad_norm": 2.9932963848114014, "learning_rate": 1.2466224491455538e-05, "loss": 0.3101, "step": 798300 }, { "epoch": 10.643772246737145, "grad_norm": 2.17999267578125, "learning_rate": 1.245918194373269e-05, "loss": 0.3165, "step": 798400 }, { "epoch": 10.6451053845436, "grad_norm": 3.767953634262085, "learning_rate": 1.2452140864471858e-05, "loss": 0.306, "step": 798500 }, { "epoch": 10.646438522350055, "grad_norm": 3.4392614364624023, "learning_rate": 1.2445101254262463e-05, "loss": 0.3358, "step": 798600 }, { "epoch": 10.64777166015651, "grad_norm": 4.864134788513184, "learning_rate": 1.2438063113693867e-05, "loss": 0.3002, "step": 798700 }, { "epoch": 10.649104797962966, "grad_norm": 3.2320749759674072, "learning_rate": 1.2431026443355245e-05, "loss": 0.3784, "step": 798800 }, { "epoch": 10.650437935769421, "grad_norm": 17.992494583129883, "learning_rate": 1.242399124383572e-05, "loss": 0.3236, "step": 798900 }, { "epoch": 10.651771073575876, "grad_norm": 2.871941566467285, "learning_rate": 1.241695751572425e-05, "loss": 0.3121, "step": 799000 }, { "epoch": 10.653104211382331, "grad_norm": 2.906003952026367, "learning_rate": 1.2409925259609662e-05, "loss": 0.2691, "step": 799100 }, { "epoch": 10.654437349188786, "grad_norm": 3.0280468463897705, "learning_rate": 1.2402894476080683e-05, "loss": 0.3366, "step": 799200 }, { "epoch": 10.655770486995241, "grad_norm": 3.2120656967163086, "learning_rate": 1.2395865165725908e-05, "loss": 0.2931, "step": 799300 }, { "epoch": 10.657103624801696, "grad_norm": 3.2596275806427, "learning_rate": 1.2388837329133807e-05, "loss": 0.3241, "step": 799400 }, { "epoch": 10.65843676260815, "grad_norm": 3.5081937313079834, "learning_rate": 1.2381810966892732e-05, "loss": 0.3034, "step": 799500 }, { "epoch": 10.659769900414606, "grad_norm": 3.3191096782684326, "learning_rate": 1.2374786079590913e-05, "loss": 0.3321, "step": 799600 }, { "epoch": 10.66110303822106, "grad_norm": 4.348423957824707, "learning_rate": 1.2367762667816425e-05, "loss": 0.2964, "step": 799700 }, { "epoch": 10.662436176027516, "grad_norm": 1.081206202507019, "learning_rate": 1.2360740732157273e-05, "loss": 0.3527, "step": 799800 }, { "epoch": 10.66376931383397, "grad_norm": 2.7065517902374268, "learning_rate": 1.2353720273201314e-05, "loss": 0.3155, "step": 799900 }, { "epoch": 10.665102451640426, "grad_norm": 4.001421928405762, "learning_rate": 1.234670129153626e-05, "loss": 0.3417, "step": 800000 }, { "epoch": 10.66643558944688, "grad_norm": 0.6280737519264221, "learning_rate": 1.2339683787749723e-05, "loss": 0.3128, "step": 800100 }, { "epoch": 10.667768727253335, "grad_norm": 2.231419086456299, "learning_rate": 1.233266776242919e-05, "loss": 0.2866, "step": 800200 }, { "epoch": 10.66910186505979, "grad_norm": 1.2662864923477173, "learning_rate": 1.232565321616202e-05, "loss": 0.284, "step": 800300 }, { "epoch": 10.670435002866245, "grad_norm": 5.442233085632324, "learning_rate": 1.231864014953546e-05, "loss": 0.349, "step": 800400 }, { "epoch": 10.671768140672702, "grad_norm": 2.3947205543518066, "learning_rate": 1.23116285631366e-05, "loss": 0.2868, "step": 800500 }, { "epoch": 10.673101278479157, "grad_norm": 5.178618907928467, "learning_rate": 1.2304618457552434e-05, "loss": 0.3041, "step": 800600 }, { "epoch": 10.674434416285612, "grad_norm": 3.068695068359375, "learning_rate": 1.2297609833369833e-05, "loss": 0.2831, "step": 800700 }, { "epoch": 10.675767554092067, "grad_norm": 6.053105354309082, "learning_rate": 1.2290602691175535e-05, "loss": 0.3295, "step": 800800 }, { "epoch": 10.677100691898522, "grad_norm": 4.802867889404297, "learning_rate": 1.2283597031556154e-05, "loss": 0.3188, "step": 800900 }, { "epoch": 10.678433829704977, "grad_norm": 2.6441433429718018, "learning_rate": 1.2276592855098193e-05, "loss": 0.2831, "step": 801000 }, { "epoch": 10.679766967511432, "grad_norm": 3.9006428718566895, "learning_rate": 1.2269590162387988e-05, "loss": 0.2823, "step": 801100 }, { "epoch": 10.681100105317887, "grad_norm": 0.7509164810180664, "learning_rate": 1.2262588954011815e-05, "loss": 0.3135, "step": 801200 }, { "epoch": 10.682433243124342, "grad_norm": 4.614596843719482, "learning_rate": 1.225558923055579e-05, "loss": 0.313, "step": 801300 }, { "epoch": 10.683766380930797, "grad_norm": 3.1536641120910645, "learning_rate": 1.224859099260589e-05, "loss": 0.311, "step": 801400 }, { "epoch": 10.685099518737252, "grad_norm": 2.890357494354248, "learning_rate": 1.2241664200908492e-05, "loss": 0.3323, "step": 801500 }, { "epoch": 10.686432656543706, "grad_norm": 2.2294139862060547, "learning_rate": 1.2234668920858683e-05, "loss": 0.3228, "step": 801600 }, { "epoch": 10.687765794350161, "grad_norm": 1.0095816850662231, "learning_rate": 1.2227675128066377e-05, "loss": 0.3222, "step": 801700 }, { "epoch": 10.689098932156616, "grad_norm": 1.7796956300735474, "learning_rate": 1.222068282311709e-05, "loss": 0.307, "step": 801800 }, { "epoch": 10.690432069963071, "grad_norm": 7.925149917602539, "learning_rate": 1.221369200659619e-05, "loss": 0.3246, "step": 801900 }, { "epoch": 10.691765207769528, "grad_norm": 0.03329342603683472, "learning_rate": 1.2206702679088926e-05, "loss": 0.3037, "step": 802000 }, { "epoch": 10.693098345575983, "grad_norm": 1.3628355264663696, "learning_rate": 1.2199714841180425e-05, "loss": 0.2996, "step": 802100 }, { "epoch": 10.694431483382438, "grad_norm": 1.932572841644287, "learning_rate": 1.2192728493455699e-05, "loss": 0.3506, "step": 802200 }, { "epoch": 10.695764621188893, "grad_norm": 3.174393653869629, "learning_rate": 1.218574363649959e-05, "loss": 0.2672, "step": 802300 }, { "epoch": 10.697097758995348, "grad_norm": 2.993605375289917, "learning_rate": 1.2178760270896876e-05, "loss": 0.318, "step": 802400 }, { "epoch": 10.698430896801803, "grad_norm": 4.2884907722473145, "learning_rate": 1.2171778397232187e-05, "loss": 0.3085, "step": 802500 }, { "epoch": 10.699764034608258, "grad_norm": 2.7866039276123047, "learning_rate": 1.216479801609e-05, "loss": 0.3326, "step": 802600 }, { "epoch": 10.701097172414713, "grad_norm": 3.2083139419555664, "learning_rate": 1.21578191280547e-05, "loss": 0.3604, "step": 802700 }, { "epoch": 10.702430310221168, "grad_norm": 2.607814311981201, "learning_rate": 1.215084173371054e-05, "loss": 0.3319, "step": 802800 }, { "epoch": 10.703763448027622, "grad_norm": 4.172729015350342, "learning_rate": 1.2143865833641646e-05, "loss": 0.3392, "step": 802900 }, { "epoch": 10.705096585834077, "grad_norm": 5.892014980316162, "learning_rate": 1.2136891428432017e-05, "loss": 0.2805, "step": 803000 }, { "epoch": 10.706429723640532, "grad_norm": 9.325193405151367, "learning_rate": 1.2129918518665538e-05, "loss": 0.3564, "step": 803100 }, { "epoch": 10.707762861446987, "grad_norm": 5.400175094604492, "learning_rate": 1.2122947104925926e-05, "loss": 0.3128, "step": 803200 }, { "epoch": 10.709095999253442, "grad_norm": 2.1238462924957275, "learning_rate": 1.2115977187796855e-05, "loss": 0.3133, "step": 803300 }, { "epoch": 10.710429137059897, "grad_norm": 2.2164177894592285, "learning_rate": 1.2109008767861786e-05, "loss": 0.283, "step": 803400 }, { "epoch": 10.711762274866352, "grad_norm": 1.8670258522033691, "learning_rate": 1.2102041845704104e-05, "loss": 0.2575, "step": 803500 }, { "epoch": 10.713095412672807, "grad_norm": 3.163496732711792, "learning_rate": 1.209507642190706e-05, "loss": 0.2779, "step": 803600 }, { "epoch": 10.714428550479264, "grad_norm": 4.7885637283325195, "learning_rate": 1.2088112497053777e-05, "loss": 0.2997, "step": 803700 }, { "epoch": 10.715761688285719, "grad_norm": 0.5116745233535767, "learning_rate": 1.2081150071727253e-05, "loss": 0.2703, "step": 803800 }, { "epoch": 10.717094826092174, "grad_norm": 2.3367347717285156, "learning_rate": 1.2074189146510371e-05, "loss": 0.2842, "step": 803900 }, { "epoch": 10.718427963898629, "grad_norm": 1.262311577796936, "learning_rate": 1.2067229721985854e-05, "loss": 0.3302, "step": 804000 }, { "epoch": 10.719761101705084, "grad_norm": 2.9287407398223877, "learning_rate": 1.206027179873634e-05, "loss": 0.2901, "step": 804100 }, { "epoch": 10.721094239511539, "grad_norm": 2.6746819019317627, "learning_rate": 1.2053315377344314e-05, "loss": 0.3056, "step": 804200 }, { "epoch": 10.722427377317993, "grad_norm": 2.6940295696258545, "learning_rate": 1.2046360458392157e-05, "loss": 0.2937, "step": 804300 }, { "epoch": 10.723760515124448, "grad_norm": 0.5741795897483826, "learning_rate": 1.2039407042462107e-05, "loss": 0.3028, "step": 804400 }, { "epoch": 10.725093652930903, "grad_norm": 3.203017473220825, "learning_rate": 1.203245513013629e-05, "loss": 0.33, "step": 804500 }, { "epoch": 10.726426790737358, "grad_norm": 9.394989013671875, "learning_rate": 1.2025504721996672e-05, "loss": 0.322, "step": 804600 }, { "epoch": 10.727759928543813, "grad_norm": 9.242717742919922, "learning_rate": 1.201855581862516e-05, "loss": 0.266, "step": 804700 }, { "epoch": 10.729093066350268, "grad_norm": 7.136580467224121, "learning_rate": 1.2011608420603462e-05, "loss": 0.3147, "step": 804800 }, { "epoch": 10.730426204156723, "grad_norm": 4.3152689933776855, "learning_rate": 1.2004662528513202e-05, "loss": 0.3187, "step": 804900 }, { "epoch": 10.731759341963178, "grad_norm": 6.068830490112305, "learning_rate": 1.1997718142935866e-05, "loss": 0.3117, "step": 805000 }, { "epoch": 10.733092479769633, "grad_norm": 5.950344562530518, "learning_rate": 1.1990775264452823e-05, "loss": 0.342, "step": 805100 }, { "epoch": 10.73442561757609, "grad_norm": 2.451275587081909, "learning_rate": 1.1983833893645301e-05, "loss": 0.3451, "step": 805200 }, { "epoch": 10.735758755382545, "grad_norm": 4.878512859344482, "learning_rate": 1.1976963422252156e-05, "loss": 0.3113, "step": 805300 }, { "epoch": 10.737091893189, "grad_norm": 4.037848949432373, "learning_rate": 1.1970025053447637e-05, "loss": 0.3084, "step": 805400 }, { "epoch": 10.738425030995455, "grad_norm": 2.249532461166382, "learning_rate": 1.1963088194055786e-05, "loss": 0.3167, "step": 805500 }, { "epoch": 10.73975816880191, "grad_norm": 5.766552448272705, "learning_rate": 1.1956152844657333e-05, "loss": 0.324, "step": 805600 }, { "epoch": 10.741091306608364, "grad_norm": 3.4614598751068115, "learning_rate": 1.1949219005832895e-05, "loss": 0.2602, "step": 805700 }, { "epoch": 10.74242444441482, "grad_norm": 2.6854403018951416, "learning_rate": 1.1942286678162922e-05, "loss": 0.2924, "step": 805800 }, { "epoch": 10.743757582221274, "grad_norm": 0.41766905784606934, "learning_rate": 1.1935355862227793e-05, "loss": 0.2734, "step": 805900 }, { "epoch": 10.74509072002773, "grad_norm": 0.9236089587211609, "learning_rate": 1.1928426558607741e-05, "loss": 0.3783, "step": 806000 }, { "epoch": 10.746423857834184, "grad_norm": 1.5133116245269775, "learning_rate": 1.1921498767882838e-05, "loss": 0.3646, "step": 806100 }, { "epoch": 10.74775699564064, "grad_norm": 2.405696153640747, "learning_rate": 1.191457249063307e-05, "loss": 0.2926, "step": 806200 }, { "epoch": 10.749090133447094, "grad_norm": 2.6231203079223633, "learning_rate": 1.1907647727438283e-05, "loss": 0.3165, "step": 806300 }, { "epoch": 10.750423271253549, "grad_norm": 2.9506239891052246, "learning_rate": 1.1900724478878198e-05, "loss": 0.3308, "step": 806400 }, { "epoch": 10.751756409060004, "grad_norm": 0.8855184316635132, "learning_rate": 1.1893802745532413e-05, "loss": 0.2885, "step": 806500 }, { "epoch": 10.753089546866459, "grad_norm": 48.01579666137695, "learning_rate": 1.1886882527980378e-05, "loss": 0.2836, "step": 806600 }, { "epoch": 10.754422684672914, "grad_norm": 1.3574862480163574, "learning_rate": 1.1879963826801428e-05, "loss": 0.2851, "step": 806700 }, { "epoch": 10.755755822479369, "grad_norm": 1.493225336074829, "learning_rate": 1.187304664257481e-05, "loss": 0.3102, "step": 806800 }, { "epoch": 10.757088960285826, "grad_norm": 1.7320996522903442, "learning_rate": 1.1866130975879574e-05, "loss": 0.3118, "step": 806900 }, { "epoch": 10.75842209809228, "grad_norm": 2.526287317276001, "learning_rate": 1.1859216827294693e-05, "loss": 0.3339, "step": 807000 }, { "epoch": 10.759755235898735, "grad_norm": 3.4694554805755615, "learning_rate": 1.1852304197398991e-05, "loss": 0.311, "step": 807100 }, { "epoch": 10.76108837370519, "grad_norm": 1.2367736101150513, "learning_rate": 1.1845393086771178e-05, "loss": 0.3256, "step": 807200 }, { "epoch": 10.762421511511645, "grad_norm": 0.8102221488952637, "learning_rate": 1.1838483495989824e-05, "loss": 0.3009, "step": 807300 }, { "epoch": 10.7637546493181, "grad_norm": 6.260825157165527, "learning_rate": 1.1831575425633396e-05, "loss": 0.2974, "step": 807400 }, { "epoch": 10.765087787124555, "grad_norm": 1.9674744606018066, "learning_rate": 1.1824668876280186e-05, "loss": 0.2687, "step": 807500 }, { "epoch": 10.76642092493101, "grad_norm": 1.9610764980316162, "learning_rate": 1.181776384850841e-05, "loss": 0.2847, "step": 807600 }, { "epoch": 10.767754062737465, "grad_norm": 2.523409605026245, "learning_rate": 1.1810860342896126e-05, "loss": 0.3414, "step": 807700 }, { "epoch": 10.76908720054392, "grad_norm": 4.054784297943115, "learning_rate": 1.1803958360021277e-05, "loss": 0.3061, "step": 807800 }, { "epoch": 10.770420338350375, "grad_norm": 1.2519599199295044, "learning_rate": 1.1797057900461685e-05, "loss": 0.3226, "step": 807900 }, { "epoch": 10.77175347615683, "grad_norm": 3.7967827320098877, "learning_rate": 1.1790158964795018e-05, "loss": 0.2948, "step": 808000 }, { "epoch": 10.773086613963285, "grad_norm": 12.608869552612305, "learning_rate": 1.1783261553598828e-05, "loss": 0.2682, "step": 808100 }, { "epoch": 10.77441975176974, "grad_norm": 2.8293416500091553, "learning_rate": 1.1776365667450575e-05, "loss": 0.2414, "step": 808200 }, { "epoch": 10.775752889576195, "grad_norm": 2.893927812576294, "learning_rate": 1.1769471306927535e-05, "loss": 0.3018, "step": 808300 }, { "epoch": 10.777086027382651, "grad_norm": 0.14119866490364075, "learning_rate": 1.1762578472606886e-05, "loss": 0.2755, "step": 808400 }, { "epoch": 10.778419165189106, "grad_norm": 2.2694971561431885, "learning_rate": 1.175568716506568e-05, "loss": 0.3173, "step": 808500 }, { "epoch": 10.779752302995561, "grad_norm": 2.2998428344726562, "learning_rate": 1.1748797384880836e-05, "loss": 0.2258, "step": 808600 }, { "epoch": 10.781085440802016, "grad_norm": 1.9009779691696167, "learning_rate": 1.1741909132629139e-05, "loss": 0.3305, "step": 808700 }, { "epoch": 10.782418578608471, "grad_norm": 4.644403457641602, "learning_rate": 1.1735022408887266e-05, "loss": 0.2938, "step": 808800 }, { "epoch": 10.783751716414926, "grad_norm": 3.819977045059204, "learning_rate": 1.172813721423173e-05, "loss": 0.3333, "step": 808900 }, { "epoch": 10.785084854221381, "grad_norm": 2.5613977909088135, "learning_rate": 1.172125354923894e-05, "loss": 0.2918, "step": 809000 }, { "epoch": 10.786417992027836, "grad_norm": 1.5057164430618286, "learning_rate": 1.1714371414485203e-05, "loss": 0.2878, "step": 809100 }, { "epoch": 10.787751129834291, "grad_norm": 3.203032970428467, "learning_rate": 1.170749081054664e-05, "loss": 0.3311, "step": 809200 }, { "epoch": 10.789084267640746, "grad_norm": 0.9881546497344971, "learning_rate": 1.1700611737999293e-05, "loss": 0.3088, "step": 809300 }, { "epoch": 10.7904174054472, "grad_norm": 2.4495656490325928, "learning_rate": 1.1693734197419025e-05, "loss": 0.3172, "step": 809400 }, { "epoch": 10.791750543253656, "grad_norm": 2.066582441329956, "learning_rate": 1.1686858189381637e-05, "loss": 0.2813, "step": 809500 }, { "epoch": 10.79308368106011, "grad_norm": 1.135407567024231, "learning_rate": 1.167998371446276e-05, "loss": 0.3358, "step": 809600 }, { "epoch": 10.794416818866566, "grad_norm": 2.2621631622314453, "learning_rate": 1.1673248217028459e-05, "loss": 0.3117, "step": 809700 }, { "epoch": 10.79574995667302, "grad_norm": 2.115668296813965, "learning_rate": 1.1666376779381966e-05, "loss": 0.2867, "step": 809800 }, { "epoch": 10.797083094479476, "grad_norm": 2.5860581398010254, "learning_rate": 1.1659506876568607e-05, "loss": 0.284, "step": 809900 }, { "epoch": 10.79841623228593, "grad_norm": 30.68425178527832, "learning_rate": 1.1652638509163519e-05, "loss": 0.3152, "step": 810000 }, { "epoch": 10.799749370092387, "grad_norm": 4.012081623077393, "learning_rate": 1.1645771677741697e-05, "loss": 0.3172, "step": 810100 }, { "epoch": 10.801082507898842, "grad_norm": 6.460646629333496, "learning_rate": 1.1638906382878012e-05, "loss": 0.2659, "step": 810200 }, { "epoch": 10.802415645705297, "grad_norm": 3.300213575363159, "learning_rate": 1.1632042625147206e-05, "loss": 0.2947, "step": 810300 }, { "epoch": 10.803748783511752, "grad_norm": 1.9453802108764648, "learning_rate": 1.1625180405123902e-05, "loss": 0.3588, "step": 810400 }, { "epoch": 10.805081921318207, "grad_norm": 3.7203729152679443, "learning_rate": 1.1618319723382553e-05, "loss": 0.3257, "step": 810500 }, { "epoch": 10.806415059124662, "grad_norm": 5.844013214111328, "learning_rate": 1.1611460580497552e-05, "loss": 0.2746, "step": 810600 }, { "epoch": 10.807748196931117, "grad_norm": 3.7683379650115967, "learning_rate": 1.1604602977043094e-05, "loss": 0.323, "step": 810700 }, { "epoch": 10.809081334737572, "grad_norm": 1.136256456375122, "learning_rate": 1.1597746913593291e-05, "loss": 0.3442, "step": 810800 }, { "epoch": 10.810414472544027, "grad_norm": 1.0717507600784302, "learning_rate": 1.1590892390722102e-05, "loss": 0.3313, "step": 810900 }, { "epoch": 10.811747610350482, "grad_norm": 4.231035232543945, "learning_rate": 1.1584039409003373e-05, "loss": 0.2568, "step": 811000 }, { "epoch": 10.813080748156937, "grad_norm": 1.6712676286697388, "learning_rate": 1.1577187969010817e-05, "loss": 0.2857, "step": 811100 }, { "epoch": 10.814413885963392, "grad_norm": 3.278346061706543, "learning_rate": 1.1570338071318018e-05, "loss": 0.3437, "step": 811200 }, { "epoch": 10.815747023769847, "grad_norm": 0.34539127349853516, "learning_rate": 1.156348971649841e-05, "loss": 0.268, "step": 811300 }, { "epoch": 10.817080161576301, "grad_norm": 1.405105710029602, "learning_rate": 1.1556642905125313e-05, "loss": 0.3158, "step": 811400 }, { "epoch": 10.818413299382756, "grad_norm": 4.890103340148926, "learning_rate": 1.154979763777196e-05, "loss": 0.3209, "step": 811500 }, { "epoch": 10.819746437189213, "grad_norm": 3.924534797668457, "learning_rate": 1.1542953915011372e-05, "loss": 0.3457, "step": 811600 }, { "epoch": 10.821079574995668, "grad_norm": 3.261784791946411, "learning_rate": 1.1536111737416506e-05, "loss": 0.3299, "step": 811700 }, { "epoch": 10.822412712802123, "grad_norm": 5.7847394943237305, "learning_rate": 1.1529339504225432e-05, "loss": 0.3376, "step": 811800 }, { "epoch": 10.823745850608578, "grad_norm": 6.5452446937561035, "learning_rate": 1.152250040321435e-05, "loss": 0.3128, "step": 811900 }, { "epoch": 10.825078988415033, "grad_norm": 4.757443904876709, "learning_rate": 1.1515662849081274e-05, "loss": 0.3171, "step": 812000 }, { "epoch": 10.826412126221488, "grad_norm": 7.363179683685303, "learning_rate": 1.1508826842398637e-05, "loss": 0.305, "step": 812100 }, { "epoch": 10.827745264027943, "grad_norm": 5.883237361907959, "learning_rate": 1.1501992383738731e-05, "loss": 0.3086, "step": 812200 }, { "epoch": 10.829078401834398, "grad_norm": 3.791349172592163, "learning_rate": 1.149515947367371e-05, "loss": 0.2674, "step": 812300 }, { "epoch": 10.830411539640853, "grad_norm": 3.1253037452697754, "learning_rate": 1.1488328112775616e-05, "loss": 0.3125, "step": 812400 }, { "epoch": 10.831744677447308, "grad_norm": 2.7885923385620117, "learning_rate": 1.1481498301616328e-05, "loss": 0.3102, "step": 812500 }, { "epoch": 10.833077815253763, "grad_norm": 4.333609104156494, "learning_rate": 1.1474670040767614e-05, "loss": 0.2845, "step": 812600 }, { "epoch": 10.834410953060218, "grad_norm": 0.6123049259185791, "learning_rate": 1.1467843330801152e-05, "loss": 0.2587, "step": 812700 }, { "epoch": 10.835744090866672, "grad_norm": 5.966846942901611, "learning_rate": 1.1461018172288418e-05, "loss": 0.3301, "step": 812800 }, { "epoch": 10.837077228673127, "grad_norm": 0.81282639503479, "learning_rate": 1.1454194565800802e-05, "loss": 0.3269, "step": 812900 }, { "epoch": 10.838410366479582, "grad_norm": 2.327103853225708, "learning_rate": 1.1447372511909554e-05, "loss": 0.322, "step": 813000 }, { "epoch": 10.839743504286037, "grad_norm": 2.8126754760742188, "learning_rate": 1.1440552011185797e-05, "loss": 0.3075, "step": 813100 }, { "epoch": 10.841076642092492, "grad_norm": 2.6240572929382324, "learning_rate": 1.1433733064200519e-05, "loss": 0.2859, "step": 813200 }, { "epoch": 10.842409779898949, "grad_norm": 0.8047534823417664, "learning_rate": 1.1426915671524598e-05, "loss": 0.326, "step": 813300 }, { "epoch": 10.843742917705404, "grad_norm": 1.690842866897583, "learning_rate": 1.1420099833728736e-05, "loss": 0.3551, "step": 813400 }, { "epoch": 10.845076055511859, "grad_norm": 2.4312024116516113, "learning_rate": 1.1413285551383548e-05, "loss": 0.3784, "step": 813500 }, { "epoch": 10.846409193318314, "grad_norm": 2.3068320751190186, "learning_rate": 1.1406472825059501e-05, "loss": 0.3252, "step": 813600 }, { "epoch": 10.847742331124769, "grad_norm": 0.8116443753242493, "learning_rate": 1.1399729759317273e-05, "loss": 0.3329, "step": 813700 }, { "epoch": 10.849075468931224, "grad_norm": 1.8104802370071411, "learning_rate": 1.1392920131171953e-05, "loss": 0.3267, "step": 813800 }, { "epoch": 10.850408606737679, "grad_norm": 5.469843864440918, "learning_rate": 1.1386112060752729e-05, "loss": 0.279, "step": 813900 }, { "epoch": 10.851741744544134, "grad_norm": 2.499788999557495, "learning_rate": 1.1379305548629515e-05, "loss": 0.2786, "step": 814000 }, { "epoch": 10.853074882350588, "grad_norm": 2.947612762451172, "learning_rate": 1.1372500595372142e-05, "loss": 0.3411, "step": 814100 }, { "epoch": 10.854408020157043, "grad_norm": 2.0335488319396973, "learning_rate": 1.1365697201550301e-05, "loss": 0.3274, "step": 814200 }, { "epoch": 10.855741157963498, "grad_norm": 2.2201638221740723, "learning_rate": 1.135889536773355e-05, "loss": 0.3632, "step": 814300 }, { "epoch": 10.857074295769953, "grad_norm": 1.9626296758651733, "learning_rate": 1.1352095094491317e-05, "loss": 0.2821, "step": 814400 }, { "epoch": 10.858407433576408, "grad_norm": 4.324323654174805, "learning_rate": 1.1345296382392911e-05, "loss": 0.3392, "step": 814500 }, { "epoch": 10.859740571382863, "grad_norm": 0.16841179132461548, "learning_rate": 1.1338499232007475e-05, "loss": 0.3074, "step": 814600 }, { "epoch": 10.861073709189318, "grad_norm": 5.162160873413086, "learning_rate": 1.1331703643904057e-05, "loss": 0.3017, "step": 814700 }, { "epoch": 10.862406846995775, "grad_norm": 5.104832649230957, "learning_rate": 1.1324909618651567e-05, "loss": 0.3034, "step": 814800 }, { "epoch": 10.86373998480223, "grad_norm": 2.1141059398651123, "learning_rate": 1.131811715681877e-05, "loss": 0.2779, "step": 814900 }, { "epoch": 10.865073122608685, "grad_norm": 2.9498305320739746, "learning_rate": 1.1311326258974324e-05, "loss": 0.3135, "step": 815000 }, { "epoch": 10.86640626041514, "grad_norm": 1.100379228591919, "learning_rate": 1.1304536925686726e-05, "loss": 0.3409, "step": 815100 }, { "epoch": 10.867739398221595, "grad_norm": 2.1752703189849854, "learning_rate": 1.1297749157524348e-05, "loss": 0.3146, "step": 815200 }, { "epoch": 10.86907253602805, "grad_norm": 4.146106719970703, "learning_rate": 1.1290962955055481e-05, "loss": 0.3284, "step": 815300 }, { "epoch": 10.870405673834505, "grad_norm": 1.0299218893051147, "learning_rate": 1.1284178318848207e-05, "loss": 0.2864, "step": 815400 }, { "epoch": 10.87173881164096, "grad_norm": 0.31805235147476196, "learning_rate": 1.1277395249470527e-05, "loss": 0.281, "step": 815500 }, { "epoch": 10.873071949447414, "grad_norm": 3.3890068531036377, "learning_rate": 1.1270613747490295e-05, "loss": 0.2927, "step": 815600 }, { "epoch": 10.87440508725387, "grad_norm": 2.4707107543945312, "learning_rate": 1.1263833813475244e-05, "loss": 0.2904, "step": 815700 }, { "epoch": 10.875738225060324, "grad_norm": 10.398162841796875, "learning_rate": 1.1257055447992964e-05, "loss": 0.3248, "step": 815800 }, { "epoch": 10.87707136286678, "grad_norm": 3.691999912261963, "learning_rate": 1.125027865161093e-05, "loss": 0.3821, "step": 815900 }, { "epoch": 10.878404500673234, "grad_norm": 3.258484363555908, "learning_rate": 1.1243503424896451e-05, "loss": 0.2804, "step": 816000 }, { "epoch": 10.879737638479689, "grad_norm": 2.611931324005127, "learning_rate": 1.1236729768416727e-05, "loss": 0.334, "step": 816100 }, { "epoch": 10.881070776286144, "grad_norm": 2.0388052463531494, "learning_rate": 1.1229957682738864e-05, "loss": 0.336, "step": 816200 }, { "epoch": 10.882403914092599, "grad_norm": 1.4714787006378174, "learning_rate": 1.1223187168429765e-05, "loss": 0.3042, "step": 816300 }, { "epoch": 10.883737051899054, "grad_norm": 2.6402127742767334, "learning_rate": 1.1216418226056247e-05, "loss": 0.267, "step": 816400 }, { "epoch": 10.88507018970551, "grad_norm": 7.941009044647217, "learning_rate": 1.1209650856184983e-05, "loss": 0.3078, "step": 816500 }, { "epoch": 10.886403327511966, "grad_norm": 3.531055212020874, "learning_rate": 1.1202885059382518e-05, "loss": 0.2994, "step": 816600 }, { "epoch": 10.88773646531842, "grad_norm": 4.1269450187683105, "learning_rate": 1.119612083621527e-05, "loss": 0.2678, "step": 816700 }, { "epoch": 10.889069603124875, "grad_norm": 2.6370692253112793, "learning_rate": 1.1189358187249505e-05, "loss": 0.3596, "step": 816800 }, { "epoch": 10.89040274093133, "grad_norm": 1.9313796758651733, "learning_rate": 1.1182597113051372e-05, "loss": 0.3353, "step": 816900 }, { "epoch": 10.891735878737785, "grad_norm": 6.331199645996094, "learning_rate": 1.1175837614186892e-05, "loss": 0.3172, "step": 817000 }, { "epoch": 10.89306901654424, "grad_norm": 2.463970899581909, "learning_rate": 1.1169079691221947e-05, "loss": 0.2886, "step": 817100 }, { "epoch": 10.894402154350695, "grad_norm": 2.8432042598724365, "learning_rate": 1.1162323344722294e-05, "loss": 0.324, "step": 817200 }, { "epoch": 10.89573529215715, "grad_norm": 1.7638603448867798, "learning_rate": 1.1155568575253556e-05, "loss": 0.2628, "step": 817300 }, { "epoch": 10.897068429963605, "grad_norm": 2.023444175720215, "learning_rate": 1.1148815383381205e-05, "loss": 0.3173, "step": 817400 }, { "epoch": 10.89840156777006, "grad_norm": 2.4429216384887695, "learning_rate": 1.1142063769670596e-05, "loss": 0.3126, "step": 817500 }, { "epoch": 10.899734705576515, "grad_norm": 3.8395512104034424, "learning_rate": 1.1135313734686982e-05, "loss": 0.2775, "step": 817600 }, { "epoch": 10.90106784338297, "grad_norm": 7.7808332443237305, "learning_rate": 1.1128565278995429e-05, "loss": 0.3078, "step": 817700 }, { "epoch": 10.902400981189425, "grad_norm": 2.6491875648498535, "learning_rate": 1.1121818403160898e-05, "loss": 0.3297, "step": 817800 }, { "epoch": 10.90373411899588, "grad_norm": 3.992133378982544, "learning_rate": 1.1115073107748224e-05, "loss": 0.3074, "step": 817900 }, { "epoch": 10.905067256802337, "grad_norm": 25.896528244018555, "learning_rate": 1.1108329393322098e-05, "loss": 0.297, "step": 818000 }, { "epoch": 10.906400394608792, "grad_norm": 3.5257115364074707, "learning_rate": 1.1101587260447085e-05, "loss": 0.3402, "step": 818100 }, { "epoch": 10.907733532415246, "grad_norm": 3.7502036094665527, "learning_rate": 1.1094846709687624e-05, "loss": 0.2892, "step": 818200 }, { "epoch": 10.909066670221701, "grad_norm": 3.6909077167510986, "learning_rate": 1.108810774160799e-05, "loss": 0.3072, "step": 818300 }, { "epoch": 10.910399808028156, "grad_norm": 2.444272518157959, "learning_rate": 1.1081370356772363e-05, "loss": 0.3414, "step": 818400 }, { "epoch": 10.911732945834611, "grad_norm": 1.8967634439468384, "learning_rate": 1.107463455574477e-05, "loss": 0.3563, "step": 818500 }, { "epoch": 10.913066083641066, "grad_norm": 1.8474873304367065, "learning_rate": 1.1067900339089116e-05, "loss": 0.3013, "step": 818600 }, { "epoch": 10.914399221447521, "grad_norm": 2.3816277980804443, "learning_rate": 1.1061167707369176e-05, "loss": 0.2847, "step": 818700 }, { "epoch": 10.915732359253976, "grad_norm": 2.0434892177581787, "learning_rate": 1.1054436661148555e-05, "loss": 0.296, "step": 818800 }, { "epoch": 10.917065497060431, "grad_norm": 7.5270867347717285, "learning_rate": 1.1047707200990785e-05, "loss": 0.3212, "step": 818900 }, { "epoch": 10.918398634866886, "grad_norm": 8.130825996398926, "learning_rate": 1.1040979327459235e-05, "loss": 0.3012, "step": 819000 }, { "epoch": 10.919731772673341, "grad_norm": 0.7256031632423401, "learning_rate": 1.1034253041117124e-05, "loss": 0.307, "step": 819100 }, { "epoch": 10.921064910479796, "grad_norm": 1.9347949028015137, "learning_rate": 1.102752834252756e-05, "loss": 0.2755, "step": 819200 }, { "epoch": 10.92239804828625, "grad_norm": 12.838713645935059, "learning_rate": 1.1020872455492243e-05, "loss": 0.3583, "step": 819300 }, { "epoch": 10.923731186092706, "grad_norm": 1.4516961574554443, "learning_rate": 1.1014150918205003e-05, "loss": 0.3072, "step": 819400 }, { "epoch": 10.92506432389916, "grad_norm": 2.986367702484131, "learning_rate": 1.1007430970353188e-05, "loss": 0.3086, "step": 819500 }, { "epoch": 10.926397461705616, "grad_norm": 2.069673538208008, "learning_rate": 1.1000712612499381e-05, "loss": 0.3625, "step": 819600 }, { "epoch": 10.927730599512072, "grad_norm": 1.2113767862319946, "learning_rate": 1.0993995845206022e-05, "loss": 0.311, "step": 819700 }, { "epoch": 10.929063737318527, "grad_norm": 4.142195701599121, "learning_rate": 1.0987280669035422e-05, "loss": 0.3097, "step": 819800 }, { "epoch": 10.930396875124982, "grad_norm": 5.9206624031066895, "learning_rate": 1.0980567084549759e-05, "loss": 0.3297, "step": 819900 }, { "epoch": 10.931730012931437, "grad_norm": 1.4605904817581177, "learning_rate": 1.0973855092311045e-05, "loss": 0.3022, "step": 820000 }, { "epoch": 10.933063150737892, "grad_norm": 3.253748893737793, "learning_rate": 1.0967144692881219e-05, "loss": 0.3133, "step": 820100 }, { "epoch": 10.934396288544347, "grad_norm": 4.235321521759033, "learning_rate": 1.0960435886822052e-05, "loss": 0.2837, "step": 820200 }, { "epoch": 10.935729426350802, "grad_norm": 3.09824800491333, "learning_rate": 1.0953728674695163e-05, "loss": 0.3029, "step": 820300 }, { "epoch": 10.937062564157257, "grad_norm": 5.2689337730407715, "learning_rate": 1.0947023057062074e-05, "loss": 0.3033, "step": 820400 }, { "epoch": 10.938395701963712, "grad_norm": 2.2329533100128174, "learning_rate": 1.0940319034484149e-05, "loss": 0.3122, "step": 820500 }, { "epoch": 10.939728839770167, "grad_norm": 1.3156402111053467, "learning_rate": 1.0933616607522637e-05, "loss": 0.3257, "step": 820600 }, { "epoch": 10.941061977576622, "grad_norm": 6.237192630767822, "learning_rate": 1.0926915776738642e-05, "loss": 0.3317, "step": 820700 }, { "epoch": 10.942395115383077, "grad_norm": 3.5965359210968018, "learning_rate": 1.092021654269314e-05, "loss": 0.3442, "step": 820800 }, { "epoch": 10.943728253189532, "grad_norm": 1.5345112085342407, "learning_rate": 1.0913518905946946e-05, "loss": 0.32, "step": 820900 }, { "epoch": 10.945061390995987, "grad_norm": 4.0490522384643555, "learning_rate": 1.0906822867060791e-05, "loss": 0.3228, "step": 821000 }, { "epoch": 10.946394528802442, "grad_norm": 5.010780334472656, "learning_rate": 1.0900128426595255e-05, "loss": 0.2678, "step": 821100 }, { "epoch": 10.947727666608898, "grad_norm": 3.9136221408843994, "learning_rate": 1.0893435585110743e-05, "loss": 0.3257, "step": 821200 }, { "epoch": 10.949060804415353, "grad_norm": 3.3711130619049072, "learning_rate": 1.088674434316758e-05, "loss": 0.3524, "step": 821300 }, { "epoch": 10.950393942221808, "grad_norm": 1.256905436515808, "learning_rate": 1.0880121589822008e-05, "loss": 0.2859, "step": 821400 }, { "epoch": 10.951727080028263, "grad_norm": 2.0531280040740967, "learning_rate": 1.0873433532632518e-05, "loss": 0.3671, "step": 821500 }, { "epoch": 10.953060217834718, "grad_norm": 2.3459877967834473, "learning_rate": 1.0866747076658878e-05, "loss": 0.2542, "step": 821600 }, { "epoch": 10.954393355641173, "grad_norm": 8.010841369628906, "learning_rate": 1.0860062222460863e-05, "loss": 0.3308, "step": 821700 }, { "epoch": 10.955726493447628, "grad_norm": 0.6159235239028931, "learning_rate": 1.0853378970598103e-05, "loss": 0.3227, "step": 821800 }, { "epoch": 10.957059631254083, "grad_norm": 3.3533554077148438, "learning_rate": 1.08466973216301e-05, "loss": 0.3024, "step": 821900 }, { "epoch": 10.958392769060538, "grad_norm": 4.431017875671387, "learning_rate": 1.084001727611623e-05, "loss": 0.3442, "step": 822000 }, { "epoch": 10.959725906866993, "grad_norm": 4.422854423522949, "learning_rate": 1.0833338834615708e-05, "loss": 0.3336, "step": 822100 }, { "epoch": 10.961059044673448, "grad_norm": 1.4536125659942627, "learning_rate": 1.082666199768763e-05, "loss": 0.2744, "step": 822200 }, { "epoch": 10.962392182479903, "grad_norm": 5.960454940795898, "learning_rate": 1.0819986765890989e-05, "loss": 0.3253, "step": 822300 }, { "epoch": 10.963725320286358, "grad_norm": 3.2162554264068604, "learning_rate": 1.0813313139784588e-05, "loss": 0.3146, "step": 822400 }, { "epoch": 10.965058458092813, "grad_norm": 0.9323593974113464, "learning_rate": 1.0806641119927126e-05, "loss": 0.2822, "step": 822500 }, { "epoch": 10.966391595899267, "grad_norm": 14.625551223754883, "learning_rate": 1.0799970706877166e-05, "loss": 0.3008, "step": 822600 }, { "epoch": 10.967724733705722, "grad_norm": 3.2562851905822754, "learning_rate": 1.0793301901193138e-05, "loss": 0.3342, "step": 822700 }, { "epoch": 10.969057871512177, "grad_norm": 16.73187255859375, "learning_rate": 1.0786634703433338e-05, "loss": 0.2639, "step": 822800 }, { "epoch": 10.970391009318634, "grad_norm": 2.064263343811035, "learning_rate": 1.0779969114155906e-05, "loss": 0.3345, "step": 822900 }, { "epoch": 10.971724147125089, "grad_norm": 3.054314613342285, "learning_rate": 1.0773305133918871e-05, "loss": 0.2849, "step": 823000 }, { "epoch": 10.973057284931544, "grad_norm": 2.424225091934204, "learning_rate": 1.0766642763280125e-05, "loss": 0.3104, "step": 823100 }, { "epoch": 10.974390422737999, "grad_norm": 0.11269629001617432, "learning_rate": 1.0759982002797415e-05, "loss": 0.3003, "step": 823200 }, { "epoch": 10.975723560544454, "grad_norm": 4.485659122467041, "learning_rate": 1.0753322853028366e-05, "loss": 0.2856, "step": 823300 }, { "epoch": 10.977056698350909, "grad_norm": 8.679529190063477, "learning_rate": 1.0746665314530465e-05, "loss": 0.323, "step": 823400 }, { "epoch": 10.978389836157364, "grad_norm": 3.5117721557617188, "learning_rate": 1.0740009387861032e-05, "loss": 0.3557, "step": 823500 }, { "epoch": 10.979722973963819, "grad_norm": 4.135761260986328, "learning_rate": 1.0733355073577312e-05, "loss": 0.284, "step": 823600 }, { "epoch": 10.981056111770274, "grad_norm": 2.4707038402557373, "learning_rate": 1.072670237223638e-05, "loss": 0.3019, "step": 823700 }, { "epoch": 10.982389249576729, "grad_norm": 1.8362681865692139, "learning_rate": 1.0720117787284914e-05, "loss": 0.31, "step": 823800 }, { "epoch": 10.983722387383184, "grad_norm": 2.5388002395629883, "learning_rate": 1.0713468297356905e-05, "loss": 0.2988, "step": 823900 }, { "epoch": 10.985055525189638, "grad_norm": 3.7479779720306396, "learning_rate": 1.0706820422036535e-05, "loss": 0.3002, "step": 824000 }, { "epoch": 10.986388662996093, "grad_norm": 0.7417492866516113, "learning_rate": 1.0700174161880348e-05, "loss": 0.3128, "step": 824100 }, { "epoch": 10.987721800802548, "grad_norm": 5.421111106872559, "learning_rate": 1.0693529517444736e-05, "loss": 0.2938, "step": 824200 }, { "epoch": 10.989054938609003, "grad_norm": 0.052307359874248505, "learning_rate": 1.068688648928597e-05, "loss": 0.3098, "step": 824300 }, { "epoch": 10.99038807641546, "grad_norm": 6.8419294357299805, "learning_rate": 1.0680245077960189e-05, "loss": 0.2876, "step": 824400 }, { "epoch": 10.991721214221915, "grad_norm": 2.6702523231506348, "learning_rate": 1.067360528402339e-05, "loss": 0.3052, "step": 824500 }, { "epoch": 10.99305435202837, "grad_norm": 9.835583686828613, "learning_rate": 1.0666967108031442e-05, "loss": 0.291, "step": 824600 }, { "epoch": 10.994387489834825, "grad_norm": 7.527591705322266, "learning_rate": 1.0660330550540046e-05, "loss": 0.3196, "step": 824700 }, { "epoch": 10.99572062764128, "grad_norm": 6.659862041473389, "learning_rate": 1.065369561210482e-05, "loss": 0.297, "step": 824800 }, { "epoch": 10.997053765447735, "grad_norm": 1.632941722869873, "learning_rate": 1.0647062293281227e-05, "loss": 0.3146, "step": 824900 }, { "epoch": 10.99838690325419, "grad_norm": 2.04634690284729, "learning_rate": 1.0640496903589473e-05, "loss": 0.2907, "step": 825000 }, { "epoch": 10.999720041060645, "grad_norm": 2.4033854007720947, "learning_rate": 1.0633866809444963e-05, "loss": 0.3255, "step": 825100 }, { "epoch": 11.0010531788671, "grad_norm": 1.069488763809204, "learning_rate": 1.0627238336572073e-05, "loss": 0.3147, "step": 825200 }, { "epoch": 11.002386316673554, "grad_norm": 1.3256263732910156, "learning_rate": 1.0620611485525734e-05, "loss": 0.2624, "step": 825300 }, { "epoch": 11.00371945448001, "grad_norm": 1.298533320426941, "learning_rate": 1.0613986256860704e-05, "loss": 0.2987, "step": 825400 }, { "epoch": 11.005052592286464, "grad_norm": 3.339860200881958, "learning_rate": 1.0607362651131632e-05, "loss": 0.3072, "step": 825500 }, { "epoch": 11.00638573009292, "grad_norm": 4.347044467926025, "learning_rate": 1.0600740668893026e-05, "loss": 0.293, "step": 825600 }, { "epoch": 11.007718867899374, "grad_norm": 2.723743200302124, "learning_rate": 1.0594120310699263e-05, "loss": 0.2806, "step": 825700 }, { "epoch": 11.00905200570583, "grad_norm": 1.6797460317611694, "learning_rate": 1.0587501577104583e-05, "loss": 0.255, "step": 825800 }, { "epoch": 11.010385143512284, "grad_norm": 1.3919941186904907, "learning_rate": 1.058088446866306e-05, "loss": 0.2533, "step": 825900 }, { "epoch": 11.011718281318739, "grad_norm": 3.390540361404419, "learning_rate": 1.0574268985928682e-05, "loss": 0.2967, "step": 826000 }, { "epoch": 11.013051419125196, "grad_norm": 1.1771074533462524, "learning_rate": 1.056765512945528e-05, "loss": 0.2646, "step": 826100 }, { "epoch": 11.01438455693165, "grad_norm": 3.0193514823913574, "learning_rate": 1.0561042899796524e-05, "loss": 0.321, "step": 826200 }, { "epoch": 11.015717694738106, "grad_norm": 0.8445558547973633, "learning_rate": 1.0554432297505982e-05, "loss": 0.2535, "step": 826300 }, { "epoch": 11.01705083254456, "grad_norm": 3.7141969203948975, "learning_rate": 1.0547823323137068e-05, "loss": 0.2639, "step": 826400 }, { "epoch": 11.018383970351016, "grad_norm": 3.659806251525879, "learning_rate": 1.0541215977243068e-05, "loss": 0.2751, "step": 826500 }, { "epoch": 11.01971710815747, "grad_norm": 2.536428451538086, "learning_rate": 1.0534610260377128e-05, "loss": 0.276, "step": 826600 }, { "epoch": 11.021050245963925, "grad_norm": 0.8949189186096191, "learning_rate": 1.0528006173092263e-05, "loss": 0.2975, "step": 826700 }, { "epoch": 11.02238338377038, "grad_norm": 3.781651496887207, "learning_rate": 1.0521403715941324e-05, "loss": 0.2967, "step": 826800 }, { "epoch": 11.023716521576835, "grad_norm": 2.8643112182617188, "learning_rate": 1.0514802889477074e-05, "loss": 0.2814, "step": 826900 }, { "epoch": 11.02504965938329, "grad_norm": 1.4366494417190552, "learning_rate": 1.050820369425211e-05, "loss": 0.2965, "step": 827000 }, { "epoch": 11.026382797189745, "grad_norm": 2.6666738986968994, "learning_rate": 1.0501606130818883e-05, "loss": 0.2828, "step": 827100 }, { "epoch": 11.0277159349962, "grad_norm": 0.9301537871360779, "learning_rate": 1.0495010199729725e-05, "loss": 0.3283, "step": 827200 }, { "epoch": 11.029049072802655, "grad_norm": 2.073171615600586, "learning_rate": 1.0488415901536828e-05, "loss": 0.308, "step": 827300 }, { "epoch": 11.03038221060911, "grad_norm": 1.5784838199615479, "learning_rate": 1.0481823236792244e-05, "loss": 0.2782, "step": 827400 }, { "epoch": 11.031715348415565, "grad_norm": 0.5578343272209167, "learning_rate": 1.0475232206047905e-05, "loss": 0.2691, "step": 827500 }, { "epoch": 11.03304848622202, "grad_norm": 2.9138052463531494, "learning_rate": 1.0468642809855568e-05, "loss": 0.252, "step": 827600 }, { "epoch": 11.034381624028477, "grad_norm": 0.050386734306812286, "learning_rate": 1.0462055048766885e-05, "loss": 0.257, "step": 827700 }, { "epoch": 11.035714761834932, "grad_norm": 1.6866273880004883, "learning_rate": 1.0455468923333368e-05, "loss": 0.3266, "step": 827800 }, { "epoch": 11.037047899641387, "grad_norm": 9.786652565002441, "learning_rate": 1.0448884434106376e-05, "loss": 0.3359, "step": 827900 }, { "epoch": 11.038381037447841, "grad_norm": 0.17019130289554596, "learning_rate": 1.0442301581637156e-05, "loss": 0.2561, "step": 828000 }, { "epoch": 11.039714175254296, "grad_norm": 1.7570124864578247, "learning_rate": 1.0435720366476803e-05, "loss": 0.2949, "step": 828100 }, { "epoch": 11.041047313060751, "grad_norm": 3.8464221954345703, "learning_rate": 1.0429140789176251e-05, "loss": 0.2656, "step": 828200 }, { "epoch": 11.042380450867206, "grad_norm": 2.9955270290374756, "learning_rate": 1.0422562850286347e-05, "loss": 0.2715, "step": 828300 }, { "epoch": 11.043713588673661, "grad_norm": 2.867802381515503, "learning_rate": 1.0415986550357779e-05, "loss": 0.2773, "step": 828400 }, { "epoch": 11.045046726480116, "grad_norm": 6.097771644592285, "learning_rate": 1.0409411889941074e-05, "loss": 0.2735, "step": 828500 }, { "epoch": 11.046379864286571, "grad_norm": 2.6856791973114014, "learning_rate": 1.0402838869586648e-05, "loss": 0.2702, "step": 828600 }, { "epoch": 11.047713002093026, "grad_norm": 4.9410834312438965, "learning_rate": 1.0396267489844778e-05, "loss": 0.2335, "step": 828700 }, { "epoch": 11.049046139899481, "grad_norm": 2.014739990234375, "learning_rate": 1.0389697751265598e-05, "loss": 0.2772, "step": 828800 }, { "epoch": 11.050379277705936, "grad_norm": 3.1430585384368896, "learning_rate": 1.0383129654399113e-05, "loss": 0.2796, "step": 828900 }, { "epoch": 11.051712415512391, "grad_norm": 2.0894975662231445, "learning_rate": 1.0376628856210198e-05, "loss": 0.2697, "step": 829000 }, { "epoch": 11.053045553318846, "grad_norm": 1.9576820135116577, "learning_rate": 1.0370064027987679e-05, "loss": 0.2969, "step": 829100 }, { "epoch": 11.0543786911253, "grad_norm": 2.538120985031128, "learning_rate": 1.036350084312152e-05, "loss": 0.3238, "step": 829200 }, { "epoch": 11.055711828931758, "grad_norm": 0.7407649159431458, "learning_rate": 1.0356939302161183e-05, "loss": 0.302, "step": 829300 }, { "epoch": 11.057044966738212, "grad_norm": 1.146635890007019, "learning_rate": 1.0350379405655945e-05, "loss": 0.2533, "step": 829400 }, { "epoch": 11.058378104544667, "grad_norm": 1.8153681755065918, "learning_rate": 1.0343821154155016e-05, "loss": 0.2797, "step": 829500 }, { "epoch": 11.059711242351122, "grad_norm": 1.3894596099853516, "learning_rate": 1.0337264548207425e-05, "loss": 0.2466, "step": 829600 }, { "epoch": 11.061044380157577, "grad_norm": 3.728987216949463, "learning_rate": 1.0330709588362054e-05, "loss": 0.2796, "step": 829700 }, { "epoch": 11.062377517964032, "grad_norm": 2.2657947540283203, "learning_rate": 1.0324156275167672e-05, "loss": 0.3123, "step": 829800 }, { "epoch": 11.063710655770487, "grad_norm": 1.313319206237793, "learning_rate": 1.0317604609172908e-05, "loss": 0.2752, "step": 829900 }, { "epoch": 11.065043793576942, "grad_norm": 5.39021110534668, "learning_rate": 1.0311054590926237e-05, "loss": 0.2826, "step": 830000 }, { "epoch": 11.066376931383397, "grad_norm": 4.28358793258667, "learning_rate": 1.0304506220976023e-05, "loss": 0.3001, "step": 830100 }, { "epoch": 11.067710069189852, "grad_norm": 4.679657459259033, "learning_rate": 1.0297959499870453e-05, "loss": 0.2675, "step": 830200 }, { "epoch": 11.069043206996307, "grad_norm": 3.121403217315674, "learning_rate": 1.0291414428157594e-05, "loss": 0.3294, "step": 830300 }, { "epoch": 11.070376344802762, "grad_norm": 4.8083319664001465, "learning_rate": 1.0284871006385413e-05, "loss": 0.2776, "step": 830400 }, { "epoch": 11.071709482609217, "grad_norm": 1.9376411437988281, "learning_rate": 1.0278329235101676e-05, "loss": 0.2844, "step": 830500 }, { "epoch": 11.073042620415672, "grad_norm": 1.1377798318862915, "learning_rate": 1.027178911485405e-05, "loss": 0.2947, "step": 830600 }, { "epoch": 11.074375758222127, "grad_norm": 2.8023107051849365, "learning_rate": 1.0265250646190046e-05, "loss": 0.295, "step": 830700 }, { "epoch": 11.075708896028582, "grad_norm": 3.726297378540039, "learning_rate": 1.025871382965705e-05, "loss": 0.2649, "step": 830800 }, { "epoch": 11.077042033835038, "grad_norm": 2.852745771408081, "learning_rate": 1.0252178665802303e-05, "loss": 0.2259, "step": 830900 }, { "epoch": 11.078375171641493, "grad_norm": 1.550816297531128, "learning_rate": 1.0245645155172916e-05, "loss": 0.278, "step": 831000 }, { "epoch": 11.079708309447948, "grad_norm": 2.166189193725586, "learning_rate": 1.0239113298315838e-05, "loss": 0.2736, "step": 831100 }, { "epoch": 11.081041447254403, "grad_norm": 3.4351553916931152, "learning_rate": 1.0232583095777901e-05, "loss": 0.2928, "step": 831200 }, { "epoch": 11.082374585060858, "grad_norm": 1.430114984512329, "learning_rate": 1.0226054548105795e-05, "loss": 0.2499, "step": 831300 }, { "epoch": 11.083707722867313, "grad_norm": 0.7686190009117126, "learning_rate": 1.0219527655846071e-05, "loss": 0.2775, "step": 831400 }, { "epoch": 11.085040860673768, "grad_norm": 1.2056758403778076, "learning_rate": 1.0213002419545135e-05, "loss": 0.2527, "step": 831500 }, { "epoch": 11.086373998480223, "grad_norm": 0.9953780174255371, "learning_rate": 1.0206478839749273e-05, "loss": 0.2781, "step": 831600 }, { "epoch": 11.087707136286678, "grad_norm": 2.8117306232452393, "learning_rate": 1.0199956917004591e-05, "loss": 0.2735, "step": 831700 }, { "epoch": 11.089040274093133, "grad_norm": 2.549400806427002, "learning_rate": 1.0193436651857118e-05, "loss": 0.2923, "step": 831800 }, { "epoch": 11.090373411899588, "grad_norm": 1.884116530418396, "learning_rate": 1.0186918044852685e-05, "loss": 0.3149, "step": 831900 }, { "epoch": 11.091706549706043, "grad_norm": 1.9008864164352417, "learning_rate": 1.0180401096537015e-05, "loss": 0.2323, "step": 832000 }, { "epoch": 11.093039687512498, "grad_norm": 4.61517333984375, "learning_rate": 1.0173950952131499e-05, "loss": 0.3034, "step": 832100 }, { "epoch": 11.094372825318953, "grad_norm": 8.649784088134766, "learning_rate": 1.0167437306229468e-05, "loss": 0.3258, "step": 832200 }, { "epoch": 11.095705963125408, "grad_norm": 1.2456222772598267, "learning_rate": 1.0160925320647063e-05, "loss": 0.2703, "step": 832300 }, { "epoch": 11.097039100931863, "grad_norm": 3.7249605655670166, "learning_rate": 1.0154414995929446e-05, "loss": 0.2701, "step": 832400 }, { "epoch": 11.09837223873832, "grad_norm": 3.7665107250213623, "learning_rate": 1.0147906332621649e-05, "loss": 0.2625, "step": 832500 }, { "epoch": 11.099705376544774, "grad_norm": 3.5548698902130127, "learning_rate": 1.0141399331268555e-05, "loss": 0.299, "step": 832600 }, { "epoch": 11.10103851435123, "grad_norm": 28.592741012573242, "learning_rate": 1.013489399241491e-05, "loss": 0.2939, "step": 832700 }, { "epoch": 11.102371652157684, "grad_norm": 2.4856202602386475, "learning_rate": 1.0128390316605327e-05, "loss": 0.2924, "step": 832800 }, { "epoch": 11.103704789964139, "grad_norm": 1.7876737117767334, "learning_rate": 1.0121888304384248e-05, "loss": 0.3215, "step": 832900 }, { "epoch": 11.105037927770594, "grad_norm": 2.3381052017211914, "learning_rate": 1.0115387956296028e-05, "loss": 0.3118, "step": 833000 }, { "epoch": 11.106371065577049, "grad_norm": 3.171891212463379, "learning_rate": 1.010888927288486e-05, "loss": 0.2952, "step": 833100 }, { "epoch": 11.107704203383504, "grad_norm": 19.871532440185547, "learning_rate": 1.0102392254694765e-05, "loss": 0.2603, "step": 833200 }, { "epoch": 11.109037341189959, "grad_norm": 1.3467493057250977, "learning_rate": 1.0095896902269672e-05, "loss": 0.2548, "step": 833300 }, { "epoch": 11.110370478996414, "grad_norm": 2.1609432697296143, "learning_rate": 1.0089403216153342e-05, "loss": 0.3076, "step": 833400 }, { "epoch": 11.111703616802869, "grad_norm": 0.47421184182167053, "learning_rate": 1.0082911196889413e-05, "loss": 0.2314, "step": 833500 }, { "epoch": 11.113036754609324, "grad_norm": 7.61360502243042, "learning_rate": 1.0076420845021388e-05, "loss": 0.258, "step": 833600 }, { "epoch": 11.114369892415779, "grad_norm": 1.73141610622406, "learning_rate": 1.0069932161092593e-05, "loss": 0.2696, "step": 833700 }, { "epoch": 11.115703030222233, "grad_norm": 1.2486661672592163, "learning_rate": 1.0063445145646242e-05, "loss": 0.2587, "step": 833800 }, { "epoch": 11.117036168028688, "grad_norm": 3.128002166748047, "learning_rate": 1.0056959799225439e-05, "loss": 0.2507, "step": 833900 }, { "epoch": 11.118369305835143, "grad_norm": 3.1788156032562256, "learning_rate": 1.0050476122373083e-05, "loss": 0.2711, "step": 834000 }, { "epoch": 11.1197024436416, "grad_norm": 23.300334930419922, "learning_rate": 1.0043994115631984e-05, "loss": 0.2777, "step": 834100 }, { "epoch": 11.121035581448055, "grad_norm": 2.4605093002319336, "learning_rate": 1.003757857463414e-05, "loss": 0.279, "step": 834200 }, { "epoch": 11.12236871925451, "grad_norm": 8.307799339294434, "learning_rate": 1.0031099893028724e-05, "loss": 0.3092, "step": 834300 }, { "epoch": 11.123701857060965, "grad_norm": 2.3707435131073, "learning_rate": 1.0024622883156669e-05, "loss": 0.3141, "step": 834400 }, { "epoch": 11.12503499486742, "grad_norm": 2.477108955383301, "learning_rate": 1.0018147545560216e-05, "loss": 0.2365, "step": 834500 }, { "epoch": 11.126368132673875, "grad_norm": 2.6324880123138428, "learning_rate": 1.0011673880781463e-05, "loss": 0.2913, "step": 834600 }, { "epoch": 11.12770127048033, "grad_norm": 0.07332079857587814, "learning_rate": 1.0005201889362365e-05, "loss": 0.2719, "step": 834700 }, { "epoch": 11.129034408286785, "grad_norm": 3.1659555435180664, "learning_rate": 9.998731571844748e-06, "loss": 0.2683, "step": 834800 }, { "epoch": 11.13036754609324, "grad_norm": 2.558363199234009, "learning_rate": 9.992262928770261e-06, "loss": 0.3073, "step": 834900 }, { "epoch": 11.131700683899695, "grad_norm": 7.4522705078125, "learning_rate": 9.985795960680444e-06, "loss": 0.3123, "step": 835000 }, { "epoch": 11.13303382170615, "grad_norm": 3.7641706466674805, "learning_rate": 9.979330668116719e-06, "loss": 0.2926, "step": 835100 }, { "epoch": 11.134366959512604, "grad_norm": 0.7691207528114319, "learning_rate": 9.972867051620306e-06, "loss": 0.3197, "step": 835200 }, { "epoch": 11.13570009731906, "grad_norm": 0.9745638370513916, "learning_rate": 9.966405111732335e-06, "loss": 0.2732, "step": 835300 }, { "epoch": 11.137033235125514, "grad_norm": 2.790252923965454, "learning_rate": 9.959944848993777e-06, "loss": 0.277, "step": 835400 }, { "epoch": 11.13836637293197, "grad_norm": 22.455598831176758, "learning_rate": 9.953486263945465e-06, "loss": 0.2564, "step": 835500 }, { "epoch": 11.139699510738424, "grad_norm": 6.279967308044434, "learning_rate": 9.947029357128092e-06, "loss": 0.2695, "step": 835600 }, { "epoch": 11.141032648544881, "grad_norm": 1.3097901344299316, "learning_rate": 9.940574129082217e-06, "loss": 0.2588, "step": 835700 }, { "epoch": 11.142365786351336, "grad_norm": 1.4336681365966797, "learning_rate": 9.934120580348232e-06, "loss": 0.2947, "step": 835800 }, { "epoch": 11.14369892415779, "grad_norm": 1.1749593019485474, "learning_rate": 9.92766871146642e-06, "loss": 0.2916, "step": 835900 }, { "epoch": 11.145032061964246, "grad_norm": 0.14731892943382263, "learning_rate": 9.92121852297691e-06, "loss": 0.2731, "step": 836000 }, { "epoch": 11.1463651997707, "grad_norm": 29.833389282226562, "learning_rate": 9.914770015419694e-06, "loss": 0.2933, "step": 836100 }, { "epoch": 11.147698337577156, "grad_norm": 2.6680848598480225, "learning_rate": 9.908323189334626e-06, "loss": 0.2732, "step": 836200 }, { "epoch": 11.14903147538361, "grad_norm": 2.712263345718384, "learning_rate": 9.901878045261394e-06, "loss": 0.2948, "step": 836300 }, { "epoch": 11.150364613190066, "grad_norm": 2.068955183029175, "learning_rate": 9.89543458373957e-06, "loss": 0.3225, "step": 836400 }, { "epoch": 11.15169775099652, "grad_norm": 3.989469051361084, "learning_rate": 9.888992805308607e-06, "loss": 0.3322, "step": 836500 }, { "epoch": 11.153030888802975, "grad_norm": 2.3316709995269775, "learning_rate": 9.882552710507763e-06, "loss": 0.2529, "step": 836600 }, { "epoch": 11.15436402660943, "grad_norm": 4.883021831512451, "learning_rate": 9.876114299876186e-06, "loss": 0.2483, "step": 836700 }, { "epoch": 11.155697164415885, "grad_norm": 6.403377532958984, "learning_rate": 9.869677573952888e-06, "loss": 0.276, "step": 836800 }, { "epoch": 11.15703030222234, "grad_norm": 1.8322230577468872, "learning_rate": 9.863242533276725e-06, "loss": 0.3137, "step": 836900 }, { "epoch": 11.158363440028795, "grad_norm": 1.287150263786316, "learning_rate": 9.85680917838642e-06, "loss": 0.2932, "step": 837000 }, { "epoch": 11.15969657783525, "grad_norm": 3.138174057006836, "learning_rate": 9.850377509820564e-06, "loss": 0.2742, "step": 837100 }, { "epoch": 11.161029715641705, "grad_norm": 2.0861759185791016, "learning_rate": 9.84394752811758e-06, "loss": 0.2708, "step": 837200 }, { "epoch": 11.162362853448162, "grad_norm": 0.5949241518974304, "learning_rate": 9.837519233815756e-06, "loss": 0.266, "step": 837300 }, { "epoch": 11.163695991254617, "grad_norm": 2.010242223739624, "learning_rate": 9.831092627453287e-06, "loss": 0.2492, "step": 837400 }, { "epoch": 11.165029129061072, "grad_norm": 8.632161140441895, "learning_rate": 9.824667709568154e-06, "loss": 0.2773, "step": 837500 }, { "epoch": 11.166362266867527, "grad_norm": 4.973984241485596, "learning_rate": 9.818244480698244e-06, "loss": 0.2685, "step": 837600 }, { "epoch": 11.167695404673982, "grad_norm": 9.519915580749512, "learning_rate": 9.811822941381286e-06, "loss": 0.285, "step": 837700 }, { "epoch": 11.169028542480437, "grad_norm": 3.0587973594665527, "learning_rate": 9.805403092154875e-06, "loss": 0.3063, "step": 837800 }, { "epoch": 11.170361680286891, "grad_norm": 1.0323870182037354, "learning_rate": 9.798984933556463e-06, "loss": 0.2532, "step": 837900 }, { "epoch": 11.171694818093346, "grad_norm": 24.503725051879883, "learning_rate": 9.792568466123348e-06, "loss": 0.3165, "step": 838000 }, { "epoch": 11.173027955899801, "grad_norm": 1.536702275276184, "learning_rate": 9.7861536903927e-06, "loss": 0.2913, "step": 838100 }, { "epoch": 11.174361093706256, "grad_norm": 1.3392361402511597, "learning_rate": 9.779740606901548e-06, "loss": 0.3142, "step": 838200 }, { "epoch": 11.175694231512711, "grad_norm": 2.9102783203125, "learning_rate": 9.77339332171292e-06, "loss": 0.2915, "step": 838300 }, { "epoch": 11.177027369319166, "grad_norm": 4.785180568695068, "learning_rate": 9.766983607375473e-06, "loss": 0.2548, "step": 838400 }, { "epoch": 11.178360507125621, "grad_norm": 1.6766622066497803, "learning_rate": 9.760575586882367e-06, "loss": 0.2815, "step": 838500 }, { "epoch": 11.179693644932076, "grad_norm": 1.6690484285354614, "learning_rate": 9.754169260770095e-06, "loss": 0.2184, "step": 838600 }, { "epoch": 11.181026782738531, "grad_norm": 4.910598278045654, "learning_rate": 9.747764629574943e-06, "loss": 0.2355, "step": 838700 }, { "epoch": 11.182359920544986, "grad_norm": 4.085083961486816, "learning_rate": 9.741361693833091e-06, "loss": 0.306, "step": 838800 }, { "epoch": 11.183693058351441, "grad_norm": 1.5116915702819824, "learning_rate": 9.734960454080582e-06, "loss": 0.2622, "step": 838900 }, { "epoch": 11.185026196157898, "grad_norm": 5.088202953338623, "learning_rate": 9.728560910853303e-06, "loss": 0.2447, "step": 839000 }, { "epoch": 11.186359333964353, "grad_norm": 1.8296294212341309, "learning_rate": 9.722163064687009e-06, "loss": 0.2688, "step": 839100 }, { "epoch": 11.187692471770808, "grad_norm": 2.586965560913086, "learning_rate": 9.715766916117295e-06, "loss": 0.249, "step": 839200 }, { "epoch": 11.189025609577262, "grad_norm": 4.2841081619262695, "learning_rate": 9.709372465679633e-06, "loss": 0.2966, "step": 839300 }, { "epoch": 11.190358747383717, "grad_norm": 3.846435785293579, "learning_rate": 9.702979713909347e-06, "loss": 0.3216, "step": 839400 }, { "epoch": 11.191691885190172, "grad_norm": 0.031239954754710197, "learning_rate": 9.696588661341616e-06, "loss": 0.2756, "step": 839500 }, { "epoch": 11.193025022996627, "grad_norm": 3.373544692993164, "learning_rate": 9.690199308511483e-06, "loss": 0.2393, "step": 839600 }, { "epoch": 11.194358160803082, "grad_norm": 1.7430886030197144, "learning_rate": 9.68381165595385e-06, "loss": 0.2669, "step": 839700 }, { "epoch": 11.195691298609537, "grad_norm": 1.3626940250396729, "learning_rate": 9.677425704203451e-06, "loss": 0.242, "step": 839800 }, { "epoch": 11.197024436415992, "grad_norm": 3.56217622756958, "learning_rate": 9.671041453794905e-06, "loss": 0.2768, "step": 839900 }, { "epoch": 11.198357574222447, "grad_norm": 1.4038139581680298, "learning_rate": 9.664658905262702e-06, "loss": 0.2629, "step": 840000 }, { "epoch": 11.199690712028902, "grad_norm": 1.6342896223068237, "learning_rate": 9.658278059141145e-06, "loss": 0.31, "step": 840100 }, { "epoch": 11.201023849835357, "grad_norm": 4.002181529998779, "learning_rate": 9.651898915964425e-06, "loss": 0.3047, "step": 840200 }, { "epoch": 11.202356987641812, "grad_norm": 0.8550492525100708, "learning_rate": 9.645521476266586e-06, "loss": 0.305, "step": 840300 }, { "epoch": 11.203690125448267, "grad_norm": 4.948855876922607, "learning_rate": 9.639145740581525e-06, "loss": 0.2666, "step": 840400 }, { "epoch": 11.205023263254724, "grad_norm": 0.8640744090080261, "learning_rate": 9.632835441315136e-06, "loss": 0.2385, "step": 840500 }, { "epoch": 11.206356401061178, "grad_norm": 2.9510722160339355, "learning_rate": 9.626463098203318e-06, "loss": 0.2779, "step": 840600 }, { "epoch": 11.207689538867633, "grad_norm": 3.2674505710601807, "learning_rate": 9.620092460699786e-06, "loss": 0.2618, "step": 840700 }, { "epoch": 11.209022676674088, "grad_norm": 0.917504608631134, "learning_rate": 9.61372352933787e-06, "loss": 0.274, "step": 840800 }, { "epoch": 11.210355814480543, "grad_norm": 0.8952785134315491, "learning_rate": 9.607356304650768e-06, "loss": 0.2451, "step": 840900 }, { "epoch": 11.211688952286998, "grad_norm": 3.646681308746338, "learning_rate": 9.600990787171505e-06, "loss": 0.2812, "step": 841000 }, { "epoch": 11.213022090093453, "grad_norm": 2.4301722049713135, "learning_rate": 9.594626977432977e-06, "loss": 0.2955, "step": 841100 }, { "epoch": 11.214355227899908, "grad_norm": 6.467960834503174, "learning_rate": 9.588264875967976e-06, "loss": 0.2622, "step": 841200 }, { "epoch": 11.215688365706363, "grad_norm": 3.0431182384490967, "learning_rate": 9.581904483309088e-06, "loss": 0.2734, "step": 841300 }, { "epoch": 11.217021503512818, "grad_norm": 1.711445689201355, "learning_rate": 9.57554579998879e-06, "loss": 0.2879, "step": 841400 }, { "epoch": 11.218354641319273, "grad_norm": 1.9534426927566528, "learning_rate": 9.569188826539417e-06, "loss": 0.2701, "step": 841500 }, { "epoch": 11.219687779125728, "grad_norm": 6.571228981018066, "learning_rate": 9.56283356349315e-06, "loss": 0.286, "step": 841600 }, { "epoch": 11.221020916932183, "grad_norm": 1.4750990867614746, "learning_rate": 9.556480011382037e-06, "loss": 0.2961, "step": 841700 }, { "epoch": 11.222354054738638, "grad_norm": 1.3444347381591797, "learning_rate": 9.550128170737981e-06, "loss": 0.2485, "step": 841800 }, { "epoch": 11.223687192545093, "grad_norm": 1.8800301551818848, "learning_rate": 9.543778042092721e-06, "loss": 0.3062, "step": 841900 }, { "epoch": 11.225020330351548, "grad_norm": 0.6489065885543823, "learning_rate": 9.537429625977869e-06, "loss": 0.2322, "step": 842000 }, { "epoch": 11.226353468158003, "grad_norm": 1.9379628896713257, "learning_rate": 9.531082922924926e-06, "loss": 0.3305, "step": 842100 }, { "epoch": 11.22768660596446, "grad_norm": 1.4620108604431152, "learning_rate": 9.524737933465187e-06, "loss": 0.2792, "step": 842200 }, { "epoch": 11.229019743770914, "grad_norm": 0.8189206123352051, "learning_rate": 9.518394658129854e-06, "loss": 0.2488, "step": 842300 }, { "epoch": 11.23035288157737, "grad_norm": 6.953339576721191, "learning_rate": 9.51205309744994e-06, "loss": 0.2596, "step": 842400 }, { "epoch": 11.231686019383824, "grad_norm": 3.0240089893341064, "learning_rate": 9.505713251956362e-06, "loss": 0.2776, "step": 842500 }, { "epoch": 11.233019157190279, "grad_norm": 1.0682249069213867, "learning_rate": 9.499375122179882e-06, "loss": 0.2842, "step": 842600 }, { "epoch": 11.234352294996734, "grad_norm": 1.2390931844711304, "learning_rate": 9.493038708651082e-06, "loss": 0.2287, "step": 842700 }, { "epoch": 11.235685432803189, "grad_norm": 16.441932678222656, "learning_rate": 9.486704011900435e-06, "loss": 0.3214, "step": 842800 }, { "epoch": 11.237018570609644, "grad_norm": 2.5954973697662354, "learning_rate": 9.480371032458266e-06, "loss": 0.2846, "step": 842900 }, { "epoch": 11.238351708416099, "grad_norm": 1.8718266487121582, "learning_rate": 9.474039770854752e-06, "loss": 0.2484, "step": 843000 }, { "epoch": 11.239684846222554, "grad_norm": 3.6476542949676514, "learning_rate": 9.467710227619923e-06, "loss": 0.2963, "step": 843100 }, { "epoch": 11.241017984029009, "grad_norm": 1.1217670440673828, "learning_rate": 9.461382403283682e-06, "loss": 0.2925, "step": 843200 }, { "epoch": 11.242351121835464, "grad_norm": 1.4542368650436401, "learning_rate": 9.455056298375753e-06, "loss": 0.2764, "step": 843300 }, { "epoch": 11.243684259641919, "grad_norm": 5.865262031555176, "learning_rate": 9.44879514875971e-06, "loss": 0.3093, "step": 843400 }, { "epoch": 11.245017397448374, "grad_norm": 0.7552257776260376, "learning_rate": 9.442472467089595e-06, "loss": 0.255, "step": 843500 }, { "epoch": 11.246350535254829, "grad_norm": 2.289929151535034, "learning_rate": 9.436151506430874e-06, "loss": 0.2527, "step": 843600 }, { "epoch": 11.247683673061283, "grad_norm": 17.918664932250977, "learning_rate": 9.42983226731273e-06, "loss": 0.3093, "step": 843700 }, { "epoch": 11.24901681086774, "grad_norm": 18.61545181274414, "learning_rate": 9.423514750264199e-06, "loss": 0.3187, "step": 843800 }, { "epoch": 11.250349948674195, "grad_norm": 0.8660023808479309, "learning_rate": 9.417198955814136e-06, "loss": 0.2524, "step": 843900 }, { "epoch": 11.25168308648065, "grad_norm": 2.625521659851074, "learning_rate": 9.410884884491295e-06, "loss": 0.231, "step": 844000 }, { "epoch": 11.253016224287105, "grad_norm": 2.5774643421173096, "learning_rate": 9.404572536824267e-06, "loss": 0.2756, "step": 844100 }, { "epoch": 11.25434936209356, "grad_norm": 5.7416672706604, "learning_rate": 9.398261913341506e-06, "loss": 0.2878, "step": 844200 }, { "epoch": 11.255682499900015, "grad_norm": 3.6808271408081055, "learning_rate": 9.391953014571314e-06, "loss": 0.2811, "step": 844300 }, { "epoch": 11.25701563770647, "grad_norm": 1.7885076999664307, "learning_rate": 9.385708904235473e-06, "loss": 0.2902, "step": 844400 }, { "epoch": 11.258348775512925, "grad_norm": 5.523645401000977, "learning_rate": 9.37940343921447e-06, "loss": 0.3238, "step": 844500 }, { "epoch": 11.25968191331938, "grad_norm": 3.016706943511963, "learning_rate": 9.373099700484798e-06, "loss": 0.2881, "step": 844600 }, { "epoch": 11.261015051125835, "grad_norm": 1.5008896589279175, "learning_rate": 9.366797688574203e-06, "loss": 0.2749, "step": 844700 }, { "epoch": 11.26234818893229, "grad_norm": 2.290311098098755, "learning_rate": 9.360497404010258e-06, "loss": 0.3043, "step": 844800 }, { "epoch": 11.263681326738745, "grad_norm": 3.167597770690918, "learning_rate": 9.354198847320412e-06, "loss": 0.2804, "step": 844900 }, { "epoch": 11.2650144645452, "grad_norm": 1.301090121269226, "learning_rate": 9.347902019031968e-06, "loss": 0.3592, "step": 845000 }, { "epoch": 11.266347602351654, "grad_norm": 2.4355902671813965, "learning_rate": 9.341669862105725e-06, "loss": 0.2558, "step": 845100 }, { "epoch": 11.26768074015811, "grad_norm": 4.96457576751709, "learning_rate": 9.335376474904195e-06, "loss": 0.3207, "step": 845200 }, { "epoch": 11.269013877964564, "grad_norm": 1.8069809675216675, "learning_rate": 9.329084817679808e-06, "loss": 0.3093, "step": 845300 }, { "epoch": 11.270347015771021, "grad_norm": 3.3560492992401123, "learning_rate": 9.322794890959288e-06, "loss": 0.2738, "step": 845400 }, { "epoch": 11.271680153577476, "grad_norm": 2.978835105895996, "learning_rate": 9.31650669526919e-06, "loss": 0.2433, "step": 845500 }, { "epoch": 11.273013291383931, "grad_norm": 1.8760792016983032, "learning_rate": 9.310220231135942e-06, "loss": 0.2981, "step": 845600 }, { "epoch": 11.274346429190386, "grad_norm": 15.404279708862305, "learning_rate": 9.303935499085855e-06, "loss": 0.245, "step": 845700 }, { "epoch": 11.27567956699684, "grad_norm": 2.4386837482452393, "learning_rate": 9.297652499645042e-06, "loss": 0.2511, "step": 845800 }, { "epoch": 11.277012704803296, "grad_norm": 2.118218421936035, "learning_rate": 9.291371233339499e-06, "loss": 0.2689, "step": 845900 }, { "epoch": 11.27834584260975, "grad_norm": 2.9508163928985596, "learning_rate": 9.285091700695077e-06, "loss": 0.2933, "step": 846000 }, { "epoch": 11.279678980416206, "grad_norm": 5.764967441558838, "learning_rate": 9.27881390223748e-06, "loss": 0.324, "step": 846100 }, { "epoch": 11.28101211822266, "grad_norm": 1.7009159326553345, "learning_rate": 9.272537838492264e-06, "loss": 0.2948, "step": 846200 }, { "epoch": 11.282345256029116, "grad_norm": 3.703735113143921, "learning_rate": 9.266263509984855e-06, "loss": 0.2201, "step": 846300 }, { "epoch": 11.28367839383557, "grad_norm": 90.09378814697266, "learning_rate": 9.259990917240492e-06, "loss": 0.2859, "step": 846400 }, { "epoch": 11.285011531642025, "grad_norm": 1.9664610624313354, "learning_rate": 9.253720060784311e-06, "loss": 0.2841, "step": 846500 }, { "epoch": 11.28634466944848, "grad_norm": 1.4986262321472168, "learning_rate": 9.247450941141292e-06, "loss": 0.2225, "step": 846600 }, { "epoch": 11.287677807254935, "grad_norm": 2.321385145187378, "learning_rate": 9.241183558836259e-06, "loss": 0.2607, "step": 846700 }, { "epoch": 11.28901094506139, "grad_norm": 2.2383360862731934, "learning_rate": 9.234917914393913e-06, "loss": 0.2968, "step": 846800 }, { "epoch": 11.290344082867847, "grad_norm": 3.8592305183410645, "learning_rate": 9.228654008338771e-06, "loss": 0.2195, "step": 846900 }, { "epoch": 11.291677220674302, "grad_norm": 3.4205868244171143, "learning_rate": 9.22239184119523e-06, "loss": 0.2451, "step": 847000 }, { "epoch": 11.293010358480757, "grad_norm": 0.5536391735076904, "learning_rate": 9.216131413487566e-06, "loss": 0.2686, "step": 847100 }, { "epoch": 11.294343496287212, "grad_norm": 3.0644478797912598, "learning_rate": 9.209872725739857e-06, "loss": 0.2571, "step": 847200 }, { "epoch": 11.295676634093667, "grad_norm": 3.158924102783203, "learning_rate": 9.203615778476063e-06, "loss": 0.2825, "step": 847300 }, { "epoch": 11.297009771900122, "grad_norm": 5.017327785491943, "learning_rate": 9.197360572220002e-06, "loss": 0.2939, "step": 847400 }, { "epoch": 11.298342909706577, "grad_norm": 2.1625359058380127, "learning_rate": 9.19110710749534e-06, "loss": 0.2877, "step": 847500 }, { "epoch": 11.299676047513032, "grad_norm": 0.6454091668128967, "learning_rate": 9.184855384825592e-06, "loss": 0.2844, "step": 847600 }, { "epoch": 11.301009185319487, "grad_norm": 7.16739559173584, "learning_rate": 9.178605404734146e-06, "loss": 0.2888, "step": 847700 }, { "epoch": 11.302342323125941, "grad_norm": 3.7887094020843506, "learning_rate": 9.172357167744212e-06, "loss": 0.2956, "step": 847800 }, { "epoch": 11.303675460932396, "grad_norm": 3.2403981685638428, "learning_rate": 9.166110674378875e-06, "loss": 0.2797, "step": 847900 }, { "epoch": 11.305008598738851, "grad_norm": 3.0981361865997314, "learning_rate": 9.159865925161096e-06, "loss": 0.3105, "step": 848000 }, { "epoch": 11.306341736545306, "grad_norm": 5.26640510559082, "learning_rate": 9.153622920613636e-06, "loss": 0.2833, "step": 848100 }, { "epoch": 11.307674874351761, "grad_norm": 2.120572566986084, "learning_rate": 9.147381661259162e-06, "loss": 0.2398, "step": 848200 }, { "epoch": 11.309008012158216, "grad_norm": 0.7636744976043701, "learning_rate": 9.141142147620144e-06, "loss": 0.2492, "step": 848300 }, { "epoch": 11.310341149964671, "grad_norm": 3.5543408393859863, "learning_rate": 9.134904380218964e-06, "loss": 0.2974, "step": 848400 }, { "epoch": 11.311674287771126, "grad_norm": 0.7254189252853394, "learning_rate": 9.128668359577823e-06, "loss": 0.2606, "step": 848500 }, { "epoch": 11.313007425577583, "grad_norm": 4.537469863891602, "learning_rate": 9.122434086218768e-06, "loss": 0.3035, "step": 848600 }, { "epoch": 11.314340563384038, "grad_norm": 2.7434802055358887, "learning_rate": 9.116201560663719e-06, "loss": 0.2825, "step": 848700 }, { "epoch": 11.315673701190493, "grad_norm": 1.281848430633545, "learning_rate": 9.109970783434443e-06, "loss": 0.2827, "step": 848800 }, { "epoch": 11.317006838996948, "grad_norm": 3.4936506748199463, "learning_rate": 9.10374175505256e-06, "loss": 0.3303, "step": 848900 }, { "epoch": 11.318339976803403, "grad_norm": 3.3001224994659424, "learning_rate": 9.097514476039547e-06, "loss": 0.2227, "step": 849000 }, { "epoch": 11.319673114609857, "grad_norm": 3.0205843448638916, "learning_rate": 9.091288946916745e-06, "loss": 0.2634, "step": 849100 }, { "epoch": 11.321006252416312, "grad_norm": 14.123927116394043, "learning_rate": 9.085127397326174e-06, "loss": 0.2509, "step": 849200 }, { "epoch": 11.322339390222767, "grad_norm": 1.6263427734375, "learning_rate": 9.07890535203525e-06, "loss": 0.2887, "step": 849300 }, { "epoch": 11.323672528029222, "grad_norm": 3.502821922302246, "learning_rate": 9.072685058192423e-06, "loss": 0.2747, "step": 849400 }, { "epoch": 11.325005665835677, "grad_norm": 3.2444846630096436, "learning_rate": 9.066466516318437e-06, "loss": 0.2757, "step": 849500 }, { "epoch": 11.326338803642132, "grad_norm": 2.609131097793579, "learning_rate": 9.06024972693388e-06, "loss": 0.2876, "step": 849600 }, { "epoch": 11.327671941448587, "grad_norm": 4.152078151702881, "learning_rate": 9.05403469055922e-06, "loss": 0.2291, "step": 849700 }, { "epoch": 11.329005079255042, "grad_norm": 1.3342233896255493, "learning_rate": 9.047821407714743e-06, "loss": 0.2848, "step": 849800 }, { "epoch": 11.330338217061497, "grad_norm": 1.6787984371185303, "learning_rate": 9.041609878920607e-06, "loss": 0.2593, "step": 849900 }, { "epoch": 11.331671354867952, "grad_norm": 3.0143139362335205, "learning_rate": 9.035400104696825e-06, "loss": 0.2418, "step": 850000 }, { "epoch": 11.333004492674407, "grad_norm": 3.215677499771118, "learning_rate": 9.029254157065198e-06, "loss": 0.2687, "step": 850100 }, { "epoch": 11.334337630480864, "grad_norm": 3.9963090419769287, "learning_rate": 9.023047875982894e-06, "loss": 0.2914, "step": 850200 }, { "epoch": 11.335670768287319, "grad_norm": 0.6161304712295532, "learning_rate": 9.016843351024888e-06, "loss": 0.265, "step": 850300 }, { "epoch": 11.337003906093774, "grad_norm": 4.706874370574951, "learning_rate": 9.010640582710633e-06, "loss": 0.2653, "step": 850400 }, { "epoch": 11.338337043900228, "grad_norm": 1.6488815546035767, "learning_rate": 9.004439571559372e-06, "loss": 0.2715, "step": 850500 }, { "epoch": 11.339670181706683, "grad_norm": 3.8818206787109375, "learning_rate": 8.99824031809025e-06, "loss": 0.2811, "step": 850600 }, { "epoch": 11.341003319513138, "grad_norm": 0.7981236577033997, "learning_rate": 8.992042822822241e-06, "loss": 0.376, "step": 850700 }, { "epoch": 11.342336457319593, "grad_norm": 3.189836025238037, "learning_rate": 8.985847086274185e-06, "loss": 0.2445, "step": 850800 }, { "epoch": 11.343669595126048, "grad_norm": 1.4963420629501343, "learning_rate": 8.979653108964781e-06, "loss": 0.2832, "step": 850900 }, { "epoch": 11.345002732932503, "grad_norm": 4.960803031921387, "learning_rate": 8.973460891412543e-06, "loss": 0.3492, "step": 851000 }, { "epoch": 11.346335870738958, "grad_norm": 2.0531938076019287, "learning_rate": 8.96727043413588e-06, "loss": 0.2872, "step": 851100 }, { "epoch": 11.347669008545413, "grad_norm": 2.2722246646881104, "learning_rate": 8.961081737653036e-06, "loss": 0.2535, "step": 851200 }, { "epoch": 11.349002146351868, "grad_norm": 2.324532985687256, "learning_rate": 8.954894802482109e-06, "loss": 0.3137, "step": 851300 }, { "epoch": 11.350335284158323, "grad_norm": 10.846653938293457, "learning_rate": 8.948709629141046e-06, "loss": 0.2813, "step": 851400 }, { "epoch": 11.351668421964778, "grad_norm": 2.5880603790283203, "learning_rate": 8.942526218147668e-06, "loss": 0.2937, "step": 851500 }, { "epoch": 11.353001559771233, "grad_norm": 3.6314408779144287, "learning_rate": 8.936344570019602e-06, "loss": 0.2549, "step": 851600 }, { "epoch": 11.354334697577688, "grad_norm": 1.2722111940383911, "learning_rate": 8.930164685274363e-06, "loss": 0.2548, "step": 851700 }, { "epoch": 11.355667835384144, "grad_norm": 2.9882984161376953, "learning_rate": 8.923986564429337e-06, "loss": 0.2336, "step": 851800 }, { "epoch": 11.3570009731906, "grad_norm": 2.5494091510772705, "learning_rate": 8.91781020800171e-06, "loss": 0.2708, "step": 851900 }, { "epoch": 11.358334110997054, "grad_norm": 3.0061800479888916, "learning_rate": 8.911635616508555e-06, "loss": 0.2606, "step": 852000 }, { "epoch": 11.35966724880351, "grad_norm": 2.284006357192993, "learning_rate": 8.90546279046679e-06, "loss": 0.2746, "step": 852100 }, { "epoch": 11.361000386609964, "grad_norm": 2.382681131362915, "learning_rate": 8.899291730393185e-06, "loss": 0.2724, "step": 852200 }, { "epoch": 11.36233352441642, "grad_norm": 3.9570600986480713, "learning_rate": 8.893122436804359e-06, "loss": 0.329, "step": 852300 }, { "epoch": 11.363666662222874, "grad_norm": 5.266878604888916, "learning_rate": 8.8869549102168e-06, "loss": 0.3074, "step": 852400 }, { "epoch": 11.364999800029329, "grad_norm": 2.88936185836792, "learning_rate": 8.88078915114681e-06, "loss": 0.2603, "step": 852500 }, { "epoch": 11.366332937835784, "grad_norm": 1.6118957996368408, "learning_rate": 8.87462516011058e-06, "loss": 0.27, "step": 852600 }, { "epoch": 11.367666075642239, "grad_norm": 3.074533700942993, "learning_rate": 8.868462937624138e-06, "loss": 0.2802, "step": 852700 }, { "epoch": 11.368999213448694, "grad_norm": 3.0512664318084717, "learning_rate": 8.86230248420337e-06, "loss": 0.279, "step": 852800 }, { "epoch": 11.370332351255149, "grad_norm": 3.8273136615753174, "learning_rate": 8.856143800364011e-06, "loss": 0.2591, "step": 852900 }, { "epoch": 11.371665489061604, "grad_norm": 1.2039111852645874, "learning_rate": 8.849986886621626e-06, "loss": 0.2747, "step": 853000 }, { "epoch": 11.372998626868059, "grad_norm": 2.654236316680908, "learning_rate": 8.843831743491677e-06, "loss": 0.2884, "step": 853100 }, { "epoch": 11.374331764674514, "grad_norm": 4.540812969207764, "learning_rate": 8.837678371489452e-06, "loss": 0.2688, "step": 853200 }, { "epoch": 11.375664902480969, "grad_norm": 2.721336603164673, "learning_rate": 8.831526771130077e-06, "loss": 0.2709, "step": 853300 }, { "epoch": 11.376998040287425, "grad_norm": 1.0754210948944092, "learning_rate": 8.825376942928552e-06, "loss": 0.2812, "step": 853400 }, { "epoch": 11.37833117809388, "grad_norm": 2.674112558364868, "learning_rate": 8.819228887399719e-06, "loss": 0.2592, "step": 853500 }, { "epoch": 11.379664315900335, "grad_norm": 4.452956676483154, "learning_rate": 8.813082605058277e-06, "loss": 0.2662, "step": 853600 }, { "epoch": 11.38099745370679, "grad_norm": 3.0580379962921143, "learning_rate": 8.806938096418771e-06, "loss": 0.2881, "step": 853700 }, { "epoch": 11.382330591513245, "grad_norm": 2.009071111679077, "learning_rate": 8.800795361995615e-06, "loss": 0.2981, "step": 853800 }, { "epoch": 11.3836637293197, "grad_norm": 1.191895842552185, "learning_rate": 8.794654402303035e-06, "loss": 0.2623, "step": 853900 }, { "epoch": 11.384996867126155, "grad_norm": 5.177194118499756, "learning_rate": 8.788515217855141e-06, "loss": 0.2602, "step": 854000 }, { "epoch": 11.38633000493261, "grad_norm": 1.9910451173782349, "learning_rate": 8.782377809165888e-06, "loss": 0.3014, "step": 854100 }, { "epoch": 11.387663142739065, "grad_norm": 1.9139107465744019, "learning_rate": 8.776303524279014e-06, "loss": 0.2703, "step": 854200 }, { "epoch": 11.38899628054552, "grad_norm": 2.0826940536499023, "learning_rate": 8.770169650877906e-06, "loss": 0.2426, "step": 854300 }, { "epoch": 11.390329418351975, "grad_norm": 3.4061169624328613, "learning_rate": 8.764037554771284e-06, "loss": 0.3038, "step": 854400 }, { "epoch": 11.39166255615843, "grad_norm": 1.3264139890670776, "learning_rate": 8.757907236472485e-06, "loss": 0.2937, "step": 854500 }, { "epoch": 11.392995693964885, "grad_norm": 12.842106819152832, "learning_rate": 8.751778696494735e-06, "loss": 0.2616, "step": 854600 }, { "epoch": 11.39432883177134, "grad_norm": 1.5853490829467773, "learning_rate": 8.74565193535109e-06, "loss": 0.273, "step": 854700 }, { "epoch": 11.395661969577795, "grad_norm": 1.577087163925171, "learning_rate": 8.739526953554465e-06, "loss": 0.2838, "step": 854800 }, { "epoch": 11.39699510738425, "grad_norm": 3.0685927867889404, "learning_rate": 8.733403751617627e-06, "loss": 0.2611, "step": 854900 }, { "epoch": 11.398328245190706, "grad_norm": 1.0822303295135498, "learning_rate": 8.727282330053194e-06, "loss": 0.2654, "step": 855000 }, { "epoch": 11.399661382997161, "grad_norm": 1.0636709928512573, "learning_rate": 8.721162689373622e-06, "loss": 0.3076, "step": 855100 }, { "epoch": 11.400994520803616, "grad_norm": 1.3338497877120972, "learning_rate": 8.715044830091222e-06, "loss": 0.2626, "step": 855200 }, { "epoch": 11.402327658610071, "grad_norm": 3.75685715675354, "learning_rate": 8.708928752718192e-06, "loss": 0.298, "step": 855300 }, { "epoch": 11.403660796416526, "grad_norm": 0.9970495104789734, "learning_rate": 8.702814457766521e-06, "loss": 0.2694, "step": 855400 }, { "epoch": 11.404993934222981, "grad_norm": 1.8472386598587036, "learning_rate": 8.696701945748099e-06, "loss": 0.2745, "step": 855500 }, { "epoch": 11.406327072029436, "grad_norm": 4.601322650909424, "learning_rate": 8.690591217174616e-06, "loss": 0.263, "step": 855600 }, { "epoch": 11.40766020983589, "grad_norm": 1.9304287433624268, "learning_rate": 8.684482272557676e-06, "loss": 0.2877, "step": 855700 }, { "epoch": 11.408993347642346, "grad_norm": 3.9739632606506348, "learning_rate": 8.678375112408695e-06, "loss": 0.3171, "step": 855800 }, { "epoch": 11.4103264854488, "grad_norm": 1.1091670989990234, "learning_rate": 8.672269737238929e-06, "loss": 0.2435, "step": 855900 }, { "epoch": 11.411659623255256, "grad_norm": 4.877865314483643, "learning_rate": 8.66616614755951e-06, "loss": 0.2977, "step": 856000 }, { "epoch": 11.41299276106171, "grad_norm": 4.6479716300964355, "learning_rate": 8.660064343881413e-06, "loss": 0.308, "step": 856100 }, { "epoch": 11.414325898868166, "grad_norm": 5.854506969451904, "learning_rate": 8.653964326715457e-06, "loss": 0.316, "step": 856200 }, { "epoch": 11.41565903667462, "grad_norm": 0.9728363156318665, "learning_rate": 8.647866096572318e-06, "loss": 0.2464, "step": 856300 }, { "epoch": 11.416992174481075, "grad_norm": 4.075482368469238, "learning_rate": 8.641769653962533e-06, "loss": 0.2972, "step": 856400 }, { "epoch": 11.41832531228753, "grad_norm": 0.48635396361351013, "learning_rate": 8.63573593708962e-06, "loss": 0.2978, "step": 856500 }, { "epoch": 11.419658450093987, "grad_norm": 3.427793264389038, "learning_rate": 8.629643053189423e-06, "loss": 0.2751, "step": 856600 }, { "epoch": 11.420991587900442, "grad_norm": 3.0754730701446533, "learning_rate": 8.623551958348144e-06, "loss": 0.2722, "step": 856700 }, { "epoch": 11.422324725706897, "grad_norm": 1.5302963256835938, "learning_rate": 8.617462653075707e-06, "loss": 0.2639, "step": 856800 }, { "epoch": 11.423657863513352, "grad_norm": 4.453147888183594, "learning_rate": 8.611375137881895e-06, "loss": 0.2649, "step": 856900 }, { "epoch": 11.424991001319807, "grad_norm": 4.044277191162109, "learning_rate": 8.605289413276345e-06, "loss": 0.2904, "step": 857000 }, { "epoch": 11.426324139126262, "grad_norm": 8.453768730163574, "learning_rate": 8.599205479768512e-06, "loss": 0.2756, "step": 857100 }, { "epoch": 11.427657276932717, "grad_norm": 6.5731964111328125, "learning_rate": 8.593123337867738e-06, "loss": 0.299, "step": 857200 }, { "epoch": 11.428990414739172, "grad_norm": 1.2860206365585327, "learning_rate": 8.587042988083194e-06, "loss": 0.2734, "step": 857300 }, { "epoch": 11.430323552545627, "grad_norm": 5.671040058135986, "learning_rate": 8.580964430923913e-06, "loss": 0.3202, "step": 857400 }, { "epoch": 11.431656690352082, "grad_norm": 3.537810802459717, "learning_rate": 8.574887666898775e-06, "loss": 0.3199, "step": 857500 }, { "epoch": 11.432989828158536, "grad_norm": 2.9792160987854004, "learning_rate": 8.568812696516507e-06, "loss": 0.2651, "step": 857600 }, { "epoch": 11.434322965964991, "grad_norm": 3.3904592990875244, "learning_rate": 8.562739520285668e-06, "loss": 0.2594, "step": 857700 }, { "epoch": 11.435656103771446, "grad_norm": 9.198789596557617, "learning_rate": 8.556668138714708e-06, "loss": 0.2739, "step": 857800 }, { "epoch": 11.436989241577901, "grad_norm": 2.381601333618164, "learning_rate": 8.550598552311906e-06, "loss": 0.2474, "step": 857900 }, { "epoch": 11.438322379384356, "grad_norm": 2.4881038665771484, "learning_rate": 8.544530761585371e-06, "loss": 0.2572, "step": 858000 }, { "epoch": 11.439655517190811, "grad_norm": 9.97589111328125, "learning_rate": 8.538464767043084e-06, "loss": 0.2895, "step": 858100 }, { "epoch": 11.440988654997268, "grad_norm": 5.14256477355957, "learning_rate": 8.532400569192876e-06, "loss": 0.2741, "step": 858200 }, { "epoch": 11.442321792803723, "grad_norm": 16.871395111083984, "learning_rate": 8.526338168542422e-06, "loss": 0.3028, "step": 858300 }, { "epoch": 11.443654930610178, "grad_norm": 3.90175461769104, "learning_rate": 8.520277565599243e-06, "loss": 0.2566, "step": 858400 }, { "epoch": 11.444988068416633, "grad_norm": 5.0227861404418945, "learning_rate": 8.514218760870726e-06, "loss": 0.2882, "step": 858500 }, { "epoch": 11.446321206223088, "grad_norm": 5.322887897491455, "learning_rate": 8.50816175486408e-06, "loss": 0.2719, "step": 858600 }, { "epoch": 11.447654344029543, "grad_norm": 1.9318363666534424, "learning_rate": 8.50210654808638e-06, "loss": 0.2728, "step": 858700 }, { "epoch": 11.448987481835998, "grad_norm": 1.4298526048660278, "learning_rate": 8.496053141044551e-06, "loss": 0.258, "step": 858800 }, { "epoch": 11.450320619642453, "grad_norm": 1.812097191810608, "learning_rate": 8.490001534245368e-06, "loss": 0.2654, "step": 858900 }, { "epoch": 11.451653757448907, "grad_norm": 1.224324107170105, "learning_rate": 8.483951728195465e-06, "loss": 0.2287, "step": 859000 }, { "epoch": 11.452986895255362, "grad_norm": 2.606306314468384, "learning_rate": 8.477903723401276e-06, "loss": 0.3023, "step": 859100 }, { "epoch": 11.454320033061817, "grad_norm": 3.055609703063965, "learning_rate": 8.471857520369153e-06, "loss": 0.3271, "step": 859200 }, { "epoch": 11.455653170868272, "grad_norm": 10.669920921325684, "learning_rate": 8.465813119605265e-06, "loss": 0.3044, "step": 859300 }, { "epoch": 11.456986308674727, "grad_norm": 2.0734565258026123, "learning_rate": 8.459770521615613e-06, "loss": 0.2967, "step": 859400 }, { "epoch": 11.458319446481182, "grad_norm": 4.97818660736084, "learning_rate": 8.453729726906073e-06, "loss": 0.2951, "step": 859500 }, { "epoch": 11.459652584287637, "grad_norm": 5.314402103424072, "learning_rate": 8.447690735982357e-06, "loss": 0.286, "step": 859600 }, { "epoch": 11.460985722094092, "grad_norm": 1.8695772886276245, "learning_rate": 8.441653549350037e-06, "loss": 0.2582, "step": 859700 }, { "epoch": 11.462318859900549, "grad_norm": 8.314386367797852, "learning_rate": 8.435618167514523e-06, "loss": 0.278, "step": 859800 }, { "epoch": 11.463651997707004, "grad_norm": 1.733999490737915, "learning_rate": 8.42958459098109e-06, "loss": 0.2785, "step": 859900 }, { "epoch": 11.464985135513459, "grad_norm": 0.7130129933357239, "learning_rate": 8.423552820254821e-06, "loss": 0.2535, "step": 860000 }, { "epoch": 11.466318273319914, "grad_norm": 0.4788627326488495, "learning_rate": 8.417522855840713e-06, "loss": 0.3155, "step": 860100 }, { "epoch": 11.467651411126369, "grad_norm": 2.0303492546081543, "learning_rate": 8.411494698243546e-06, "loss": 0.2825, "step": 860200 }, { "epoch": 11.468984548932823, "grad_norm": 6.503332614898682, "learning_rate": 8.405468347967993e-06, "loss": 0.2631, "step": 860300 }, { "epoch": 11.470317686739278, "grad_norm": 3.1238906383514404, "learning_rate": 8.399443805518557e-06, "loss": 0.2684, "step": 860400 }, { "epoch": 11.471650824545733, "grad_norm": 0.14008614420890808, "learning_rate": 8.393421071399593e-06, "loss": 0.2456, "step": 860500 }, { "epoch": 11.472983962352188, "grad_norm": 2.947237730026245, "learning_rate": 8.387400146115308e-06, "loss": 0.2776, "step": 860600 }, { "epoch": 11.474317100158643, "grad_norm": 1.121751070022583, "learning_rate": 8.381441212371334e-06, "loss": 0.2734, "step": 860700 }, { "epoch": 11.475650237965098, "grad_norm": 2.2047579288482666, "learning_rate": 8.375423888167494e-06, "loss": 0.2482, "step": 860800 }, { "epoch": 11.476983375771553, "grad_norm": 2.182953357696533, "learning_rate": 8.369468520480786e-06, "loss": 0.2774, "step": 860900 }, { "epoch": 11.478316513578008, "grad_norm": 0.8537976145744324, "learning_rate": 8.363454799352293e-06, "loss": 0.2836, "step": 861000 }, { "epoch": 11.479649651384463, "grad_norm": 4.856468677520752, "learning_rate": 8.357442889567153e-06, "loss": 0.3054, "step": 861100 }, { "epoch": 11.480982789190918, "grad_norm": 4.212578773498535, "learning_rate": 8.351432791628688e-06, "loss": 0.2491, "step": 861200 }, { "epoch": 11.482315926997373, "grad_norm": 4.409786224365234, "learning_rate": 8.345424506040018e-06, "loss": 0.2929, "step": 861300 }, { "epoch": 11.48364906480383, "grad_norm": 1.831067681312561, "learning_rate": 8.339418033304157e-06, "loss": 0.2959, "step": 861400 }, { "epoch": 11.484982202610285, "grad_norm": 2.4130806922912598, "learning_rate": 8.333413373923919e-06, "loss": 0.2983, "step": 861500 }, { "epoch": 11.48631534041674, "grad_norm": 3.4049177169799805, "learning_rate": 8.327410528402026e-06, "loss": 0.2754, "step": 861600 }, { "epoch": 11.487648478223194, "grad_norm": 2.2214443683624268, "learning_rate": 8.321409497241013e-06, "loss": 0.2459, "step": 861700 }, { "epoch": 11.48898161602965, "grad_norm": 2.3752386569976807, "learning_rate": 8.31541028094325e-06, "loss": 0.2784, "step": 861800 }, { "epoch": 11.490314753836104, "grad_norm": 3.205519914627075, "learning_rate": 8.30941288001098e-06, "loss": 0.3088, "step": 861900 }, { "epoch": 11.49164789164256, "grad_norm": 4.045821189880371, "learning_rate": 8.303417294946295e-06, "loss": 0.295, "step": 862000 }, { "epoch": 11.492981029449014, "grad_norm": 1.78306245803833, "learning_rate": 8.297423526251119e-06, "loss": 0.2896, "step": 862100 }, { "epoch": 11.49431416725547, "grad_norm": 1.604124665260315, "learning_rate": 8.291431574427231e-06, "loss": 0.2515, "step": 862200 }, { "epoch": 11.495647305061924, "grad_norm": 3.35526180267334, "learning_rate": 8.28544143997627e-06, "loss": 0.2954, "step": 862300 }, { "epoch": 11.496980442868379, "grad_norm": 0.2938272953033447, "learning_rate": 8.279453123399686e-06, "loss": 0.2602, "step": 862400 }, { "epoch": 11.498313580674834, "grad_norm": 2.5773699283599854, "learning_rate": 8.273466625198822e-06, "loss": 0.2838, "step": 862500 }, { "epoch": 11.499646718481289, "grad_norm": 1.1129941940307617, "learning_rate": 8.267481945874858e-06, "loss": 0.2816, "step": 862600 }, { "epoch": 11.500979856287744, "grad_norm": 3.820784330368042, "learning_rate": 8.261499085928785e-06, "loss": 0.293, "step": 862700 }, { "epoch": 11.502312994094199, "grad_norm": 0.18470068275928497, "learning_rate": 8.255518045861486e-06, "loss": 0.2656, "step": 862800 }, { "epoch": 11.503646131900654, "grad_norm": 3.2013702392578125, "learning_rate": 8.24953882617367e-06, "loss": 0.2786, "step": 862900 }, { "epoch": 11.50497926970711, "grad_norm": 0.6084058284759521, "learning_rate": 8.243561427365901e-06, "loss": 0.3098, "step": 863000 }, { "epoch": 11.506312407513565, "grad_norm": 2.1349751949310303, "learning_rate": 8.237585849938597e-06, "loss": 0.283, "step": 863100 }, { "epoch": 11.50764554532002, "grad_norm": 1.382629156112671, "learning_rate": 8.231612094391995e-06, "loss": 0.2573, "step": 863200 }, { "epoch": 11.508978683126475, "grad_norm": 2.515981912612915, "learning_rate": 8.225640161226205e-06, "loss": 0.2865, "step": 863300 }, { "epoch": 11.51031182093293, "grad_norm": 2.6239519119262695, "learning_rate": 8.219670050941186e-06, "loss": 0.2939, "step": 863400 }, { "epoch": 11.511644958739385, "grad_norm": 2.446830987930298, "learning_rate": 8.213701764036732e-06, "loss": 0.2813, "step": 863500 }, { "epoch": 11.51297809654584, "grad_norm": 6.261186122894287, "learning_rate": 8.20773530101249e-06, "loss": 0.2747, "step": 863600 }, { "epoch": 11.514311234352295, "grad_norm": 1.8703464269638062, "learning_rate": 8.201770662367955e-06, "loss": 0.2616, "step": 863700 }, { "epoch": 11.51564437215875, "grad_norm": 1.4288796186447144, "learning_rate": 8.195807848602463e-06, "loss": 0.2703, "step": 863800 }, { "epoch": 11.516977509965205, "grad_norm": 3.026885986328125, "learning_rate": 8.189846860215207e-06, "loss": 0.2566, "step": 863900 }, { "epoch": 11.51831064777166, "grad_norm": 2.5339653491973877, "learning_rate": 8.183887697705227e-06, "loss": 0.255, "step": 864000 }, { "epoch": 11.519643785578115, "grad_norm": 5.012900352478027, "learning_rate": 8.177930361571391e-06, "loss": 0.2743, "step": 864100 }, { "epoch": 11.52097692338457, "grad_norm": 2.217170476913452, "learning_rate": 8.171974852312437e-06, "loss": 0.2702, "step": 864200 }, { "epoch": 11.522310061191025, "grad_norm": 1.6684863567352295, "learning_rate": 8.166080698198657e-06, "loss": 0.2951, "step": 864300 }, { "epoch": 11.52364319899748, "grad_norm": 1.1526622772216797, "learning_rate": 8.160128825903864e-06, "loss": 0.2288, "step": 864400 }, { "epoch": 11.524976336803935, "grad_norm": 7.024319171905518, "learning_rate": 8.154178781974233e-06, "loss": 0.2367, "step": 864500 }, { "epoch": 11.52630947461039, "grad_norm": 1.785459041595459, "learning_rate": 8.148230566907891e-06, "loss": 0.2801, "step": 864600 }, { "epoch": 11.527642612416846, "grad_norm": 1.7178466320037842, "learning_rate": 8.142284181202802e-06, "loss": 0.2659, "step": 864700 }, { "epoch": 11.528975750223301, "grad_norm": 5.710700988769531, "learning_rate": 8.136339625356778e-06, "loss": 0.2962, "step": 864800 }, { "epoch": 11.530308888029756, "grad_norm": 1.7916768789291382, "learning_rate": 8.130396899867493e-06, "loss": 0.2841, "step": 864900 }, { "epoch": 11.531642025836211, "grad_norm": 6.752439022064209, "learning_rate": 8.124456005232418e-06, "loss": 0.2302, "step": 865000 }, { "epoch": 11.532975163642666, "grad_norm": 1.2486001253128052, "learning_rate": 8.118516941948943e-06, "loss": 0.2535, "step": 865100 }, { "epoch": 11.534308301449121, "grad_norm": 3.7759299278259277, "learning_rate": 8.112579710514259e-06, "loss": 0.2688, "step": 865200 }, { "epoch": 11.535641439255576, "grad_norm": 3.830681562423706, "learning_rate": 8.106644311425397e-06, "loss": 0.2589, "step": 865300 }, { "epoch": 11.536974577062031, "grad_norm": 3.5803253650665283, "learning_rate": 8.100710745179259e-06, "loss": 0.2885, "step": 865400 }, { "epoch": 11.538307714868486, "grad_norm": 5.756787300109863, "learning_rate": 8.094779012272588e-06, "loss": 0.2932, "step": 865500 }, { "epoch": 11.53964085267494, "grad_norm": 2.067807674407959, "learning_rate": 8.088849113201962e-06, "loss": 0.3009, "step": 865600 }, { "epoch": 11.540973990481396, "grad_norm": 3.0370497703552246, "learning_rate": 8.082921048463824e-06, "loss": 0.2456, "step": 865700 }, { "epoch": 11.54230712828785, "grad_norm": 8.689926147460938, "learning_rate": 8.076994818554454e-06, "loss": 0.2846, "step": 865800 }, { "epoch": 11.543640266094306, "grad_norm": 3.1895129680633545, "learning_rate": 8.071070423969949e-06, "loss": 0.2885, "step": 865900 }, { "epoch": 11.54497340390076, "grad_norm": 9.283191680908203, "learning_rate": 8.065147865206327e-06, "loss": 0.2665, "step": 866000 }, { "epoch": 11.546306541707215, "grad_norm": 6.104173183441162, "learning_rate": 8.059227142759366e-06, "loss": 0.3297, "step": 866100 }, { "epoch": 11.547639679513672, "grad_norm": 1.613429069519043, "learning_rate": 8.053308257124749e-06, "loss": 0.3016, "step": 866200 }, { "epoch": 11.548972817320127, "grad_norm": 2.44187331199646, "learning_rate": 8.04739120879798e-06, "loss": 0.2491, "step": 866300 }, { "epoch": 11.550305955126582, "grad_norm": 2.940979480743408, "learning_rate": 8.04147599827442e-06, "loss": 0.2762, "step": 866400 }, { "epoch": 11.551639092933037, "grad_norm": 2.9253339767456055, "learning_rate": 8.03556262604927e-06, "loss": 0.2802, "step": 866500 }, { "epoch": 11.552972230739492, "grad_norm": 1.993588924407959, "learning_rate": 8.029651092617589e-06, "loss": 0.3419, "step": 866600 }, { "epoch": 11.554305368545947, "grad_norm": 12.383373260498047, "learning_rate": 8.023741398474252e-06, "loss": 0.3243, "step": 866700 }, { "epoch": 11.555638506352402, "grad_norm": 12.558550834655762, "learning_rate": 8.017892613549059e-06, "loss": 0.3371, "step": 866800 }, { "epoch": 11.556971644158857, "grad_norm": 2.9204294681549072, "learning_rate": 8.011986581061274e-06, "loss": 0.3183, "step": 866900 }, { "epoch": 11.558304781965312, "grad_norm": 2.08571720123291, "learning_rate": 8.006082389340671e-06, "loss": 0.2262, "step": 867000 }, { "epoch": 11.559637919771767, "grad_norm": 1.5851726531982422, "learning_rate": 8.000180038881501e-06, "loss": 0.2856, "step": 867100 }, { "epoch": 11.560971057578222, "grad_norm": 4.143967628479004, "learning_rate": 7.994279530177917e-06, "loss": 0.2753, "step": 867200 }, { "epoch": 11.562304195384677, "grad_norm": 3.712432384490967, "learning_rate": 7.988380863723892e-06, "loss": 0.2656, "step": 867300 }, { "epoch": 11.563637333191132, "grad_norm": 2.4203031063079834, "learning_rate": 7.982484040013225e-06, "loss": 0.2523, "step": 867400 }, { "epoch": 11.564970470997586, "grad_norm": 2.6472790241241455, "learning_rate": 7.976589059539588e-06, "loss": 0.3202, "step": 867500 }, { "epoch": 11.566303608804041, "grad_norm": 1.6340421438217163, "learning_rate": 7.970695922796491e-06, "loss": 0.3391, "step": 867600 }, { "epoch": 11.567636746610496, "grad_norm": 1.812507152557373, "learning_rate": 7.964804630277292e-06, "loss": 0.2915, "step": 867700 }, { "epoch": 11.568969884416951, "grad_norm": 10.67653751373291, "learning_rate": 7.958915182475195e-06, "loss": 0.283, "step": 867800 }, { "epoch": 11.570303022223408, "grad_norm": 2.5869269371032715, "learning_rate": 7.95302757988323e-06, "loss": 0.2878, "step": 867900 }, { "epoch": 11.571636160029863, "grad_norm": 1.8252224922180176, "learning_rate": 7.947141822994292e-06, "loss": 0.3154, "step": 868000 }, { "epoch": 11.572969297836318, "grad_norm": 1.1929222345352173, "learning_rate": 7.941257912301141e-06, "loss": 0.249, "step": 868100 }, { "epoch": 11.574302435642773, "grad_norm": 1.335221529006958, "learning_rate": 7.935375848296331e-06, "loss": 0.2527, "step": 868200 }, { "epoch": 11.575635573449228, "grad_norm": 2.063171863555908, "learning_rate": 7.929495631472304e-06, "loss": 0.2634, "step": 868300 }, { "epoch": 11.576968711255683, "grad_norm": 2.608980894088745, "learning_rate": 7.92361726232133e-06, "loss": 0.2789, "step": 868400 }, { "epoch": 11.578301849062138, "grad_norm": 2.36737322807312, "learning_rate": 7.917740741335527e-06, "loss": 0.3087, "step": 868500 }, { "epoch": 11.579634986868593, "grad_norm": 1.5620843172073364, "learning_rate": 7.91186606900686e-06, "loss": 0.2682, "step": 868600 }, { "epoch": 11.580968124675048, "grad_norm": 2.116508960723877, "learning_rate": 7.905993245827148e-06, "loss": 0.3034, "step": 868700 }, { "epoch": 11.582301262481502, "grad_norm": 0.9214315414428711, "learning_rate": 7.900122272288024e-06, "loss": 0.2701, "step": 868800 }, { "epoch": 11.583634400287957, "grad_norm": 3.7070536613464355, "learning_rate": 7.894253148881e-06, "loss": 0.2563, "step": 868900 }, { "epoch": 11.584967538094412, "grad_norm": 1.3277828693389893, "learning_rate": 7.888385876097419e-06, "loss": 0.2942, "step": 869000 }, { "epoch": 11.586300675900867, "grad_norm": 0.7061882019042969, "learning_rate": 7.882520454428473e-06, "loss": 0.301, "step": 869100 }, { "epoch": 11.587633813707322, "grad_norm": 1.869004249572754, "learning_rate": 7.876656884365198e-06, "loss": 0.2675, "step": 869200 }, { "epoch": 11.588966951513777, "grad_norm": 3.8921072483062744, "learning_rate": 7.870795166398466e-06, "loss": 0.2755, "step": 869300 }, { "epoch": 11.590300089320234, "grad_norm": 2.2744288444519043, "learning_rate": 7.864935301018995e-06, "loss": 0.2628, "step": 869400 }, { "epoch": 11.591633227126689, "grad_norm": 1.9491490125656128, "learning_rate": 7.85907728871738e-06, "loss": 0.3017, "step": 869500 }, { "epoch": 11.592966364933144, "grad_norm": 3.441875696182251, "learning_rate": 7.853221129984011e-06, "loss": 0.2923, "step": 869600 }, { "epoch": 11.594299502739599, "grad_norm": 3.360450506210327, "learning_rate": 7.847425359176707e-06, "loss": 0.2715, "step": 869700 }, { "epoch": 11.595632640546054, "grad_norm": 1.036820888519287, "learning_rate": 7.84157289050256e-06, "loss": 0.2532, "step": 869800 }, { "epoch": 11.596965778352509, "grad_norm": 3.864276170730591, "learning_rate": 7.835722276862089e-06, "loss": 0.2699, "step": 869900 }, { "epoch": 11.598298916158964, "grad_norm": 4.392749309539795, "learning_rate": 7.829873518745078e-06, "loss": 0.3247, "step": 870000 }, { "epoch": 11.599632053965419, "grad_norm": 0.979407548904419, "learning_rate": 7.824026616641166e-06, "loss": 0.255, "step": 870100 }, { "epoch": 11.600965191771873, "grad_norm": 2.405672788619995, "learning_rate": 7.818181571039839e-06, "loss": 0.2735, "step": 870200 }, { "epoch": 11.602298329578328, "grad_norm": 0.5771623253822327, "learning_rate": 7.812338382430428e-06, "loss": 0.2539, "step": 870300 }, { "epoch": 11.603631467384783, "grad_norm": 2.511413335800171, "learning_rate": 7.806497051302116e-06, "loss": 0.2594, "step": 870400 }, { "epoch": 11.604964605191238, "grad_norm": 2.6003243923187256, "learning_rate": 7.800657578143907e-06, "loss": 0.2655, "step": 870500 }, { "epoch": 11.606297742997693, "grad_norm": 210.1833953857422, "learning_rate": 7.794878330390673e-06, "loss": 0.2804, "step": 870600 }, { "epoch": 11.607630880804148, "grad_norm": 0.5098041892051697, "learning_rate": 7.78904255604721e-06, "loss": 0.2137, "step": 870700 }, { "epoch": 11.608964018610603, "grad_norm": 21.66533088684082, "learning_rate": 7.783208641135098e-06, "loss": 0.2607, "step": 870800 }, { "epoch": 11.610297156417058, "grad_norm": 3.247753858566284, "learning_rate": 7.777376586142705e-06, "loss": 0.2313, "step": 870900 }, { "epoch": 11.611630294223513, "grad_norm": 1.3693182468414307, "learning_rate": 7.771546391558306e-06, "loss": 0.239, "step": 871000 }, { "epoch": 11.61296343202997, "grad_norm": 1.861351728439331, "learning_rate": 7.765718057869979e-06, "loss": 0.3048, "step": 871100 }, { "epoch": 11.614296569836425, "grad_norm": 4.54676628112793, "learning_rate": 7.759891585565639e-06, "loss": 0.2721, "step": 871200 }, { "epoch": 11.61562970764288, "grad_norm": 3.1501104831695557, "learning_rate": 7.754066975133068e-06, "loss": 0.2833, "step": 871300 }, { "epoch": 11.616962845449335, "grad_norm": 0.3681785464286804, "learning_rate": 7.748244227059885e-06, "loss": 0.282, "step": 871400 }, { "epoch": 11.61829598325579, "grad_norm": 9.242782592773438, "learning_rate": 7.742423341833553e-06, "loss": 0.2827, "step": 871500 }, { "epoch": 11.619629121062244, "grad_norm": 2.394941568374634, "learning_rate": 7.736604319941375e-06, "loss": 0.3036, "step": 871600 }, { "epoch": 11.6209622588687, "grad_norm": 1.8571115732192993, "learning_rate": 7.730787161870514e-06, "loss": 0.3165, "step": 871700 }, { "epoch": 11.622295396675154, "grad_norm": 3.9358997344970703, "learning_rate": 7.724971868107932e-06, "loss": 0.2951, "step": 871800 }, { "epoch": 11.62362853448161, "grad_norm": 1.4062660932540894, "learning_rate": 7.71915843914051e-06, "loss": 0.2214, "step": 871900 }, { "epoch": 11.624961672288064, "grad_norm": 1.7834917306900024, "learning_rate": 7.713346875454896e-06, "loss": 0.2607, "step": 872000 }, { "epoch": 11.62629481009452, "grad_norm": 3.5179247856140137, "learning_rate": 7.70753717753763e-06, "loss": 0.3019, "step": 872100 }, { "epoch": 11.627627947900974, "grad_norm": 3.2881357669830322, "learning_rate": 7.70172934587508e-06, "loss": 0.309, "step": 872200 }, { "epoch": 11.628961085707429, "grad_norm": 2.718829870223999, "learning_rate": 7.695923380953458e-06, "loss": 0.2848, "step": 872300 }, { "epoch": 11.630294223513884, "grad_norm": 2.9728355407714844, "learning_rate": 7.690119283258824e-06, "loss": 0.2532, "step": 872400 }, { "epoch": 11.631627361320339, "grad_norm": 0.329740047454834, "learning_rate": 7.684317053277082e-06, "loss": 0.2501, "step": 872500 }, { "epoch": 11.632960499126796, "grad_norm": 2.5982258319854736, "learning_rate": 7.678516691493964e-06, "loss": 0.2676, "step": 872600 }, { "epoch": 11.63429363693325, "grad_norm": 1.5226020812988281, "learning_rate": 7.672718198395057e-06, "loss": 0.239, "step": 872700 }, { "epoch": 11.635626774739706, "grad_norm": 2.4343161582946777, "learning_rate": 7.666921574465819e-06, "loss": 0.2738, "step": 872800 }, { "epoch": 11.63695991254616, "grad_norm": 2.157003402709961, "learning_rate": 7.661126820191494e-06, "loss": 0.2777, "step": 872900 }, { "epoch": 11.638293050352615, "grad_norm": 3.9683620929718018, "learning_rate": 7.655333936057216e-06, "loss": 0.2728, "step": 873000 }, { "epoch": 11.63962618815907, "grad_norm": 3.597301721572876, "learning_rate": 7.649542922547947e-06, "loss": 0.2921, "step": 873100 }, { "epoch": 11.640959325965525, "grad_norm": 0.05836411565542221, "learning_rate": 7.643753780148487e-06, "loss": 0.2475, "step": 873200 }, { "epoch": 11.64229246377198, "grad_norm": 1.4827470779418945, "learning_rate": 7.637966509343487e-06, "loss": 0.2702, "step": 873300 }, { "epoch": 11.643625601578435, "grad_norm": 1.0609769821166992, "learning_rate": 7.632181110617449e-06, "loss": 0.2848, "step": 873400 }, { "epoch": 11.64495873938489, "grad_norm": 1.0123403072357178, "learning_rate": 7.626397584454692e-06, "loss": 0.2791, "step": 873500 }, { "epoch": 11.646291877191345, "grad_norm": 2.3000385761260986, "learning_rate": 7.620615931339398e-06, "loss": 0.2993, "step": 873600 }, { "epoch": 11.6476250149978, "grad_norm": 2.841310501098633, "learning_rate": 7.614836151755596e-06, "loss": 0.2552, "step": 873700 }, { "epoch": 11.648958152804255, "grad_norm": 6.076557636260986, "learning_rate": 7.609058246187145e-06, "loss": 0.274, "step": 873800 }, { "epoch": 11.65029129061071, "grad_norm": 4.247016906738281, "learning_rate": 7.603282215117767e-06, "loss": 0.3011, "step": 873900 }, { "epoch": 11.651624428417165, "grad_norm": 1.4806047677993774, "learning_rate": 7.597508059030989e-06, "loss": 0.2534, "step": 874000 }, { "epoch": 11.65295756622362, "grad_norm": 3.4310038089752197, "learning_rate": 7.591735778410209e-06, "loss": 0.2676, "step": 874100 }, { "epoch": 11.654290704030075, "grad_norm": 3.10591459274292, "learning_rate": 7.585965373738693e-06, "loss": 0.2821, "step": 874200 }, { "epoch": 11.655623841836531, "grad_norm": 1.9201483726501465, "learning_rate": 7.580196845499492e-06, "loss": 0.2768, "step": 874300 }, { "epoch": 11.656956979642986, "grad_norm": 4.6450114250183105, "learning_rate": 7.574430194175535e-06, "loss": 0.2561, "step": 874400 }, { "epoch": 11.658290117449441, "grad_norm": 3.5134503841400146, "learning_rate": 7.5686654202495905e-06, "loss": 0.2669, "step": 874500 }, { "epoch": 11.659623255255896, "grad_norm": 4.481507778167725, "learning_rate": 7.562902524204272e-06, "loss": 0.2796, "step": 874600 }, { "epoch": 11.660956393062351, "grad_norm": 5.691135406494141, "learning_rate": 7.557141506522021e-06, "loss": 0.263, "step": 874700 }, { "epoch": 11.662289530868806, "grad_norm": 6.663542747497559, "learning_rate": 7.551382367685149e-06, "loss": 0.2522, "step": 874800 }, { "epoch": 11.663622668675261, "grad_norm": 1.813889503479004, "learning_rate": 7.545625108175771e-06, "loss": 0.2876, "step": 874900 }, { "epoch": 11.664955806481716, "grad_norm": 1.894878625869751, "learning_rate": 7.539869728475877e-06, "loss": 0.3176, "step": 875000 }, { "epoch": 11.666288944288171, "grad_norm": 0.5745949745178223, "learning_rate": 7.534116229067287e-06, "loss": 0.2895, "step": 875100 }, { "epoch": 11.667622082094626, "grad_norm": 3.031857490539551, "learning_rate": 7.528364610431671e-06, "loss": 0.2402, "step": 875200 }, { "epoch": 11.66895521990108, "grad_norm": 5.618890762329102, "learning_rate": 7.52261487305054e-06, "loss": 0.2479, "step": 875300 }, { "epoch": 11.670288357707536, "grad_norm": 2.594421148300171, "learning_rate": 7.516867017405219e-06, "loss": 0.2845, "step": 875400 }, { "epoch": 11.67162149551399, "grad_norm": 1.0490972995758057, "learning_rate": 7.511121043976927e-06, "loss": 0.2717, "step": 875500 }, { "epoch": 11.672954633320446, "grad_norm": 0.21111635863780975, "learning_rate": 7.5053769532467e-06, "loss": 0.2482, "step": 875600 }, { "epoch": 11.6742877711269, "grad_norm": 5.029289245605469, "learning_rate": 7.499634745695398e-06, "loss": 0.2492, "step": 875700 }, { "epoch": 11.675620908933357, "grad_norm": 4.723931789398193, "learning_rate": 7.493894421803746e-06, "loss": 0.2636, "step": 875800 }, { "epoch": 11.676954046739812, "grad_norm": 2.7320823669433594, "learning_rate": 7.48815598205231e-06, "loss": 0.2391, "step": 875900 }, { "epoch": 11.678287184546267, "grad_norm": 0.9192624688148499, "learning_rate": 7.482419426921486e-06, "loss": 0.283, "step": 876000 }, { "epoch": 11.679620322352722, "grad_norm": 2.2908530235290527, "learning_rate": 7.476684756891532e-06, "loss": 0.2741, "step": 876100 }, { "epoch": 11.680953460159177, "grad_norm": 0.9987959265708923, "learning_rate": 7.470951972442536e-06, "loss": 0.2785, "step": 876200 }, { "epoch": 11.682286597965632, "grad_norm": 2.1121068000793457, "learning_rate": 7.465221074054417e-06, "loss": 0.2964, "step": 876300 }, { "epoch": 11.683619735772087, "grad_norm": 1.1923198699951172, "learning_rate": 7.459492062206942e-06, "loss": 0.2626, "step": 876400 }, { "epoch": 11.684952873578542, "grad_norm": 2.9971323013305664, "learning_rate": 7.453764937379757e-06, "loss": 0.2689, "step": 876500 }, { "epoch": 11.686286011384997, "grad_norm": 2.040814161300659, "learning_rate": 7.448039700052292e-06, "loss": 0.3064, "step": 876600 }, { "epoch": 11.687619149191452, "grad_norm": 3.6144635677337646, "learning_rate": 7.442316350703859e-06, "loss": 0.342, "step": 876700 }, { "epoch": 11.688952286997907, "grad_norm": 2.0952000617980957, "learning_rate": 7.436652095073043e-06, "loss": 0.3138, "step": 876800 }, { "epoch": 11.690285424804362, "grad_norm": 11.020368576049805, "learning_rate": 7.430932504228179e-06, "loss": 0.2776, "step": 876900 }, { "epoch": 11.691618562610817, "grad_norm": 0.3791695833206177, "learning_rate": 7.425214802794503e-06, "loss": 0.2666, "step": 877000 }, { "epoch": 11.692951700417272, "grad_norm": 1.2187587022781372, "learning_rate": 7.419498991250679e-06, "loss": 0.2904, "step": 877100 }, { "epoch": 11.694284838223727, "grad_norm": 4.556686878204346, "learning_rate": 7.4137850700752185e-06, "loss": 0.2847, "step": 877200 }, { "epoch": 11.695617976030181, "grad_norm": 3.0662841796875, "learning_rate": 7.408073039746478e-06, "loss": 0.2634, "step": 877300 }, { "epoch": 11.696951113836636, "grad_norm": 1.5077787637710571, "learning_rate": 7.402362900742654e-06, "loss": 0.3116, "step": 877400 }, { "epoch": 11.698284251643093, "grad_norm": 2.935046911239624, "learning_rate": 7.396654653541764e-06, "loss": 0.2688, "step": 877500 }, { "epoch": 11.699617389449548, "grad_norm": 1.68976628780365, "learning_rate": 7.390948298621687e-06, "loss": 0.2462, "step": 877600 }, { "epoch": 11.700950527256003, "grad_norm": 2.048398494720459, "learning_rate": 7.385243836460163e-06, "loss": 0.2626, "step": 877700 }, { "epoch": 11.702283665062458, "grad_norm": 2.4775960445404053, "learning_rate": 7.379541267534733e-06, "loss": 0.237, "step": 877800 }, { "epoch": 11.703616802868913, "grad_norm": 1.7761921882629395, "learning_rate": 7.3738405923227995e-06, "loss": 0.2745, "step": 877900 }, { "epoch": 11.704949940675368, "grad_norm": 1.7857154607772827, "learning_rate": 7.368141811301606e-06, "loss": 0.2246, "step": 878000 }, { "epoch": 11.706283078481823, "grad_norm": 6.366359233856201, "learning_rate": 7.362444924948243e-06, "loss": 0.2422, "step": 878100 }, { "epoch": 11.707616216288278, "grad_norm": 2.01678204536438, "learning_rate": 7.356749933739637e-06, "loss": 0.2305, "step": 878200 }, { "epoch": 11.708949354094733, "grad_norm": 3.972714424133301, "learning_rate": 7.3510568381525415e-06, "loss": 0.2838, "step": 878300 }, { "epoch": 11.710282491901188, "grad_norm": 2.507524251937866, "learning_rate": 7.345365638663571e-06, "loss": 0.3091, "step": 878400 }, { "epoch": 11.711615629707643, "grad_norm": 4.3078508377075195, "learning_rate": 7.339733219388709e-06, "loss": 0.2988, "step": 878500 }, { "epoch": 11.712948767514098, "grad_norm": 0.6124159693717957, "learning_rate": 7.334045794552322e-06, "loss": 0.273, "step": 878600 }, { "epoch": 11.714281905320552, "grad_norm": 3.5428860187530518, "learning_rate": 7.3283602672381645e-06, "loss": 0.2932, "step": 878700 }, { "epoch": 11.715615043127007, "grad_norm": 4.544633865356445, "learning_rate": 7.3226766379222055e-06, "loss": 0.2679, "step": 878800 }, { "epoch": 11.716948180933462, "grad_norm": 1.050838828086853, "learning_rate": 7.316994907080294e-06, "loss": 0.2447, "step": 878900 }, { "epoch": 11.718281318739919, "grad_norm": 0.5961071252822876, "learning_rate": 7.311315075188058e-06, "loss": 0.2757, "step": 879000 }, { "epoch": 11.719614456546374, "grad_norm": 0.5856873989105225, "learning_rate": 7.3056371427210015e-06, "loss": 0.298, "step": 879100 }, { "epoch": 11.720947594352829, "grad_norm": 0.46718209981918335, "learning_rate": 7.299961110154465e-06, "loss": 0.2671, "step": 879200 }, { "epoch": 11.722280732159284, "grad_norm": 1.8555805683135986, "learning_rate": 7.294286977963629e-06, "loss": 0.2633, "step": 879300 }, { "epoch": 11.723613869965739, "grad_norm": 2.2545316219329834, "learning_rate": 7.2886147466235195e-06, "loss": 0.2685, "step": 879400 }, { "epoch": 11.724947007772194, "grad_norm": 30.743560791015625, "learning_rate": 7.282944416608981e-06, "loss": 0.3148, "step": 879500 }, { "epoch": 11.726280145578649, "grad_norm": 1.9947175979614258, "learning_rate": 7.2772759883947295e-06, "loss": 0.2961, "step": 879600 }, { "epoch": 11.727613283385104, "grad_norm": 2.150372266769409, "learning_rate": 7.271609462455302e-06, "loss": 0.3093, "step": 879700 }, { "epoch": 11.728946421191559, "grad_norm": 3.4747819900512695, "learning_rate": 7.265944839265083e-06, "loss": 0.2817, "step": 879800 }, { "epoch": 11.730279558998014, "grad_norm": 1.0805890560150146, "learning_rate": 7.260282119298299e-06, "loss": 0.2688, "step": 879900 }, { "epoch": 11.731612696804468, "grad_norm": 0.8939980864524841, "learning_rate": 7.254621303029022e-06, "loss": 0.2724, "step": 880000 }, { "epoch": 11.732945834610923, "grad_norm": 5.220782279968262, "learning_rate": 7.248962390931135e-06, "loss": 0.2589, "step": 880100 }, { "epoch": 11.734278972417378, "grad_norm": 1.7845185995101929, "learning_rate": 7.243305383478407e-06, "loss": 0.2192, "step": 880200 }, { "epoch": 11.735612110223833, "grad_norm": 6.824737071990967, "learning_rate": 7.237650281144425e-06, "loss": 0.3145, "step": 880300 }, { "epoch": 11.736945248030288, "grad_norm": 2.6832690238952637, "learning_rate": 7.231997084402599e-06, "loss": 0.2917, "step": 880400 }, { "epoch": 11.738278385836743, "grad_norm": 1.8710452318191528, "learning_rate": 7.226345793726206e-06, "loss": 0.2694, "step": 880500 }, { "epoch": 11.739611523643198, "grad_norm": 0.9140282869338989, "learning_rate": 7.220696409588358e-06, "loss": 0.2888, "step": 880600 }, { "epoch": 11.740944661449655, "grad_norm": 0.11161131411790848, "learning_rate": 7.215048932461998e-06, "loss": 0.2595, "step": 880700 }, { "epoch": 11.74227779925611, "grad_norm": 2.728036403656006, "learning_rate": 7.209403362819914e-06, "loss": 0.309, "step": 880800 }, { "epoch": 11.743610937062565, "grad_norm": 1.6523206233978271, "learning_rate": 7.2038161283056624e-06, "loss": 0.3125, "step": 880900 }, { "epoch": 11.74494407486902, "grad_norm": 3.0909667015075684, "learning_rate": 7.198174355963234e-06, "loss": 0.2766, "step": 881000 }, { "epoch": 11.746277212675475, "grad_norm": 5.044795513153076, "learning_rate": 7.192534492517773e-06, "loss": 0.2734, "step": 881100 }, { "epoch": 11.74761035048193, "grad_norm": 3.0103859901428223, "learning_rate": 7.186896538441434e-06, "loss": 0.2478, "step": 881200 }, { "epoch": 11.748943488288385, "grad_norm": 1.2637537717819214, "learning_rate": 7.181260494206191e-06, "loss": 0.2987, "step": 881300 }, { "epoch": 11.75027662609484, "grad_norm": 0.5832773447036743, "learning_rate": 7.175626360283895e-06, "loss": 0.2504, "step": 881400 }, { "epoch": 11.751609763901294, "grad_norm": 0.7584011554718018, "learning_rate": 7.169994137146221e-06, "loss": 0.1997, "step": 881500 }, { "epoch": 11.75294290170775, "grad_norm": 2.4686920642852783, "learning_rate": 7.164420118921209e-06, "loss": 0.2207, "step": 881600 }, { "epoch": 11.754276039514204, "grad_norm": 0.3683180809020996, "learning_rate": 7.158791699647518e-06, "loss": 0.3019, "step": 881700 }, { "epoch": 11.75560917732066, "grad_norm": 0.009500913321971893, "learning_rate": 7.153165192567787e-06, "loss": 0.3079, "step": 881800 }, { "epoch": 11.756942315127114, "grad_norm": 0.7710634469985962, "learning_rate": 7.147540598153052e-06, "loss": 0.315, "step": 881900 }, { "epoch": 11.75827545293357, "grad_norm": 1.6923195123672485, "learning_rate": 7.141917916874173e-06, "loss": 0.3212, "step": 882000 }, { "epoch": 11.759608590740024, "grad_norm": 2.778151273727417, "learning_rate": 7.136297149201864e-06, "loss": 0.3103, "step": 882100 }, { "epoch": 11.76094172854648, "grad_norm": 2.8409624099731445, "learning_rate": 7.1306782956067025e-06, "loss": 0.2799, "step": 882200 }, { "epoch": 11.762274866352936, "grad_norm": 5.076458930969238, "learning_rate": 7.125061356559055e-06, "loss": 0.2802, "step": 882300 }, { "epoch": 11.76360800415939, "grad_norm": 1.2239861488342285, "learning_rate": 7.1194463325291685e-06, "loss": 0.2879, "step": 882400 }, { "epoch": 11.764941141965846, "grad_norm": 3.403141498565674, "learning_rate": 7.113833223987096e-06, "loss": 0.317, "step": 882500 }, { "epoch": 11.7662742797723, "grad_norm": 1.9211995601654053, "learning_rate": 7.10822203140277e-06, "loss": 0.264, "step": 882600 }, { "epoch": 11.767607417578756, "grad_norm": 2.877993583679199, "learning_rate": 7.1026127552459404e-06, "loss": 0.213, "step": 882700 }, { "epoch": 11.76894055538521, "grad_norm": 2.2014949321746826, "learning_rate": 7.097005395986185e-06, "loss": 0.2914, "step": 882800 }, { "epoch": 11.770273693191665, "grad_norm": 1.9589903354644775, "learning_rate": 7.091399954092943e-06, "loss": 0.2479, "step": 882900 }, { "epoch": 11.77160683099812, "grad_norm": 1.6294783353805542, "learning_rate": 7.0857964300354805e-06, "loss": 0.2495, "step": 883000 }, { "epoch": 11.772939968804575, "grad_norm": 3.1174395084381104, "learning_rate": 7.080194824282913e-06, "loss": 0.3045, "step": 883100 }, { "epoch": 11.77427310661103, "grad_norm": 2.68426513671875, "learning_rate": 7.074595137304182e-06, "loss": 0.2472, "step": 883200 }, { "epoch": 11.775606244417485, "grad_norm": 3.132901668548584, "learning_rate": 7.0689973695680886e-06, "loss": 0.2651, "step": 883300 }, { "epoch": 11.77693938222394, "grad_norm": 1.7609425783157349, "learning_rate": 7.063401521543239e-06, "loss": 0.2776, "step": 883400 }, { "epoch": 11.778272520030395, "grad_norm": 3.808729410171509, "learning_rate": 7.057807593698108e-06, "loss": 0.3147, "step": 883500 }, { "epoch": 11.77960565783685, "grad_norm": 9.037235260009766, "learning_rate": 7.05221558650102e-06, "loss": 0.2816, "step": 883600 }, { "epoch": 11.780938795643305, "grad_norm": 2.978830099105835, "learning_rate": 7.046625500420096e-06, "loss": 0.2913, "step": 883700 }, { "epoch": 11.78227193344976, "grad_norm": 2.4860856533050537, "learning_rate": 7.041037335923331e-06, "loss": 0.2693, "step": 883800 }, { "epoch": 11.783605071256215, "grad_norm": 3.573829174041748, "learning_rate": 7.035451093478548e-06, "loss": 0.3201, "step": 883900 }, { "epoch": 11.784938209062672, "grad_norm": 2.1699657440185547, "learning_rate": 7.029866773553407e-06, "loss": 0.2357, "step": 884000 }, { "epoch": 11.786271346869126, "grad_norm": 1.7024701833724976, "learning_rate": 7.024284376615422e-06, "loss": 0.2388, "step": 884100 }, { "epoch": 11.787604484675581, "grad_norm": 3.369220495223999, "learning_rate": 7.018703903131912e-06, "loss": 0.2719, "step": 884200 }, { "epoch": 11.788937622482036, "grad_norm": 4.459671497344971, "learning_rate": 7.013125353570072e-06, "loss": 0.3202, "step": 884300 }, { "epoch": 11.790270760288491, "grad_norm": 0.7341356873512268, "learning_rate": 7.007548728396915e-06, "loss": 0.2515, "step": 884400 }, { "epoch": 11.791603898094946, "grad_norm": 2.1921002864837646, "learning_rate": 7.001974028079298e-06, "loss": 0.2619, "step": 884500 }, { "epoch": 11.792937035901401, "grad_norm": 2.7920329570770264, "learning_rate": 6.996401253083921e-06, "loss": 0.3317, "step": 884600 }, { "epoch": 11.794270173707856, "grad_norm": 1.448196291923523, "learning_rate": 6.990830403877324e-06, "loss": 0.2575, "step": 884700 }, { "epoch": 11.795603311514311, "grad_norm": 4.250967979431152, "learning_rate": 6.985261480925862e-06, "loss": 0.2733, "step": 884800 }, { "epoch": 11.796936449320766, "grad_norm": 1.48957359790802, "learning_rate": 6.979694484695766e-06, "loss": 0.2705, "step": 884900 }, { "epoch": 11.798269587127221, "grad_norm": 1.603066086769104, "learning_rate": 6.974129415653089e-06, "loss": 0.2587, "step": 885000 }, { "epoch": 11.799602724933676, "grad_norm": 14.403501510620117, "learning_rate": 6.968566274263706e-06, "loss": 0.2425, "step": 885100 }, { "epoch": 11.80093586274013, "grad_norm": 1.0629826784133911, "learning_rate": 6.9630050609933505e-06, "loss": 0.3183, "step": 885200 }, { "epoch": 11.802269000546586, "grad_norm": 23.925416946411133, "learning_rate": 6.957445776307597e-06, "loss": 0.3128, "step": 885300 }, { "epoch": 11.80360213835304, "grad_norm": 6.14047384262085, "learning_rate": 6.951888420671844e-06, "loss": 0.2559, "step": 885400 }, { "epoch": 11.804935276159497, "grad_norm": 1.2519170045852661, "learning_rate": 6.946332994551347e-06, "loss": 0.2266, "step": 885500 }, { "epoch": 11.806268413965952, "grad_norm": 2.963927745819092, "learning_rate": 6.940779498411174e-06, "loss": 0.3132, "step": 885600 }, { "epoch": 11.807601551772407, "grad_norm": 3.295853614807129, "learning_rate": 6.9352279327162504e-06, "loss": 0.2479, "step": 885700 }, { "epoch": 11.808934689578862, "grad_norm": 1.7031188011169434, "learning_rate": 6.929678297931339e-06, "loss": 0.2428, "step": 885800 }, { "epoch": 11.810267827385317, "grad_norm": 6.181028842926025, "learning_rate": 6.924130594521035e-06, "loss": 0.2948, "step": 885900 }, { "epoch": 11.811600965191772, "grad_norm": 3.6480331420898438, "learning_rate": 6.918584822949775e-06, "loss": 0.2572, "step": 886000 }, { "epoch": 11.812934102998227, "grad_norm": 2.9992990493774414, "learning_rate": 6.913040983681846e-06, "loss": 0.2974, "step": 886100 }, { "epoch": 11.814267240804682, "grad_norm": 2.0895068645477295, "learning_rate": 6.9074990771813306e-06, "loss": 0.2628, "step": 886200 }, { "epoch": 11.815600378611137, "grad_norm": 1.8713104724884033, "learning_rate": 6.90195910391221e-06, "loss": 0.2941, "step": 886300 }, { "epoch": 11.816933516417592, "grad_norm": 2.3353536128997803, "learning_rate": 6.896421064338269e-06, "loss": 0.3016, "step": 886400 }, { "epoch": 11.818266654224047, "grad_norm": 7.600959300994873, "learning_rate": 6.89088495892312e-06, "loss": 0.3028, "step": 886500 }, { "epoch": 11.819599792030502, "grad_norm": 0.8819344639778137, "learning_rate": 6.885350788130235e-06, "loss": 0.2614, "step": 886600 }, { "epoch": 11.820932929836957, "grad_norm": 1.2925019264221191, "learning_rate": 6.879818552422915e-06, "loss": 0.2494, "step": 886700 }, { "epoch": 11.822266067643412, "grad_norm": 0.19378753006458282, "learning_rate": 6.87428825226431e-06, "loss": 0.2588, "step": 886800 }, { "epoch": 11.823599205449867, "grad_norm": 3.2930877208709717, "learning_rate": 6.86875988811739e-06, "loss": 0.2474, "step": 886900 }, { "epoch": 11.824932343256322, "grad_norm": 4.595335006713867, "learning_rate": 6.863233460444987e-06, "loss": 0.2685, "step": 887000 }, { "epoch": 11.826265481062777, "grad_norm": 3.9929897785186768, "learning_rate": 6.8577089697097275e-06, "loss": 0.2331, "step": 887100 }, { "epoch": 11.827598618869233, "grad_norm": 3.422025203704834, "learning_rate": 6.852186416374142e-06, "loss": 0.3249, "step": 887200 }, { "epoch": 11.828931756675688, "grad_norm": 8.293984413146973, "learning_rate": 6.846665800900531e-06, "loss": 0.2894, "step": 887300 }, { "epoch": 11.830264894482143, "grad_norm": 2.097482204437256, "learning_rate": 6.841147123751071e-06, "loss": 0.2703, "step": 887400 }, { "epoch": 11.831598032288598, "grad_norm": 1.9757205247879028, "learning_rate": 6.835630385387773e-06, "loss": 0.2529, "step": 887500 }, { "epoch": 11.832931170095053, "grad_norm": 2.836461305618286, "learning_rate": 6.830115586272481e-06, "loss": 0.2655, "step": 887600 }, { "epoch": 11.834264307901508, "grad_norm": 6.192403316497803, "learning_rate": 6.824602726866871e-06, "loss": 0.2726, "step": 887700 }, { "epoch": 11.835597445707963, "grad_norm": 2.497969627380371, "learning_rate": 6.8190918076324745e-06, "loss": 0.2706, "step": 887800 }, { "epoch": 11.836930583514418, "grad_norm": 2.1243536472320557, "learning_rate": 6.8135828290306335e-06, "loss": 0.268, "step": 887900 }, { "epoch": 11.838263721320873, "grad_norm": 2.741239547729492, "learning_rate": 6.808075791522545e-06, "loss": 0.2601, "step": 888000 }, { "epoch": 11.839596859127328, "grad_norm": 2.099771499633789, "learning_rate": 6.802570695569246e-06, "loss": 0.2757, "step": 888100 }, { "epoch": 11.840929996933783, "grad_norm": 2.9384939670562744, "learning_rate": 6.7970675416316015e-06, "loss": 0.2845, "step": 888200 }, { "epoch": 11.842263134740238, "grad_norm": 2.1098012924194336, "learning_rate": 6.791566330170321e-06, "loss": 0.3007, "step": 888300 }, { "epoch": 11.843596272546693, "grad_norm": 4.187661647796631, "learning_rate": 6.786067061645958e-06, "loss": 0.3225, "step": 888400 }, { "epoch": 11.844929410353148, "grad_norm": 3.6738345623016357, "learning_rate": 6.7805697365188655e-06, "loss": 0.2657, "step": 888500 }, { "epoch": 11.846262548159602, "grad_norm": 0.9980809688568115, "learning_rate": 6.775074355249301e-06, "loss": 0.3137, "step": 888600 }, { "epoch": 11.84759568596606, "grad_norm": 3.746868848800659, "learning_rate": 6.7695809182972865e-06, "loss": 0.2788, "step": 888700 }, { "epoch": 11.848928823772514, "grad_norm": 5.811614036560059, "learning_rate": 6.764089426122735e-06, "loss": 0.212, "step": 888800 }, { "epoch": 11.850261961578969, "grad_norm": 1.0099453926086426, "learning_rate": 6.758654765024305e-06, "loss": 0.2827, "step": 888900 }, { "epoch": 11.851595099385424, "grad_norm": 2.3507275581359863, "learning_rate": 6.753167144324457e-06, "loss": 0.2712, "step": 889000 }, { "epoch": 11.852928237191879, "grad_norm": 2.648359775543213, "learning_rate": 6.747681469776169e-06, "loss": 0.2674, "step": 889100 }, { "epoch": 11.854261374998334, "grad_norm": 0.3788479268550873, "learning_rate": 6.74219774183869e-06, "loss": 0.2763, "step": 889200 }, { "epoch": 11.855594512804789, "grad_norm": 1.5369967222213745, "learning_rate": 6.7367159609711035e-06, "loss": 0.3172, "step": 889300 }, { "epoch": 11.856927650611244, "grad_norm": 4.108091354370117, "learning_rate": 6.731236127632322e-06, "loss": 0.2425, "step": 889400 }, { "epoch": 11.858260788417699, "grad_norm": 2.1124703884124756, "learning_rate": 6.725758242281105e-06, "loss": 0.2981, "step": 889500 }, { "epoch": 11.859593926224154, "grad_norm": 2.145397901535034, "learning_rate": 6.720337055098771e-06, "loss": 0.2817, "step": 889600 }, { "epoch": 11.860927064030609, "grad_norm": 2.1509482860565186, "learning_rate": 6.714863047606981e-06, "loss": 0.2582, "step": 889700 }, { "epoch": 11.862260201837064, "grad_norm": 8.327189445495605, "learning_rate": 6.709390989473448e-06, "loss": 0.3017, "step": 889800 }, { "epoch": 11.863593339643518, "grad_norm": 3.521618127822876, "learning_rate": 6.703920881156281e-06, "loss": 0.2443, "step": 889900 }, { "epoch": 11.864926477449973, "grad_norm": 5.362782955169678, "learning_rate": 6.698452723113423e-06, "loss": 0.2952, "step": 890000 }, { "epoch": 11.866259615256428, "grad_norm": 4.01275634765625, "learning_rate": 6.6929865158026506e-06, "loss": 0.2878, "step": 890100 }, { "epoch": 11.867592753062883, "grad_norm": 3.011690855026245, "learning_rate": 6.687522259681584e-06, "loss": 0.2615, "step": 890200 }, { "epoch": 11.868925890869338, "grad_norm": 4.270585536956787, "learning_rate": 6.6820599552076625e-06, "loss": 0.2476, "step": 890300 }, { "epoch": 11.870259028675795, "grad_norm": 2.9336509704589844, "learning_rate": 6.676599602838172e-06, "loss": 0.3148, "step": 890400 }, { "epoch": 11.87159216648225, "grad_norm": 1.4522470235824585, "learning_rate": 6.671141203030243e-06, "loss": 0.3056, "step": 890500 }, { "epoch": 11.872925304288705, "grad_norm": 6.838119029998779, "learning_rate": 6.665684756240835e-06, "loss": 0.2768, "step": 890600 }, { "epoch": 11.87425844209516, "grad_norm": 3.2221872806549072, "learning_rate": 6.660230262926742e-06, "loss": 0.2849, "step": 890700 }, { "epoch": 11.875591579901615, "grad_norm": 3.3815879821777344, "learning_rate": 6.654777723544605e-06, "loss": 0.2748, "step": 890800 }, { "epoch": 11.87692471770807, "grad_norm": 2.3127996921539307, "learning_rate": 6.6493271385508696e-06, "loss": 0.2519, "step": 890900 }, { "epoch": 11.878257855514525, "grad_norm": 2.1568470001220703, "learning_rate": 6.643878508401869e-06, "loss": 0.274, "step": 891000 }, { "epoch": 11.87959099332098, "grad_norm": 4.910637378692627, "learning_rate": 6.638431833553741e-06, "loss": 0.2554, "step": 891100 }, { "epoch": 11.880924131127435, "grad_norm": 1.3070067167282104, "learning_rate": 6.632987114462452e-06, "loss": 0.2799, "step": 891200 }, { "epoch": 11.88225726893389, "grad_norm": 0.9298990368843079, "learning_rate": 6.627544351583822e-06, "loss": 0.277, "step": 891300 }, { "epoch": 11.883590406740344, "grad_norm": 5.693776607513428, "learning_rate": 6.622103545373503e-06, "loss": 0.2557, "step": 891400 }, { "epoch": 11.8849235445468, "grad_norm": 4.251009941101074, "learning_rate": 6.616664696286979e-06, "loss": 0.3232, "step": 891500 }, { "epoch": 11.886256682353254, "grad_norm": 2.2560787200927734, "learning_rate": 6.6112278047795845e-06, "loss": 0.2373, "step": 891600 }, { "epoch": 11.88758982015971, "grad_norm": 2.2414982318878174, "learning_rate": 6.605792871306465e-06, "loss": 0.3135, "step": 891700 }, { "epoch": 11.888922957966164, "grad_norm": 55.600074768066406, "learning_rate": 6.600359896322613e-06, "loss": 0.2698, "step": 891800 }, { "epoch": 11.89025609577262, "grad_norm": 11.54895305633545, "learning_rate": 6.594928880282884e-06, "loss": 0.3029, "step": 891900 }, { "epoch": 11.891589233579076, "grad_norm": 1.9194616079330444, "learning_rate": 6.589499823641919e-06, "loss": 0.2344, "step": 892000 }, { "epoch": 11.89292237138553, "grad_norm": 1.4370161294937134, "learning_rate": 6.584072726854237e-06, "loss": 0.2758, "step": 892100 }, { "epoch": 11.894255509191986, "grad_norm": 3.036733865737915, "learning_rate": 6.578647590374171e-06, "loss": 0.2353, "step": 892200 }, { "epoch": 11.89558864699844, "grad_norm": 2.6518914699554443, "learning_rate": 6.573224414655897e-06, "loss": 0.2832, "step": 892300 }, { "epoch": 11.896921784804896, "grad_norm": 1.7423714399337769, "learning_rate": 6.567803200153427e-06, "loss": 0.3135, "step": 892400 }, { "epoch": 11.89825492261135, "grad_norm": 3.5193214416503906, "learning_rate": 6.56238394732062e-06, "loss": 0.2624, "step": 892500 }, { "epoch": 11.899588060417805, "grad_norm": 5.316020965576172, "learning_rate": 6.5570208198042284e-06, "loss": 0.272, "step": 892600 }, { "epoch": 11.90092119822426, "grad_norm": 0.9564589262008667, "learning_rate": 6.551605472043587e-06, "loss": 0.2627, "step": 892700 }, { "epoch": 11.902254336030715, "grad_norm": 3.3286662101745605, "learning_rate": 6.546192087308618e-06, "loss": 0.2665, "step": 892800 }, { "epoch": 11.90358747383717, "grad_norm": 0.22393138706684113, "learning_rate": 6.540780666052521e-06, "loss": 0.2868, "step": 892900 }, { "epoch": 11.904920611643625, "grad_norm": 2.13742995262146, "learning_rate": 6.5353712087283025e-06, "loss": 0.2915, "step": 893000 }, { "epoch": 11.90625374945008, "grad_norm": 3.4224750995635986, "learning_rate": 6.529963715788858e-06, "loss": 0.2243, "step": 893100 }, { "epoch": 11.907586887256535, "grad_norm": 15.364213943481445, "learning_rate": 6.524558187686857e-06, "loss": 0.2995, "step": 893200 }, { "epoch": 11.90892002506299, "grad_norm": 3.288698434829712, "learning_rate": 6.519154624874843e-06, "loss": 0.3164, "step": 893300 }, { "epoch": 11.910253162869445, "grad_norm": 3.4602901935577393, "learning_rate": 6.513753027805187e-06, "loss": 0.3227, "step": 893400 }, { "epoch": 11.9115863006759, "grad_norm": 2.2780284881591797, "learning_rate": 6.508353396930092e-06, "loss": 0.2395, "step": 893500 }, { "epoch": 11.912919438482357, "grad_norm": 2.333223581314087, "learning_rate": 6.502955732701597e-06, "loss": 0.2797, "step": 893600 }, { "epoch": 11.914252576288812, "grad_norm": 8.352416038513184, "learning_rate": 6.497560035571588e-06, "loss": 0.2321, "step": 893700 }, { "epoch": 11.915585714095267, "grad_norm": 3.666830539703369, "learning_rate": 6.492166305991759e-06, "loss": 0.245, "step": 893800 }, { "epoch": 11.916918851901722, "grad_norm": 2.2875680923461914, "learning_rate": 6.486774544413665e-06, "loss": 0.3061, "step": 893900 }, { "epoch": 11.918251989708176, "grad_norm": 3.662961721420288, "learning_rate": 6.481384751288685e-06, "loss": 0.3348, "step": 894000 }, { "epoch": 11.919585127514631, "grad_norm": 2.463247776031494, "learning_rate": 6.475996927068034e-06, "loss": 0.2694, "step": 894100 }, { "epoch": 11.920918265321086, "grad_norm": 4.491716384887695, "learning_rate": 6.4706110722027685e-06, "loss": 0.2487, "step": 894200 }, { "epoch": 11.922251403127541, "grad_norm": 3.146228313446045, "learning_rate": 6.465227187143782e-06, "loss": 0.2523, "step": 894300 }, { "epoch": 11.923584540933996, "grad_norm": 0.3491351902484894, "learning_rate": 6.459845272341772e-06, "loss": 0.3127, "step": 894400 }, { "epoch": 11.924917678740451, "grad_norm": 2.501765489578247, "learning_rate": 6.454465328247323e-06, "loss": 0.2545, "step": 894500 }, { "epoch": 11.926250816546906, "grad_norm": 2.617704391479492, "learning_rate": 6.449087355310811e-06, "loss": 0.25, "step": 894600 }, { "epoch": 11.927583954353361, "grad_norm": 4.533973693847656, "learning_rate": 6.443711353982466e-06, "loss": 0.3141, "step": 894700 }, { "epoch": 11.928917092159816, "grad_norm": 4.279921054840088, "learning_rate": 6.438337324712346e-06, "loss": 0.2607, "step": 894800 }, { "epoch": 11.930250229966271, "grad_norm": 1.679612636566162, "learning_rate": 6.432965267950358e-06, "loss": 0.2738, "step": 894900 }, { "epoch": 11.931583367772726, "grad_norm": 1.6124738454818726, "learning_rate": 6.427595184146226e-06, "loss": 0.276, "step": 895000 }, { "epoch": 11.932916505579183, "grad_norm": 1.5098003149032593, "learning_rate": 6.422227073749526e-06, "loss": 0.2419, "step": 895100 }, { "epoch": 11.934249643385638, "grad_norm": 3.0955042839050293, "learning_rate": 6.416860937209643e-06, "loss": 0.3011, "step": 895200 }, { "epoch": 11.935582781192092, "grad_norm": 1.8335355520248413, "learning_rate": 6.411496774975816e-06, "loss": 0.2334, "step": 895300 }, { "epoch": 11.936915918998547, "grad_norm": 4.473577499389648, "learning_rate": 6.406134587497138e-06, "loss": 0.2425, "step": 895400 }, { "epoch": 11.938249056805002, "grad_norm": 0.7117503881454468, "learning_rate": 6.40077437522249e-06, "loss": 0.2714, "step": 895500 }, { "epoch": 11.939582194611457, "grad_norm": 4.157454967498779, "learning_rate": 6.395416138600617e-06, "loss": 0.2691, "step": 895600 }, { "epoch": 11.940915332417912, "grad_norm": 0.2763010263442993, "learning_rate": 6.3900598780801e-06, "loss": 0.2716, "step": 895700 }, { "epoch": 11.942248470224367, "grad_norm": 1.2014427185058594, "learning_rate": 6.384705594109343e-06, "loss": 0.2636, "step": 895800 }, { "epoch": 11.943581608030822, "grad_norm": 0.31700971722602844, "learning_rate": 6.3793532871365925e-06, "loss": 0.2232, "step": 895900 }, { "epoch": 11.944914745837277, "grad_norm": 1.6016260385513306, "learning_rate": 6.374002957609935e-06, "loss": 0.271, "step": 896000 }, { "epoch": 11.946247883643732, "grad_norm": 1.6328692436218262, "learning_rate": 6.368654605977265e-06, "loss": 0.231, "step": 896100 }, { "epoch": 11.947581021450187, "grad_norm": 1.2327414751052856, "learning_rate": 6.36330823268634e-06, "loss": 0.2582, "step": 896200 }, { "epoch": 11.948914159256642, "grad_norm": 1.201438069343567, "learning_rate": 6.357963838184741e-06, "loss": 0.2535, "step": 896300 }, { "epoch": 11.950247297063097, "grad_norm": 3.7756526470184326, "learning_rate": 6.352621422919882e-06, "loss": 0.257, "step": 896400 }, { "epoch": 11.951580434869552, "grad_norm": 0.8409222364425659, "learning_rate": 6.347280987339024e-06, "loss": 0.298, "step": 896500 }, { "epoch": 11.952913572676007, "grad_norm": 7.653677940368652, "learning_rate": 6.341942531889224e-06, "loss": 0.3087, "step": 896600 }, { "epoch": 11.954246710482462, "grad_norm": 7.829451084136963, "learning_rate": 6.336606057017427e-06, "loss": 0.2848, "step": 896700 }, { "epoch": 11.955579848288918, "grad_norm": 4.472172737121582, "learning_rate": 6.331271563170387e-06, "loss": 0.2909, "step": 896800 }, { "epoch": 11.956912986095373, "grad_norm": 2.3284947872161865, "learning_rate": 6.32593905079467e-06, "loss": 0.2971, "step": 896900 }, { "epoch": 11.958246123901828, "grad_norm": 12.280878067016602, "learning_rate": 6.3206085203367105e-06, "loss": 0.2602, "step": 897000 }, { "epoch": 11.959579261708283, "grad_norm": 3.3376433849334717, "learning_rate": 6.315279972242763e-06, "loss": 0.275, "step": 897100 }, { "epoch": 11.960912399514738, "grad_norm": 1.0438425540924072, "learning_rate": 6.309953406958916e-06, "loss": 0.2723, "step": 897200 }, { "epoch": 11.962245537321193, "grad_norm": 2.2277417182922363, "learning_rate": 6.304682060932791e-06, "loss": 0.291, "step": 897300 }, { "epoch": 11.963578675127648, "grad_norm": 3.3058409690856934, "learning_rate": 6.299359442767516e-06, "loss": 0.2492, "step": 897400 }, { "epoch": 11.964911812934103, "grad_norm": 2.5768532752990723, "learning_rate": 6.294038808745174e-06, "loss": 0.2594, "step": 897500 }, { "epoch": 11.966244950740558, "grad_norm": 0.7247925996780396, "learning_rate": 6.288720159311166e-06, "loss": 0.289, "step": 897600 }, { "epoch": 11.967578088547013, "grad_norm": 16.24053955078125, "learning_rate": 6.283403494910768e-06, "loss": 0.2158, "step": 897700 }, { "epoch": 11.968911226353468, "grad_norm": 1.2331138849258423, "learning_rate": 6.278088815989055e-06, "loss": 0.2739, "step": 897800 }, { "epoch": 11.970244364159923, "grad_norm": 2.9274184703826904, "learning_rate": 6.272776122990974e-06, "loss": 0.2892, "step": 897900 }, { "epoch": 11.971577501966378, "grad_norm": 4.257789134979248, "learning_rate": 6.26746541636129e-06, "loss": 0.2724, "step": 898000 }, { "epoch": 11.972910639772833, "grad_norm": 8.426987648010254, "learning_rate": 6.262156696544583e-06, "loss": 0.2676, "step": 898100 }, { "epoch": 11.974243777579288, "grad_norm": 13.404335975646973, "learning_rate": 6.256849963985288e-06, "loss": 0.2685, "step": 898200 }, { "epoch": 11.975576915385744, "grad_norm": 3.8954765796661377, "learning_rate": 6.25154521912767e-06, "loss": 0.2758, "step": 898300 }, { "epoch": 11.9769100531922, "grad_norm": 1.521412968635559, "learning_rate": 6.246242462415823e-06, "loss": 0.3308, "step": 898400 }, { "epoch": 11.978243190998654, "grad_norm": 2.6033880710601807, "learning_rate": 6.240994692129932e-06, "loss": 0.2586, "step": 898500 }, { "epoch": 11.97957632880511, "grad_norm": 4.195366382598877, "learning_rate": 6.235695893148718e-06, "loss": 0.2857, "step": 898600 }, { "epoch": 11.980909466611564, "grad_norm": 0.010313699021935463, "learning_rate": 6.230399083640145e-06, "loss": 0.2624, "step": 898700 }, { "epoch": 11.982242604418019, "grad_norm": 1.708083152770996, "learning_rate": 6.225104264047626e-06, "loss": 0.2773, "step": 898800 }, { "epoch": 11.983575742224474, "grad_norm": 2.7527995109558105, "learning_rate": 6.219811434814436e-06, "loss": 0.2973, "step": 898900 }, { "epoch": 11.984908880030929, "grad_norm": 1.703338861465454, "learning_rate": 6.214520596383667e-06, "loss": 0.281, "step": 899000 }, { "epoch": 11.986242017837384, "grad_norm": 4.917289733886719, "learning_rate": 6.209231749198258e-06, "loss": 0.2786, "step": 899100 }, { "epoch": 11.987575155643839, "grad_norm": 1.498780369758606, "learning_rate": 6.203944893700979e-06, "loss": 0.275, "step": 899200 }, { "epoch": 11.988908293450294, "grad_norm": 1.9366953372955322, "learning_rate": 6.198660030334414e-06, "loss": 0.2865, "step": 899300 }, { "epoch": 11.990241431256749, "grad_norm": 4.6695685386657715, "learning_rate": 6.193377159541003e-06, "loss": 0.3415, "step": 899400 }, { "epoch": 11.991574569063204, "grad_norm": 2.071725606918335, "learning_rate": 6.18809628176301e-06, "loss": 0.318, "step": 899500 }, { "epoch": 11.992907706869659, "grad_norm": 1.6134600639343262, "learning_rate": 6.182817397442537e-06, "loss": 0.2722, "step": 899600 }, { "epoch": 11.994240844676114, "grad_norm": 2.8426108360290527, "learning_rate": 6.177540507021513e-06, "loss": 0.2244, "step": 899700 }, { "epoch": 11.995573982482568, "grad_norm": 2.5163934230804443, "learning_rate": 6.172265610941711e-06, "loss": 0.2266, "step": 899800 }, { "epoch": 11.996907120289023, "grad_norm": 1.723705530166626, "learning_rate": 6.166992709644711e-06, "loss": 0.2887, "step": 899900 }, { "epoch": 11.99824025809548, "grad_norm": 2.3086984157562256, "learning_rate": 6.161721803571944e-06, "loss": 0.2939, "step": 900000 }, { "epoch": 11.99824025809548, "eval_accuracy": 0.9054316004735766, "eval_cer": 0.06271105867400348, "eval_loss": 0.36983928084373474, "eval_runtime": 9805.4445, "eval_samples_per_second": 5.685, "eval_steps_per_second": 0.355, "eval_wer": 0.13463274588077237, "step": 900000 }, { "epoch": 11.999573395901935, "grad_norm": 3.0518038272857666, "learning_rate": 6.1564528931647025e-06, "loss": 0.2738, "step": 900100 }, { "epoch": 12.00090653370839, "grad_norm": 6.160830020904541, "learning_rate": 6.151185978864051e-06, "loss": 0.2714, "step": 900200 }, { "epoch": 12.002239671514845, "grad_norm": 1.0580933094024658, "learning_rate": 6.145921061110934e-06, "loss": 0.2393, "step": 900300 }, { "epoch": 12.0035728093213, "grad_norm": 2.796807050704956, "learning_rate": 6.140658140346112e-06, "loss": 0.2049, "step": 900400 }, { "epoch": 12.004905947127755, "grad_norm": 1.9026405811309814, "learning_rate": 6.135397217010178e-06, "loss": 0.1884, "step": 900500 }, { "epoch": 12.00623908493421, "grad_norm": 3.052219867706299, "learning_rate": 6.130138291543571e-06, "loss": 0.2519, "step": 900600 }, { "epoch": 12.007572222740665, "grad_norm": 0.9173609614372253, "learning_rate": 6.124881364386532e-06, "loss": 0.2392, "step": 900700 }, { "epoch": 12.00890536054712, "grad_norm": 0.7534306049346924, "learning_rate": 6.119678975367981e-06, "loss": 0.2085, "step": 900800 }, { "epoch": 12.010238498353575, "grad_norm": 4.836151123046875, "learning_rate": 6.114426026156139e-06, "loss": 0.2695, "step": 900900 }, { "epoch": 12.01157163616003, "grad_norm": 2.178283214569092, "learning_rate": 6.1091750765692626e-06, "loss": 0.2791, "step": 901000 }, { "epoch": 12.012904773966484, "grad_norm": 1.9135663509368896, "learning_rate": 6.103926127046933e-06, "loss": 0.2683, "step": 901100 }, { "epoch": 12.01423791177294, "grad_norm": 1.405087947845459, "learning_rate": 6.098679178028569e-06, "loss": 0.2397, "step": 901200 }, { "epoch": 12.015571049579394, "grad_norm": 2.0643012523651123, "learning_rate": 6.09343422995346e-06, "loss": 0.2675, "step": 901300 }, { "epoch": 12.01690418738585, "grad_norm": 3.2926788330078125, "learning_rate": 6.08819128326067e-06, "loss": 0.25, "step": 901400 }, { "epoch": 12.018237325192304, "grad_norm": 2.4948537349700928, "learning_rate": 6.082950338389126e-06, "loss": 0.2489, "step": 901500 }, { "epoch": 12.019570462998761, "grad_norm": 5.5571980476379395, "learning_rate": 6.07771139577759e-06, "loss": 0.263, "step": 901600 }, { "epoch": 12.020903600805216, "grad_norm": 4.182150363922119, "learning_rate": 6.072474455864644e-06, "loss": 0.2582, "step": 901700 }, { "epoch": 12.02223673861167, "grad_norm": 1.409656047821045, "learning_rate": 6.067239519088711e-06, "loss": 0.2417, "step": 901800 }, { "epoch": 12.023569876418126, "grad_norm": 1.2480206489562988, "learning_rate": 6.06200658588805e-06, "loss": 0.2744, "step": 901900 }, { "epoch": 12.02490301422458, "grad_norm": 4.862720966339111, "learning_rate": 6.056775656700732e-06, "loss": 0.2145, "step": 902000 }, { "epoch": 12.026236152031036, "grad_norm": 3.4372942447662354, "learning_rate": 6.051546731964681e-06, "loss": 0.2388, "step": 902100 }, { "epoch": 12.02756928983749, "grad_norm": 0.5858887434005737, "learning_rate": 6.046319812117646e-06, "loss": 0.2012, "step": 902200 }, { "epoch": 12.028902427643946, "grad_norm": 1.6273142099380493, "learning_rate": 6.041094897597205e-06, "loss": 0.2368, "step": 902300 }, { "epoch": 12.0302355654504, "grad_norm": 3.7770581245422363, "learning_rate": 6.035871988840786e-06, "loss": 0.273, "step": 902400 }, { "epoch": 12.031568703256855, "grad_norm": 4.6929521560668945, "learning_rate": 6.030651086285601e-06, "loss": 0.2636, "step": 902500 }, { "epoch": 12.03290184106331, "grad_norm": 2.2504334449768066, "learning_rate": 6.025432190368763e-06, "loss": 0.244, "step": 902600 }, { "epoch": 12.034234978869765, "grad_norm": 1.1599302291870117, "learning_rate": 6.020215301527173e-06, "loss": 0.2607, "step": 902700 }, { "epoch": 12.03556811667622, "grad_norm": 2.6435039043426514, "learning_rate": 6.015000420197557e-06, "loss": 0.2736, "step": 902800 }, { "epoch": 12.036901254482675, "grad_norm": 3.292186975479126, "learning_rate": 6.009787546816502e-06, "loss": 0.2519, "step": 902900 }, { "epoch": 12.03823439228913, "grad_norm": 2.4300005435943604, "learning_rate": 6.004576681820409e-06, "loss": 0.2824, "step": 903000 }, { "epoch": 12.039567530095585, "grad_norm": 1.5446580648422241, "learning_rate": 5.999367825645517e-06, "loss": 0.2366, "step": 903100 }, { "epoch": 12.040900667902042, "grad_norm": 2.7782516479492188, "learning_rate": 5.994160978727894e-06, "loss": 0.2769, "step": 903200 }, { "epoch": 12.042233805708497, "grad_norm": 1.3104634284973145, "learning_rate": 5.988956141503446e-06, "loss": 0.2668, "step": 903300 }, { "epoch": 12.043566943514952, "grad_norm": 2.2692058086395264, "learning_rate": 5.983753314407894e-06, "loss": 0.2533, "step": 903400 }, { "epoch": 12.044900081321407, "grad_norm": 1.2946538925170898, "learning_rate": 5.97855249787681e-06, "loss": 0.2549, "step": 903500 }, { "epoch": 12.046233219127862, "grad_norm": 0.2296280711889267, "learning_rate": 5.9733536923455865e-06, "loss": 0.265, "step": 903600 }, { "epoch": 12.047566356934317, "grad_norm": 1.9479546546936035, "learning_rate": 5.968156898249455e-06, "loss": 0.2697, "step": 903700 }, { "epoch": 12.048899494740771, "grad_norm": 82.79035186767578, "learning_rate": 5.962962116023484e-06, "loss": 0.2668, "step": 903800 }, { "epoch": 12.050232632547226, "grad_norm": 1.756920337677002, "learning_rate": 5.957769346102532e-06, "loss": 0.2132, "step": 903900 }, { "epoch": 12.051565770353681, "grad_norm": 4.616285800933838, "learning_rate": 5.952578588921354e-06, "loss": 0.2364, "step": 904000 }, { "epoch": 12.052898908160136, "grad_norm": 1.8441754579544067, "learning_rate": 5.947389844914498e-06, "loss": 0.2505, "step": 904100 }, { "epoch": 12.054232045966591, "grad_norm": 1.6996898651123047, "learning_rate": 5.942203114516338e-06, "loss": 0.2786, "step": 904200 }, { "epoch": 12.055565183773046, "grad_norm": 1.6546986103057861, "learning_rate": 5.937018398161097e-06, "loss": 0.2732, "step": 904300 }, { "epoch": 12.056898321579501, "grad_norm": 1.3335598707199097, "learning_rate": 5.931835696282821e-06, "loss": 0.2583, "step": 904400 }, { "epoch": 12.058231459385956, "grad_norm": 0.10461355745792389, "learning_rate": 5.92665500931539e-06, "loss": 0.2626, "step": 904500 }, { "epoch": 12.059564597192411, "grad_norm": 10.811370849609375, "learning_rate": 5.921476337692517e-06, "loss": 0.2604, "step": 904600 }, { "epoch": 12.060897734998866, "grad_norm": 2.2312686443328857, "learning_rate": 5.9162996818477525e-06, "loss": 0.242, "step": 904700 }, { "epoch": 12.062230872805323, "grad_norm": 0.03286326676607132, "learning_rate": 5.911125042214445e-06, "loss": 0.2519, "step": 904800 }, { "epoch": 12.063564010611778, "grad_norm": 4.100107669830322, "learning_rate": 5.905952419225818e-06, "loss": 0.2817, "step": 904900 }, { "epoch": 12.064897148418233, "grad_norm": 3.8710758686065674, "learning_rate": 5.900781813314914e-06, "loss": 0.2362, "step": 905000 }, { "epoch": 12.066230286224688, "grad_norm": 3.0347137451171875, "learning_rate": 5.895613224914581e-06, "loss": 0.2909, "step": 905100 }, { "epoch": 12.067563424031142, "grad_norm": 4.896530628204346, "learning_rate": 5.890446654457528e-06, "loss": 0.2302, "step": 905200 }, { "epoch": 12.068896561837597, "grad_norm": 1.0871081352233887, "learning_rate": 5.885282102376277e-06, "loss": 0.2452, "step": 905300 }, { "epoch": 12.070229699644052, "grad_norm": 0.9228593707084656, "learning_rate": 5.880119569103194e-06, "loss": 0.2538, "step": 905400 }, { "epoch": 12.071562837450507, "grad_norm": 11.739236831665039, "learning_rate": 5.8749590550704756e-06, "loss": 0.2802, "step": 905500 }, { "epoch": 12.072895975256962, "grad_norm": 0.8779415488243103, "learning_rate": 5.86980056071013e-06, "loss": 0.2292, "step": 905600 }, { "epoch": 12.074229113063417, "grad_norm": 1.7563613653182983, "learning_rate": 5.864644086454015e-06, "loss": 0.262, "step": 905700 }, { "epoch": 12.075562250869872, "grad_norm": 2.6778564453125, "learning_rate": 5.859489632733819e-06, "loss": 0.2285, "step": 905800 }, { "epoch": 12.076895388676327, "grad_norm": 34.216331481933594, "learning_rate": 5.854337199981049e-06, "loss": 0.2531, "step": 905900 }, { "epoch": 12.078228526482782, "grad_norm": 5.8331403732299805, "learning_rate": 5.849186788627057e-06, "loss": 0.2942, "step": 906000 }, { "epoch": 12.079561664289237, "grad_norm": 3.313847064971924, "learning_rate": 5.844089872988781e-06, "loss": 0.2628, "step": 906100 }, { "epoch": 12.080894802095692, "grad_norm": 0.9568344354629517, "learning_rate": 5.838943485500966e-06, "loss": 0.2719, "step": 906200 }, { "epoch": 12.082227939902147, "grad_norm": 1.7648638486862183, "learning_rate": 5.833799120700633e-06, "loss": 0.2221, "step": 906300 }, { "epoch": 12.083561077708604, "grad_norm": 12.508292198181152, "learning_rate": 5.828708192419425e-06, "loss": 0.2277, "step": 906400 }, { "epoch": 12.084894215515058, "grad_norm": 1.5671731233596802, "learning_rate": 5.823567854048301e-06, "loss": 0.2779, "step": 906500 }, { "epoch": 12.086227353321513, "grad_norm": 6.937047958374023, "learning_rate": 5.818429539651859e-06, "loss": 0.2456, "step": 906600 }, { "epoch": 12.087560491127968, "grad_norm": 5.497538089752197, "learning_rate": 5.8132932496602675e-06, "loss": 0.2669, "step": 906700 }, { "epoch": 12.088893628934423, "grad_norm": 3.1304783821105957, "learning_rate": 5.808158984503519e-06, "loss": 0.2249, "step": 906800 }, { "epoch": 12.090226766740878, "grad_norm": 2.1791040897369385, "learning_rate": 5.803026744611446e-06, "loss": 0.2506, "step": 906900 }, { "epoch": 12.091559904547333, "grad_norm": 2.211010456085205, "learning_rate": 5.797896530413696e-06, "loss": 0.2385, "step": 907000 }, { "epoch": 12.092893042353788, "grad_norm": 1.1169394254684448, "learning_rate": 5.792768342339764e-06, "loss": 0.2295, "step": 907100 }, { "epoch": 12.094226180160243, "grad_norm": 15.717145919799805, "learning_rate": 5.787642180818944e-06, "loss": 0.2319, "step": 907200 }, { "epoch": 12.095559317966698, "grad_norm": 4.260777473449707, "learning_rate": 5.782518046280406e-06, "loss": 0.2515, "step": 907300 }, { "epoch": 12.096892455773153, "grad_norm": 0.8469980955123901, "learning_rate": 5.777395939153124e-06, "loss": 0.2644, "step": 907400 }, { "epoch": 12.098225593579608, "grad_norm": 2.910144567489624, "learning_rate": 5.7722758598658875e-06, "loss": 0.244, "step": 907500 }, { "epoch": 12.099558731386063, "grad_norm": 1.3542451858520508, "learning_rate": 5.7671578088473445e-06, "loss": 0.2184, "step": 907600 }, { "epoch": 12.100891869192518, "grad_norm": 2.7820029258728027, "learning_rate": 5.762041786525964e-06, "loss": 0.2411, "step": 907700 }, { "epoch": 12.102225006998973, "grad_norm": 2.198718309402466, "learning_rate": 5.756927793330039e-06, "loss": 0.2381, "step": 907800 }, { "epoch": 12.103558144805428, "grad_norm": 2.8619070053100586, "learning_rate": 5.751815829687699e-06, "loss": 0.2478, "step": 907900 }, { "epoch": 12.104891282611884, "grad_norm": 1.1147148609161377, "learning_rate": 5.746705896026905e-06, "loss": 0.2293, "step": 908000 }, { "epoch": 12.10622442041834, "grad_norm": 1.5387470722198486, "learning_rate": 5.741597992775435e-06, "loss": 0.2168, "step": 908100 }, { "epoch": 12.107557558224794, "grad_norm": 0.8768924474716187, "learning_rate": 5.736492120360913e-06, "loss": 0.3028, "step": 908200 }, { "epoch": 12.10889069603125, "grad_norm": 1.2227578163146973, "learning_rate": 5.731388279210785e-06, "loss": 0.2559, "step": 908300 }, { "epoch": 12.110223833837704, "grad_norm": 1.245158314704895, "learning_rate": 5.726286469752328e-06, "loss": 0.2442, "step": 908400 }, { "epoch": 12.11155697164416, "grad_norm": 7.015786647796631, "learning_rate": 5.721186692412659e-06, "loss": 0.277, "step": 908500 }, { "epoch": 12.112890109450614, "grad_norm": 0.9381494522094727, "learning_rate": 5.716088947618694e-06, "loss": 0.2481, "step": 908600 }, { "epoch": 12.114223247257069, "grad_norm": 1.5771160125732422, "learning_rate": 5.710993235797216e-06, "loss": 0.2376, "step": 908700 }, { "epoch": 12.115556385063524, "grad_norm": 3.1201553344726562, "learning_rate": 5.705899557374827e-06, "loss": 0.2587, "step": 908800 }, { "epoch": 12.116889522869979, "grad_norm": 2.4153194427490234, "learning_rate": 5.70080791277794e-06, "loss": 0.2733, "step": 908900 }, { "epoch": 12.118222660676434, "grad_norm": 0.9636184573173523, "learning_rate": 5.695718302432816e-06, "loss": 0.2706, "step": 909000 }, { "epoch": 12.119555798482889, "grad_norm": 1.9535919427871704, "learning_rate": 5.6906307267655425e-06, "loss": 0.2658, "step": 909100 }, { "epoch": 12.120888936289344, "grad_norm": 0.6547178626060486, "learning_rate": 5.685545186202034e-06, "loss": 0.2038, "step": 909200 }, { "epoch": 12.122222074095799, "grad_norm": 3.3384149074554443, "learning_rate": 5.680461681168036e-06, "loss": 0.2766, "step": 909300 }, { "epoch": 12.123555211902254, "grad_norm": 4.48566198348999, "learning_rate": 5.675380212089134e-06, "loss": 0.2549, "step": 909400 }, { "epoch": 12.124888349708709, "grad_norm": 3.968554735183716, "learning_rate": 5.670300779390706e-06, "loss": 0.2428, "step": 909500 }, { "epoch": 12.126221487515165, "grad_norm": 0.7101783752441406, "learning_rate": 5.665223383498018e-06, "loss": 0.2731, "step": 909600 }, { "epoch": 12.12755462532162, "grad_norm": 3.3943307399749756, "learning_rate": 5.660148024836111e-06, "loss": 0.2716, "step": 909700 }, { "epoch": 12.128887763128075, "grad_norm": 5.220906734466553, "learning_rate": 5.655074703829885e-06, "loss": 0.239, "step": 909800 }, { "epoch": 12.13022090093453, "grad_norm": 3.059144973754883, "learning_rate": 5.650003420904064e-06, "loss": 0.2552, "step": 909900 }, { "epoch": 12.131554038740985, "grad_norm": 2.5066394805908203, "learning_rate": 5.6449341764832e-06, "loss": 0.2399, "step": 910000 }, { "epoch": 12.13288717654744, "grad_norm": 2.4255685806274414, "learning_rate": 5.639866970991669e-06, "loss": 0.253, "step": 910100 }, { "epoch": 12.134220314353895, "grad_norm": 2.682727336883545, "learning_rate": 5.634801804853697e-06, "loss": 0.2959, "step": 910200 }, { "epoch": 12.13555345216035, "grad_norm": 3.0601229667663574, "learning_rate": 5.629738678493302e-06, "loss": 0.2692, "step": 910300 }, { "epoch": 12.136886589966805, "grad_norm": 3.610278844833374, "learning_rate": 5.624728193095563e-06, "loss": 0.2491, "step": 910400 }, { "epoch": 12.13821972777326, "grad_norm": 1.040045976638794, "learning_rate": 5.6196691271534325e-06, "loss": 0.2778, "step": 910500 }, { "epoch": 12.139552865579715, "grad_norm": 0.9340599775314331, "learning_rate": 5.614612102255756e-06, "loss": 0.2255, "step": 910600 }, { "epoch": 12.14088600338617, "grad_norm": 0.9979805946350098, "learning_rate": 5.609557118825871e-06, "loss": 0.2756, "step": 910700 }, { "epoch": 12.142219141192625, "grad_norm": 2.783565044403076, "learning_rate": 5.604504177286982e-06, "loss": 0.2481, "step": 910800 }, { "epoch": 12.14355227899908, "grad_norm": 2.1818482875823975, "learning_rate": 5.599453278062114e-06, "loss": 0.2641, "step": 910900 }, { "epoch": 12.144885416805534, "grad_norm": 1.4976085424423218, "learning_rate": 5.594404421574089e-06, "loss": 0.2459, "step": 911000 }, { "epoch": 12.14621855461199, "grad_norm": 3.3093416690826416, "learning_rate": 5.589357608245597e-06, "loss": 0.2607, "step": 911100 }, { "epoch": 12.147551692418446, "grad_norm": 3.840735912322998, "learning_rate": 5.584312838499134e-06, "loss": 0.2621, "step": 911200 }, { "epoch": 12.148884830224901, "grad_norm": 1.5330376625061035, "learning_rate": 5.579270112757037e-06, "loss": 0.2923, "step": 911300 }, { "epoch": 12.150217968031356, "grad_norm": 3.7095179557800293, "learning_rate": 5.574229431441471e-06, "loss": 0.2607, "step": 911400 }, { "epoch": 12.151551105837811, "grad_norm": 1.5614701509475708, "learning_rate": 5.569190794974416e-06, "loss": 0.2631, "step": 911500 }, { "epoch": 12.152884243644266, "grad_norm": 9.701653480529785, "learning_rate": 5.564154203777691e-06, "loss": 0.2611, "step": 911600 }, { "epoch": 12.15421738145072, "grad_norm": 3.020928144454956, "learning_rate": 5.559119658272951e-06, "loss": 0.2439, "step": 911700 }, { "epoch": 12.155550519257176, "grad_norm": 0.8986079692840576, "learning_rate": 5.5540871588816665e-06, "loss": 0.2615, "step": 911800 }, { "epoch": 12.15688365706363, "grad_norm": 21.51932716369629, "learning_rate": 5.549056706025147e-06, "loss": 0.2214, "step": 911900 }, { "epoch": 12.158216794870086, "grad_norm": 1.8510868549346924, "learning_rate": 5.5440283001245295e-06, "loss": 0.2396, "step": 912000 }, { "epoch": 12.15954993267654, "grad_norm": 1.1224286556243896, "learning_rate": 5.539001941600756e-06, "loss": 0.2879, "step": 912100 }, { "epoch": 12.160883070482996, "grad_norm": 3.0365657806396484, "learning_rate": 5.533977630874641e-06, "loss": 0.261, "step": 912200 }, { "epoch": 12.16221620828945, "grad_norm": 2.0213980674743652, "learning_rate": 5.528955368366805e-06, "loss": 0.2311, "step": 912300 }, { "epoch": 12.163549346095905, "grad_norm": 1.3839588165283203, "learning_rate": 5.523935154497673e-06, "loss": 0.2316, "step": 912400 }, { "epoch": 12.16488248390236, "grad_norm": 3.0505473613739014, "learning_rate": 5.518916989687538e-06, "loss": 0.2605, "step": 912500 }, { "epoch": 12.166215621708815, "grad_norm": 4.623411178588867, "learning_rate": 5.513900874356502e-06, "loss": 0.2689, "step": 912600 }, { "epoch": 12.16754875951527, "grad_norm": 2.8127095699310303, "learning_rate": 5.508886808924497e-06, "loss": 0.2497, "step": 912700 }, { "epoch": 12.168881897321727, "grad_norm": 0.04829666018486023, "learning_rate": 5.5038747938112935e-06, "loss": 0.2102, "step": 912800 }, { "epoch": 12.170215035128182, "grad_norm": 2.397094964981079, "learning_rate": 5.498864829436465e-06, "loss": 0.2491, "step": 912900 }, { "epoch": 12.171548172934637, "grad_norm": 4.972393989562988, "learning_rate": 5.493856916219434e-06, "loss": 0.2395, "step": 913000 }, { "epoch": 12.172881310741092, "grad_norm": 1.1719332933425903, "learning_rate": 5.488851054579464e-06, "loss": 0.268, "step": 913100 }, { "epoch": 12.174214448547547, "grad_norm": 1.3145138025283813, "learning_rate": 5.483847244935611e-06, "loss": 0.2401, "step": 913200 }, { "epoch": 12.175547586354002, "grad_norm": 3.9388346672058105, "learning_rate": 5.478845487706786e-06, "loss": 0.2288, "step": 913300 }, { "epoch": 12.176880724160457, "grad_norm": 2.8045613765716553, "learning_rate": 5.473845783311721e-06, "loss": 0.259, "step": 913400 }, { "epoch": 12.178213861966912, "grad_norm": 4.266867637634277, "learning_rate": 5.4688481321689735e-06, "loss": 0.2354, "step": 913500 }, { "epoch": 12.179546999773367, "grad_norm": 1.1176652908325195, "learning_rate": 5.463852534696929e-06, "loss": 0.2532, "step": 913600 }, { "epoch": 12.180880137579821, "grad_norm": 2.708777666091919, "learning_rate": 5.458858991313814e-06, "loss": 0.2478, "step": 913700 }, { "epoch": 12.182213275386276, "grad_norm": 1.95732843875885, "learning_rate": 5.453867502437657e-06, "loss": 0.2521, "step": 913800 }, { "epoch": 12.183546413192731, "grad_norm": 3.5208001136779785, "learning_rate": 5.448878068486337e-06, "loss": 0.2608, "step": 913900 }, { "epoch": 12.184879550999186, "grad_norm": 2.780874252319336, "learning_rate": 5.443890689877553e-06, "loss": 0.3098, "step": 914000 }, { "epoch": 12.186212688805641, "grad_norm": 1.5728164911270142, "learning_rate": 5.4389053670288315e-06, "loss": 0.191, "step": 914100 }, { "epoch": 12.187545826612096, "grad_norm": 3.0754129886627197, "learning_rate": 5.4339221003575376e-06, "loss": 0.2607, "step": 914200 }, { "epoch": 12.188878964418551, "grad_norm": 1.2941920757293701, "learning_rate": 5.428940890280839e-06, "loss": 0.2542, "step": 914300 }, { "epoch": 12.190212102225008, "grad_norm": 3.1201815605163574, "learning_rate": 5.423961737215748e-06, "loss": 0.2349, "step": 914400 }, { "epoch": 12.191545240031463, "grad_norm": 1.7374802827835083, "learning_rate": 5.418984641579125e-06, "loss": 0.2388, "step": 914500 }, { "epoch": 12.192878377837918, "grad_norm": 0.4944462776184082, "learning_rate": 5.414009603787612e-06, "loss": 0.2262, "step": 914600 }, { "epoch": 12.194211515644373, "grad_norm": 56.775917053222656, "learning_rate": 5.4090366242577175e-06, "loss": 0.2847, "step": 914700 }, { "epoch": 12.195544653450828, "grad_norm": 10.95918083190918, "learning_rate": 5.404065703405754e-06, "loss": 0.271, "step": 914800 }, { "epoch": 12.196877791257283, "grad_norm": 1.8033764362335205, "learning_rate": 5.399096841647879e-06, "loss": 0.2321, "step": 914900 }, { "epoch": 12.198210929063737, "grad_norm": 2.601595401763916, "learning_rate": 5.3941300394000706e-06, "loss": 0.2552, "step": 915000 }, { "epoch": 12.199544066870192, "grad_norm": 0.4837692081928253, "learning_rate": 5.389165297078134e-06, "loss": 0.2063, "step": 915100 }, { "epoch": 12.200877204676647, "grad_norm": 7.863617897033691, "learning_rate": 5.384202615097693e-06, "loss": 0.2388, "step": 915200 }, { "epoch": 12.202210342483102, "grad_norm": 2.714421510696411, "learning_rate": 5.379241993874207e-06, "loss": 0.2799, "step": 915300 }, { "epoch": 12.203543480289557, "grad_norm": 1.6290998458862305, "learning_rate": 5.3743330092193275e-06, "loss": 0.2875, "step": 915400 }, { "epoch": 12.204876618096012, "grad_norm": 4.437595844268799, "learning_rate": 5.369376490137541e-06, "loss": 0.2458, "step": 915500 }, { "epoch": 12.206209755902467, "grad_norm": 0.2260209619998932, "learning_rate": 5.364422033053896e-06, "loss": 0.2742, "step": 915600 }, { "epoch": 12.207542893708922, "grad_norm": 1.0661331415176392, "learning_rate": 5.3594696383831955e-06, "loss": 0.2087, "step": 915700 }, { "epoch": 12.208876031515377, "grad_norm": 1.5363876819610596, "learning_rate": 5.354519306540017e-06, "loss": 0.2383, "step": 915800 }, { "epoch": 12.210209169321832, "grad_norm": 3.1213538646698, "learning_rate": 5.3495710379387915e-06, "loss": 0.237, "step": 915900 }, { "epoch": 12.211542307128289, "grad_norm": 2.4504308700561523, "learning_rate": 5.344624832993776e-06, "loss": 0.2553, "step": 916000 }, { "epoch": 12.212875444934744, "grad_norm": 1.7475353479385376, "learning_rate": 5.3396806921190485e-06, "loss": 0.2277, "step": 916100 }, { "epoch": 12.214208582741199, "grad_norm": 0.18215541541576385, "learning_rate": 5.334738615728515e-06, "loss": 0.2655, "step": 916200 }, { "epoch": 12.215541720547654, "grad_norm": 2.274644613265991, "learning_rate": 5.329798604235924e-06, "loss": 0.2834, "step": 916300 }, { "epoch": 12.216874858354108, "grad_norm": 3.4861905574798584, "learning_rate": 5.324860658054814e-06, "loss": 0.2481, "step": 916400 }, { "epoch": 12.218207996160563, "grad_norm": 2.6161258220672607, "learning_rate": 5.3199247775985856e-06, "loss": 0.2653, "step": 916500 }, { "epoch": 12.219541133967018, "grad_norm": 3.314802885055542, "learning_rate": 5.314990963280468e-06, "loss": 0.2551, "step": 916600 }, { "epoch": 12.220874271773473, "grad_norm": 11.221961975097656, "learning_rate": 5.310059215513484e-06, "loss": 0.2646, "step": 916700 }, { "epoch": 12.222207409579928, "grad_norm": 2.5686957836151123, "learning_rate": 5.305129534710517e-06, "loss": 0.2511, "step": 916800 }, { "epoch": 12.223540547386383, "grad_norm": 2.173985481262207, "learning_rate": 5.300201921284259e-06, "loss": 0.3163, "step": 916900 }, { "epoch": 12.224873685192838, "grad_norm": 1.7591909170150757, "learning_rate": 5.295276375647234e-06, "loss": 0.2781, "step": 917000 }, { "epoch": 12.226206822999293, "grad_norm": 1.9224470853805542, "learning_rate": 5.290402122747204e-06, "loss": 0.2421, "step": 917100 }, { "epoch": 12.227539960805748, "grad_norm": 1.0554269552230835, "learning_rate": 5.285480693237352e-06, "loss": 0.2377, "step": 917200 }, { "epoch": 12.228873098612203, "grad_norm": 0.05096810683608055, "learning_rate": 5.2805613327491505e-06, "loss": 0.2614, "step": 917300 }, { "epoch": 12.230206236418658, "grad_norm": 14.433694839477539, "learning_rate": 5.275644041694434e-06, "loss": 0.2771, "step": 917400 }, { "epoch": 12.231539374225113, "grad_norm": 2.096698045730591, "learning_rate": 5.270728820484862e-06, "loss": 0.2748, "step": 917500 }, { "epoch": 12.23287251203157, "grad_norm": 2.6465489864349365, "learning_rate": 5.265815669531917e-06, "loss": 0.2773, "step": 917600 }, { "epoch": 12.234205649838024, "grad_norm": 19.497411727905273, "learning_rate": 5.260904589246905e-06, "loss": 0.2576, "step": 917700 }, { "epoch": 12.23553878764448, "grad_norm": 1.1925921440124512, "learning_rate": 5.256044659879862e-06, "loss": 0.2635, "step": 917800 }, { "epoch": 12.236871925450934, "grad_norm": 1.556823492050171, "learning_rate": 5.251137701447063e-06, "loss": 0.2671, "step": 917900 }, { "epoch": 12.23820506325739, "grad_norm": 2.5649526119232178, "learning_rate": 5.246232814910989e-06, "loss": 0.2771, "step": 918000 }, { "epoch": 12.239538201063844, "grad_norm": 2.7975122928619385, "learning_rate": 5.24133000068228e-06, "loss": 0.2378, "step": 918100 }, { "epoch": 12.2408713388703, "grad_norm": 5.720521926879883, "learning_rate": 5.236429259171384e-06, "loss": 0.3001, "step": 918200 }, { "epoch": 12.242204476676754, "grad_norm": 1.348763346672058, "learning_rate": 5.231530590788562e-06, "loss": 0.2239, "step": 918300 }, { "epoch": 12.243537614483209, "grad_norm": 0.6535266637802124, "learning_rate": 5.226633995943925e-06, "loss": 0.2226, "step": 918400 }, { "epoch": 12.244870752289664, "grad_norm": 4.039662837982178, "learning_rate": 5.221739475047394e-06, "loss": 0.1981, "step": 918500 }, { "epoch": 12.246203890096119, "grad_norm": 9.670918464660645, "learning_rate": 5.216847028508724e-06, "loss": 0.2455, "step": 918600 }, { "epoch": 12.247537027902574, "grad_norm": 2.3351125717163086, "learning_rate": 5.21195665673751e-06, "loss": 0.2539, "step": 918700 }, { "epoch": 12.248870165709029, "grad_norm": 0.96206134557724, "learning_rate": 5.207068360143136e-06, "loss": 0.2422, "step": 918800 }, { "epoch": 12.250203303515484, "grad_norm": 2.372965097427368, "learning_rate": 5.202182139134838e-06, "loss": 0.2747, "step": 918900 }, { "epoch": 12.251536441321939, "grad_norm": 4.333166599273682, "learning_rate": 5.197297994121696e-06, "loss": 0.2845, "step": 919000 }, { "epoch": 12.252869579128394, "grad_norm": 3.8385298252105713, "learning_rate": 5.192415925512576e-06, "loss": 0.3011, "step": 919100 }, { "epoch": 12.25420271693485, "grad_norm": 2.087268590927124, "learning_rate": 5.187535933716191e-06, "loss": 0.2376, "step": 919200 }, { "epoch": 12.255535854741305, "grad_norm": 1.104325771331787, "learning_rate": 5.182658019141084e-06, "loss": 0.2913, "step": 919300 }, { "epoch": 12.25686899254776, "grad_norm": 3.6475210189819336, "learning_rate": 5.177782182195616e-06, "loss": 0.2443, "step": 919400 }, { "epoch": 12.258202130354215, "grad_norm": 2.300957679748535, "learning_rate": 5.17290842328798e-06, "loss": 0.24, "step": 919500 }, { "epoch": 12.25953526816067, "grad_norm": 2.7129993438720703, "learning_rate": 5.168036742826192e-06, "loss": 0.2475, "step": 919600 }, { "epoch": 12.260868405967125, "grad_norm": 0.861116349697113, "learning_rate": 5.163167141218087e-06, "loss": 0.2527, "step": 919700 }, { "epoch": 12.26220154377358, "grad_norm": 3.91090726852417, "learning_rate": 5.158299618871335e-06, "loss": 0.2233, "step": 919800 }, { "epoch": 12.263534681580035, "grad_norm": 3.5548765659332275, "learning_rate": 5.15343417619343e-06, "loss": 0.278, "step": 919900 }, { "epoch": 12.26486781938649, "grad_norm": 8.14737606048584, "learning_rate": 5.148570813591693e-06, "loss": 0.2348, "step": 920000 }, { "epoch": 12.266200957192945, "grad_norm": 1.246731162071228, "learning_rate": 5.143709531473278e-06, "loss": 0.2574, "step": 920100 }, { "epoch": 12.2675340949994, "grad_norm": 4.886107444763184, "learning_rate": 5.138850330245137e-06, "loss": 0.298, "step": 920200 }, { "epoch": 12.268867232805855, "grad_norm": 1.448932409286499, "learning_rate": 5.133993210314074e-06, "loss": 0.2147, "step": 920300 }, { "epoch": 12.27020037061231, "grad_norm": 1.846472978591919, "learning_rate": 5.1291381720867266e-06, "loss": 0.2527, "step": 920400 }, { "epoch": 12.271533508418765, "grad_norm": 2.760423421859741, "learning_rate": 5.124285215969527e-06, "loss": 0.214, "step": 920500 }, { "epoch": 12.27286664622522, "grad_norm": 4.494177341461182, "learning_rate": 5.119434342368751e-06, "loss": 0.248, "step": 920600 }, { "epoch": 12.274199784031675, "grad_norm": 2.486684560775757, "learning_rate": 5.1145855516905046e-06, "loss": 0.2392, "step": 920700 }, { "epoch": 12.275532921838131, "grad_norm": 3.638136863708496, "learning_rate": 5.109738844340709e-06, "loss": 0.3002, "step": 920800 }, { "epoch": 12.276866059644586, "grad_norm": 0.21215258538722992, "learning_rate": 5.104894220725115e-06, "loss": 0.2817, "step": 920900 }, { "epoch": 12.278199197451041, "grad_norm": 3.296415090560913, "learning_rate": 5.100051681249308e-06, "loss": 0.2577, "step": 921000 }, { "epoch": 12.279532335257496, "grad_norm": 0.2349209040403366, "learning_rate": 5.095211226318678e-06, "loss": 0.2372, "step": 921100 }, { "epoch": 12.280865473063951, "grad_norm": 3.624521255493164, "learning_rate": 5.090372856338447e-06, "loss": 0.2622, "step": 921200 }, { "epoch": 12.282198610870406, "grad_norm": 1.4416910409927368, "learning_rate": 5.085536571713697e-06, "loss": 0.2368, "step": 921300 }, { "epoch": 12.283531748676861, "grad_norm": 1.6884653568267822, "learning_rate": 5.0807023728492785e-06, "loss": 0.2049, "step": 921400 }, { "epoch": 12.284864886483316, "grad_norm": 4.711012363433838, "learning_rate": 5.075870260149904e-06, "loss": 0.2804, "step": 921500 }, { "epoch": 12.28619802428977, "grad_norm": 3.3392958641052246, "learning_rate": 5.071040234020107e-06, "loss": 0.2454, "step": 921600 }, { "epoch": 12.287531162096226, "grad_norm": 4.042043685913086, "learning_rate": 5.0662122948642334e-06, "loss": 0.3093, "step": 921700 }, { "epoch": 12.28886429990268, "grad_norm": 4.031702995300293, "learning_rate": 5.06138644308648e-06, "loss": 0.2911, "step": 921800 }, { "epoch": 12.290197437709136, "grad_norm": 2.4868180751800537, "learning_rate": 5.056562679090834e-06, "loss": 0.3072, "step": 921900 }, { "epoch": 12.29153057551559, "grad_norm": 1.3522409200668335, "learning_rate": 5.051741003281129e-06, "loss": 0.2423, "step": 922000 }, { "epoch": 12.292863713322046, "grad_norm": 1.3871009349822998, "learning_rate": 5.046921416061024e-06, "loss": 0.2545, "step": 922100 }, { "epoch": 12.2941968511285, "grad_norm": 2.297294855117798, "learning_rate": 5.042103917833999e-06, "loss": 0.2364, "step": 922200 }, { "epoch": 12.295529988934955, "grad_norm": 7.891490459442139, "learning_rate": 5.037288509003359e-06, "loss": 0.2829, "step": 922300 }, { "epoch": 12.29686312674141, "grad_norm": 2.086728811264038, "learning_rate": 5.032475189972247e-06, "loss": 0.2567, "step": 922400 }, { "epoch": 12.298196264547867, "grad_norm": 2.2017531394958496, "learning_rate": 5.027663961143588e-06, "loss": 0.2593, "step": 922500 }, { "epoch": 12.299529402354322, "grad_norm": 1.8701095581054688, "learning_rate": 5.022854822920195e-06, "loss": 0.2983, "step": 922600 }, { "epoch": 12.300862540160777, "grad_norm": 5.090763092041016, "learning_rate": 5.0180477757046664e-06, "loss": 0.2343, "step": 922700 }, { "epoch": 12.302195677967232, "grad_norm": 2.590134382247925, "learning_rate": 5.013242819899419e-06, "loss": 0.2448, "step": 922800 }, { "epoch": 12.303528815773687, "grad_norm": 4.712742805480957, "learning_rate": 5.00843995590672e-06, "loss": 0.2216, "step": 922900 }, { "epoch": 12.304861953580142, "grad_norm": 0.18358176946640015, "learning_rate": 5.003639184128648e-06, "loss": 0.2692, "step": 923000 }, { "epoch": 12.306195091386597, "grad_norm": 1.5282623767852783, "learning_rate": 4.998840504967105e-06, "loss": 0.244, "step": 923100 }, { "epoch": 12.307528229193052, "grad_norm": 1.4500377178192139, "learning_rate": 4.994043918823834e-06, "loss": 0.262, "step": 923200 }, { "epoch": 12.308861366999507, "grad_norm": 0.7345616817474365, "learning_rate": 4.989249426100374e-06, "loss": 0.2668, "step": 923300 }, { "epoch": 12.310194504805962, "grad_norm": 2.358548164367676, "learning_rate": 4.98445702719811e-06, "loss": 0.2266, "step": 923400 }, { "epoch": 12.311527642612416, "grad_norm": 0.5862047672271729, "learning_rate": 4.979666722518249e-06, "loss": 0.2539, "step": 923500 }, { "epoch": 12.312860780418871, "grad_norm": 2.172884702682495, "learning_rate": 4.974878512461819e-06, "loss": 0.2273, "step": 923600 }, { "epoch": 12.314193918225326, "grad_norm": 3.8990635871887207, "learning_rate": 4.970092397429673e-06, "loss": 0.3173, "step": 923700 }, { "epoch": 12.315527056031781, "grad_norm": 3.7319490909576416, "learning_rate": 4.965308377822497e-06, "loss": 0.2517, "step": 923800 }, { "epoch": 12.316860193838236, "grad_norm": 0.8295384645462036, "learning_rate": 4.960526454040774e-06, "loss": 0.2293, "step": 923900 }, { "epoch": 12.318193331644693, "grad_norm": 6.955110549926758, "learning_rate": 4.955746626484855e-06, "loss": 0.2779, "step": 924000 }, { "epoch": 12.319526469451148, "grad_norm": 2.785040855407715, "learning_rate": 4.9509688955548846e-06, "loss": 0.2143, "step": 924100 }, { "epoch": 12.320859607257603, "grad_norm": 0.9796993732452393, "learning_rate": 4.946193261650831e-06, "loss": 0.2823, "step": 924200 }, { "epoch": 12.322192745064058, "grad_norm": 2.5978450775146484, "learning_rate": 4.9414197251725035e-06, "loss": 0.2622, "step": 924300 }, { "epoch": 12.323525882870513, "grad_norm": 1.4930555820465088, "learning_rate": 4.936648286519523e-06, "loss": 0.2591, "step": 924400 }, { "epoch": 12.324859020676968, "grad_norm": 1.9559129476547241, "learning_rate": 4.931878946091343e-06, "loss": 0.2264, "step": 924500 }, { "epoch": 12.326192158483423, "grad_norm": 5.774669170379639, "learning_rate": 4.927111704287234e-06, "loss": 0.2678, "step": 924600 }, { "epoch": 12.327525296289878, "grad_norm": 1.562207818031311, "learning_rate": 4.922346561506306e-06, "loss": 0.2399, "step": 924700 }, { "epoch": 12.328858434096333, "grad_norm": 3.0923686027526855, "learning_rate": 4.917583518147456e-06, "loss": 0.2436, "step": 924800 }, { "epoch": 12.330191571902787, "grad_norm": 2.2186479568481445, "learning_rate": 4.912822574609463e-06, "loss": 0.2098, "step": 924900 }, { "epoch": 12.331524709709242, "grad_norm": 1.256603479385376, "learning_rate": 4.908063731290874e-06, "loss": 0.2615, "step": 925000 }, { "epoch": 12.332857847515697, "grad_norm": 0.8009347319602966, "learning_rate": 4.903306988590095e-06, "loss": 0.2448, "step": 925100 }, { "epoch": 12.334190985322152, "grad_norm": 1.6510899066925049, "learning_rate": 4.898552346905341e-06, "loss": 0.2219, "step": 925200 }, { "epoch": 12.335524123128607, "grad_norm": 2.717055320739746, "learning_rate": 4.893799806634659e-06, "loss": 0.2409, "step": 925300 }, { "epoch": 12.336857260935062, "grad_norm": 4.232410907745361, "learning_rate": 4.889049368175915e-06, "loss": 0.2726, "step": 925400 }, { "epoch": 12.338190398741517, "grad_norm": 8.753009796142578, "learning_rate": 4.884301031926807e-06, "loss": 0.2626, "step": 925500 }, { "epoch": 12.339523536547972, "grad_norm": 3.1286182403564453, "learning_rate": 4.879554798284842e-06, "loss": 0.226, "step": 925600 }, { "epoch": 12.340856674354429, "grad_norm": 6.334001064300537, "learning_rate": 4.87481066764736e-06, "loss": 0.2606, "step": 925700 }, { "epoch": 12.342189812160884, "grad_norm": 2.7978384494781494, "learning_rate": 4.870068640411528e-06, "loss": 0.2622, "step": 925800 }, { "epoch": 12.343522949967339, "grad_norm": 3.7585608959198, "learning_rate": 4.865376105793603e-06, "loss": 0.2615, "step": 925900 }, { "epoch": 12.344856087773794, "grad_norm": 2.5759189128875732, "learning_rate": 4.860638265507931e-06, "loss": 0.2483, "step": 926000 }, { "epoch": 12.346189225580249, "grad_norm": 3.9736249446868896, "learning_rate": 4.855902529810386e-06, "loss": 0.2815, "step": 926100 }, { "epoch": 12.347522363386704, "grad_norm": 1.8395107984542847, "learning_rate": 4.851168899097427e-06, "loss": 0.2617, "step": 926200 }, { "epoch": 12.348855501193158, "grad_norm": 1.4062070846557617, "learning_rate": 4.846437373765321e-06, "loss": 0.2288, "step": 926300 }, { "epoch": 12.350188638999613, "grad_norm": 3.756873846054077, "learning_rate": 4.841707954210192e-06, "loss": 0.2381, "step": 926400 }, { "epoch": 12.351521776806068, "grad_norm": 1.0961089134216309, "learning_rate": 4.836980640827965e-06, "loss": 0.2621, "step": 926500 }, { "epoch": 12.352854914612523, "grad_norm": 6.83694314956665, "learning_rate": 4.832255434014397e-06, "loss": 0.2839, "step": 926600 }, { "epoch": 12.354188052418978, "grad_norm": 1.8762657642364502, "learning_rate": 4.827532334165077e-06, "loss": 0.24, "step": 926700 }, { "epoch": 12.355521190225433, "grad_norm": 1.9596199989318848, "learning_rate": 4.822811341675393e-06, "loss": 0.271, "step": 926800 }, { "epoch": 12.356854328031888, "grad_norm": 0.9748072028160095, "learning_rate": 4.8180924569405795e-06, "loss": 0.2402, "step": 926900 }, { "epoch": 12.358187465838343, "grad_norm": 0.9437841773033142, "learning_rate": 4.813422837684889e-06, "loss": 0.2975, "step": 927000 }, { "epoch": 12.359520603644798, "grad_norm": 1.701424241065979, "learning_rate": 4.808708148557391e-06, "loss": 0.2408, "step": 927100 }, { "epoch": 12.360853741451255, "grad_norm": 5.789949417114258, "learning_rate": 4.80404268372682e-06, "loss": 0.2522, "step": 927200 }, { "epoch": 12.36218687925771, "grad_norm": 2.802722215652466, "learning_rate": 4.799332191769673e-06, "loss": 0.2847, "step": 927300 }, { "epoch": 12.363520017064165, "grad_norm": 5.255577087402344, "learning_rate": 4.794623809532994e-06, "loss": 0.2953, "step": 927400 }, { "epoch": 12.36485315487062, "grad_norm": 2.9553253650665283, "learning_rate": 4.789917537410955e-06, "loss": 0.2955, "step": 927500 }, { "epoch": 12.366186292677074, "grad_norm": 2.043102264404297, "learning_rate": 4.785213375797554e-06, "loss": 0.2593, "step": 927600 }, { "epoch": 12.36751943048353, "grad_norm": 5.574523448944092, "learning_rate": 4.780511325086608e-06, "loss": 0.2565, "step": 927700 }, { "epoch": 12.368852568289984, "grad_norm": 1.9252631664276123, "learning_rate": 4.7758113856717515e-06, "loss": 0.2834, "step": 927800 }, { "epoch": 12.37018570609644, "grad_norm": 2.0755529403686523, "learning_rate": 4.771113557946451e-06, "loss": 0.2766, "step": 927900 }, { "epoch": 12.371518843902894, "grad_norm": 2.0479514598846436, "learning_rate": 4.766417842303995e-06, "loss": 0.2635, "step": 928000 }, { "epoch": 12.37285198170935, "grad_norm": 1.4333826303482056, "learning_rate": 4.761724239137494e-06, "loss": 0.3148, "step": 928100 }, { "epoch": 12.374185119515804, "grad_norm": 2.8338000774383545, "learning_rate": 4.7570327488398824e-06, "loss": 0.2724, "step": 928200 }, { "epoch": 12.375518257322259, "grad_norm": 3.8832480907440186, "learning_rate": 4.752343371803923e-06, "loss": 0.2989, "step": 928300 }, { "epoch": 12.376851395128714, "grad_norm": 18.53417205810547, "learning_rate": 4.7476561084221755e-06, "loss": 0.2301, "step": 928400 }, { "epoch": 12.378184532935169, "grad_norm": 0.8471727967262268, "learning_rate": 4.742970959087063e-06, "loss": 0.299, "step": 928500 }, { "epoch": 12.379517670741624, "grad_norm": 1.611725926399231, "learning_rate": 4.738287924190814e-06, "loss": 0.2079, "step": 928600 }, { "epoch": 12.380850808548079, "grad_norm": 0.7274547815322876, "learning_rate": 4.733607004125459e-06, "loss": 0.2296, "step": 928700 }, { "epoch": 12.382183946354534, "grad_norm": 4.0114946365356445, "learning_rate": 4.728928199282877e-06, "loss": 0.2004, "step": 928800 }, { "epoch": 12.38351708416099, "grad_norm": 2.080698251724243, "learning_rate": 4.724251510054769e-06, "loss": 0.2891, "step": 928900 }, { "epoch": 12.384850221967445, "grad_norm": 2.098275661468506, "learning_rate": 4.719576936832644e-06, "loss": 0.2536, "step": 929000 }, { "epoch": 12.3861833597739, "grad_norm": 1.6766067743301392, "learning_rate": 4.714904480007858e-06, "loss": 0.2367, "step": 929100 }, { "epoch": 12.387516497580355, "grad_norm": 2.288574695587158, "learning_rate": 4.710234139971556e-06, "loss": 0.2507, "step": 929200 }, { "epoch": 12.38884963538681, "grad_norm": 0.31259217858314514, "learning_rate": 4.705565917114728e-06, "loss": 0.2776, "step": 929300 }, { "epoch": 12.390182773193265, "grad_norm": 3.0954737663269043, "learning_rate": 4.7008998118281894e-06, "loss": 0.272, "step": 929400 }, { "epoch": 12.39151591099972, "grad_norm": 4.1209211349487305, "learning_rate": 4.696235824502568e-06, "loss": 0.2317, "step": 929500 }, { "epoch": 12.392849048806175, "grad_norm": 2.861140727996826, "learning_rate": 4.691573955528321e-06, "loss": 0.2328, "step": 929600 }, { "epoch": 12.39418218661263, "grad_norm": 5.256686687469482, "learning_rate": 4.686914205295729e-06, "loss": 0.279, "step": 929700 }, { "epoch": 12.395515324419085, "grad_norm": 0.685003936290741, "learning_rate": 4.682256574194871e-06, "loss": 0.2257, "step": 929800 }, { "epoch": 12.39684846222554, "grad_norm": 7.815999984741211, "learning_rate": 4.677601062615694e-06, "loss": 0.2589, "step": 929900 }, { "epoch": 12.398181600031995, "grad_norm": 3.609636068344116, "learning_rate": 4.672947670947937e-06, "loss": 0.2838, "step": 930000 }, { "epoch": 12.39951473783845, "grad_norm": 1.04523766040802, "learning_rate": 4.668296399581158e-06, "loss": 0.2446, "step": 930100 }, { "epoch": 12.400847875644905, "grad_norm": 3.069114923477173, "learning_rate": 4.663647248904752e-06, "loss": 0.2508, "step": 930200 }, { "epoch": 12.40218101345136, "grad_norm": 1.8934342861175537, "learning_rate": 4.6590002193079285e-06, "loss": 0.2637, "step": 930300 }, { "epoch": 12.403514151257816, "grad_norm": 2.2012417316436768, "learning_rate": 4.654355311179729e-06, "loss": 0.2502, "step": 930400 }, { "epoch": 12.404847289064271, "grad_norm": 1.9401131868362427, "learning_rate": 4.6497125249090026e-06, "loss": 0.2619, "step": 930500 }, { "epoch": 12.406180426870726, "grad_norm": 1.1660860776901245, "learning_rate": 4.645071860884441e-06, "loss": 0.2321, "step": 930600 }, { "epoch": 12.407513564677181, "grad_norm": 0.6979244947433472, "learning_rate": 4.640433319494525e-06, "loss": 0.2373, "step": 930700 }, { "epoch": 12.408846702483636, "grad_norm": 3.9631173610687256, "learning_rate": 4.635796901127606e-06, "loss": 0.2622, "step": 930800 }, { "epoch": 12.410179840290091, "grad_norm": 2.757805347442627, "learning_rate": 4.631162606171807e-06, "loss": 0.2238, "step": 930900 }, { "epoch": 12.411512978096546, "grad_norm": 1.1563937664031982, "learning_rate": 4.626530435015105e-06, "loss": 0.2704, "step": 931000 }, { "epoch": 12.412846115903001, "grad_norm": 2.5787436962127686, "learning_rate": 4.621900388045289e-06, "loss": 0.2395, "step": 931100 }, { "epoch": 12.414179253709456, "grad_norm": 4.9129319190979, "learning_rate": 4.617272465649977e-06, "loss": 0.2821, "step": 931200 }, { "epoch": 12.415512391515911, "grad_norm": 1.2534772157669067, "learning_rate": 4.6126466682166e-06, "loss": 0.2583, "step": 931300 }, { "epoch": 12.416845529322366, "grad_norm": 3.9108164310455322, "learning_rate": 4.608022996132424e-06, "loss": 0.2188, "step": 931400 }, { "epoch": 12.41817866712882, "grad_norm": 1.1614277362823486, "learning_rate": 4.603447654724322e-06, "loss": 0.2619, "step": 931500 }, { "epoch": 12.419511804935276, "grad_norm": 1.790312647819519, "learning_rate": 4.598828213236439e-06, "loss": 0.2704, "step": 931600 }, { "epoch": 12.42084494274173, "grad_norm": 4.160953044891357, "learning_rate": 4.594210898254584e-06, "loss": 0.2547, "step": 931700 }, { "epoch": 12.422178080548186, "grad_norm": 2.721505641937256, "learning_rate": 4.589595710165314e-06, "loss": 0.217, "step": 931800 }, { "epoch": 12.42351121835464, "grad_norm": 2.430011749267578, "learning_rate": 4.584982649354977e-06, "loss": 0.2489, "step": 931900 }, { "epoch": 12.424844356161096, "grad_norm": 0.5692229270935059, "learning_rate": 4.580371716209784e-06, "loss": 0.2207, "step": 932000 }, { "epoch": 12.426177493967552, "grad_norm": 0.41217827796936035, "learning_rate": 4.5757629111157506e-06, "loss": 0.2412, "step": 932100 }, { "epoch": 12.427510631774007, "grad_norm": 2.8846263885498047, "learning_rate": 4.5711562344586936e-06, "loss": 0.2987, "step": 932200 }, { "epoch": 12.428843769580462, "grad_norm": 1.840562343597412, "learning_rate": 4.566551686624278e-06, "loss": 0.258, "step": 932300 }, { "epoch": 12.430176907386917, "grad_norm": 4.336565017700195, "learning_rate": 4.561949267997981e-06, "loss": 0.2574, "step": 932400 }, { "epoch": 12.431510045193372, "grad_norm": 1.5288326740264893, "learning_rate": 4.557348978965107e-06, "loss": 0.27, "step": 932500 }, { "epoch": 12.432843182999827, "grad_norm": 1.5450669527053833, "learning_rate": 4.552750819910779e-06, "loss": 0.2575, "step": 932600 }, { "epoch": 12.434176320806282, "grad_norm": 0.039984218776226044, "learning_rate": 4.548154791219927e-06, "loss": 0.2433, "step": 932700 }, { "epoch": 12.435509458612737, "grad_norm": 4.528310775756836, "learning_rate": 4.54356089327733e-06, "loss": 0.243, "step": 932800 }, { "epoch": 12.436842596419192, "grad_norm": 39.97502899169922, "learning_rate": 4.538969126467566e-06, "loss": 0.2681, "step": 932900 }, { "epoch": 12.438175734225647, "grad_norm": 1.2854477167129517, "learning_rate": 4.534379491175045e-06, "loss": 0.2546, "step": 933000 }, { "epoch": 12.439508872032102, "grad_norm": 4.377691268920898, "learning_rate": 4.5297919877839985e-06, "loss": 0.2512, "step": 933100 }, { "epoch": 12.440842009838557, "grad_norm": 2.0400209426879883, "learning_rate": 4.5252066166784875e-06, "loss": 0.265, "step": 933200 }, { "epoch": 12.442175147645012, "grad_norm": 5.209951877593994, "learning_rate": 4.520623378242361e-06, "loss": 0.2908, "step": 933300 }, { "epoch": 12.443508285451466, "grad_norm": 2.4825692176818848, "learning_rate": 4.516042272859332e-06, "loss": 0.2527, "step": 933400 }, { "epoch": 12.444841423257921, "grad_norm": 1.321142554283142, "learning_rate": 4.5114633009129204e-06, "loss": 0.2859, "step": 933500 }, { "epoch": 12.446174561064378, "grad_norm": 1.581779956817627, "learning_rate": 4.506932220604041e-06, "loss": 0.249, "step": 933600 }, { "epoch": 12.447507698870833, "grad_norm": 1.5424952507019043, "learning_rate": 4.502357495336747e-06, "loss": 0.2447, "step": 933700 }, { "epoch": 12.448840836677288, "grad_norm": 2.2278249263763428, "learning_rate": 4.497784904651712e-06, "loss": 0.2316, "step": 933800 }, { "epoch": 12.450173974483743, "grad_norm": 1.9155102968215942, "learning_rate": 4.493214448931727e-06, "loss": 0.2573, "step": 933900 }, { "epoch": 12.451507112290198, "grad_norm": 2.3618733882904053, "learning_rate": 4.488646128559425e-06, "loss": 0.2466, "step": 934000 }, { "epoch": 12.452840250096653, "grad_norm": 0.8664543032646179, "learning_rate": 4.484079943917252e-06, "loss": 0.2401, "step": 934100 }, { "epoch": 12.454173387903108, "grad_norm": 4.6638946533203125, "learning_rate": 4.4795158953874725e-06, "loss": 0.2407, "step": 934200 }, { "epoch": 12.455506525709563, "grad_norm": 2.286673069000244, "learning_rate": 4.474953983352174e-06, "loss": 0.2359, "step": 934300 }, { "epoch": 12.456839663516018, "grad_norm": 2.4250259399414062, "learning_rate": 4.470394208193272e-06, "loss": 0.2421, "step": 934400 }, { "epoch": 12.458172801322473, "grad_norm": 2.1804039478302, "learning_rate": 4.465836570292478e-06, "loss": 0.2327, "step": 934500 }, { "epoch": 12.459505939128928, "grad_norm": 3.185941219329834, "learning_rate": 4.4612810700313615e-06, "loss": 0.2735, "step": 934600 }, { "epoch": 12.460839076935383, "grad_norm": 5.314849376678467, "learning_rate": 4.456727707791298e-06, "loss": 0.2225, "step": 934700 }, { "epoch": 12.462172214741837, "grad_norm": 2.2434229850769043, "learning_rate": 4.452176483953464e-06, "loss": 0.2536, "step": 934800 }, { "epoch": 12.463505352548292, "grad_norm": 2.077296495437622, "learning_rate": 4.44762739889888e-06, "loss": 0.2613, "step": 934900 }, { "epoch": 12.464838490354747, "grad_norm": 4.423823833465576, "learning_rate": 4.443080453008382e-06, "loss": 0.2977, "step": 935000 }, { "epoch": 12.466171628161202, "grad_norm": 4.033376693725586, "learning_rate": 4.438535646662626e-06, "loss": 0.2823, "step": 935100 }, { "epoch": 12.467504765967657, "grad_norm": 2.1964223384857178, "learning_rate": 4.433992980242096e-06, "loss": 0.2433, "step": 935200 }, { "epoch": 12.468837903774114, "grad_norm": 5.976814270019531, "learning_rate": 4.429452454127076e-06, "loss": 0.2844, "step": 935300 }, { "epoch": 12.470171041580569, "grad_norm": 119.49600982666016, "learning_rate": 4.424914068697685e-06, "loss": 0.2805, "step": 935400 }, { "epoch": 12.471504179387024, "grad_norm": 1.7310266494750977, "learning_rate": 4.420377824333882e-06, "loss": 0.2596, "step": 935500 }, { "epoch": 12.472837317193479, "grad_norm": 3.2412543296813965, "learning_rate": 4.415889051843195e-06, "loss": 0.2648, "step": 935600 }, { "epoch": 12.474170454999934, "grad_norm": 2.0588762760162354, "learning_rate": 4.4113570693295016e-06, "loss": 0.2659, "step": 935700 }, { "epoch": 12.475503592806389, "grad_norm": 3.2087998390197754, "learning_rate": 4.406827229016338e-06, "loss": 0.245, "step": 935800 }, { "epoch": 12.476836730612844, "grad_norm": 1.4978392124176025, "learning_rate": 4.402299531282929e-06, "loss": 0.2476, "step": 935900 }, { "epoch": 12.478169868419299, "grad_norm": 4.070075988769531, "learning_rate": 4.397773976508305e-06, "loss": 0.2443, "step": 936000 }, { "epoch": 12.479503006225753, "grad_norm": 2.3430750370025635, "learning_rate": 4.393250565071337e-06, "loss": 0.2625, "step": 936100 }, { "epoch": 12.480836144032208, "grad_norm": 1.2129617929458618, "learning_rate": 4.388729297350716e-06, "loss": 0.2357, "step": 936200 }, { "epoch": 12.482169281838663, "grad_norm": 1.755907654762268, "learning_rate": 4.384210173724941e-06, "loss": 0.2175, "step": 936300 }, { "epoch": 12.483502419645118, "grad_norm": 3.576152801513672, "learning_rate": 4.379693194572344e-06, "loss": 0.2359, "step": 936400 }, { "epoch": 12.484835557451573, "grad_norm": 2.499882221221924, "learning_rate": 4.3751783602710805e-06, "loss": 0.2275, "step": 936500 }, { "epoch": 12.486168695258028, "grad_norm": 1.5315368175506592, "learning_rate": 4.370665671199091e-06, "loss": 0.2506, "step": 936600 }, { "epoch": 12.487501833064483, "grad_norm": 0.866172730922699, "learning_rate": 4.366155127734194e-06, "loss": 0.2802, "step": 936700 }, { "epoch": 12.488834970870938, "grad_norm": 2.7350802421569824, "learning_rate": 4.36164673025398e-06, "loss": 0.2417, "step": 936800 }, { "epoch": 12.490168108677395, "grad_norm": 4.260119438171387, "learning_rate": 4.35714047913588e-06, "loss": 0.2728, "step": 936900 }, { "epoch": 12.49150124648385, "grad_norm": 3.0618271827697754, "learning_rate": 4.352681405173337e-06, "loss": 0.2156, "step": 937000 }, { "epoch": 12.492834384290305, "grad_norm": 0.6054587960243225, "learning_rate": 4.348179426438015e-06, "loss": 0.2525, "step": 937100 }, { "epoch": 12.49416752209676, "grad_norm": 2.008774518966675, "learning_rate": 4.343679595192243e-06, "loss": 0.2484, "step": 937200 }, { "epoch": 12.495500659903215, "grad_norm": 0.791273295879364, "learning_rate": 4.339226878013361e-06, "loss": 0.2633, "step": 937300 }, { "epoch": 12.49683379770967, "grad_norm": 1.9838759899139404, "learning_rate": 4.334731321392367e-06, "loss": 0.2569, "step": 937400 }, { "epoch": 12.498166935516124, "grad_norm": 1.5638457536697388, "learning_rate": 4.330237913386771e-06, "loss": 0.2796, "step": 937500 }, { "epoch": 12.49950007332258, "grad_norm": 0.23070238530635834, "learning_rate": 4.3257466543727334e-06, "loss": 0.2358, "step": 937600 }, { "epoch": 12.500833211129034, "grad_norm": 2.126694679260254, "learning_rate": 4.321257544726246e-06, "loss": 0.2192, "step": 937700 }, { "epoch": 12.50216634893549, "grad_norm": 3.165987014770508, "learning_rate": 4.316770584823142e-06, "loss": 0.2249, "step": 937800 }, { "epoch": 12.503499486741944, "grad_norm": 4.013132572174072, "learning_rate": 4.312285775039038e-06, "loss": 0.2693, "step": 937900 }, { "epoch": 12.5048326245484, "grad_norm": 1.6522722244262695, "learning_rate": 4.307803115749402e-06, "loss": 0.2555, "step": 938000 }, { "epoch": 12.506165762354854, "grad_norm": 3.460148572921753, "learning_rate": 4.303322607329486e-06, "loss": 0.2719, "step": 938100 }, { "epoch": 12.507498900161309, "grad_norm": 2.4282307624816895, "learning_rate": 4.298844250154409e-06, "loss": 0.2493, "step": 938200 }, { "epoch": 12.508832037967764, "grad_norm": 4.603011131286621, "learning_rate": 4.294368044599077e-06, "loss": 0.2539, "step": 938300 }, { "epoch": 12.510165175774219, "grad_norm": 1.284596562385559, "learning_rate": 4.289893991038215e-06, "loss": 0.2245, "step": 938400 }, { "epoch": 12.511498313580676, "grad_norm": 3.112994909286499, "learning_rate": 4.285422089846383e-06, "loss": 0.2451, "step": 938500 }, { "epoch": 12.51283145138713, "grad_norm": 0.8201691508293152, "learning_rate": 4.280952341397954e-06, "loss": 0.2151, "step": 938600 }, { "epoch": 12.514164589193586, "grad_norm": 1.3048896789550781, "learning_rate": 4.276484746067122e-06, "loss": 0.2452, "step": 938700 }, { "epoch": 12.51549772700004, "grad_norm": 4.152522563934326, "learning_rate": 4.272019304227902e-06, "loss": 0.2456, "step": 938800 }, { "epoch": 12.516830864806495, "grad_norm": 4.652343273162842, "learning_rate": 4.267556016254129e-06, "loss": 0.2261, "step": 938900 }, { "epoch": 12.51816400261295, "grad_norm": 2.40323543548584, "learning_rate": 4.263094882519435e-06, "loss": 0.265, "step": 939000 }, { "epoch": 12.519497140419405, "grad_norm": 2.8905298709869385, "learning_rate": 4.2586359033973166e-06, "loss": 0.2556, "step": 939100 }, { "epoch": 12.52083027822586, "grad_norm": 2.753084421157837, "learning_rate": 4.254179079261061e-06, "loss": 0.259, "step": 939200 }, { "epoch": 12.522163416032315, "grad_norm": 4.2441301345825195, "learning_rate": 4.2497244104837706e-06, "loss": 0.2123, "step": 939300 }, { "epoch": 12.52349655383877, "grad_norm": 2.613142728805542, "learning_rate": 4.2452718974383834e-06, "loss": 0.2617, "step": 939400 }, { "epoch": 12.524829691645225, "grad_norm": 5.27469539642334, "learning_rate": 4.240821540497644e-06, "loss": 0.217, "step": 939500 }, { "epoch": 12.52616282945168, "grad_norm": 6.218994140625, "learning_rate": 4.2363733400341245e-06, "loss": 0.2197, "step": 939600 }, { "epoch": 12.527495967258135, "grad_norm": 0.5270727872848511, "learning_rate": 4.231927296420221e-06, "loss": 0.2479, "step": 939700 }, { "epoch": 12.52882910506459, "grad_norm": 3.18322491645813, "learning_rate": 4.227483410028131e-06, "loss": 0.2947, "step": 939800 }, { "epoch": 12.530162242871045, "grad_norm": 2.8765716552734375, "learning_rate": 4.223041681229889e-06, "loss": 0.2254, "step": 939900 }, { "epoch": 12.531495380677502, "grad_norm": 1.8159868717193604, "learning_rate": 4.21860211039734e-06, "loss": 0.2492, "step": 940000 }, { "epoch": 12.532828518483957, "grad_norm": 4.150238513946533, "learning_rate": 4.214164697902152e-06, "loss": 0.2431, "step": 940100 }, { "epoch": 12.534161656290411, "grad_norm": 3.0745246410369873, "learning_rate": 4.209729444115813e-06, "loss": 0.2867, "step": 940200 }, { "epoch": 12.535494794096866, "grad_norm": 2.4128482341766357, "learning_rate": 4.205296349409635e-06, "loss": 0.2554, "step": 940300 }, { "epoch": 12.536827931903321, "grad_norm": 3.1932404041290283, "learning_rate": 4.200865414154719e-06, "loss": 0.289, "step": 940400 }, { "epoch": 12.538161069709776, "grad_norm": 2.84053635597229, "learning_rate": 4.19643663872203e-06, "loss": 0.3155, "step": 940500 }, { "epoch": 12.539494207516231, "grad_norm": 4.148738861083984, "learning_rate": 4.192010023482336e-06, "loss": 0.2611, "step": 940600 }, { "epoch": 12.540827345322686, "grad_norm": 1.7728614807128906, "learning_rate": 4.187585568806199e-06, "loss": 0.2412, "step": 940700 }, { "epoch": 12.542160483129141, "grad_norm": 1.7447460889816284, "learning_rate": 4.183163275064031e-06, "loss": 0.2758, "step": 940800 }, { "epoch": 12.543493620935596, "grad_norm": 2.9625704288482666, "learning_rate": 4.1787431426260526e-06, "loss": 0.2657, "step": 940900 }, { "epoch": 12.544826758742051, "grad_norm": 0.49437278509140015, "learning_rate": 4.174325171862303e-06, "loss": 0.2498, "step": 941000 }, { "epoch": 12.546159896548506, "grad_norm": 0.29064279794692993, "learning_rate": 4.169909363142648e-06, "loss": 0.219, "step": 941100 }, { "epoch": 12.54749303435496, "grad_norm": 2.7932188510894775, "learning_rate": 4.16549571683675e-06, "loss": 0.2528, "step": 941200 }, { "epoch": 12.548826172161416, "grad_norm": 1.882507562637329, "learning_rate": 4.161084233314108e-06, "loss": 0.2313, "step": 941300 }, { "epoch": 12.55015930996787, "grad_norm": 0.19162194430828094, "learning_rate": 4.156674912944054e-06, "loss": 0.2753, "step": 941400 }, { "epoch": 12.551492447774326, "grad_norm": 3.0537216663360596, "learning_rate": 4.152267756095705e-06, "loss": 0.2546, "step": 941500 }, { "epoch": 12.55282558558078, "grad_norm": 3.658240556716919, "learning_rate": 4.147862763138018e-06, "loss": 0.3117, "step": 941600 }, { "epoch": 12.554158723387236, "grad_norm": 2.019237756729126, "learning_rate": 4.143459934439771e-06, "loss": 0.2503, "step": 941700 }, { "epoch": 12.555491861193692, "grad_norm": 1.8761464357376099, "learning_rate": 4.139059270369549e-06, "loss": 0.2469, "step": 941800 }, { "epoch": 12.556824999000147, "grad_norm": 2.2106101512908936, "learning_rate": 4.1346607712957675e-06, "loss": 0.2613, "step": 941900 }, { "epoch": 12.558158136806602, "grad_norm": 0.03157365322113037, "learning_rate": 4.1302644375866564e-06, "loss": 0.2807, "step": 942000 }, { "epoch": 12.559491274613057, "grad_norm": 0.9323731064796448, "learning_rate": 4.125870269610251e-06, "loss": 0.2523, "step": 942100 }, { "epoch": 12.560824412419512, "grad_norm": 2.268608808517456, "learning_rate": 4.121478267734425e-06, "loss": 0.2893, "step": 942200 }, { "epoch": 12.562157550225967, "grad_norm": 2.9315402507781982, "learning_rate": 4.117088432326861e-06, "loss": 0.2453, "step": 942300 }, { "epoch": 12.563490688032422, "grad_norm": 2.105062484741211, "learning_rate": 4.112744629713746e-06, "loss": 0.2703, "step": 942400 }, { "epoch": 12.564823825838877, "grad_norm": 1.6621636152267456, "learning_rate": 4.1083591066711805e-06, "loss": 0.2797, "step": 942500 }, { "epoch": 12.566156963645332, "grad_norm": 1.7947099208831787, "learning_rate": 4.10397575119519e-06, "loss": 0.2252, "step": 942600 }, { "epoch": 12.567490101451787, "grad_norm": 3.652238368988037, "learning_rate": 4.09959456365271e-06, "loss": 0.2501, "step": 942700 }, { "epoch": 12.568823239258242, "grad_norm": 9.840374946594238, "learning_rate": 4.095215544410529e-06, "loss": 0.2674, "step": 942800 }, { "epoch": 12.570156377064697, "grad_norm": 1.0966650247573853, "learning_rate": 4.09083869383524e-06, "loss": 0.2429, "step": 942900 }, { "epoch": 12.571489514871152, "grad_norm": 0.8062249422073364, "learning_rate": 4.086464012293265e-06, "loss": 0.2244, "step": 943000 }, { "epoch": 12.572822652677607, "grad_norm": 2.5355494022369385, "learning_rate": 4.0821352145325376e-06, "loss": 0.2333, "step": 943100 }, { "epoch": 12.574155790484062, "grad_norm": 3.007452964782715, "learning_rate": 4.07776485045624e-06, "loss": 0.2506, "step": 943200 }, { "epoch": 12.575488928290518, "grad_norm": 2.0988407135009766, "learning_rate": 4.073396656507758e-06, "loss": 0.2726, "step": 943300 }, { "epoch": 12.576822066096973, "grad_norm": 1.7699944972991943, "learning_rate": 4.0690306330527794e-06, "loss": 0.3036, "step": 943400 }, { "epoch": 12.578155203903428, "grad_norm": 1.411070466041565, "learning_rate": 4.064666780456814e-06, "loss": 0.229, "step": 943500 }, { "epoch": 12.579488341709883, "grad_norm": 7.6503214836120605, "learning_rate": 4.060305099085199e-06, "loss": 0.2322, "step": 943600 }, { "epoch": 12.580821479516338, "grad_norm": 0.4021323621273041, "learning_rate": 4.055945589303055e-06, "loss": 0.2488, "step": 943700 }, { "epoch": 12.582154617322793, "grad_norm": 2.769026517868042, "learning_rate": 4.0515882514753745e-06, "loss": 0.2925, "step": 943800 }, { "epoch": 12.583487755129248, "grad_norm": 1.3371021747589111, "learning_rate": 4.047233085966932e-06, "loss": 0.2572, "step": 943900 }, { "epoch": 12.584820892935703, "grad_norm": 0.4436841309070587, "learning_rate": 4.042880093142321e-06, "loss": 0.2757, "step": 944000 }, { "epoch": 12.586154030742158, "grad_norm": 2.042196035385132, "learning_rate": 4.038529273365963e-06, "loss": 0.2584, "step": 944100 }, { "epoch": 12.587487168548613, "grad_norm": 2.7866461277008057, "learning_rate": 4.034180627002098e-06, "loss": 0.293, "step": 944200 }, { "epoch": 12.588820306355068, "grad_norm": 1.577594518661499, "learning_rate": 4.029834154414781e-06, "loss": 0.2887, "step": 944300 }, { "epoch": 12.590153444161523, "grad_norm": 1.550360083580017, "learning_rate": 4.0254898559678884e-06, "loss": 0.2172, "step": 944400 }, { "epoch": 12.591486581967978, "grad_norm": 3.3531389236450195, "learning_rate": 4.0211477320251e-06, "loss": 0.2454, "step": 944500 }, { "epoch": 12.592819719774432, "grad_norm": 2.047915458679199, "learning_rate": 4.01680778294993e-06, "loss": 0.2936, "step": 944600 }, { "epoch": 12.594152857580887, "grad_norm": 4.711468696594238, "learning_rate": 4.012470009105706e-06, "loss": 0.2781, "step": 944700 }, { "epoch": 12.595485995387342, "grad_norm": 2.0772082805633545, "learning_rate": 4.008134410855574e-06, "loss": 0.2057, "step": 944800 }, { "epoch": 12.596819133193797, "grad_norm": 1.1733373403549194, "learning_rate": 4.003800988562496e-06, "loss": 0.2251, "step": 944900 }, { "epoch": 12.598152271000254, "grad_norm": 3.37178897857666, "learning_rate": 3.999469742589259e-06, "loss": 0.2293, "step": 945000 }, { "epoch": 12.599485408806709, "grad_norm": 3.9488959312438965, "learning_rate": 3.995140673298443e-06, "loss": 0.2735, "step": 945100 }, { "epoch": 12.600818546613164, "grad_norm": 1.0321086645126343, "learning_rate": 3.990813781052478e-06, "loss": 0.2975, "step": 945200 }, { "epoch": 12.602151684419619, "grad_norm": 1.5605601072311401, "learning_rate": 3.9864890662136046e-06, "loss": 0.27, "step": 945300 }, { "epoch": 12.603484822226074, "grad_norm": 3.524538993835449, "learning_rate": 3.982166529143859e-06, "loss": 0.2661, "step": 945400 }, { "epoch": 12.604817960032529, "grad_norm": 3.2315356731414795, "learning_rate": 3.977846170205113e-06, "loss": 0.2681, "step": 945500 }, { "epoch": 12.606151097838984, "grad_norm": 0.6471185684204102, "learning_rate": 3.973527989759059e-06, "loss": 0.2106, "step": 945600 }, { "epoch": 12.607484235645439, "grad_norm": 2.7659342288970947, "learning_rate": 3.969211988167199e-06, "loss": 0.2602, "step": 945700 }, { "epoch": 12.608817373451894, "grad_norm": 29.723154067993164, "learning_rate": 3.964898165790861e-06, "loss": 0.2576, "step": 945800 }, { "epoch": 12.610150511258349, "grad_norm": 2.1810498237609863, "learning_rate": 3.9605865229911715e-06, "loss": 0.2692, "step": 945900 }, { "epoch": 12.611483649064803, "grad_norm": 1.874487042427063, "learning_rate": 3.956277060129093e-06, "loss": 0.2646, "step": 946000 }, { "epoch": 12.612816786871258, "grad_norm": 0.9802221655845642, "learning_rate": 3.951969777565409e-06, "loss": 0.2769, "step": 946100 }, { "epoch": 12.614149924677713, "grad_norm": 8.491118431091309, "learning_rate": 3.947664675660703e-06, "loss": 0.2307, "step": 946200 }, { "epoch": 12.615483062484168, "grad_norm": 1.2303508520126343, "learning_rate": 3.943361754775384e-06, "loss": 0.2525, "step": 946300 }, { "epoch": 12.616816200290623, "grad_norm": 3.4735355377197266, "learning_rate": 3.939061015269682e-06, "loss": 0.2436, "step": 946400 }, { "epoch": 12.61814933809708, "grad_norm": 7.405066967010498, "learning_rate": 3.93476245750364e-06, "loss": 0.2436, "step": 946500 }, { "epoch": 12.619482475903535, "grad_norm": 2.2136666774749756, "learning_rate": 3.930466081837122e-06, "loss": 0.26, "step": 946600 }, { "epoch": 12.62081561370999, "grad_norm": 4.915460586547852, "learning_rate": 3.92617188862981e-06, "loss": 0.2345, "step": 946700 }, { "epoch": 12.622148751516445, "grad_norm": 4.909228801727295, "learning_rate": 3.921879878241188e-06, "loss": 0.2599, "step": 946800 }, { "epoch": 12.6234818893229, "grad_norm": 1.989874243736267, "learning_rate": 3.9175900510305804e-06, "loss": 0.2727, "step": 946900 }, { "epoch": 12.624815027129355, "grad_norm": 2.353018283843994, "learning_rate": 3.913302407357114e-06, "loss": 0.2907, "step": 947000 }, { "epoch": 12.62614816493581, "grad_norm": 3.7120625972747803, "learning_rate": 3.9090169475797355e-06, "loss": 0.2205, "step": 947100 }, { "epoch": 12.627481302742265, "grad_norm": 2.0131614208221436, "learning_rate": 3.904733672057223e-06, "loss": 0.2771, "step": 947200 }, { "epoch": 12.62881444054872, "grad_norm": 0.4839431643486023, "learning_rate": 3.9004525811481384e-06, "loss": 0.2621, "step": 947300 }, { "epoch": 12.630147578355174, "grad_norm": 1.0553988218307495, "learning_rate": 3.896173675210884e-06, "loss": 0.2815, "step": 947400 }, { "epoch": 12.63148071616163, "grad_norm": 2.3871636390686035, "learning_rate": 3.891896954603699e-06, "loss": 0.2768, "step": 947500 }, { "epoch": 12.632813853968084, "grad_norm": 4.010435104370117, "learning_rate": 3.8876224196845915e-06, "loss": 0.2804, "step": 947600 }, { "epoch": 12.63414699177454, "grad_norm": 2.824028491973877, "learning_rate": 3.883350070811425e-06, "loss": 0.2602, "step": 947700 }, { "epoch": 12.635480129580994, "grad_norm": 0.4512970447540283, "learning_rate": 3.879079908341859e-06, "loss": 0.227, "step": 947800 }, { "epoch": 12.63681326738745, "grad_norm": 4.466464519500732, "learning_rate": 3.874811932633386e-06, "loss": 0.2597, "step": 947900 }, { "epoch": 12.638146405193904, "grad_norm": 1.7986429929733276, "learning_rate": 3.870546144043303e-06, "loss": 0.2472, "step": 948000 }, { "epoch": 12.639479543000359, "grad_norm": 0.728628933429718, "learning_rate": 3.866282542928737e-06, "loss": 0.2666, "step": 948100 }, { "epoch": 12.640812680806816, "grad_norm": 0.5938609838485718, "learning_rate": 3.86202112964661e-06, "loss": 0.2662, "step": 948200 }, { "epoch": 12.64214581861327, "grad_norm": 1.5069218873977661, "learning_rate": 3.857761904553673e-06, "loss": 0.2314, "step": 948300 }, { "epoch": 12.643478956419726, "grad_norm": 4.116724491119385, "learning_rate": 3.853504868006517e-06, "loss": 0.228, "step": 948400 }, { "epoch": 12.64481209422618, "grad_norm": 2.8679308891296387, "learning_rate": 3.849250020361504e-06, "loss": 0.2479, "step": 948500 }, { "epoch": 12.646145232032636, "grad_norm": 2.209404706954956, "learning_rate": 3.844997361974852e-06, "loss": 0.2195, "step": 948600 }, { "epoch": 12.64747836983909, "grad_norm": 2.309967517852783, "learning_rate": 3.840746893202559e-06, "loss": 0.3027, "step": 948700 }, { "epoch": 12.648811507645545, "grad_norm": 3.273979902267456, "learning_rate": 3.836498614400482e-06, "loss": 0.2765, "step": 948800 }, { "epoch": 12.650144645452, "grad_norm": 3.147834539413452, "learning_rate": 3.832252525924272e-06, "loss": 0.2472, "step": 948900 }, { "epoch": 12.651477783258455, "grad_norm": 3.9513957500457764, "learning_rate": 3.828008628129389e-06, "loss": 0.2842, "step": 949000 }, { "epoch": 12.65281092106491, "grad_norm": 7.015556335449219, "learning_rate": 3.823766921371121e-06, "loss": 0.3203, "step": 949100 }, { "epoch": 12.654144058871365, "grad_norm": 0.80588698387146, "learning_rate": 3.8195274060045716e-06, "loss": 0.2681, "step": 949200 }, { "epoch": 12.65547719667782, "grad_norm": 3.4522786140441895, "learning_rate": 3.81529008238466e-06, "loss": 0.287, "step": 949300 }, { "epoch": 12.656810334484275, "grad_norm": 2.719113349914551, "learning_rate": 3.811054950866121e-06, "loss": 0.2403, "step": 949400 }, { "epoch": 12.65814347229073, "grad_norm": 5.678623676300049, "learning_rate": 3.8068220118035124e-06, "loss": 0.2247, "step": 949500 }, { "epoch": 12.659476610097185, "grad_norm": 1.4147008657455444, "learning_rate": 3.802591265551191e-06, "loss": 0.2068, "step": 949600 }, { "epoch": 12.660809747903642, "grad_norm": 3.229229688644409, "learning_rate": 3.7983627124633425e-06, "loss": 0.2159, "step": 949700 }, { "epoch": 12.662142885710097, "grad_norm": 0.43252503871917725, "learning_rate": 3.7941363528939844e-06, "loss": 0.2683, "step": 949800 }, { "epoch": 12.663476023516552, "grad_norm": 4.795872688293457, "learning_rate": 3.7899121871969167e-06, "loss": 0.2471, "step": 949900 }, { "epoch": 12.664809161323006, "grad_norm": 3.523634433746338, "learning_rate": 3.78573242457791e-06, "loss": 0.2383, "step": 950000 }, { "epoch": 12.666142299129461, "grad_norm": 2.208688974380493, "learning_rate": 3.781512625738618e-06, "loss": 0.1939, "step": 950100 }, { "epoch": 12.667475436935916, "grad_norm": 3.0435631275177, "learning_rate": 3.777295021828434e-06, "loss": 0.2658, "step": 950200 }, { "epoch": 12.668808574742371, "grad_norm": 4.536346435546875, "learning_rate": 3.77307961320045e-06, "loss": 0.2742, "step": 950300 }, { "epoch": 12.670141712548826, "grad_norm": 4.0227789878845215, "learning_rate": 3.7688664002075655e-06, "loss": 0.2498, "step": 950400 }, { "epoch": 12.671474850355281, "grad_norm": 1.6626524925231934, "learning_rate": 3.7646553832024965e-06, "loss": 0.2886, "step": 950500 }, { "epoch": 12.672807988161736, "grad_norm": 2.912806272506714, "learning_rate": 3.7604465625377805e-06, "loss": 0.2422, "step": 950600 }, { "epoch": 12.674141125968191, "grad_norm": 0.6262388229370117, "learning_rate": 3.756239938565772e-06, "loss": 0.2481, "step": 950700 }, { "epoch": 12.675474263774646, "grad_norm": 1.4657864570617676, "learning_rate": 3.7520355116386196e-06, "loss": 0.2822, "step": 950800 }, { "epoch": 12.676807401581101, "grad_norm": 2.022388458251953, "learning_rate": 3.747833282108305e-06, "loss": 0.2394, "step": 950900 }, { "epoch": 12.678140539387556, "grad_norm": 2.951547622680664, "learning_rate": 3.74363325032665e-06, "loss": 0.2529, "step": 951000 }, { "epoch": 12.67947367719401, "grad_norm": 1.2674009799957275, "learning_rate": 3.739435416645247e-06, "loss": 0.2544, "step": 951100 }, { "epoch": 12.680806815000466, "grad_norm": 43.91072463989258, "learning_rate": 3.735239781415528e-06, "loss": 0.2323, "step": 951200 }, { "epoch": 12.68213995280692, "grad_norm": 2.5937023162841797, "learning_rate": 3.7310882684677773e-06, "loss": 0.2788, "step": 951300 }, { "epoch": 12.683473090613377, "grad_norm": 2.2791953086853027, "learning_rate": 3.7268970092017127e-06, "loss": 0.2251, "step": 951400 }, { "epoch": 12.684806228419832, "grad_norm": 1.9958133697509766, "learning_rate": 3.722791709074922e-06, "loss": 0.2468, "step": 951500 }, { "epoch": 12.686139366226287, "grad_norm": 2.7356152534484863, "learning_rate": 3.718604805161793e-06, "loss": 0.2775, "step": 951600 }, { "epoch": 12.687472504032742, "grad_norm": 2.824702739715576, "learning_rate": 3.714420101444226e-06, "loss": 0.2417, "step": 951700 }, { "epoch": 12.688805641839197, "grad_norm": 10.121207237243652, "learning_rate": 3.710237598272548e-06, "loss": 0.2441, "step": 951800 }, { "epoch": 12.690138779645652, "grad_norm": 0.9980538487434387, "learning_rate": 3.7060572959969054e-06, "loss": 0.2506, "step": 951900 }, { "epoch": 12.691471917452107, "grad_norm": 25.14413833618164, "learning_rate": 3.701879194967258e-06, "loss": 0.227, "step": 952000 }, { "epoch": 12.692805055258562, "grad_norm": 1.4132399559020996, "learning_rate": 3.6977032955333965e-06, "loss": 0.2976, "step": 952100 }, { "epoch": 12.694138193065017, "grad_norm": 4.45799446105957, "learning_rate": 3.693529598044888e-06, "loss": 0.2164, "step": 952200 }, { "epoch": 12.695471330871472, "grad_norm": 0.9709843397140503, "learning_rate": 3.6893581028511623e-06, "loss": 0.2668, "step": 952300 }, { "epoch": 12.696804468677927, "grad_norm": 3.4656338691711426, "learning_rate": 3.6851888103014444e-06, "loss": 0.2337, "step": 952400 }, { "epoch": 12.698137606484382, "grad_norm": 5.8816609382629395, "learning_rate": 3.681021720744764e-06, "loss": 0.2746, "step": 952500 }, { "epoch": 12.699470744290837, "grad_norm": 1.154234528541565, "learning_rate": 3.6768568345299756e-06, "loss": 0.2347, "step": 952600 }, { "epoch": 12.700803882097292, "grad_norm": 1.6751922369003296, "learning_rate": 3.672694152005761e-06, "loss": 0.2647, "step": 952700 }, { "epoch": 12.702137019903747, "grad_norm": 2.9805240631103516, "learning_rate": 3.6685336735205976e-06, "loss": 0.2844, "step": 952800 }, { "epoch": 12.703470157710203, "grad_norm": 2.489335060119629, "learning_rate": 3.6643753994227967e-06, "loss": 0.2629, "step": 952900 }, { "epoch": 12.704803295516658, "grad_norm": 11.05298137664795, "learning_rate": 3.660219330060467e-06, "loss": 0.2332, "step": 953000 }, { "epoch": 12.706136433323113, "grad_norm": 5.837963104248047, "learning_rate": 3.656065465781536e-06, "loss": 0.2714, "step": 953100 }, { "epoch": 12.707469571129568, "grad_norm": 1.98056161403656, "learning_rate": 3.6519138069337764e-06, "loss": 0.269, "step": 953200 }, { "epoch": 12.708802708936023, "grad_norm": 2.1161022186279297, "learning_rate": 3.6477643538647264e-06, "loss": 0.2868, "step": 953300 }, { "epoch": 12.710135846742478, "grad_norm": 2.1575355529785156, "learning_rate": 3.6436171069217783e-06, "loss": 0.2233, "step": 953400 }, { "epoch": 12.711468984548933, "grad_norm": 2.6656105518341064, "learning_rate": 3.6394720664521243e-06, "loss": 0.2563, "step": 953500 }, { "epoch": 12.712802122355388, "grad_norm": 3.5907647609710693, "learning_rate": 3.63532923280277e-06, "loss": 0.251, "step": 953600 }, { "epoch": 12.714135260161843, "grad_norm": 2.7048683166503906, "learning_rate": 3.631188606320545e-06, "loss": 0.2779, "step": 953700 }, { "epoch": 12.715468397968298, "grad_norm": 3.8718221187591553, "learning_rate": 3.627050187352098e-06, "loss": 0.2414, "step": 953800 }, { "epoch": 12.716801535774753, "grad_norm": 1.0046050548553467, "learning_rate": 3.6229139762438624e-06, "loss": 0.311, "step": 953900 }, { "epoch": 12.718134673581208, "grad_norm": 0.7894651889801025, "learning_rate": 3.6187799733421247e-06, "loss": 0.27, "step": 954000 }, { "epoch": 12.719467811387663, "grad_norm": 1.4283474683761597, "learning_rate": 3.6146481789929676e-06, "loss": 0.2559, "step": 954100 }, { "epoch": 12.720800949194118, "grad_norm": 1.9218014478683472, "learning_rate": 3.610518593542288e-06, "loss": 0.2177, "step": 954200 }, { "epoch": 12.722134087000573, "grad_norm": 2.218454122543335, "learning_rate": 3.6063912173358127e-06, "loss": 0.2253, "step": 954300 }, { "epoch": 12.723467224807028, "grad_norm": 2.7843313217163086, "learning_rate": 3.6022660507190587e-06, "loss": 0.2141, "step": 954400 }, { "epoch": 12.724800362613482, "grad_norm": 6.044347763061523, "learning_rate": 3.5981430940373725e-06, "loss": 0.2918, "step": 954500 }, { "epoch": 12.72613350041994, "grad_norm": 7.650594711303711, "learning_rate": 3.5940223476359325e-06, "loss": 0.2301, "step": 954600 }, { "epoch": 12.727466638226394, "grad_norm": 4.33224630355835, "learning_rate": 3.589903811859695e-06, "loss": 0.2593, "step": 954700 }, { "epoch": 12.728799776032849, "grad_norm": 0.9965423941612244, "learning_rate": 3.5857874870534614e-06, "loss": 0.3109, "step": 954800 }, { "epoch": 12.730132913839304, "grad_norm": 11.444055557250977, "learning_rate": 3.5816733735618325e-06, "loss": 0.1953, "step": 954900 }, { "epoch": 12.731466051645759, "grad_norm": 1.630806803703308, "learning_rate": 3.5775614717292326e-06, "loss": 0.2742, "step": 955000 }, { "epoch": 12.732799189452214, "grad_norm": 5.19965124130249, "learning_rate": 3.5734517818998934e-06, "loss": 0.2534, "step": 955100 }, { "epoch": 12.734132327258669, "grad_norm": 1.650588035583496, "learning_rate": 3.569344304417873e-06, "loss": 0.2243, "step": 955200 }, { "epoch": 12.735465465065124, "grad_norm": 6.192601680755615, "learning_rate": 3.5652390396270297e-06, "loss": 0.2752, "step": 955300 }, { "epoch": 12.736798602871579, "grad_norm": 1.091105580329895, "learning_rate": 3.5611359878710314e-06, "loss": 0.2275, "step": 955400 }, { "epoch": 12.738131740678034, "grad_norm": 2.3326525688171387, "learning_rate": 3.557035149493404e-06, "loss": 0.2601, "step": 955500 }, { "epoch": 12.739464878484489, "grad_norm": 4.331932067871094, "learning_rate": 3.552936524837429e-06, "loss": 0.2745, "step": 955600 }, { "epoch": 12.740798016290944, "grad_norm": 2.242626190185547, "learning_rate": 3.5488401142462425e-06, "loss": 0.2853, "step": 955700 }, { "epoch": 12.742131154097398, "grad_norm": 5.460253715515137, "learning_rate": 3.544745918062776e-06, "loss": 0.2461, "step": 955800 }, { "epoch": 12.743464291903853, "grad_norm": 3.4809815883636475, "learning_rate": 3.5406539366297898e-06, "loss": 0.2431, "step": 955900 }, { "epoch": 12.744797429710308, "grad_norm": 0.49865415692329407, "learning_rate": 3.5365641702898555e-06, "loss": 0.2385, "step": 956000 }, { "epoch": 12.746130567516765, "grad_norm": 1.4386488199234009, "learning_rate": 3.532476619385343e-06, "loss": 0.227, "step": 956100 }, { "epoch": 12.74746370532322, "grad_norm": 0.05840674415230751, "learning_rate": 3.5283912842584553e-06, "loss": 0.2569, "step": 956200 }, { "epoch": 12.748796843129675, "grad_norm": 0.9940827488899231, "learning_rate": 3.524308165251202e-06, "loss": 0.2802, "step": 956300 }, { "epoch": 12.75012998093613, "grad_norm": 0.8568811416625977, "learning_rate": 3.52022726270541e-06, "loss": 0.2559, "step": 956400 }, { "epoch": 12.751463118742585, "grad_norm": 0.8735173940658569, "learning_rate": 3.516148576962722e-06, "loss": 0.2914, "step": 956500 }, { "epoch": 12.75279625654904, "grad_norm": 1.5425689220428467, "learning_rate": 3.512072108364596e-06, "loss": 0.2243, "step": 956600 }, { "epoch": 12.754129394355495, "grad_norm": 0.7964770197868347, "learning_rate": 3.5079978572522907e-06, "loss": 0.239, "step": 956700 }, { "epoch": 12.75546253216195, "grad_norm": 3.186533212661743, "learning_rate": 3.503925823966888e-06, "loss": 0.2638, "step": 956800 }, { "epoch": 12.756795669968405, "grad_norm": 1.0787135362625122, "learning_rate": 3.499856008849305e-06, "loss": 0.238, "step": 956900 }, { "epoch": 12.75812880777486, "grad_norm": 5.341506004333496, "learning_rate": 3.495788412240235e-06, "loss": 0.3254, "step": 957000 }, { "epoch": 12.759461945581315, "grad_norm": 2.0803868770599365, "learning_rate": 3.49172303448021e-06, "loss": 0.268, "step": 957100 }, { "epoch": 12.76079508338777, "grad_norm": 4.519742965698242, "learning_rate": 3.48765987590957e-06, "loss": 0.2426, "step": 957200 }, { "epoch": 12.762128221194224, "grad_norm": 2.318047046661377, "learning_rate": 3.483598936868474e-06, "loss": 0.2671, "step": 957300 }, { "epoch": 12.76346135900068, "grad_norm": 2.571186065673828, "learning_rate": 3.479540217696886e-06, "loss": 0.2706, "step": 957400 }, { "epoch": 12.764794496807134, "grad_norm": 3.0463786125183105, "learning_rate": 3.4754837187346012e-06, "loss": 0.2418, "step": 957500 }, { "epoch": 12.76612763461359, "grad_norm": 4.013525485992432, "learning_rate": 3.471429440321198e-06, "loss": 0.3006, "step": 957600 }, { "epoch": 12.767460772420044, "grad_norm": 3.082514524459839, "learning_rate": 3.467377382796094e-06, "loss": 0.2562, "step": 957700 }, { "epoch": 12.768793910226501, "grad_norm": 1.5277072191238403, "learning_rate": 3.4633275464985226e-06, "loss": 0.2489, "step": 957800 }, { "epoch": 12.770127048032956, "grad_norm": 2.794179916381836, "learning_rate": 3.459279931767515e-06, "loss": 0.2502, "step": 957900 }, { "epoch": 12.77146018583941, "grad_norm": 2.3275372982025146, "learning_rate": 3.4552345389419338e-06, "loss": 0.2464, "step": 958000 }, { "epoch": 12.772793323645866, "grad_norm": 2.2611711025238037, "learning_rate": 3.4511913683604313e-06, "loss": 0.2459, "step": 958100 }, { "epoch": 12.77412646145232, "grad_norm": 5.498658180236816, "learning_rate": 3.4471504203615034e-06, "loss": 0.2402, "step": 958200 }, { "epoch": 12.775459599258776, "grad_norm": 0.9207426905632019, "learning_rate": 3.443111695283443e-06, "loss": 0.2207, "step": 958300 }, { "epoch": 12.77679273706523, "grad_norm": 1.1915078163146973, "learning_rate": 3.439075193464356e-06, "loss": 0.2196, "step": 958400 }, { "epoch": 12.778125874871685, "grad_norm": 1.7746065855026245, "learning_rate": 3.4350409152421625e-06, "loss": 0.2366, "step": 958500 }, { "epoch": 12.77945901267814, "grad_norm": 2.728477716445923, "learning_rate": 3.4310088609546055e-06, "loss": 0.2493, "step": 958600 }, { "epoch": 12.780792150484595, "grad_norm": 0.9841799736022949, "learning_rate": 3.4269790309392345e-06, "loss": 0.257, "step": 958700 }, { "epoch": 12.78212528829105, "grad_norm": 0.7740300893783569, "learning_rate": 3.422951425533414e-06, "loss": 0.2752, "step": 958800 }, { "epoch": 12.783458426097505, "grad_norm": 4.564949035644531, "learning_rate": 3.4189260450743266e-06, "loss": 0.2569, "step": 958900 }, { "epoch": 12.78479156390396, "grad_norm": 4.127772331237793, "learning_rate": 3.41490288989895e-06, "loss": 0.239, "step": 959000 }, { "epoch": 12.786124701710415, "grad_norm": 1.6236968040466309, "learning_rate": 3.4108819603441076e-06, "loss": 0.2499, "step": 959100 }, { "epoch": 12.78745783951687, "grad_norm": 4.573429107666016, "learning_rate": 3.406863256746411e-06, "loss": 0.2877, "step": 959200 }, { "epoch": 12.788790977323327, "grad_norm": 3.8354969024658203, "learning_rate": 3.4028467794422903e-06, "loss": 0.2319, "step": 959300 }, { "epoch": 12.790124115129782, "grad_norm": 7.993900299072266, "learning_rate": 3.3988325287679965e-06, "loss": 0.2788, "step": 959400 }, { "epoch": 12.791457252936237, "grad_norm": 0.897704005241394, "learning_rate": 3.394900723707319e-06, "loss": 0.2564, "step": 959500 }, { "epoch": 12.792790390742692, "grad_norm": 5.671728610992432, "learning_rate": 3.3908908827513464e-06, "loss": 0.2861, "step": 959600 }, { "epoch": 12.794123528549147, "grad_norm": 2.4153761863708496, "learning_rate": 3.386883269426113e-06, "loss": 0.2533, "step": 959700 }, { "epoch": 12.795456666355602, "grad_norm": 0.09218544512987137, "learning_rate": 3.3828778840671147e-06, "loss": 0.2435, "step": 959800 }, { "epoch": 12.796789804162056, "grad_norm": 1.8965486288070679, "learning_rate": 3.3788747270096865e-06, "loss": 0.2253, "step": 959900 }, { "epoch": 12.798122941968511, "grad_norm": 4.113499164581299, "learning_rate": 3.37487379858894e-06, "loss": 0.2809, "step": 960000 }, { "epoch": 12.799456079774966, "grad_norm": 3.0210025310516357, "learning_rate": 3.370875099139824e-06, "loss": 0.2452, "step": 960100 }, { "epoch": 12.800789217581421, "grad_norm": 1.9931222200393677, "learning_rate": 3.3668786289971177e-06, "loss": 0.2475, "step": 960200 }, { "epoch": 12.802122355387876, "grad_norm": 2.385469675064087, "learning_rate": 3.3628843884953696e-06, "loss": 0.2413, "step": 960300 }, { "epoch": 12.803455493194331, "grad_norm": 2.6931769847869873, "learning_rate": 3.3588923779689764e-06, "loss": 0.2948, "step": 960400 }, { "epoch": 12.804788631000786, "grad_norm": 2.81099796295166, "learning_rate": 3.354902597752123e-06, "loss": 0.2904, "step": 960500 }, { "epoch": 12.806121768807241, "grad_norm": 0.8931247591972351, "learning_rate": 3.35091504817884e-06, "loss": 0.2173, "step": 960600 }, { "epoch": 12.807454906613696, "grad_norm": 5.276648998260498, "learning_rate": 3.346929729582946e-06, "loss": 0.2445, "step": 960700 }, { "epoch": 12.808788044420151, "grad_norm": 1.9552611112594604, "learning_rate": 3.342946642298075e-06, "loss": 0.2848, "step": 960800 }, { "epoch": 12.810121182226606, "grad_norm": 1.8057249784469604, "learning_rate": 3.3389657866576795e-06, "loss": 0.2295, "step": 960900 }, { "epoch": 12.811454320033063, "grad_norm": 1.5897176265716553, "learning_rate": 3.3349871629950236e-06, "loss": 0.2687, "step": 961000 }, { "epoch": 12.812787457839518, "grad_norm": 1.0766253471374512, "learning_rate": 3.3310107716431904e-06, "loss": 0.2331, "step": 961100 }, { "epoch": 12.814120595645973, "grad_norm": 4.0713653564453125, "learning_rate": 3.3270366129350637e-06, "loss": 0.2785, "step": 961200 }, { "epoch": 12.815453733452427, "grad_norm": 3.319166660308838, "learning_rate": 3.323064687203361e-06, "loss": 0.2186, "step": 961300 }, { "epoch": 12.816786871258882, "grad_norm": 0.8296047449111938, "learning_rate": 3.3190949947805828e-06, "loss": 0.2306, "step": 961400 }, { "epoch": 12.818120009065337, "grad_norm": 9.970751762390137, "learning_rate": 3.315127535999056e-06, "loss": 0.2562, "step": 961500 }, { "epoch": 12.819453146871792, "grad_norm": 2.7808010578155518, "learning_rate": 3.311162311190953e-06, "loss": 0.2541, "step": 961600 }, { "epoch": 12.820786284678247, "grad_norm": 2.5030531883239746, "learning_rate": 3.3071993206882002e-06, "loss": 0.2919, "step": 961700 }, { "epoch": 12.822119422484702, "grad_norm": 6.093588352203369, "learning_rate": 3.303238564822576e-06, "loss": 0.2417, "step": 961800 }, { "epoch": 12.823452560291157, "grad_norm": 3.9066433906555176, "learning_rate": 3.2992800439256686e-06, "loss": 0.2673, "step": 961900 }, { "epoch": 12.824785698097612, "grad_norm": 5.195038318634033, "learning_rate": 3.295323758328863e-06, "loss": 0.2604, "step": 962000 }, { "epoch": 12.826118835904067, "grad_norm": 1.6778606176376343, "learning_rate": 3.291369708363381e-06, "loss": 0.2526, "step": 962100 }, { "epoch": 12.827451973710522, "grad_norm": 0.36562010645866394, "learning_rate": 3.2874178943602274e-06, "loss": 0.3026, "step": 962200 }, { "epoch": 12.828785111516977, "grad_norm": 1.604463815689087, "learning_rate": 3.2834683166502446e-06, "loss": 0.2441, "step": 962300 }, { "epoch": 12.830118249323432, "grad_norm": 2.3600351810455322, "learning_rate": 3.2795209755640745e-06, "loss": 0.282, "step": 962400 }, { "epoch": 12.831451387129889, "grad_norm": 1.1945091485977173, "learning_rate": 3.275575871432176e-06, "loss": 0.2535, "step": 962500 }, { "epoch": 12.832784524936343, "grad_norm": 1.88932204246521, "learning_rate": 3.2716330045848254e-06, "loss": 0.307, "step": 962600 }, { "epoch": 12.834117662742798, "grad_norm": 1.5000025033950806, "learning_rate": 3.2676923753521083e-06, "loss": 0.2978, "step": 962700 }, { "epoch": 12.835450800549253, "grad_norm": 2.9707469940185547, "learning_rate": 3.2637539840639107e-06, "loss": 0.2427, "step": 962800 }, { "epoch": 12.836783938355708, "grad_norm": 2.830761194229126, "learning_rate": 3.2598178310499526e-06, "loss": 0.2464, "step": 962900 }, { "epoch": 12.838117076162163, "grad_norm": 1.150719165802002, "learning_rate": 3.255883916639757e-06, "loss": 0.2381, "step": 963000 }, { "epoch": 12.839450213968618, "grad_norm": 1.5358749628067017, "learning_rate": 3.2519522411626534e-06, "loss": 0.3189, "step": 963100 }, { "epoch": 12.840783351775073, "grad_norm": 0.11532138288021088, "learning_rate": 3.248022804947789e-06, "loss": 0.2581, "step": 963200 }, { "epoch": 12.842116489581528, "grad_norm": 3.805968999862671, "learning_rate": 3.244095608324127e-06, "loss": 0.2408, "step": 963300 }, { "epoch": 12.843449627387983, "grad_norm": 2.574373483657837, "learning_rate": 3.2401706516204375e-06, "loss": 0.2384, "step": 963400 }, { "epoch": 12.844782765194438, "grad_norm": 7.662262916564941, "learning_rate": 3.236247935165312e-06, "loss": 0.2337, "step": 963500 }, { "epoch": 12.846115903000893, "grad_norm": 4.314692497253418, "learning_rate": 3.23232745928714e-06, "loss": 0.2479, "step": 963600 }, { "epoch": 12.847449040807348, "grad_norm": 0.2563561797142029, "learning_rate": 3.228409224314123e-06, "loss": 0.27, "step": 963700 }, { "epoch": 12.848782178613803, "grad_norm": 2.3308119773864746, "learning_rate": 3.2244932305743112e-06, "loss": 0.2761, "step": 963800 }, { "epoch": 12.850115316420258, "grad_norm": 3.0224945545196533, "learning_rate": 3.220579478395513e-06, "loss": 0.2564, "step": 963900 }, { "epoch": 12.851448454226713, "grad_norm": 1.6242492198944092, "learning_rate": 3.216667968105387e-06, "loss": 0.2481, "step": 964000 }, { "epoch": 12.852781592033168, "grad_norm": 1.7367541790008545, "learning_rate": 3.2127587000313894e-06, "loss": 0.2502, "step": 964100 }, { "epoch": 12.854114729839624, "grad_norm": 5.389195919036865, "learning_rate": 3.2088516745007935e-06, "loss": 0.2722, "step": 964200 }, { "epoch": 12.85544786764608, "grad_norm": 0.5790653824806213, "learning_rate": 3.204946891840683e-06, "loss": 0.2449, "step": 964300 }, { "epoch": 12.856781005452534, "grad_norm": 4.226961612701416, "learning_rate": 3.2010443523779574e-06, "loss": 0.3229, "step": 964400 }, { "epoch": 12.85811414325899, "grad_norm": 4.231327056884766, "learning_rate": 3.1971440564393183e-06, "loss": 0.2456, "step": 964500 }, { "epoch": 12.859447281065444, "grad_norm": 1.5779876708984375, "learning_rate": 3.193246004351288e-06, "loss": 0.2972, "step": 964600 }, { "epoch": 12.860780418871899, "grad_norm": 1.684570074081421, "learning_rate": 3.1893501964401984e-06, "loss": 0.2214, "step": 964700 }, { "epoch": 12.862113556678354, "grad_norm": 3.2900025844573975, "learning_rate": 3.1854566330321957e-06, "loss": 0.2386, "step": 964800 }, { "epoch": 12.863446694484809, "grad_norm": 2.444035530090332, "learning_rate": 3.1815653144532388e-06, "loss": 0.2739, "step": 964900 }, { "epoch": 12.864779832291264, "grad_norm": 0.9829068183898926, "learning_rate": 3.177676241029097e-06, "loss": 0.238, "step": 965000 }, { "epoch": 12.866112970097719, "grad_norm": 3.7439088821411133, "learning_rate": 3.1737894130853407e-06, "loss": 0.2434, "step": 965100 }, { "epoch": 12.867446107904174, "grad_norm": 5.41604471206665, "learning_rate": 3.169904830947382e-06, "loss": 0.2819, "step": 965200 }, { "epoch": 12.868779245710629, "grad_norm": 1.3865383863449097, "learning_rate": 3.166022494940408e-06, "loss": 0.2476, "step": 965300 }, { "epoch": 12.870112383517084, "grad_norm": 0.8095299601554871, "learning_rate": 3.1621424053894456e-06, "loss": 0.2323, "step": 965400 }, { "epoch": 12.871445521323539, "grad_norm": 3.0242760181427, "learning_rate": 3.1582645626193207e-06, "loss": 0.2457, "step": 965500 }, { "epoch": 12.872778659129994, "grad_norm": 2.0668857097625732, "learning_rate": 3.154388966954671e-06, "loss": 0.2498, "step": 965600 }, { "epoch": 12.87411179693645, "grad_norm": 2.255464792251587, "learning_rate": 3.1505156187199537e-06, "loss": 0.2745, "step": 965700 }, { "epoch": 12.875444934742905, "grad_norm": 0.6036825180053711, "learning_rate": 3.146644518239439e-06, "loss": 0.2555, "step": 965800 }, { "epoch": 12.87677807254936, "grad_norm": 0.169227734208107, "learning_rate": 3.1428143432321576e-06, "loss": 0.2606, "step": 965900 }, { "epoch": 12.878111210355815, "grad_norm": 0.6598296165466309, "learning_rate": 3.1389477167464444e-06, "loss": 0.2573, "step": 966000 }, { "epoch": 12.87944434816227, "grad_norm": 1.456014633178711, "learning_rate": 3.1350833389833555e-06, "loss": 0.223, "step": 966100 }, { "epoch": 12.880777485968725, "grad_norm": 4.618704319000244, "learning_rate": 3.131221210266406e-06, "loss": 0.2552, "step": 966200 }, { "epoch": 12.88211062377518, "grad_norm": 1.886242151260376, "learning_rate": 3.127361330918911e-06, "loss": 0.224, "step": 966300 }, { "epoch": 12.883443761581635, "grad_norm": 0.6606713533401489, "learning_rate": 3.123503701264021e-06, "loss": 0.2993, "step": 966400 }, { "epoch": 12.88477689938809, "grad_norm": 3.065469980239868, "learning_rate": 3.119648321624685e-06, "loss": 0.2346, "step": 966500 }, { "epoch": 12.886110037194545, "grad_norm": 4.301302433013916, "learning_rate": 3.1157951923236515e-06, "loss": 0.2331, "step": 966600 }, { "epoch": 12.887443175001, "grad_norm": 1.2215876579284668, "learning_rate": 3.111944313683496e-06, "loss": 0.2156, "step": 966700 }, { "epoch": 12.888776312807455, "grad_norm": 3.3157708644866943, "learning_rate": 3.1080956860266063e-06, "loss": 0.2517, "step": 966800 }, { "epoch": 12.89010945061391, "grad_norm": 2.0455520153045654, "learning_rate": 3.1042493096751724e-06, "loss": 0.2199, "step": 966900 }, { "epoch": 12.891442588420365, "grad_norm": 3.6217589378356934, "learning_rate": 3.1004051849512127e-06, "loss": 0.2573, "step": 967000 }, { "epoch": 12.89277572622682, "grad_norm": 0.9140611886978149, "learning_rate": 3.09656331217653e-06, "loss": 0.2396, "step": 967100 }, { "epoch": 12.894108864033274, "grad_norm": 1.9710097312927246, "learning_rate": 3.0927620767279985e-06, "loss": 0.188, "step": 967200 }, { "epoch": 12.89544200183973, "grad_norm": 1.1969482898712158, "learning_rate": 3.088963049042086e-06, "loss": 0.2633, "step": 967300 }, { "epoch": 12.896775139646186, "grad_norm": 1.4829864501953125, "learning_rate": 3.0851278889828503e-06, "loss": 0.2836, "step": 967400 }, { "epoch": 12.898108277452641, "grad_norm": 4.1477203369140625, "learning_rate": 3.0812949821518754e-06, "loss": 0.2769, "step": 967500 }, { "epoch": 12.899441415259096, "grad_norm": 26.71065902709961, "learning_rate": 3.0774643288700243e-06, "loss": 0.2398, "step": 967600 }, { "epoch": 12.90077455306555, "grad_norm": 2.410588026046753, "learning_rate": 3.0736359294579906e-06, "loss": 0.2105, "step": 967700 }, { "epoch": 12.902107690872006, "grad_norm": 2.165375232696533, "learning_rate": 3.0698097842362783e-06, "loss": 0.2455, "step": 967800 }, { "epoch": 12.90344082867846, "grad_norm": 1.9898093938827515, "learning_rate": 3.0659858935252006e-06, "loss": 0.2484, "step": 967900 }, { "epoch": 12.904773966484916, "grad_norm": 2.1487722396850586, "learning_rate": 3.062164257644885e-06, "loss": 0.2737, "step": 968000 }, { "epoch": 12.90610710429137, "grad_norm": 3.069239377975464, "learning_rate": 3.058344876915259e-06, "loss": 0.2488, "step": 968100 }, { "epoch": 12.907440242097826, "grad_norm": 3.168151378631592, "learning_rate": 3.05452775165607e-06, "loss": 0.2305, "step": 968200 }, { "epoch": 12.90877337990428, "grad_norm": 3.0387067794799805, "learning_rate": 3.050712882186879e-06, "loss": 0.2612, "step": 968300 }, { "epoch": 12.910106517710735, "grad_norm": 1.2617368698120117, "learning_rate": 3.046900268827054e-06, "loss": 0.2739, "step": 968400 }, { "epoch": 12.91143965551719, "grad_norm": 8.907123565673828, "learning_rate": 3.0430899118957734e-06, "loss": 0.2646, "step": 968500 }, { "epoch": 12.912772793323645, "grad_norm": 0.9636139273643494, "learning_rate": 3.039281811712038e-06, "loss": 0.2606, "step": 968600 }, { "epoch": 12.9141059311301, "grad_norm": 7.339158058166504, "learning_rate": 3.03547596859463e-06, "loss": 0.2464, "step": 968700 }, { "epoch": 12.915439068936555, "grad_norm": 1.9552009105682373, "learning_rate": 3.0316723828621805e-06, "loss": 0.2339, "step": 968800 }, { "epoch": 12.916772206743012, "grad_norm": 2.581087112426758, "learning_rate": 3.0278710548331125e-06, "loss": 0.2297, "step": 968900 }, { "epoch": 12.918105344549467, "grad_norm": 1.9679183959960938, "learning_rate": 3.0240719848256504e-06, "loss": 0.2526, "step": 969000 }, { "epoch": 12.919438482355922, "grad_norm": 0.5132949948310852, "learning_rate": 3.020275173157847e-06, "loss": 0.2282, "step": 969100 }, { "epoch": 12.920771620162377, "grad_norm": 1.067435383796692, "learning_rate": 3.016480620147558e-06, "loss": 0.2464, "step": 969200 }, { "epoch": 12.922104757968832, "grad_norm": 3.247602701187134, "learning_rate": 3.0126883261124527e-06, "loss": 0.2503, "step": 969300 }, { "epoch": 12.923437895775287, "grad_norm": 3.9520392417907715, "learning_rate": 3.0088982913700136e-06, "loss": 0.2373, "step": 969400 }, { "epoch": 12.924771033581742, "grad_norm": 1.051625370979309, "learning_rate": 3.00511051623752e-06, "loss": 0.2789, "step": 969500 }, { "epoch": 12.926104171388197, "grad_norm": 1.2630882263183594, "learning_rate": 3.0013250010320758e-06, "loss": 0.208, "step": 969600 }, { "epoch": 12.927437309194652, "grad_norm": 3.6759703159332275, "learning_rate": 2.9975417460706067e-06, "loss": 0.2527, "step": 969700 }, { "epoch": 12.928770447001106, "grad_norm": 2.675844669342041, "learning_rate": 2.9937607516698164e-06, "loss": 0.2329, "step": 969800 }, { "epoch": 12.930103584807561, "grad_norm": 4.684080600738525, "learning_rate": 2.9899820181462444e-06, "loss": 0.2609, "step": 969900 }, { "epoch": 12.931436722614016, "grad_norm": 1.8627375364303589, "learning_rate": 2.9862055458162352e-06, "loss": 0.2251, "step": 970000 }, { "epoch": 12.932769860420471, "grad_norm": 2.828932046890259, "learning_rate": 2.982431334995945e-06, "loss": 0.2343, "step": 970100 }, { "epoch": 12.934102998226926, "grad_norm": 5.442868709564209, "learning_rate": 2.978659386001332e-06, "loss": 0.2396, "step": 970200 }, { "epoch": 12.935436136033381, "grad_norm": 1.410388708114624, "learning_rate": 2.974889699148187e-06, "loss": 0.2432, "step": 970300 }, { "epoch": 12.936769273839836, "grad_norm": 30.551998138427734, "learning_rate": 2.9711222747520796e-06, "loss": 0.261, "step": 970400 }, { "epoch": 12.938102411646291, "grad_norm": 1.5001500844955444, "learning_rate": 2.967357113128412e-06, "loss": 0.2184, "step": 970500 }, { "epoch": 12.939435549452746, "grad_norm": 2.2352519035339355, "learning_rate": 2.963594214592391e-06, "loss": 0.2358, "step": 970600 }, { "epoch": 12.940768687259203, "grad_norm": 1.0332177877426147, "learning_rate": 2.9598335794590393e-06, "loss": 0.2338, "step": 970700 }, { "epoch": 12.942101825065658, "grad_norm": 2.4741623401641846, "learning_rate": 2.9560752080431807e-06, "loss": 0.2306, "step": 970800 }, { "epoch": 12.943434962872113, "grad_norm": 2.1590285301208496, "learning_rate": 2.952319100659461e-06, "loss": 0.2753, "step": 970900 }, { "epoch": 12.944768100678568, "grad_norm": 2.411669969558716, "learning_rate": 2.9485652576223144e-06, "loss": 0.247, "step": 971000 }, { "epoch": 12.946101238485022, "grad_norm": 1.6627914905548096, "learning_rate": 2.9448136792460236e-06, "loss": 0.2248, "step": 971100 }, { "epoch": 12.947434376291477, "grad_norm": 0.5757569074630737, "learning_rate": 2.9410643658446366e-06, "loss": 0.2224, "step": 971200 }, { "epoch": 12.948767514097932, "grad_norm": 1.2518645524978638, "learning_rate": 2.937317317732047e-06, "loss": 0.2545, "step": 971300 }, { "epoch": 12.950100651904387, "grad_norm": 5.232341766357422, "learning_rate": 2.9335725352219423e-06, "loss": 0.2764, "step": 971400 }, { "epoch": 12.951433789710842, "grad_norm": 1.9546564817428589, "learning_rate": 2.9298300186278227e-06, "loss": 0.2195, "step": 971500 }, { "epoch": 12.952766927517297, "grad_norm": 4.25957727432251, "learning_rate": 2.9260897682630007e-06, "loss": 0.2913, "step": 971600 }, { "epoch": 12.954100065323752, "grad_norm": 0.9457753896713257, "learning_rate": 2.922351784440609e-06, "loss": 0.2333, "step": 971700 }, { "epoch": 12.955433203130207, "grad_norm": 2.0337398052215576, "learning_rate": 2.9186534134212684e-06, "loss": 0.2269, "step": 971800 }, { "epoch": 12.956766340936662, "grad_norm": 3.578605890274048, "learning_rate": 2.9149199409490866e-06, "loss": 0.2022, "step": 971900 }, { "epoch": 12.958099478743117, "grad_norm": 2.2911765575408936, "learning_rate": 2.9111887359544275e-06, "loss": 0.2704, "step": 972000 }, { "epoch": 12.959432616549572, "grad_norm": 1.1931428909301758, "learning_rate": 2.907459798749664e-06, "loss": 0.21, "step": 972100 }, { "epoch": 12.960765754356029, "grad_norm": 3.6876413822174072, "learning_rate": 2.9037331296469507e-06, "loss": 0.2329, "step": 972200 }, { "epoch": 12.962098892162484, "grad_norm": 1.6973955631256104, "learning_rate": 2.90000872895829e-06, "loss": 0.2425, "step": 972300 }, { "epoch": 12.963432029968939, "grad_norm": 3.0317232608795166, "learning_rate": 2.8962865969954765e-06, "loss": 0.2402, "step": 972400 }, { "epoch": 12.964765167775393, "grad_norm": 6.590473651885986, "learning_rate": 2.892566734070108e-06, "loss": 0.2605, "step": 972500 }, { "epoch": 12.966098305581848, "grad_norm": 3.158972978591919, "learning_rate": 2.8888491404936014e-06, "loss": 0.2633, "step": 972600 }, { "epoch": 12.967431443388303, "grad_norm": 1.5034518241882324, "learning_rate": 2.8851338165771847e-06, "loss": 0.2309, "step": 972700 }, { "epoch": 12.968764581194758, "grad_norm": 3.8069708347320557, "learning_rate": 2.881420762631889e-06, "loss": 0.2359, "step": 972800 }, { "epoch": 12.970097719001213, "grad_norm": 2.02595591545105, "learning_rate": 2.877709978968569e-06, "loss": 0.2588, "step": 972900 }, { "epoch": 12.971430856807668, "grad_norm": 0.8056592345237732, "learning_rate": 2.8740014658978664e-06, "loss": 0.2954, "step": 973000 }, { "epoch": 12.972763994614123, "grad_norm": 3.362772226333618, "learning_rate": 2.870295223730246e-06, "loss": 0.2455, "step": 973100 }, { "epoch": 12.974097132420578, "grad_norm": 7.759967803955078, "learning_rate": 2.8665912527759997e-06, "loss": 0.2742, "step": 973200 }, { "epoch": 12.975430270227033, "grad_norm": 0.6899523138999939, "learning_rate": 2.8628895533451992e-06, "loss": 0.2502, "step": 973300 }, { "epoch": 12.976763408033488, "grad_norm": 1.210829496383667, "learning_rate": 2.8591901257477405e-06, "loss": 0.2311, "step": 973400 }, { "epoch": 12.978096545839943, "grad_norm": 1.070743203163147, "learning_rate": 2.8554929702933285e-06, "loss": 0.2272, "step": 973500 }, { "epoch": 12.979429683646398, "grad_norm": 1.5172107219696045, "learning_rate": 2.8517980872914796e-06, "loss": 0.2867, "step": 973600 }, { "epoch": 12.980762821452853, "grad_norm": 1.9205819368362427, "learning_rate": 2.848105477051516e-06, "loss": 0.2186, "step": 973700 }, { "epoch": 12.982095959259308, "grad_norm": 3.764533281326294, "learning_rate": 2.8444151398825768e-06, "loss": 0.2293, "step": 973800 }, { "epoch": 12.983429097065764, "grad_norm": 3.2406704425811768, "learning_rate": 2.840727076093592e-06, "loss": 0.3046, "step": 973900 }, { "epoch": 12.98476223487222, "grad_norm": 2.4158923625946045, "learning_rate": 2.837041285993328e-06, "loss": 0.2489, "step": 974000 }, { "epoch": 12.986095372678674, "grad_norm": 2.0488009452819824, "learning_rate": 2.833357769890341e-06, "loss": 0.2383, "step": 974100 }, { "epoch": 12.98742851048513, "grad_norm": 1.276706337928772, "learning_rate": 2.8296765280930014e-06, "loss": 0.2504, "step": 974200 }, { "epoch": 12.988761648291584, "grad_norm": 3.9043667316436768, "learning_rate": 2.8259975609095056e-06, "loss": 0.2592, "step": 974300 }, { "epoch": 12.99009478609804, "grad_norm": 2.3680944442749023, "learning_rate": 2.8223208686478243e-06, "loss": 0.2955, "step": 974400 }, { "epoch": 12.991427923904494, "grad_norm": 2.266467809677124, "learning_rate": 2.8186464516157638e-06, "loss": 0.2742, "step": 974500 }, { "epoch": 12.992761061710949, "grad_norm": 0.30117085576057434, "learning_rate": 2.8149743101209524e-06, "loss": 0.2832, "step": 974600 }, { "epoch": 12.994094199517404, "grad_norm": 2.868476629257202, "learning_rate": 2.8113044444707903e-06, "loss": 0.2349, "step": 974700 }, { "epoch": 12.995427337323859, "grad_norm": 3.3314387798309326, "learning_rate": 2.807636854972515e-06, "loss": 0.2805, "step": 974800 }, { "epoch": 12.996760475130314, "grad_norm": 1.8869407176971436, "learning_rate": 2.803971541933161e-06, "loss": 0.2074, "step": 974900 }, { "epoch": 12.998093612936769, "grad_norm": 2.700432538986206, "learning_rate": 2.8003085056595832e-06, "loss": 0.2352, "step": 975000 }, { "epoch": 12.999426750743224, "grad_norm": 6.901564121246338, "learning_rate": 2.796647746458436e-06, "loss": 0.252, "step": 975100 }, { "epoch": 13.000759888549679, "grad_norm": 0.004742998164147139, "learning_rate": 2.7929892646361944e-06, "loss": 0.2466, "step": 975200 }, { "epoch": 13.002093026356134, "grad_norm": 1.5193774700164795, "learning_rate": 2.7893696112649404e-06, "loss": 0.2455, "step": 975300 }, { "epoch": 13.00342616416259, "grad_norm": 1.773594617843628, "learning_rate": 2.785715662337701e-06, "loss": 0.2107, "step": 975400 }, { "epoch": 13.004759301969045, "grad_norm": 2.121253728866577, "learning_rate": 2.782063991704562e-06, "loss": 0.2033, "step": 975500 }, { "epoch": 13.0060924397755, "grad_norm": 1.781933307647705, "learning_rate": 2.778414599671225e-06, "loss": 0.3033, "step": 975600 }, { "epoch": 13.007425577581955, "grad_norm": 3.8563241958618164, "learning_rate": 2.7747674865432028e-06, "loss": 0.2649, "step": 975700 }, { "epoch": 13.00875871538841, "grad_norm": 4.357295989990234, "learning_rate": 2.7711226526258337e-06, "loss": 0.2895, "step": 975800 }, { "epoch": 13.010091853194865, "grad_norm": 4.504706382751465, "learning_rate": 2.7674800982242378e-06, "loss": 0.2312, "step": 975900 }, { "epoch": 13.01142499100132, "grad_norm": 0.4866066575050354, "learning_rate": 2.7638398236433637e-06, "loss": 0.2313, "step": 976000 }, { "epoch": 13.012758128807775, "grad_norm": 2.807159423828125, "learning_rate": 2.760201829187965e-06, "loss": 0.2214, "step": 976100 }, { "epoch": 13.01409126661423, "grad_norm": 2.503917694091797, "learning_rate": 2.756566115162604e-06, "loss": 0.2484, "step": 976200 }, { "epoch": 13.015424404420685, "grad_norm": 1.6308655738830566, "learning_rate": 2.752932681871645e-06, "loss": 0.2227, "step": 976300 }, { "epoch": 13.01675754222714, "grad_norm": 1.8451423645019531, "learning_rate": 2.74930152961928e-06, "loss": 0.2527, "step": 976400 }, { "epoch": 13.018090680033595, "grad_norm": 1.7896599769592285, "learning_rate": 2.7456726587094836e-06, "loss": 0.23, "step": 976500 }, { "epoch": 13.01942381784005, "grad_norm": 2.2094814777374268, "learning_rate": 2.7420460694460525e-06, "loss": 0.2445, "step": 976600 }, { "epoch": 13.020756955646505, "grad_norm": 0.9879699945449829, "learning_rate": 2.73842176213261e-06, "loss": 0.2703, "step": 976700 }, { "epoch": 13.02209009345296, "grad_norm": 0.8369930386543274, "learning_rate": 2.7347997370725575e-06, "loss": 0.2629, "step": 976800 }, { "epoch": 13.023423231259414, "grad_norm": 2.0615055561065674, "learning_rate": 2.731179994569125e-06, "loss": 0.2922, "step": 976900 }, { "epoch": 13.024756369065871, "grad_norm": 1.4877842664718628, "learning_rate": 2.7275625349253443e-06, "loss": 0.2415, "step": 977000 }, { "epoch": 13.026089506872326, "grad_norm": 2.320891857147217, "learning_rate": 2.723983498906225e-06, "loss": 0.259, "step": 977100 }, { "epoch": 13.027422644678781, "grad_norm": 2.3077609539031982, "learning_rate": 2.720370583053934e-06, "loss": 0.201, "step": 977200 }, { "epoch": 13.028755782485236, "grad_norm": 1.8207155466079712, "learning_rate": 2.7167599509662234e-06, "loss": 0.2513, "step": 977300 }, { "epoch": 13.030088920291691, "grad_norm": 5.5597920417785645, "learning_rate": 2.7131516029453685e-06, "loss": 0.225, "step": 977400 }, { "epoch": 13.031422058098146, "grad_norm": 3.1071159839630127, "learning_rate": 2.709545539293444e-06, "loss": 0.2308, "step": 977500 }, { "epoch": 13.0327551959046, "grad_norm": 1.2155574560165405, "learning_rate": 2.705941760312346e-06, "loss": 0.2217, "step": 977600 }, { "epoch": 13.034088333711056, "grad_norm": 1.0107080936431885, "learning_rate": 2.7023402663037622e-06, "loss": 0.1987, "step": 977700 }, { "epoch": 13.03542147151751, "grad_norm": 1.5587199926376343, "learning_rate": 2.698741057569195e-06, "loss": 0.1865, "step": 977800 }, { "epoch": 13.036754609323966, "grad_norm": 0.09095676988363266, "learning_rate": 2.6951441344099737e-06, "loss": 0.2677, "step": 977900 }, { "epoch": 13.03808774713042, "grad_norm": 1.117909550666809, "learning_rate": 2.6915494971272102e-06, "loss": 0.2145, "step": 978000 }, { "epoch": 13.039420884936876, "grad_norm": 6.169051170349121, "learning_rate": 2.687957146021838e-06, "loss": 0.2559, "step": 978100 }, { "epoch": 13.04075402274333, "grad_norm": 2.011521100997925, "learning_rate": 2.6843670813945988e-06, "loss": 0.2467, "step": 978200 }, { "epoch": 13.042087160549785, "grad_norm": 3.1226046085357666, "learning_rate": 2.6807793035460426e-06, "loss": 0.2496, "step": 978300 }, { "epoch": 13.04342029835624, "grad_norm": 0.6129396557807922, "learning_rate": 2.6771938127765293e-06, "loss": 0.2347, "step": 978400 }, { "epoch": 13.044753436162695, "grad_norm": 1.9656240940093994, "learning_rate": 2.673610609386219e-06, "loss": 0.1861, "step": 978500 }, { "epoch": 13.046086573969152, "grad_norm": 3.8746039867401123, "learning_rate": 2.6700296936750847e-06, "loss": 0.2659, "step": 978600 }, { "epoch": 13.047419711775607, "grad_norm": 1.8118975162506104, "learning_rate": 2.666451065942913e-06, "loss": 0.2292, "step": 978700 }, { "epoch": 13.048752849582062, "grad_norm": 4.512861251831055, "learning_rate": 2.662874726489295e-06, "loss": 0.2942, "step": 978800 }, { "epoch": 13.050085987388517, "grad_norm": 3.3059325218200684, "learning_rate": 2.659300675613634e-06, "loss": 0.2322, "step": 978900 }, { "epoch": 13.051419125194972, "grad_norm": 2.8773844242095947, "learning_rate": 2.6557289136151373e-06, "loss": 0.2393, "step": 979000 }, { "epoch": 13.052752263001427, "grad_norm": 2.436704158782959, "learning_rate": 2.652159440792816e-06, "loss": 0.2389, "step": 979100 }, { "epoch": 13.054085400807882, "grad_norm": 3.399959087371826, "learning_rate": 2.648592257445491e-06, "loss": 0.2455, "step": 979200 }, { "epoch": 13.055418538614337, "grad_norm": 1.1682134866714478, "learning_rate": 2.645027363871817e-06, "loss": 0.2044, "step": 979300 }, { "epoch": 13.056751676420792, "grad_norm": 3.2186293601989746, "learning_rate": 2.6414647603702145e-06, "loss": 0.2698, "step": 979400 }, { "epoch": 13.058084814227247, "grad_norm": 2.028212308883667, "learning_rate": 2.637904447238939e-06, "loss": 0.244, "step": 979500 }, { "epoch": 13.059417952033701, "grad_norm": 1.035264253616333, "learning_rate": 2.634346424776055e-06, "loss": 0.2577, "step": 979600 }, { "epoch": 13.060751089840156, "grad_norm": 3.568134307861328, "learning_rate": 2.6307906932794212e-06, "loss": 0.2403, "step": 979700 }, { "epoch": 13.062084227646611, "grad_norm": 0.7878082990646362, "learning_rate": 2.627237253046716e-06, "loss": 0.2322, "step": 979800 }, { "epoch": 13.063417365453066, "grad_norm": 0.9244498610496521, "learning_rate": 2.6236861043754244e-06, "loss": 0.2216, "step": 979900 }, { "epoch": 13.064750503259521, "grad_norm": 6.423386096954346, "learning_rate": 2.620137247562833e-06, "loss": 0.2565, "step": 980000 }, { "epoch": 13.066083641065976, "grad_norm": 2.74898099899292, "learning_rate": 2.6165906829060393e-06, "loss": 0.2132, "step": 980100 }, { "epoch": 13.067416778872433, "grad_norm": 2.980236053466797, "learning_rate": 2.6130464107019568e-06, "loss": 0.2238, "step": 980200 }, { "epoch": 13.068749916678888, "grad_norm": 3.7837836742401123, "learning_rate": 2.609504431247295e-06, "loss": 0.2083, "step": 980300 }, { "epoch": 13.070083054485343, "grad_norm": 2.105623483657837, "learning_rate": 2.605964744838586e-06, "loss": 0.2209, "step": 980400 }, { "epoch": 13.071416192291798, "grad_norm": 4.0249552726745605, "learning_rate": 2.6024273517721464e-06, "loss": 0.2259, "step": 980500 }, { "epoch": 13.072749330098253, "grad_norm": 0.9676557779312134, "learning_rate": 2.5988922523441263e-06, "loss": 0.2326, "step": 980600 }, { "epoch": 13.074082467904708, "grad_norm": 0.9647080898284912, "learning_rate": 2.595359446850478e-06, "loss": 0.2229, "step": 980700 }, { "epoch": 13.075415605711163, "grad_norm": 4.592409610748291, "learning_rate": 2.5918289355869463e-06, "loss": 0.2486, "step": 980800 }, { "epoch": 13.076748743517618, "grad_norm": 4.142221927642822, "learning_rate": 2.588300718849097e-06, "loss": 0.228, "step": 980900 }, { "epoch": 13.078081881324072, "grad_norm": 1.2074427604675293, "learning_rate": 2.584774796932301e-06, "loss": 0.228, "step": 981000 }, { "epoch": 13.079415019130527, "grad_norm": 7.173422336578369, "learning_rate": 2.581251170131739e-06, "loss": 0.2779, "step": 981100 }, { "epoch": 13.080748156936982, "grad_norm": 2.368518829345703, "learning_rate": 2.577729838742401e-06, "loss": 0.2411, "step": 981200 }, { "epoch": 13.082081294743437, "grad_norm": 3.078491449356079, "learning_rate": 2.574210803059078e-06, "loss": 0.2612, "step": 981300 }, { "epoch": 13.083414432549892, "grad_norm": 3.826683759689331, "learning_rate": 2.570694063376372e-06, "loss": 0.2412, "step": 981400 }, { "epoch": 13.084747570356347, "grad_norm": 3.9290459156036377, "learning_rate": 2.5672147530549426e-06, "loss": 0.2013, "step": 981500 }, { "epoch": 13.086080708162802, "grad_norm": 3.6379024982452393, "learning_rate": 2.563702583289168e-06, "loss": 0.2364, "step": 981600 }, { "epoch": 13.087413845969257, "grad_norm": 2.112459182739258, "learning_rate": 2.560192710403716e-06, "loss": 0.2504, "step": 981700 }, { "epoch": 13.088746983775714, "grad_norm": 4.233604907989502, "learning_rate": 2.5566851346924347e-06, "loss": 0.2516, "step": 981800 }, { "epoch": 13.090080121582169, "grad_norm": 1.012691617012024, "learning_rate": 2.553179856448973e-06, "loss": 0.2158, "step": 981900 }, { "epoch": 13.091413259388624, "grad_norm": 0.2684175372123718, "learning_rate": 2.5496768759667687e-06, "loss": 0.2372, "step": 982000 }, { "epoch": 13.092746397195079, "grad_norm": 5.06481409072876, "learning_rate": 2.546176193539085e-06, "loss": 0.235, "step": 982100 }, { "epoch": 13.094079535001534, "grad_norm": 3.182952404022217, "learning_rate": 2.54267780945899e-06, "loss": 0.2615, "step": 982200 }, { "epoch": 13.095412672807988, "grad_norm": 2.037128448486328, "learning_rate": 2.5391817240193525e-06, "loss": 0.2592, "step": 982300 }, { "epoch": 13.096745810614443, "grad_norm": 2.066495656967163, "learning_rate": 2.5356879375128593e-06, "loss": 0.2027, "step": 982400 }, { "epoch": 13.098078948420898, "grad_norm": 2.4772679805755615, "learning_rate": 2.532196450232003e-06, "loss": 0.2257, "step": 982500 }, { "epoch": 13.099412086227353, "grad_norm": 1.180071473121643, "learning_rate": 2.528707262469069e-06, "loss": 0.2642, "step": 982600 }, { "epoch": 13.100745224033808, "grad_norm": 1.9269696474075317, "learning_rate": 2.5252203745161573e-06, "loss": 0.2478, "step": 982700 }, { "epoch": 13.102078361840263, "grad_norm": 210.022705078125, "learning_rate": 2.521735786665201e-06, "loss": 0.2565, "step": 982800 }, { "epoch": 13.103411499646718, "grad_norm": 1.2286723852157593, "learning_rate": 2.518253499207901e-06, "loss": 0.2, "step": 982900 }, { "epoch": 13.104744637453173, "grad_norm": 1.3339831829071045, "learning_rate": 2.514773512435793e-06, "loss": 0.237, "step": 983000 }, { "epoch": 13.106077775259628, "grad_norm": 0.7086319327354431, "learning_rate": 2.511295826640194e-06, "loss": 0.2545, "step": 983100 }, { "epoch": 13.107410913066083, "grad_norm": 1.8975627422332764, "learning_rate": 2.5078204421122617e-06, "loss": 0.2713, "step": 983200 }, { "epoch": 13.108744050872538, "grad_norm": 1.7255421876907349, "learning_rate": 2.504347359142947e-06, "loss": 0.2266, "step": 983300 }, { "epoch": 13.110077188678993, "grad_norm": 0.44330620765686035, "learning_rate": 2.5008765780229893e-06, "loss": 0.229, "step": 983400 }, { "epoch": 13.11141032648545, "grad_norm": 2.1011791229248047, "learning_rate": 2.4974080990429605e-06, "loss": 0.203, "step": 983500 }, { "epoch": 13.112743464291905, "grad_norm": 4.372251987457275, "learning_rate": 2.4939419224932313e-06, "loss": 0.2646, "step": 983600 }, { "epoch": 13.11407660209836, "grad_norm": 2.249999523162842, "learning_rate": 2.4904780486639765e-06, "loss": 0.2369, "step": 983700 }, { "epoch": 13.115409739904814, "grad_norm": 3.4752724170684814, "learning_rate": 2.487016477845181e-06, "loss": 0.2545, "step": 983800 }, { "epoch": 13.11674287771127, "grad_norm": 1.5243782997131348, "learning_rate": 2.483557210326646e-06, "loss": 0.2273, "step": 983900 }, { "epoch": 13.118076015517724, "grad_norm": 3.6801116466522217, "learning_rate": 2.4801002463979463e-06, "loss": 0.244, "step": 984000 }, { "epoch": 13.11940915332418, "grad_norm": 7.059962749481201, "learning_rate": 2.476645586348514e-06, "loss": 0.2078, "step": 984100 }, { "epoch": 13.120742291130634, "grad_norm": 1.8893111944198608, "learning_rate": 2.4731932304675542e-06, "loss": 0.201, "step": 984200 }, { "epoch": 13.12207542893709, "grad_norm": 2.8072938919067383, "learning_rate": 2.469743179044083e-06, "loss": 0.2089, "step": 984300 }, { "epoch": 13.123408566743544, "grad_norm": 3.2659077644348145, "learning_rate": 2.4662954323669283e-06, "loss": 0.2727, "step": 984400 }, { "epoch": 13.124741704549999, "grad_norm": 5.177024841308594, "learning_rate": 2.462849990724727e-06, "loss": 0.2597, "step": 984500 }, { "epoch": 13.126074842356454, "grad_norm": 4.923074245452881, "learning_rate": 2.459406854405918e-06, "loss": 0.2273, "step": 984600 }, { "epoch": 13.127407980162909, "grad_norm": 1.6677277088165283, "learning_rate": 2.455966023698757e-06, "loss": 0.2178, "step": 984700 }, { "epoch": 13.128741117969364, "grad_norm": 2.3764092922210693, "learning_rate": 2.4525274988912907e-06, "loss": 0.2314, "step": 984800 }, { "epoch": 13.130074255775819, "grad_norm": 0.9175522923469543, "learning_rate": 2.449091280271385e-06, "loss": 0.2149, "step": 984900 }, { "epoch": 13.131407393582275, "grad_norm": 2.4859044551849365, "learning_rate": 2.4456573681267103e-06, "loss": 0.2575, "step": 985000 }, { "epoch": 13.13274053138873, "grad_norm": 1.408011555671692, "learning_rate": 2.44222576274474e-06, "loss": 0.2067, "step": 985100 }, { "epoch": 13.134073669195185, "grad_norm": 3.9604804515838623, "learning_rate": 2.43879646441276e-06, "loss": 0.2721, "step": 985200 }, { "epoch": 13.13540680700164, "grad_norm": 1.1790401935577393, "learning_rate": 2.4353694734178622e-06, "loss": 0.2319, "step": 985300 }, { "epoch": 13.136739944808095, "grad_norm": 5.384906768798828, "learning_rate": 2.431944790046933e-06, "loss": 0.2481, "step": 985400 }, { "epoch": 13.13807308261455, "grad_norm": 3.194575309753418, "learning_rate": 2.4285224145866892e-06, "loss": 0.2497, "step": 985500 }, { "epoch": 13.139406220421005, "grad_norm": 2.365767240524292, "learning_rate": 2.4251023473236424e-06, "loss": 0.2607, "step": 985600 }, { "epoch": 13.14073935822746, "grad_norm": 7.928754806518555, "learning_rate": 2.4216845885440963e-06, "loss": 0.2332, "step": 985700 }, { "epoch": 13.142072496033915, "grad_norm": 5.303425312042236, "learning_rate": 2.4182691385341825e-06, "loss": 0.2045, "step": 985800 }, { "epoch": 13.14340563384037, "grad_norm": 0.9255412817001343, "learning_rate": 2.4148559975798325e-06, "loss": 0.2184, "step": 985900 }, { "epoch": 13.144738771646825, "grad_norm": 1.362631916999817, "learning_rate": 2.4114451659667803e-06, "loss": 0.2948, "step": 986000 }, { "epoch": 13.14607190945328, "grad_norm": 1.0444462299346924, "learning_rate": 2.4080707177668505e-06, "loss": 0.2264, "step": 986100 }, { "epoch": 13.147405047259735, "grad_norm": 0.4490780532360077, "learning_rate": 2.404664482592305e-06, "loss": 0.2519, "step": 986200 }, { "epoch": 13.14873818506619, "grad_norm": 0.5962821841239929, "learning_rate": 2.401260557612258e-06, "loss": 0.2313, "step": 986300 }, { "epoch": 13.150071322872645, "grad_norm": 0.540228545665741, "learning_rate": 2.3978589431116828e-06, "loss": 0.2348, "step": 986400 }, { "epoch": 13.1514044606791, "grad_norm": 5.585846424102783, "learning_rate": 2.3944596393753503e-06, "loss": 0.2467, "step": 986500 }, { "epoch": 13.152737598485555, "grad_norm": 0.7395691871643066, "learning_rate": 2.3910626466878273e-06, "loss": 0.2806, "step": 986600 }, { "epoch": 13.154070736292011, "grad_norm": 20.20760726928711, "learning_rate": 2.387667965333512e-06, "loss": 0.254, "step": 986700 }, { "epoch": 13.155403874098466, "grad_norm": 0.8865936994552612, "learning_rate": 2.384275595596601e-06, "loss": 0.246, "step": 986800 }, { "epoch": 13.156737011904921, "grad_norm": 8.258091926574707, "learning_rate": 2.380885537761077e-06, "loss": 0.2749, "step": 986900 }, { "epoch": 13.158070149711376, "grad_norm": 3.2992358207702637, "learning_rate": 2.377497792110753e-06, "loss": 0.2545, "step": 987000 }, { "epoch": 13.159403287517831, "grad_norm": 3.1090099811553955, "learning_rate": 2.3741123589292426e-06, "loss": 0.2571, "step": 987100 }, { "epoch": 13.160736425324286, "grad_norm": 1.0370755195617676, "learning_rate": 2.370729238499961e-06, "loss": 0.2503, "step": 987200 }, { "epoch": 13.162069563130741, "grad_norm": 3.607137441635132, "learning_rate": 2.36734843110613e-06, "loss": 0.2655, "step": 987300 }, { "epoch": 13.163402700937196, "grad_norm": 1.3913695812225342, "learning_rate": 2.3639699370307886e-06, "loss": 0.2294, "step": 987400 }, { "epoch": 13.16473583874365, "grad_norm": 1.3934696912765503, "learning_rate": 2.3605937565567583e-06, "loss": 0.2544, "step": 987500 }, { "epoch": 13.166068976550106, "grad_norm": 1.865551233291626, "learning_rate": 2.357219889966702e-06, "loss": 0.2482, "step": 987600 }, { "epoch": 13.16740211435656, "grad_norm": 2.054616928100586, "learning_rate": 2.353882041611244e-06, "loss": 0.2201, "step": 987700 }, { "epoch": 13.168735252163016, "grad_norm": 2.082685708999634, "learning_rate": 2.350512780490377e-06, "loss": 0.1998, "step": 987800 }, { "epoch": 13.17006838996947, "grad_norm": 2.442512273788452, "learning_rate": 2.347145834097426e-06, "loss": 0.2133, "step": 987900 }, { "epoch": 13.171401527775926, "grad_norm": 0.21898366510868073, "learning_rate": 2.3437812027142623e-06, "loss": 0.2386, "step": 988000 }, { "epoch": 13.17273466558238, "grad_norm": 1.9082317352294922, "learning_rate": 2.3404524983218477e-06, "loss": 0.2612, "step": 988100 }, { "epoch": 13.174067803388837, "grad_norm": 0.5068109631538391, "learning_rate": 2.3370924746459533e-06, "loss": 0.1753, "step": 988200 }, { "epoch": 13.175400941195292, "grad_norm": 0.9515410661697388, "learning_rate": 2.3337347668214757e-06, "loss": 0.2069, "step": 988300 }, { "epoch": 13.176734079001747, "grad_norm": 2.5892653465270996, "learning_rate": 2.3303793751295177e-06, "loss": 0.1875, "step": 988400 }, { "epoch": 13.178067216808202, "grad_norm": 58.8388557434082, "learning_rate": 2.327026299850974e-06, "loss": 0.2473, "step": 988500 }, { "epoch": 13.179400354614657, "grad_norm": 4.362705707550049, "learning_rate": 2.323675541266547e-06, "loss": 0.2613, "step": 988600 }, { "epoch": 13.180733492421112, "grad_norm": 2.6033926010131836, "learning_rate": 2.3203270996567718e-06, "loss": 0.221, "step": 988700 }, { "epoch": 13.182066630227567, "grad_norm": 5.422058582305908, "learning_rate": 2.3169809753019513e-06, "loss": 0.2456, "step": 988800 }, { "epoch": 13.183399768034022, "grad_norm": 1.5958895683288574, "learning_rate": 2.3136371684822242e-06, "loss": 0.1917, "step": 988900 }, { "epoch": 13.184732905840477, "grad_norm": 2.266005039215088, "learning_rate": 2.310295679477511e-06, "loss": 0.2203, "step": 989000 }, { "epoch": 13.186066043646932, "grad_norm": 5.624663352966309, "learning_rate": 2.3069565085675602e-06, "loss": 0.2449, "step": 989100 }, { "epoch": 13.187399181453387, "grad_norm": 3.8455193042755127, "learning_rate": 2.303619656031922e-06, "loss": 0.2451, "step": 989200 }, { "epoch": 13.188732319259842, "grad_norm": 1.1332341432571411, "learning_rate": 2.300285122149933e-06, "loss": 0.254, "step": 989300 }, { "epoch": 13.190065457066297, "grad_norm": 1.8392179012298584, "learning_rate": 2.2969529072007567e-06, "loss": 0.2056, "step": 989400 }, { "epoch": 13.191398594872751, "grad_norm": 6.067431449890137, "learning_rate": 2.293623011463356e-06, "loss": 0.2284, "step": 989500 }, { "epoch": 13.192731732679206, "grad_norm": 2.554819345474243, "learning_rate": 2.2902954352164986e-06, "loss": 0.2631, "step": 989600 }, { "epoch": 13.194064870485661, "grad_norm": 3.073650360107422, "learning_rate": 2.2870034198197687e-06, "loss": 0.2524, "step": 989700 }, { "epoch": 13.195398008292116, "grad_norm": 2.1299760341644287, "learning_rate": 2.2836804601876717e-06, "loss": 0.2825, "step": 989800 }, { "epoch": 13.196731146098573, "grad_norm": 17.496633529663086, "learning_rate": 2.2803598208784813e-06, "loss": 0.2375, "step": 989900 }, { "epoch": 13.198064283905028, "grad_norm": 3.82346248626709, "learning_rate": 2.2770415021701862e-06, "loss": 0.2594, "step": 990000 }, { "epoch": 13.199397421711483, "grad_norm": 2.3872146606445312, "learning_rate": 2.273725504340587e-06, "loss": 0.2599, "step": 990100 }, { "epoch": 13.200730559517938, "grad_norm": 2.0518124103546143, "learning_rate": 2.2704118276672846e-06, "loss": 0.2317, "step": 990200 }, { "epoch": 13.202063697324393, "grad_norm": 1.4452645778656006, "learning_rate": 2.2671004724276943e-06, "loss": 0.2572, "step": 990300 }, { "epoch": 13.203396835130848, "grad_norm": 1.2954167127609253, "learning_rate": 2.2637914388990377e-06, "loss": 0.2565, "step": 990400 }, { "epoch": 13.204729972937303, "grad_norm": 4.50738000869751, "learning_rate": 2.260484727358323e-06, "loss": 0.2695, "step": 990500 }, { "epoch": 13.206063110743758, "grad_norm": 2.170823335647583, "learning_rate": 2.257180338082382e-06, "loss": 0.2096, "step": 990600 }, { "epoch": 13.207396248550213, "grad_norm": 1.616028904914856, "learning_rate": 2.2538782713478533e-06, "loss": 0.2222, "step": 990700 }, { "epoch": 13.208729386356667, "grad_norm": 1.8514529466629028, "learning_rate": 2.250578527431173e-06, "loss": 0.2059, "step": 990800 }, { "epoch": 13.210062524163122, "grad_norm": 3.065424919128418, "learning_rate": 2.247281106608583e-06, "loss": 0.2444, "step": 990900 }, { "epoch": 13.211395661969577, "grad_norm": 1.3139287233352661, "learning_rate": 2.243986009156139e-06, "loss": 0.2659, "step": 991000 }, { "epoch": 13.212728799776032, "grad_norm": 6.735456466674805, "learning_rate": 2.2406932353496845e-06, "loss": 0.2793, "step": 991100 }, { "epoch": 13.214061937582487, "grad_norm": 3.1129536628723145, "learning_rate": 2.2374027854648883e-06, "loss": 0.2583, "step": 991200 }, { "epoch": 13.215395075388942, "grad_norm": 7.355184555053711, "learning_rate": 2.2341146597772233e-06, "loss": 0.2343, "step": 991300 }, { "epoch": 13.216728213195397, "grad_norm": 3.852482318878174, "learning_rate": 2.2308288585619498e-06, "loss": 0.2376, "step": 991400 }, { "epoch": 13.218061351001854, "grad_norm": 9.703104972839355, "learning_rate": 2.2275453820941472e-06, "loss": 0.2431, "step": 991500 }, { "epoch": 13.219394488808309, "grad_norm": 2.166896343231201, "learning_rate": 2.224264230648696e-06, "loss": 0.2468, "step": 991600 }, { "epoch": 13.220727626614764, "grad_norm": 2.1845285892486572, "learning_rate": 2.22098540450029e-06, "loss": 0.251, "step": 991700 }, { "epoch": 13.222060764421219, "grad_norm": 2.334709644317627, "learning_rate": 2.217708903923419e-06, "loss": 0.2378, "step": 991800 }, { "epoch": 13.223393902227674, "grad_norm": 3.368220806121826, "learning_rate": 2.214434729192384e-06, "loss": 0.2303, "step": 991900 }, { "epoch": 13.224727040034129, "grad_norm": 1.930102825164795, "learning_rate": 2.2111628805812824e-06, "loss": 0.2334, "step": 992000 }, { "epoch": 13.226060177840584, "grad_norm": 2.7748382091522217, "learning_rate": 2.207893358364025e-06, "loss": 0.2395, "step": 992100 }, { "epoch": 13.227393315647038, "grad_norm": 3.6990904808044434, "learning_rate": 2.2046261628143293e-06, "loss": 0.2177, "step": 992200 }, { "epoch": 13.228726453453493, "grad_norm": 1.5688179731369019, "learning_rate": 2.2013612942057106e-06, "loss": 0.2373, "step": 992300 }, { "epoch": 13.230059591259948, "grad_norm": 1.8772776126861572, "learning_rate": 2.1980987528115028e-06, "loss": 0.2188, "step": 992400 }, { "epoch": 13.231392729066403, "grad_norm": 4.930783748626709, "learning_rate": 2.1948385389048175e-06, "loss": 0.236, "step": 992500 }, { "epoch": 13.232725866872858, "grad_norm": 3.6578433513641357, "learning_rate": 2.1915806527586036e-06, "loss": 0.2303, "step": 992600 }, { "epoch": 13.234059004679313, "grad_norm": 2.0709176063537598, "learning_rate": 2.1883250946456056e-06, "loss": 0.1994, "step": 992700 }, { "epoch": 13.235392142485768, "grad_norm": 0.2774977385997772, "learning_rate": 2.1850718648383526e-06, "loss": 0.21, "step": 992800 }, { "epoch": 13.236725280292223, "grad_norm": 1.0290982723236084, "learning_rate": 2.1818209636092034e-06, "loss": 0.2378, "step": 992900 }, { "epoch": 13.238058418098678, "grad_norm": 4.7850141525268555, "learning_rate": 2.1785723912303134e-06, "loss": 0.19, "step": 993000 }, { "epoch": 13.239391555905135, "grad_norm": 3.3726329803466797, "learning_rate": 2.175326147973645e-06, "loss": 0.2057, "step": 993100 }, { "epoch": 13.24072469371159, "grad_norm": 0.6237396597862244, "learning_rate": 2.172082234110958e-06, "loss": 0.2615, "step": 993200 }, { "epoch": 13.242057831518045, "grad_norm": 2.3383848667144775, "learning_rate": 2.168840649913828e-06, "loss": 0.2361, "step": 993300 }, { "epoch": 13.2433909693245, "grad_norm": 3.888592481613159, "learning_rate": 2.165601395653621e-06, "loss": 0.2043, "step": 993400 }, { "epoch": 13.244724107130954, "grad_norm": 1.6457338333129883, "learning_rate": 2.1623644716015313e-06, "loss": 0.2541, "step": 993500 }, { "epoch": 13.24605724493741, "grad_norm": 2.261824369430542, "learning_rate": 2.159129878028534e-06, "loss": 0.2775, "step": 993600 }, { "epoch": 13.247390382743864, "grad_norm": 5.889023303985596, "learning_rate": 2.155897615205417e-06, "loss": 0.2572, "step": 993700 }, { "epoch": 13.24872352055032, "grad_norm": 1.2785365581512451, "learning_rate": 2.1526676834027827e-06, "loss": 0.2492, "step": 993800 }, { "epoch": 13.250056658356774, "grad_norm": 3.0757341384887695, "learning_rate": 2.1494400828910253e-06, "loss": 0.2006, "step": 993900 }, { "epoch": 13.25138979616323, "grad_norm": 3.1122896671295166, "learning_rate": 2.1462148139403524e-06, "loss": 0.2534, "step": 994000 }, { "epoch": 13.252722933969684, "grad_norm": 8.591680526733398, "learning_rate": 2.142991876820781e-06, "loss": 0.2475, "step": 994100 }, { "epoch": 13.254056071776139, "grad_norm": 2.5910472869873047, "learning_rate": 2.139803466307507e-06, "loss": 0.251, "step": 994200 }, { "epoch": 13.255389209582594, "grad_norm": 7.684698104858398, "learning_rate": 2.1365851703343233e-06, "loss": 0.2229, "step": 994300 }, { "epoch": 13.256722347389049, "grad_norm": 1.0554276704788208, "learning_rate": 2.1333692069983957e-06, "loss": 0.2224, "step": 994400 }, { "epoch": 13.258055485195504, "grad_norm": 0.20810054242610931, "learning_rate": 2.130155576568956e-06, "loss": 0.2469, "step": 994500 }, { "epoch": 13.259388623001959, "grad_norm": 5.9576497077941895, "learning_rate": 2.126944279315033e-06, "loss": 0.232, "step": 994600 }, { "epoch": 13.260721760808416, "grad_norm": 2.364408254623413, "learning_rate": 2.123735315505474e-06, "loss": 0.2616, "step": 994700 }, { "epoch": 13.26205489861487, "grad_norm": 41.66487503051758, "learning_rate": 2.1205286854089247e-06, "loss": 0.2358, "step": 994800 }, { "epoch": 13.263388036421325, "grad_norm": 1.341072916984558, "learning_rate": 2.1173243892938267e-06, "loss": 0.2052, "step": 994900 }, { "epoch": 13.26472117422778, "grad_norm": 1.5500035285949707, "learning_rate": 2.114122427428439e-06, "loss": 0.2204, "step": 995000 }, { "epoch": 13.266054312034235, "grad_norm": 0.9197058081626892, "learning_rate": 2.1109228000808134e-06, "loss": 0.2349, "step": 995100 }, { "epoch": 13.26738744984069, "grad_norm": 2.73844051361084, "learning_rate": 2.1077255075188196e-06, "loss": 0.269, "step": 995200 }, { "epoch": 13.268720587647145, "grad_norm": 1.6655995845794678, "learning_rate": 2.1045305500101297e-06, "loss": 0.2429, "step": 995300 }, { "epoch": 13.2700537254536, "grad_norm": 2.3821632862091064, "learning_rate": 2.1013379278222e-06, "loss": 0.1925, "step": 995400 }, { "epoch": 13.271386863260055, "grad_norm": 1.1907219886779785, "learning_rate": 2.0981476412223133e-06, "loss": 0.2815, "step": 995500 }, { "epoch": 13.27272000106651, "grad_norm": 4.6126790046691895, "learning_rate": 2.094959690477556e-06, "loss": 0.2317, "step": 995600 }, { "epoch": 13.274053138872965, "grad_norm": 1.6449415683746338, "learning_rate": 2.0917740758548076e-06, "loss": 0.2386, "step": 995700 }, { "epoch": 13.27538627667942, "grad_norm": 0.7122129797935486, "learning_rate": 2.0885907976207584e-06, "loss": 0.2375, "step": 995800 }, { "epoch": 13.276719414485875, "grad_norm": 2.5788447856903076, "learning_rate": 2.085409856041902e-06, "loss": 0.2826, "step": 995900 }, { "epoch": 13.27805255229233, "grad_norm": 5.994018077850342, "learning_rate": 2.0822312513845387e-06, "loss": 0.264, "step": 996000 }, { "epoch": 13.279385690098785, "grad_norm": 0.9393078088760376, "learning_rate": 2.0790549839147723e-06, "loss": 0.2931, "step": 996100 }, { "epoch": 13.28071882790524, "grad_norm": 0.7129513025283813, "learning_rate": 2.0758810538985127e-06, "loss": 0.2429, "step": 996200 }, { "epoch": 13.282051965711696, "grad_norm": 2.564594030380249, "learning_rate": 2.072709461601462e-06, "loss": 0.2492, "step": 996300 }, { "epoch": 13.283385103518151, "grad_norm": 1.6812554597854614, "learning_rate": 2.06954020728914e-06, "loss": 0.2545, "step": 996400 }, { "epoch": 13.284718241324606, "grad_norm": 1.8822720050811768, "learning_rate": 2.0663732912268685e-06, "loss": 0.2426, "step": 996500 }, { "epoch": 13.286051379131061, "grad_norm": 1.201397180557251, "learning_rate": 2.0632087136797682e-06, "loss": 0.246, "step": 996600 }, { "epoch": 13.287384516937516, "grad_norm": 5.3328752517700195, "learning_rate": 2.060046474912778e-06, "loss": 0.2423, "step": 996700 }, { "epoch": 13.288717654743971, "grad_norm": 1.7554446458816528, "learning_rate": 2.0568865751906153e-06, "loss": 0.236, "step": 996800 }, { "epoch": 13.290050792550426, "grad_norm": 3.1043825149536133, "learning_rate": 2.0537290147778222e-06, "loss": 0.2481, "step": 996900 }, { "epoch": 13.291383930356881, "grad_norm": 1.1213667392730713, "learning_rate": 2.050573793938747e-06, "loss": 0.2768, "step": 997000 }, { "epoch": 13.292717068163336, "grad_norm": 1.7872288227081299, "learning_rate": 2.047420912937529e-06, "loss": 0.2361, "step": 997100 }, { "epoch": 13.294050205969791, "grad_norm": 1.318896770477295, "learning_rate": 2.0442703720381162e-06, "loss": 0.2399, "step": 997200 }, { "epoch": 13.295383343776246, "grad_norm": 0.04535510018467903, "learning_rate": 2.041122171504265e-06, "loss": 0.2479, "step": 997300 }, { "epoch": 13.2967164815827, "grad_norm": 1.894174337387085, "learning_rate": 2.0379763115995307e-06, "loss": 0.251, "step": 997400 }, { "epoch": 13.298049619389156, "grad_norm": 4.947009086608887, "learning_rate": 2.0348327925872733e-06, "loss": 0.2756, "step": 997500 }, { "epoch": 13.29938275719561, "grad_norm": 0.012298863381147385, "learning_rate": 2.0316916147306687e-06, "loss": 0.2447, "step": 997600 }, { "epoch": 13.300715895002066, "grad_norm": 1.967058777809143, "learning_rate": 2.0285527782926727e-06, "loss": 0.2232, "step": 997700 }, { "epoch": 13.30204903280852, "grad_norm": 1.6988046169281006, "learning_rate": 2.025416283536059e-06, "loss": 0.2203, "step": 997800 }, { "epoch": 13.303382170614977, "grad_norm": 5.130369186401367, "learning_rate": 2.022282130723424e-06, "loss": 0.2564, "step": 997900 }, { "epoch": 13.304715308421432, "grad_norm": 5.976224422454834, "learning_rate": 2.0191503201171246e-06, "loss": 0.2428, "step": 998000 }, { "epoch": 13.306048446227887, "grad_norm": 0.9794806838035583, "learning_rate": 2.0160208519793676e-06, "loss": 0.2589, "step": 998100 }, { "epoch": 13.307381584034342, "grad_norm": 1.4271641969680786, "learning_rate": 2.012893726572117e-06, "loss": 0.2035, "step": 998200 }, { "epoch": 13.308714721840797, "grad_norm": 2.594515085220337, "learning_rate": 2.0098001803826726e-06, "loss": 0.2484, "step": 998300 }, { "epoch": 13.310047859647252, "grad_norm": 10.906722068786621, "learning_rate": 2.0066777177878217e-06, "loss": 0.237, "step": 998400 }, { "epoch": 13.311380997453707, "grad_norm": 87.84608459472656, "learning_rate": 2.0035575987056677e-06, "loss": 0.2175, "step": 998500 }, { "epoch": 13.312714135260162, "grad_norm": 1.099711298942566, "learning_rate": 2.000439823397422e-06, "loss": 0.2027, "step": 998600 }, { "epoch": 13.314047273066617, "grad_norm": 1.4639660120010376, "learning_rate": 1.997324392124089e-06, "loss": 0.2384, "step": 998700 }, { "epoch": 13.315380410873072, "grad_norm": 5.278416633605957, "learning_rate": 1.994211305146494e-06, "loss": 0.2568, "step": 998800 }, { "epoch": 13.316713548679527, "grad_norm": 2.002200126647949, "learning_rate": 1.991100562725241e-06, "loss": 0.1987, "step": 998900 }, { "epoch": 13.318046686485982, "grad_norm": 1.1180830001831055, "learning_rate": 1.9879921651207523e-06, "loss": 0.2163, "step": 999000 }, { "epoch": 13.319379824292437, "grad_norm": 1.6272788047790527, "learning_rate": 1.984886112593266e-06, "loss": 0.238, "step": 999100 }, { "epoch": 13.320712962098892, "grad_norm": 4.01228141784668, "learning_rate": 1.981782405402798e-06, "loss": 0.2272, "step": 999200 }, { "epoch": 13.322046099905346, "grad_norm": 3.346054792404175, "learning_rate": 1.9786810438091894e-06, "loss": 0.2319, "step": 999300 }, { "epoch": 13.323379237711801, "grad_norm": 8.027875900268555, "learning_rate": 1.9755820280720704e-06, "loss": 0.2379, "step": 999400 }, { "epoch": 13.324712375518258, "grad_norm": 0.7599044442176819, "learning_rate": 1.972485358450883e-06, "loss": 0.233, "step": 999500 }, { "epoch": 13.326045513324713, "grad_norm": 3.3822529315948486, "learning_rate": 1.9693910352048794e-06, "loss": 0.2178, "step": 999600 }, { "epoch": 13.327378651131168, "grad_norm": 3.2835707664489746, "learning_rate": 1.9662990585930894e-06, "loss": 0.1985, "step": 999700 }, { "epoch": 13.328711788937623, "grad_norm": 1.3854175806045532, "learning_rate": 1.963209428874373e-06, "loss": 0.2014, "step": 999800 }, { "epoch": 13.330044926744078, "grad_norm": 3.287252902984619, "learning_rate": 1.960122146307386e-06, "loss": 0.2591, "step": 999900 }, { "epoch": 13.331378064550533, "grad_norm": 4.800164222717285, "learning_rate": 1.9570372111505818e-06, "loss": 0.2347, "step": 1000000 }, { "epoch": 13.332711202356988, "grad_norm": 6.500070095062256, "learning_rate": 1.953954623662224e-06, "loss": 0.2154, "step": 1000100 }, { "epoch": 13.334044340163443, "grad_norm": 0.5439701676368713, "learning_rate": 1.9508743841003794e-06, "loss": 0.2347, "step": 1000200 }, { "epoch": 13.335377477969898, "grad_norm": 2.4459052085876465, "learning_rate": 1.947796492722912e-06, "loss": 0.2299, "step": 1000300 }, { "epoch": 13.336710615776353, "grad_norm": 2.5971617698669434, "learning_rate": 1.944720949787485e-06, "loss": 0.2532, "step": 1000400 }, { "epoch": 13.338043753582808, "grad_norm": 1.5353102684020996, "learning_rate": 1.9416477555515932e-06, "loss": 0.2342, "step": 1000500 }, { "epoch": 13.339376891389263, "grad_norm": 3.6087124347686768, "learning_rate": 1.9385769102725005e-06, "loss": 0.2547, "step": 1000600 }, { "epoch": 13.340710029195717, "grad_norm": 0.899368405342102, "learning_rate": 1.935508414207291e-06, "loss": 0.2378, "step": 1000700 }, { "epoch": 13.342043167002172, "grad_norm": 1.830380916595459, "learning_rate": 1.9324422676128495e-06, "loss": 0.2275, "step": 1000800 }, { "epoch": 13.343376304808627, "grad_norm": 7.580560684204102, "learning_rate": 1.929378470745864e-06, "loss": 0.2692, "step": 1000900 }, { "epoch": 13.344709442615082, "grad_norm": 5.285236835479736, "learning_rate": 1.926317023862829e-06, "loss": 0.2314, "step": 1001000 }, { "epoch": 13.346042580421539, "grad_norm": 0.8989763855934143, "learning_rate": 1.92325792722004e-06, "loss": 0.2366, "step": 1001100 }, { "epoch": 13.347375718227994, "grad_norm": 5.324594020843506, "learning_rate": 1.9202011810735854e-06, "loss": 0.2396, "step": 1001200 }, { "epoch": 13.348708856034449, "grad_norm": 4.387541770935059, "learning_rate": 1.9171467856793768e-06, "loss": 0.2482, "step": 1001300 }, { "epoch": 13.350041993840904, "grad_norm": 0.9239872694015503, "learning_rate": 1.9140947412931097e-06, "loss": 0.1961, "step": 1001400 }, { "epoch": 13.351375131647359, "grad_norm": 3.1589109897613525, "learning_rate": 1.9110450481703e-06, "loss": 0.2362, "step": 1001500 }, { "epoch": 13.352708269453814, "grad_norm": 0.9545846581459045, "learning_rate": 1.9079977065662603e-06, "loss": 0.2539, "step": 1001600 }, { "epoch": 13.354041407260269, "grad_norm": 1.9261771440505981, "learning_rate": 1.9049527167360858e-06, "loss": 0.2233, "step": 1001700 }, { "epoch": 13.355374545066724, "grad_norm": 1.5301603078842163, "learning_rate": 1.9019100789347132e-06, "loss": 0.2652, "step": 1001800 }, { "epoch": 13.356707682873179, "grad_norm": 0.790209174156189, "learning_rate": 1.8989001846274e-06, "loss": 0.2143, "step": 1001900 }, { "epoch": 13.358040820679633, "grad_norm": 0.7608899474143982, "learning_rate": 1.8958622281209393e-06, "loss": 0.2477, "step": 1002000 }, { "epoch": 13.359373958486088, "grad_norm": 1.8537259101867676, "learning_rate": 1.8928266244043046e-06, "loss": 0.2252, "step": 1002100 }, { "epoch": 13.360707096292543, "grad_norm": 2.8588929176330566, "learning_rate": 1.8897933737316231e-06, "loss": 0.2486, "step": 1002200 }, { "epoch": 13.362040234098998, "grad_norm": 2.7853643894195557, "learning_rate": 1.8867624763568314e-06, "loss": 0.2401, "step": 1002300 }, { "epoch": 13.363373371905453, "grad_norm": 1.1804156303405762, "learning_rate": 1.8837339325336633e-06, "loss": 0.2201, "step": 1002400 }, { "epoch": 13.364706509711908, "grad_norm": 7.8570451736450195, "learning_rate": 1.8807077425156594e-06, "loss": 0.2209, "step": 1002500 }, { "epoch": 13.366039647518363, "grad_norm": 3.739088296890259, "learning_rate": 1.8776839065561634e-06, "loss": 0.2472, "step": 1002600 }, { "epoch": 13.36737278532482, "grad_norm": 1.9458091259002686, "learning_rate": 1.87466242490832e-06, "loss": 0.2356, "step": 1002700 }, { "epoch": 13.368705923131275, "grad_norm": 0.8443073034286499, "learning_rate": 1.8716432978250864e-06, "loss": 0.2657, "step": 1002800 }, { "epoch": 13.37003906093773, "grad_norm": 1.438545823097229, "learning_rate": 1.8686265255591971e-06, "loss": 0.2513, "step": 1002900 }, { "epoch": 13.371372198744185, "grad_norm": 2.4785239696502686, "learning_rate": 1.8656121083632205e-06, "loss": 0.252, "step": 1003000 }, { "epoch": 13.37270533655064, "grad_norm": 3.4199953079223633, "learning_rate": 1.8626000464895176e-06, "loss": 0.2295, "step": 1003100 }, { "epoch": 13.374038474357095, "grad_norm": 1.7976092100143433, "learning_rate": 1.859590340190237e-06, "loss": 0.2672, "step": 1003200 }, { "epoch": 13.37537161216355, "grad_norm": 6.328517913818359, "learning_rate": 1.8565829897173469e-06, "loss": 0.239, "step": 1003300 }, { "epoch": 13.376704749970004, "grad_norm": 0.614324152469635, "learning_rate": 1.8535779953226163e-06, "loss": 0.2336, "step": 1003400 }, { "epoch": 13.37803788777646, "grad_norm": 5.269018650054932, "learning_rate": 1.8505753572576066e-06, "loss": 0.2479, "step": 1003500 }, { "epoch": 13.379371025582914, "grad_norm": 4.311187267303467, "learning_rate": 1.8475750757736943e-06, "loss": 0.2349, "step": 1003600 }, { "epoch": 13.38070416338937, "grad_norm": 4.325336933135986, "learning_rate": 1.8445771511220578e-06, "loss": 0.2184, "step": 1003700 }, { "epoch": 13.382037301195824, "grad_norm": 0.8371071219444275, "learning_rate": 1.8415815835536632e-06, "loss": 0.2672, "step": 1003800 }, { "epoch": 13.38337043900228, "grad_norm": 4.210418701171875, "learning_rate": 1.8385883733192932e-06, "loss": 0.2404, "step": 1003900 }, { "epoch": 13.384703576808734, "grad_norm": 1.8549855947494507, "learning_rate": 1.8355975206695374e-06, "loss": 0.2191, "step": 1004000 }, { "epoch": 13.386036714615189, "grad_norm": 0.7883515954017639, "learning_rate": 1.8326090258547723e-06, "loss": 0.2317, "step": 1004100 }, { "epoch": 13.387369852421644, "grad_norm": 2.8919899463653564, "learning_rate": 1.8296228891251942e-06, "loss": 0.2193, "step": 1004200 }, { "epoch": 13.3887029902281, "grad_norm": 0.5774446129798889, "learning_rate": 1.8266391107307766e-06, "loss": 0.2041, "step": 1004300 }, { "epoch": 13.390036128034556, "grad_norm": 12.554656982421875, "learning_rate": 1.8236576909213299e-06, "loss": 0.2278, "step": 1004400 }, { "epoch": 13.39136926584101, "grad_norm": 3.6649811267852783, "learning_rate": 1.820678629946444e-06, "loss": 0.2232, "step": 1004500 }, { "epoch": 13.392702403647466, "grad_norm": 1.9041813611984253, "learning_rate": 1.8177019280555097e-06, "loss": 0.1935, "step": 1004600 }, { "epoch": 13.39403554145392, "grad_norm": 0.2038881629705429, "learning_rate": 1.8147275854977307e-06, "loss": 0.2538, "step": 1004700 }, { "epoch": 13.395368679260375, "grad_norm": 3.0678465366363525, "learning_rate": 1.8117556025221116e-06, "loss": 0.2291, "step": 1004800 }, { "epoch": 13.39670181706683, "grad_norm": 1.1052043437957764, "learning_rate": 1.8087859793774563e-06, "loss": 0.2336, "step": 1004900 }, { "epoch": 13.398034954873285, "grad_norm": 1.1058032512664795, "learning_rate": 1.8058483772598178e-06, "loss": 0.1978, "step": 1005000 }, { "epoch": 13.39936809267974, "grad_norm": 1.7634062767028809, "learning_rate": 1.8028834509182024e-06, "loss": 0.2605, "step": 1005100 }, { "epoch": 13.400701230486195, "grad_norm": 2.0324172973632812, "learning_rate": 1.7999208851503102e-06, "loss": 0.2378, "step": 1005200 }, { "epoch": 13.40203436829265, "grad_norm": 2.0849997997283936, "learning_rate": 1.7969606802041394e-06, "loss": 0.281, "step": 1005300 }, { "epoch": 13.403367506099105, "grad_norm": 0.8214668035507202, "learning_rate": 1.7940028363275184e-06, "loss": 0.1941, "step": 1005400 }, { "epoch": 13.40470064390556, "grad_norm": 2.6234562397003174, "learning_rate": 1.791047353768066e-06, "loss": 0.237, "step": 1005500 }, { "epoch": 13.406033781712015, "grad_norm": 4.378609657287598, "learning_rate": 1.7880942327732076e-06, "loss": 0.24, "step": 1005600 }, { "epoch": 13.40736691951847, "grad_norm": 3.680432081222534, "learning_rate": 1.785143473590175e-06, "loss": 0.2158, "step": 1005700 }, { "epoch": 13.408700057324925, "grad_norm": 0.6212829947471619, "learning_rate": 1.7821950764659812e-06, "loss": 0.2918, "step": 1005800 }, { "epoch": 13.410033195131382, "grad_norm": 2.7185847759246826, "learning_rate": 1.7792490416474683e-06, "loss": 0.1945, "step": 1005900 }, { "epoch": 13.411366332937837, "grad_norm": 4.130110263824463, "learning_rate": 1.7763053693812658e-06, "loss": 0.2188, "step": 1006000 }, { "epoch": 13.412699470744291, "grad_norm": 0.7660539746284485, "learning_rate": 1.77336405991381e-06, "loss": 0.2695, "step": 1006100 }, { "epoch": 13.414032608550746, "grad_norm": 1.4541934728622437, "learning_rate": 1.7704251134913341e-06, "loss": 0.2486, "step": 1006200 }, { "epoch": 13.415365746357201, "grad_norm": 1.6634230613708496, "learning_rate": 1.7674885303598875e-06, "loss": 0.2213, "step": 1006300 }, { "epoch": 13.416698884163656, "grad_norm": 2.7851316928863525, "learning_rate": 1.764554310765294e-06, "loss": 0.2253, "step": 1006400 }, { "epoch": 13.418032021970111, "grad_norm": 1.0742967128753662, "learning_rate": 1.7616224549532133e-06, "loss": 0.2532, "step": 1006500 }, { "epoch": 13.419365159776566, "grad_norm": 1.9832504987716675, "learning_rate": 1.7586929631690896e-06, "loss": 0.2531, "step": 1006600 }, { "epoch": 13.420698297583021, "grad_norm": 1.8850820064544678, "learning_rate": 1.755765835658163e-06, "loss": 0.2018, "step": 1006700 }, { "epoch": 13.422031435389476, "grad_norm": 16.38494300842285, "learning_rate": 1.7528410726654876e-06, "loss": 0.2201, "step": 1006800 }, { "epoch": 13.423364573195931, "grad_norm": 1.217800498008728, "learning_rate": 1.7499186744359142e-06, "loss": 0.1824, "step": 1006900 }, { "epoch": 13.424697711002386, "grad_norm": 2.924734354019165, "learning_rate": 1.7469986412140971e-06, "loss": 0.28, "step": 1007000 }, { "epoch": 13.426030848808841, "grad_norm": 2.7734363079071045, "learning_rate": 1.744110138215389e-06, "loss": 0.2079, "step": 1007100 }, { "epoch": 13.427363986615296, "grad_norm": 1.1117273569107056, "learning_rate": 1.7411948120860844e-06, "loss": 0.2639, "step": 1007200 }, { "epoch": 13.42869712442175, "grad_norm": 1.8629313707351685, "learning_rate": 1.7382818516948684e-06, "loss": 0.2391, "step": 1007300 }, { "epoch": 13.430030262228206, "grad_norm": 0.16442987322807312, "learning_rate": 1.7353712572856061e-06, "loss": 0.2282, "step": 1007400 }, { "epoch": 13.431363400034662, "grad_norm": 3.464031219482422, "learning_rate": 1.7324630291019694e-06, "loss": 0.2562, "step": 1007500 }, { "epoch": 13.432696537841117, "grad_norm": 4.830044746398926, "learning_rate": 1.7295571673874133e-06, "loss": 0.2409, "step": 1007600 }, { "epoch": 13.434029675647572, "grad_norm": 0.4861234128475189, "learning_rate": 1.7266536723852166e-06, "loss": 0.2583, "step": 1007700 }, { "epoch": 13.435362813454027, "grad_norm": 1.3872150182724, "learning_rate": 1.723752544338455e-06, "loss": 0.2507, "step": 1007800 }, { "epoch": 13.436695951260482, "grad_norm": 2.6880526542663574, "learning_rate": 1.720853783489994e-06, "loss": 0.2318, "step": 1007900 }, { "epoch": 13.438029089066937, "grad_norm": 2.6838223934173584, "learning_rate": 1.7179573900825062e-06, "loss": 0.232, "step": 1008000 }, { "epoch": 13.439362226873392, "grad_norm": 1.1057159900665283, "learning_rate": 1.7150633643584779e-06, "loss": 0.2266, "step": 1008100 }, { "epoch": 13.440695364679847, "grad_norm": 1.6614058017730713, "learning_rate": 1.7121717065601782e-06, "loss": 0.2406, "step": 1008200 }, { "epoch": 13.442028502486302, "grad_norm": 4.284942626953125, "learning_rate": 1.7092824169296973e-06, "loss": 0.2312, "step": 1008300 }, { "epoch": 13.443361640292757, "grad_norm": 5.880484580993652, "learning_rate": 1.7063954957089144e-06, "loss": 0.2567, "step": 1008400 }, { "epoch": 13.444694778099212, "grad_norm": 1.665380835533142, "learning_rate": 1.7035109431395068e-06, "loss": 0.2446, "step": 1008500 }, { "epoch": 13.446027915905667, "grad_norm": 5.280551433563232, "learning_rate": 1.7006287594629644e-06, "loss": 0.2232, "step": 1008600 }, { "epoch": 13.447361053712122, "grad_norm": 2.0433716773986816, "learning_rate": 1.697748944920574e-06, "loss": 0.2611, "step": 1008700 }, { "epoch": 13.448694191518577, "grad_norm": 2.0265660285949707, "learning_rate": 1.6948714997534264e-06, "loss": 0.2341, "step": 1008800 }, { "epoch": 13.450027329325032, "grad_norm": 1.6255310773849487, "learning_rate": 1.6919964242024189e-06, "loss": 0.2413, "step": 1008900 }, { "epoch": 13.451360467131487, "grad_norm": 4.54021692276001, "learning_rate": 1.689123718508222e-06, "loss": 0.227, "step": 1009000 }, { "epoch": 13.452693604937942, "grad_norm": 4.409527778625488, "learning_rate": 1.6862533829113536e-06, "loss": 0.2408, "step": 1009100 }, { "epoch": 13.454026742744398, "grad_norm": 2.775481700897217, "learning_rate": 1.6833854176521012e-06, "loss": 0.2323, "step": 1009200 }, { "epoch": 13.455359880550853, "grad_norm": 1.2475905418395996, "learning_rate": 1.6805198229705532e-06, "loss": 0.301, "step": 1009300 }, { "epoch": 13.456693018357308, "grad_norm": 2.9411191940307617, "learning_rate": 1.6776565991066174e-06, "loss": 0.2675, "step": 1009400 }, { "epoch": 13.458026156163763, "grad_norm": 2.748649835586548, "learning_rate": 1.6747957462999918e-06, "loss": 0.2395, "step": 1009500 }, { "epoch": 13.459359293970218, "grad_norm": 0.6863798499107361, "learning_rate": 1.6719372647901787e-06, "loss": 0.2534, "step": 1009600 }, { "epoch": 13.460692431776673, "grad_norm": 2.2179465293884277, "learning_rate": 1.6690811548164763e-06, "loss": 0.2311, "step": 1009700 }, { "epoch": 13.462025569583128, "grad_norm": 0.03598444163799286, "learning_rate": 1.6662274166180003e-06, "loss": 0.2264, "step": 1009800 }, { "epoch": 13.463358707389583, "grad_norm": 3.4713759422302246, "learning_rate": 1.6633760504336393e-06, "loss": 0.2267, "step": 1009900 }, { "epoch": 13.464691845196038, "grad_norm": 2.630997896194458, "learning_rate": 1.6605270565021158e-06, "loss": 0.2018, "step": 1010000 }, { "epoch": 13.466024983002493, "grad_norm": 6.974677085876465, "learning_rate": 1.6576804350619356e-06, "loss": 0.24, "step": 1010100 }, { "epoch": 13.467358120808948, "grad_norm": 1.1127938032150269, "learning_rate": 1.654836186351405e-06, "loss": 0.3177, "step": 1010200 }, { "epoch": 13.468691258615403, "grad_norm": 2.9912655353546143, "learning_rate": 1.6519943106086365e-06, "loss": 0.2401, "step": 1010300 }, { "epoch": 13.470024396421858, "grad_norm": 14.222843170166016, "learning_rate": 1.6491548080715436e-06, "loss": 0.2614, "step": 1010400 }, { "epoch": 13.471357534228313, "grad_norm": 3.4551589488983154, "learning_rate": 1.646317678977839e-06, "loss": 0.2334, "step": 1010500 }, { "epoch": 13.472690672034767, "grad_norm": 5.066672325134277, "learning_rate": 1.6434829235650462e-06, "loss": 0.2675, "step": 1010600 }, { "epoch": 13.474023809841224, "grad_norm": 3.008920669555664, "learning_rate": 1.6406788541337393e-06, "loss": 0.1975, "step": 1010700 }, { "epoch": 13.475356947647679, "grad_norm": 1.040297269821167, "learning_rate": 1.6378488230517795e-06, "loss": 0.2576, "step": 1010800 }, { "epoch": 13.476690085454134, "grad_norm": 3.969312906265259, "learning_rate": 1.6350211663597081e-06, "loss": 0.2849, "step": 1010900 }, { "epoch": 13.478023223260589, "grad_norm": 3.103792190551758, "learning_rate": 1.6321958842942553e-06, "loss": 0.2272, "step": 1011000 }, { "epoch": 13.479356361067044, "grad_norm": 1.607137680053711, "learning_rate": 1.6293729770919318e-06, "loss": 0.2494, "step": 1011100 }, { "epoch": 13.480689498873499, "grad_norm": 3.5486419200897217, "learning_rate": 1.6265524449890756e-06, "loss": 0.2406, "step": 1011200 }, { "epoch": 13.482022636679954, "grad_norm": 0.7889184355735779, "learning_rate": 1.623734288221811e-06, "loss": 0.2188, "step": 1011300 }, { "epoch": 13.483355774486409, "grad_norm": 9.9981107711792, "learning_rate": 1.620918507026059e-06, "loss": 0.2508, "step": 1011400 }, { "epoch": 13.484688912292864, "grad_norm": 3.344499111175537, "learning_rate": 1.6181051016375481e-06, "loss": 0.2359, "step": 1011500 }, { "epoch": 13.486022050099319, "grad_norm": 4.091126441955566, "learning_rate": 1.6152940722918131e-06, "loss": 0.2736, "step": 1011600 }, { "epoch": 13.487355187905774, "grad_norm": 0.8635809421539307, "learning_rate": 1.6124854192241823e-06, "loss": 0.2181, "step": 1011700 }, { "epoch": 13.488688325712229, "grad_norm": 5.281364917755127, "learning_rate": 1.609679142669791e-06, "loss": 0.2415, "step": 1011800 }, { "epoch": 13.490021463518683, "grad_norm": 2.1436667442321777, "learning_rate": 1.6068752428635647e-06, "loss": 0.2483, "step": 1011900 }, { "epoch": 13.491354601325138, "grad_norm": 4.863543510437012, "learning_rate": 1.604073720040239e-06, "loss": 0.2206, "step": 1012000 }, { "epoch": 13.492687739131593, "grad_norm": 5.987667083740234, "learning_rate": 1.6012745744343493e-06, "loss": 0.2448, "step": 1012100 }, { "epoch": 13.494020876938048, "grad_norm": 10.220489501953125, "learning_rate": 1.598477806280232e-06, "loss": 0.302, "step": 1012200 }, { "epoch": 13.495354014744503, "grad_norm": 0.4943493902683258, "learning_rate": 1.5956834158120226e-06, "loss": 0.2149, "step": 1012300 }, { "epoch": 13.49668715255096, "grad_norm": 2.8406100273132324, "learning_rate": 1.5928914032636643e-06, "loss": 0.2601, "step": 1012400 }, { "epoch": 13.498020290357415, "grad_norm": 0.05239993333816528, "learning_rate": 1.5901017688688835e-06, "loss": 0.2435, "step": 1012500 }, { "epoch": 13.49935342816387, "grad_norm": 4.603099346160889, "learning_rate": 1.5873145128612298e-06, "loss": 0.2467, "step": 1012600 }, { "epoch": 13.500686565970325, "grad_norm": 1.281956672668457, "learning_rate": 1.584529635474047e-06, "loss": 0.2321, "step": 1012700 }, { "epoch": 13.50201970377678, "grad_norm": 4.109086513519287, "learning_rate": 1.5817471369404612e-06, "loss": 0.2424, "step": 1012800 }, { "epoch": 13.503352841583235, "grad_norm": 4.7668986320495605, "learning_rate": 1.578994806910654e-06, "loss": 0.2244, "step": 1012900 }, { "epoch": 13.50468597938969, "grad_norm": 1.7165402173995972, "learning_rate": 1.5762170429885682e-06, "loss": 0.2494, "step": 1013000 }, { "epoch": 13.506019117196145, "grad_norm": 2.348994731903076, "learning_rate": 1.5734416586159894e-06, "loss": 0.2505, "step": 1013100 }, { "epoch": 13.5073522550026, "grad_norm": 2.3115532398223877, "learning_rate": 1.5706686540252658e-06, "loss": 0.2529, "step": 1013200 }, { "epoch": 13.508685392809054, "grad_norm": 1.167907476425171, "learning_rate": 1.5678980294485446e-06, "loss": 0.2028, "step": 1013300 }, { "epoch": 13.51001853061551, "grad_norm": 4.032381534576416, "learning_rate": 1.5651297851177738e-06, "loss": 0.1805, "step": 1013400 }, { "epoch": 13.511351668421964, "grad_norm": 2.085862636566162, "learning_rate": 1.5623639212647045e-06, "loss": 0.2637, "step": 1013500 }, { "epoch": 13.51268480622842, "grad_norm": 0.8097604513168335, "learning_rate": 1.5596004381208917e-06, "loss": 0.2534, "step": 1013600 }, { "epoch": 13.514017944034874, "grad_norm": 12.55018424987793, "learning_rate": 1.556839335917667e-06, "loss": 0.2561, "step": 1013700 }, { "epoch": 13.51535108184133, "grad_norm": 0.6078125238418579, "learning_rate": 1.5540806148861952e-06, "loss": 0.2357, "step": 1013800 }, { "epoch": 13.516684219647786, "grad_norm": 3.3405168056488037, "learning_rate": 1.5513242752574319e-06, "loss": 0.2535, "step": 1013900 }, { "epoch": 13.51801735745424, "grad_norm": 0.49315568804740906, "learning_rate": 1.5485703172621157e-06, "loss": 0.2279, "step": 1014000 }, { "epoch": 13.519350495260696, "grad_norm": 4.163716793060303, "learning_rate": 1.545818741130809e-06, "loss": 0.2266, "step": 1014100 }, { "epoch": 13.52068363306715, "grad_norm": 0.0033327024430036545, "learning_rate": 1.5430695470938604e-06, "loss": 0.2842, "step": 1014200 }, { "epoch": 13.522016770873606, "grad_norm": 2.0866501331329346, "learning_rate": 1.5403227353814264e-06, "loss": 0.2514, "step": 1014300 }, { "epoch": 13.52334990868006, "grad_norm": 1.8098331689834595, "learning_rate": 1.5375783062234593e-06, "loss": 0.2426, "step": 1014400 }, { "epoch": 13.524683046486516, "grad_norm": 12.320226669311523, "learning_rate": 1.534836259849719e-06, "loss": 0.2398, "step": 1014500 }, { "epoch": 13.52601618429297, "grad_norm": 1.275500774383545, "learning_rate": 1.5320965964897482e-06, "loss": 0.2344, "step": 1014600 }, { "epoch": 13.527349322099425, "grad_norm": 2.129239797592163, "learning_rate": 1.5293593163729202e-06, "loss": 0.2318, "step": 1014700 }, { "epoch": 13.52868245990588, "grad_norm": 2.831033229827881, "learning_rate": 1.5266244197283785e-06, "loss": 0.2204, "step": 1014800 }, { "epoch": 13.530015597712335, "grad_norm": 1.493262767791748, "learning_rate": 1.5238919067850865e-06, "loss": 0.2358, "step": 1014900 }, { "epoch": 13.53134873551879, "grad_norm": 2.149043560028076, "learning_rate": 1.5211617777717979e-06, "loss": 0.2062, "step": 1015000 }, { "epoch": 13.532681873325245, "grad_norm": 0.9247037768363953, "learning_rate": 1.5184340329170732e-06, "loss": 0.2273, "step": 1015100 }, { "epoch": 13.5340150111317, "grad_norm": 0.13423053920269012, "learning_rate": 1.5157086724492697e-06, "loss": 0.2625, "step": 1015200 }, { "epoch": 13.535348148938155, "grad_norm": 0.983063817024231, "learning_rate": 1.5129856965965517e-06, "loss": 0.1892, "step": 1015300 }, { "epoch": 13.53668128674461, "grad_norm": 2.1841163635253906, "learning_rate": 1.5102651055868667e-06, "loss": 0.226, "step": 1015400 }, { "epoch": 13.538014424551065, "grad_norm": 1.9584037065505981, "learning_rate": 1.507546899647979e-06, "loss": 0.2237, "step": 1015500 }, { "epoch": 13.539347562357522, "grad_norm": 1.3948334455490112, "learning_rate": 1.504831079007447e-06, "loss": 0.2498, "step": 1015600 }, { "epoch": 13.540680700163977, "grad_norm": 3.2156498432159424, "learning_rate": 1.5021176438926354e-06, "loss": 0.2753, "step": 1015700 }, { "epoch": 13.542013837970432, "grad_norm": 2.3233084678649902, "learning_rate": 1.4994065945307023e-06, "loss": 0.2355, "step": 1015800 }, { "epoch": 13.543346975776887, "grad_norm": 1.2755969762802124, "learning_rate": 1.4967250059710803e-06, "loss": 0.2165, "step": 1015900 }, { "epoch": 13.544680113583341, "grad_norm": 2.8963449001312256, "learning_rate": 1.4940187049323983e-06, "loss": 0.2416, "step": 1016000 }, { "epoch": 13.546013251389796, "grad_norm": 3.4061532020568848, "learning_rate": 1.4913147903246094e-06, "loss": 0.2554, "step": 1016100 }, { "epoch": 13.547346389196251, "grad_norm": 12.57809066772461, "learning_rate": 1.4886132623740788e-06, "loss": 0.2262, "step": 1016200 }, { "epoch": 13.548679527002706, "grad_norm": 3.4070842266082764, "learning_rate": 1.4859141213069693e-06, "loss": 0.2281, "step": 1016300 }, { "epoch": 13.550012664809161, "grad_norm": 1.6993954181671143, "learning_rate": 1.4832173673492466e-06, "loss": 0.2271, "step": 1016400 }, { "epoch": 13.551345802615616, "grad_norm": 1.8138377666473389, "learning_rate": 1.4805230007266768e-06, "loss": 0.2841, "step": 1016500 }, { "epoch": 13.552678940422071, "grad_norm": 1.4859635829925537, "learning_rate": 1.477831021664816e-06, "loss": 0.2447, "step": 1016600 }, { "epoch": 13.554012078228526, "grad_norm": 1.524917721748352, "learning_rate": 1.475141430389031e-06, "loss": 0.1965, "step": 1016700 }, { "epoch": 13.555345216034981, "grad_norm": 0.22031576931476593, "learning_rate": 1.4724542271244911e-06, "loss": 0.2906, "step": 1016800 }, { "epoch": 13.556678353841436, "grad_norm": 1.4786688089370728, "learning_rate": 1.4697694120961536e-06, "loss": 0.2292, "step": 1016900 }, { "epoch": 13.55801149164789, "grad_norm": 2.6946494579315186, "learning_rate": 1.4670869855287849e-06, "loss": 0.2056, "step": 1017000 }, { "epoch": 13.559344629454348, "grad_norm": 2.716026544570923, "learning_rate": 1.4644069476469591e-06, "loss": 0.2681, "step": 1017100 }, { "epoch": 13.560677767260803, "grad_norm": 1.9024349451065063, "learning_rate": 1.4617292986750197e-06, "loss": 0.2109, "step": 1017200 }, { "epoch": 13.562010905067257, "grad_norm": 2.2203056812286377, "learning_rate": 1.4590540388371475e-06, "loss": 0.2171, "step": 1017300 }, { "epoch": 13.563344042873712, "grad_norm": 0.9676151871681213, "learning_rate": 1.456381168357307e-06, "loss": 0.231, "step": 1017400 }, { "epoch": 13.564677180680167, "grad_norm": 2.068114757537842, "learning_rate": 1.453710687459252e-06, "loss": 0.3011, "step": 1017500 }, { "epoch": 13.566010318486622, "grad_norm": 4.294790267944336, "learning_rate": 1.4510425963665574e-06, "loss": 0.2351, "step": 1017600 }, { "epoch": 13.567343456293077, "grad_norm": 1.2438205480575562, "learning_rate": 1.4483768953025779e-06, "loss": 0.2297, "step": 1017700 }, { "epoch": 13.568676594099532, "grad_norm": 1.1350047588348389, "learning_rate": 1.4457135844904845e-06, "loss": 0.2259, "step": 1017800 }, { "epoch": 13.570009731905987, "grad_norm": 3.8517580032348633, "learning_rate": 1.4430526641532427e-06, "loss": 0.2468, "step": 1017900 }, { "epoch": 13.571342869712442, "grad_norm": 2.222165822982788, "learning_rate": 1.4403941345136074e-06, "loss": 0.2046, "step": 1018000 }, { "epoch": 13.572676007518897, "grad_norm": 2.264735698699951, "learning_rate": 1.4377379957941439e-06, "loss": 0.2404, "step": 1018100 }, { "epoch": 13.574009145325352, "grad_norm": 7.155026912689209, "learning_rate": 1.4351107738561098e-06, "loss": 0.2571, "step": 1018200 }, { "epoch": 13.575342283131807, "grad_norm": 1.5524216890335083, "learning_rate": 1.4324593937291497e-06, "loss": 0.2344, "step": 1018300 }, { "epoch": 13.576675420938262, "grad_norm": 4.619513988494873, "learning_rate": 1.4298104051866302e-06, "loss": 0.2353, "step": 1018400 }, { "epoch": 13.578008558744717, "grad_norm": 4.335543155670166, "learning_rate": 1.4271638084503237e-06, "loss": 0.2189, "step": 1018500 }, { "epoch": 13.579341696551172, "grad_norm": 3.266209125518799, "learning_rate": 1.4245196037417996e-06, "loss": 0.2761, "step": 1018600 }, { "epoch": 13.580674834357627, "grad_norm": 2.4398388862609863, "learning_rate": 1.421877791282411e-06, "loss": 0.2453, "step": 1018700 }, { "epoch": 13.582007972164083, "grad_norm": 1.6173663139343262, "learning_rate": 1.419238371293331e-06, "loss": 0.265, "step": 1018800 }, { "epoch": 13.583341109970538, "grad_norm": 1.126875877380371, "learning_rate": 1.4166013439955161e-06, "loss": 0.2095, "step": 1018900 }, { "epoch": 13.584674247776993, "grad_norm": 2.478405714035034, "learning_rate": 1.4139667096097364e-06, "loss": 0.2847, "step": 1019000 }, { "epoch": 13.586007385583448, "grad_norm": 4.412572860717773, "learning_rate": 1.4113344683565554e-06, "loss": 0.2444, "step": 1019100 }, { "epoch": 13.587340523389903, "grad_norm": 1.0348798036575317, "learning_rate": 1.4087046204563337e-06, "loss": 0.236, "step": 1019200 }, { "epoch": 13.588673661196358, "grad_norm": 2.7359347343444824, "learning_rate": 1.406077166129225e-06, "loss": 0.1808, "step": 1019300 }, { "epoch": 13.590006799002813, "grad_norm": 4.612929344177246, "learning_rate": 1.4034521055952098e-06, "loss": 0.2514, "step": 1019400 }, { "epoch": 13.591339936809268, "grad_norm": 1.4685773849487305, "learning_rate": 1.4008294390740361e-06, "loss": 0.2254, "step": 1019500 }, { "epoch": 13.592673074615723, "grad_norm": 3.3266844749450684, "learning_rate": 1.3982091667852682e-06, "loss": 0.1815, "step": 1019600 }, { "epoch": 13.594006212422178, "grad_norm": 0.9221118092536926, "learning_rate": 1.3955912889482703e-06, "loss": 0.2, "step": 1019700 }, { "epoch": 13.595339350228633, "grad_norm": 8.032780647277832, "learning_rate": 1.3930019487595236e-06, "loss": 0.2152, "step": 1019800 }, { "epoch": 13.596672488035088, "grad_norm": 1.3585059642791748, "learning_rate": 1.3903888365333584e-06, "loss": 0.2112, "step": 1019900 }, { "epoch": 13.598005625841543, "grad_norm": 2.0480728149414062, "learning_rate": 1.3877781194136564e-06, "loss": 0.2208, "step": 1020000 }, { "epoch": 13.599338763647998, "grad_norm": 4.565343856811523, "learning_rate": 1.3851697976189758e-06, "loss": 0.2481, "step": 1020100 }, { "epoch": 13.600671901454453, "grad_norm": 2.8952016830444336, "learning_rate": 1.3825638713676759e-06, "loss": 0.2322, "step": 1020200 }, { "epoch": 13.60200503926091, "grad_norm": 3.6253256797790527, "learning_rate": 1.3799603408779215e-06, "loss": 0.2726, "step": 1020300 }, { "epoch": 13.603338177067364, "grad_norm": 3.3369033336639404, "learning_rate": 1.3773592063676722e-06, "loss": 0.1988, "step": 1020400 }, { "epoch": 13.60467131487382, "grad_norm": 2.4745054244995117, "learning_rate": 1.3747604680546765e-06, "loss": 0.2641, "step": 1020500 }, { "epoch": 13.606004452680274, "grad_norm": 1.6995848417282104, "learning_rate": 1.3721641261565076e-06, "loss": 0.2458, "step": 1020600 }, { "epoch": 13.607337590486729, "grad_norm": 3.2344322204589844, "learning_rate": 1.3695701808905114e-06, "loss": 0.2906, "step": 1020700 }, { "epoch": 13.608670728293184, "grad_norm": 5.636976718902588, "learning_rate": 1.366978632473851e-06, "loss": 0.2541, "step": 1020800 }, { "epoch": 13.610003866099639, "grad_norm": 2.6445152759552, "learning_rate": 1.3643894811234759e-06, "loss": 0.2143, "step": 1020900 }, { "epoch": 13.611337003906094, "grad_norm": 4.073627948760986, "learning_rate": 1.3618027270561495e-06, "loss": 0.2777, "step": 1021000 }, { "epoch": 13.612670141712549, "grad_norm": 3.0688014030456543, "learning_rate": 1.359218370488422e-06, "loss": 0.2975, "step": 1021100 }, { "epoch": 13.614003279519004, "grad_norm": 1.9567162990570068, "learning_rate": 1.3566364116366536e-06, "loss": 0.2055, "step": 1021200 }, { "epoch": 13.615336417325459, "grad_norm": 1.9227718114852905, "learning_rate": 1.3540568507169882e-06, "loss": 0.2065, "step": 1021300 }, { "epoch": 13.616669555131914, "grad_norm": 2.0606436729431152, "learning_rate": 1.3514796879453862e-06, "loss": 0.2532, "step": 1021400 }, { "epoch": 13.618002692938369, "grad_norm": 1.1467031240463257, "learning_rate": 1.348904923537595e-06, "loss": 0.2071, "step": 1021500 }, { "epoch": 13.619335830744824, "grad_norm": 1.91815984249115, "learning_rate": 1.3463325577091656e-06, "loss": 0.2648, "step": 1021600 }, { "epoch": 13.620668968551279, "grad_norm": 4.980091094970703, "learning_rate": 1.3437625906754524e-06, "loss": 0.2795, "step": 1021700 }, { "epoch": 13.622002106357733, "grad_norm": 3.0017123222351074, "learning_rate": 1.3411950226516067e-06, "loss": 0.221, "step": 1021800 }, { "epoch": 13.623335244164188, "grad_norm": 1.4372702836990356, "learning_rate": 1.3386298538525632e-06, "loss": 0.2815, "step": 1021900 }, { "epoch": 13.624668381970645, "grad_norm": 1.7390023469924927, "learning_rate": 1.3360670844930867e-06, "loss": 0.2162, "step": 1022000 }, { "epoch": 13.6260015197771, "grad_norm": 2.133512258529663, "learning_rate": 1.3335067147877222e-06, "loss": 0.241, "step": 1022100 }, { "epoch": 13.627334657583555, "grad_norm": 2.624282121658325, "learning_rate": 1.3309487449508051e-06, "loss": 0.2245, "step": 1022200 }, { "epoch": 13.62866779539001, "grad_norm": 5.677690505981445, "learning_rate": 1.328418719012916e-06, "loss": 0.2315, "step": 1022300 }, { "epoch": 13.630000933196465, "grad_norm": 0.8731240630149841, "learning_rate": 1.325865525551121e-06, "loss": 0.2334, "step": 1022400 }, { "epoch": 13.63133407100292, "grad_norm": 1.2446554899215698, "learning_rate": 1.3233147325974714e-06, "loss": 0.2455, "step": 1022500 }, { "epoch": 13.632667208809375, "grad_norm": 1.4413827657699585, "learning_rate": 1.3207663403655136e-06, "loss": 0.2226, "step": 1022600 }, { "epoch": 13.63400034661583, "grad_norm": 0.9179291129112244, "learning_rate": 1.3182203490685963e-06, "loss": 0.2431, "step": 1022700 }, { "epoch": 13.635333484422285, "grad_norm": 3.6306746006011963, "learning_rate": 1.315676758919856e-06, "loss": 0.2572, "step": 1022800 }, { "epoch": 13.63666662222874, "grad_norm": 6.894445419311523, "learning_rate": 1.3131355701322356e-06, "loss": 0.2313, "step": 1022900 }, { "epoch": 13.637999760035195, "grad_norm": 2.2408010959625244, "learning_rate": 1.310596782918485e-06, "loss": 0.2721, "step": 1023000 }, { "epoch": 13.63933289784165, "grad_norm": 0.3820740580558777, "learning_rate": 1.3080603974911244e-06, "loss": 0.1877, "step": 1023100 }, { "epoch": 13.640666035648104, "grad_norm": 14.322356224060059, "learning_rate": 1.3055264140625067e-06, "loss": 0.2508, "step": 1023200 }, { "epoch": 13.64199917345456, "grad_norm": 4.514532566070557, "learning_rate": 1.302994832844766e-06, "loss": 0.2426, "step": 1023300 }, { "epoch": 13.643332311261014, "grad_norm": 0.7727705836296082, "learning_rate": 1.300465654049836e-06, "loss": 0.2757, "step": 1023400 }, { "epoch": 13.644665449067471, "grad_norm": 2.188720464706421, "learning_rate": 1.297938877889454e-06, "loss": 0.2261, "step": 1023500 }, { "epoch": 13.645998586873926, "grad_norm": 0.9018987417221069, "learning_rate": 1.2954145045751508e-06, "loss": 0.2218, "step": 1023600 }, { "epoch": 13.647331724680381, "grad_norm": 4.280423164367676, "learning_rate": 1.2928925343182607e-06, "loss": 0.2525, "step": 1023700 }, { "epoch": 13.648664862486836, "grad_norm": 8.22768497467041, "learning_rate": 1.2903729673299181e-06, "loss": 0.2268, "step": 1023800 }, { "epoch": 13.64999800029329, "grad_norm": 1.9903533458709717, "learning_rate": 1.2878558038210475e-06, "loss": 0.2484, "step": 1023900 }, { "epoch": 13.651331138099746, "grad_norm": 1.8018858432769775, "learning_rate": 1.2853410440023772e-06, "loss": 0.234, "step": 1024000 }, { "epoch": 13.6526642759062, "grad_norm": 3.1900289058685303, "learning_rate": 1.2828286880844453e-06, "loss": 0.2577, "step": 1024100 }, { "epoch": 13.653997413712656, "grad_norm": 7.487944602966309, "learning_rate": 1.280318736277567e-06, "loss": 0.2572, "step": 1024200 }, { "epoch": 13.65533055151911, "grad_norm": 0.5133689045906067, "learning_rate": 1.2778111887918708e-06, "loss": 0.2406, "step": 1024300 }, { "epoch": 13.656663689325566, "grad_norm": 1.3485945463180542, "learning_rate": 1.2753060458372822e-06, "loss": 0.2922, "step": 1024400 }, { "epoch": 13.65799682713202, "grad_norm": 2.416781187057495, "learning_rate": 1.2728033076235236e-06, "loss": 0.2633, "step": 1024500 }, { "epoch": 13.659329964938475, "grad_norm": 1.0433067083358765, "learning_rate": 1.2703279657875643e-06, "loss": 0.214, "step": 1024600 }, { "epoch": 13.66066310274493, "grad_norm": 2.182941198348999, "learning_rate": 1.267830013631196e-06, "loss": 0.2619, "step": 1024700 }, { "epoch": 13.661996240551385, "grad_norm": 4.010409832000732, "learning_rate": 1.2653344668415267e-06, "loss": 0.2075, "step": 1024800 }, { "epoch": 13.66332937835784, "grad_norm": 2.3803224563598633, "learning_rate": 1.2628413256274762e-06, "loss": 0.2476, "step": 1024900 }, { "epoch": 13.664662516164295, "grad_norm": 1.7638188600540161, "learning_rate": 1.2603505901977675e-06, "loss": 0.2505, "step": 1025000 }, { "epoch": 13.66599565397075, "grad_norm": 2.3411829471588135, "learning_rate": 1.2578622607609102e-06, "loss": 0.2363, "step": 1025100 }, { "epoch": 13.667328791777207, "grad_norm": 2.5686991214752197, "learning_rate": 1.2553763375252148e-06, "loss": 0.2562, "step": 1025200 }, { "epoch": 13.668661929583662, "grad_norm": 1.814324140548706, "learning_rate": 1.252892820698811e-06, "loss": 0.2281, "step": 1025300 }, { "epoch": 13.669995067390117, "grad_norm": 1.1019353866577148, "learning_rate": 1.2504117104895963e-06, "loss": 0.2374, "step": 1025400 }, { "epoch": 13.671328205196572, "grad_norm": 0.6828590631484985, "learning_rate": 1.2479330071052874e-06, "loss": 0.2553, "step": 1025500 }, { "epoch": 13.672661343003027, "grad_norm": 2.6213815212249756, "learning_rate": 1.245456710753392e-06, "loss": 0.2385, "step": 1025600 }, { "epoch": 13.673994480809482, "grad_norm": 3.6125173568725586, "learning_rate": 1.242982821641221e-06, "loss": 0.2353, "step": 1025700 }, { "epoch": 13.675327618615936, "grad_norm": 0.9118366241455078, "learning_rate": 1.2405113399758749e-06, "loss": 0.2649, "step": 1025800 }, { "epoch": 13.676660756422391, "grad_norm": 2.799250602722168, "learning_rate": 1.2380422659642688e-06, "loss": 0.2103, "step": 1025900 }, { "epoch": 13.677993894228846, "grad_norm": 1.0269471406936646, "learning_rate": 1.2355755998130936e-06, "loss": 0.2378, "step": 1026000 }, { "epoch": 13.679327032035301, "grad_norm": 2.2869434356689453, "learning_rate": 1.2331113417288541e-06, "loss": 0.2658, "step": 1026100 }, { "epoch": 13.680660169841756, "grad_norm": 4.2690887451171875, "learning_rate": 1.2306494919178523e-06, "loss": 0.2269, "step": 1026200 }, { "epoch": 13.681993307648211, "grad_norm": 3.078319787979126, "learning_rate": 1.2281900505861899e-06, "loss": 0.2036, "step": 1026300 }, { "epoch": 13.683326445454666, "grad_norm": 3.199495553970337, "learning_rate": 1.2257330179397553e-06, "loss": 0.2843, "step": 1026400 }, { "epoch": 13.684659583261121, "grad_norm": 1.129878282546997, "learning_rate": 1.223278394184254e-06, "loss": 0.204, "step": 1026500 }, { "epoch": 13.685992721067576, "grad_norm": 0.4340502917766571, "learning_rate": 1.220826179525165e-06, "loss": 0.2217, "step": 1026600 }, { "epoch": 13.687325858874033, "grad_norm": 1.4574376344680786, "learning_rate": 1.2183763741677978e-06, "loss": 0.1903, "step": 1026700 }, { "epoch": 13.688658996680488, "grad_norm": 3.727329969406128, "learning_rate": 1.2159289783172278e-06, "loss": 0.2101, "step": 1026800 }, { "epoch": 13.689992134486943, "grad_norm": 3.7099997997283936, "learning_rate": 1.2134839921783513e-06, "loss": 0.2774, "step": 1026900 }, { "epoch": 13.691325272293398, "grad_norm": 1.546053409576416, "learning_rate": 1.211041415955848e-06, "loss": 0.2227, "step": 1027000 }, { "epoch": 13.692658410099853, "grad_norm": 0.2020559310913086, "learning_rate": 1.2086012498542109e-06, "loss": 0.2278, "step": 1027100 }, { "epoch": 13.693991547906307, "grad_norm": 2.6912925243377686, "learning_rate": 1.2061634940777167e-06, "loss": 0.2146, "step": 1027200 }, { "epoch": 13.695324685712762, "grad_norm": 3.087265729904175, "learning_rate": 1.203728148830452e-06, "loss": 0.2287, "step": 1027300 }, { "epoch": 13.696657823519217, "grad_norm": 8.115093231201172, "learning_rate": 1.201295214316287e-06, "loss": 0.2054, "step": 1027400 }, { "epoch": 13.697990961325672, "grad_norm": 0.7551224231719971, "learning_rate": 1.1988646907389057e-06, "loss": 0.2416, "step": 1027500 }, { "epoch": 13.699324099132127, "grad_norm": 2.421635150909424, "learning_rate": 1.1964365783017883e-06, "loss": 0.2185, "step": 1027600 }, { "epoch": 13.700657236938582, "grad_norm": 1.9235447645187378, "learning_rate": 1.1940108772082025e-06, "loss": 0.2054, "step": 1027700 }, { "epoch": 13.701990374745037, "grad_norm": 1.4193453788757324, "learning_rate": 1.1915875876612193e-06, "loss": 0.2789, "step": 1027800 }, { "epoch": 13.703323512551492, "grad_norm": 0.9692482352256775, "learning_rate": 1.1891667098637127e-06, "loss": 0.2353, "step": 1027900 }, { "epoch": 13.704656650357947, "grad_norm": 3.675288438796997, "learning_rate": 1.1867482440183509e-06, "loss": 0.2831, "step": 1028000 }, { "epoch": 13.705989788164402, "grad_norm": 2.6083357334136963, "learning_rate": 1.1843321903276016e-06, "loss": 0.2199, "step": 1028100 }, { "epoch": 13.707322925970857, "grad_norm": 0.5835309028625488, "learning_rate": 1.18191854899372e-06, "loss": 0.2146, "step": 1028200 }, { "epoch": 13.708656063777312, "grad_norm": 1.7736082077026367, "learning_rate": 1.1795073202187778e-06, "loss": 0.242, "step": 1028300 }, { "epoch": 13.709989201583767, "grad_norm": 8.174287796020508, "learning_rate": 1.1770985042046334e-06, "loss": 0.2647, "step": 1028400 }, { "epoch": 13.711322339390223, "grad_norm": 1.419973611831665, "learning_rate": 1.1746921011529422e-06, "loss": 0.2164, "step": 1028500 }, { "epoch": 13.712655477196678, "grad_norm": 1.7583025693893433, "learning_rate": 1.1722881112651662e-06, "loss": 0.2255, "step": 1028600 }, { "epoch": 13.713988615003133, "grad_norm": 2.115246057510376, "learning_rate": 1.169886534742558e-06, "loss": 0.2284, "step": 1028700 }, { "epoch": 13.715321752809588, "grad_norm": 6.986651420593262, "learning_rate": 1.1674873717861602e-06, "loss": 0.2093, "step": 1028800 }, { "epoch": 13.716654890616043, "grad_norm": 2.2412428855895996, "learning_rate": 1.1650906225968382e-06, "loss": 0.236, "step": 1028900 }, { "epoch": 13.717988028422498, "grad_norm": 2.371774911880493, "learning_rate": 1.1626962873752357e-06, "loss": 0.2531, "step": 1029000 }, { "epoch": 13.719321166228953, "grad_norm": 3.4392151832580566, "learning_rate": 1.160304366321795e-06, "loss": 0.2239, "step": 1029100 }, { "epoch": 13.720654304035408, "grad_norm": 3.0710790157318115, "learning_rate": 1.1579148596367595e-06, "loss": 0.2221, "step": 1029200 }, { "epoch": 13.721987441841863, "grad_norm": 2.8711650371551514, "learning_rate": 1.1555277675201759e-06, "loss": 0.2535, "step": 1029300 }, { "epoch": 13.723320579648318, "grad_norm": 1.0238970518112183, "learning_rate": 1.1531430901718842e-06, "loss": 0.2089, "step": 1029400 }, { "epoch": 13.724653717454773, "grad_norm": 1.4846059083938599, "learning_rate": 1.150760827791515e-06, "loss": 0.2644, "step": 1029500 }, { "epoch": 13.725986855261228, "grad_norm": 2.8586456775665283, "learning_rate": 1.148380980578515e-06, "loss": 0.2413, "step": 1029600 }, { "epoch": 13.727319993067683, "grad_norm": 2.557223081588745, "learning_rate": 1.1460035487321087e-06, "loss": 0.2399, "step": 1029700 }, { "epoch": 13.728653130874138, "grad_norm": 4.516556739807129, "learning_rate": 1.1436285324513262e-06, "loss": 0.2303, "step": 1029800 }, { "epoch": 13.729986268680593, "grad_norm": 1.0812755823135376, "learning_rate": 1.1412796459814822e-06, "loss": 0.2196, "step": 1029900 }, { "epoch": 13.73131940648705, "grad_norm": 1.4370964765548706, "learning_rate": 1.1389094372676246e-06, "loss": 0.2673, "step": 1030000 }, { "epoch": 13.732652544293504, "grad_norm": 3.9859459400177, "learning_rate": 1.1365416447132925e-06, "loss": 0.2055, "step": 1030100 }, { "epoch": 13.73398568209996, "grad_norm": 0.7819865345954895, "learning_rate": 1.1341762685167112e-06, "loss": 0.2889, "step": 1030200 }, { "epoch": 13.735318819906414, "grad_norm": 2.772371292114258, "learning_rate": 1.1318133088758987e-06, "loss": 0.2212, "step": 1030300 }, { "epoch": 13.73665195771287, "grad_norm": 2.717047691345215, "learning_rate": 1.1294527659886766e-06, "loss": 0.2721, "step": 1030400 }, { "epoch": 13.737985095519324, "grad_norm": 2.3621532917022705, "learning_rate": 1.127094640052657e-06, "loss": 0.2383, "step": 1030500 }, { "epoch": 13.739318233325779, "grad_norm": 5.8931474685668945, "learning_rate": 1.1247389312652622e-06, "loss": 0.2373, "step": 1030600 }, { "epoch": 13.740651371132234, "grad_norm": 2.9569621086120605, "learning_rate": 1.122385639823701e-06, "loss": 0.2328, "step": 1030700 }, { "epoch": 13.741984508938689, "grad_norm": 1.6080334186553955, "learning_rate": 1.1200347659249855e-06, "loss": 0.2751, "step": 1030800 }, { "epoch": 13.743317646745144, "grad_norm": 1.6390502452850342, "learning_rate": 1.1176863097659218e-06, "loss": 0.2353, "step": 1030900 }, { "epoch": 13.744650784551599, "grad_norm": 2.246633529663086, "learning_rate": 1.1153402715431093e-06, "loss": 0.1856, "step": 1031000 }, { "epoch": 13.745983922358054, "grad_norm": 2.348642587661743, "learning_rate": 1.1129966514529644e-06, "loss": 0.211, "step": 1031100 }, { "epoch": 13.747317060164509, "grad_norm": 3.060925006866455, "learning_rate": 1.1106554496916766e-06, "loss": 0.2184, "step": 1031200 }, { "epoch": 13.748650197970964, "grad_norm": 0.8270769715309143, "learning_rate": 1.108316666455249e-06, "loss": 0.2749, "step": 1031300 }, { "epoch": 13.749983335777419, "grad_norm": 4.706063747406006, "learning_rate": 1.105980301939472e-06, "loss": 0.2465, "step": 1031400 }, { "epoch": 13.751316473583874, "grad_norm": 3.0009799003601074, "learning_rate": 1.1036463563399458e-06, "loss": 0.2861, "step": 1031500 }, { "epoch": 13.752649611390328, "grad_norm": 1.865051031112671, "learning_rate": 1.1013148298520604e-06, "loss": 0.2574, "step": 1031600 }, { "epoch": 13.753982749196785, "grad_norm": 2.1810996532440186, "learning_rate": 1.0989857226709965e-06, "loss": 0.2422, "step": 1031700 }, { "epoch": 13.75531588700324, "grad_norm": 2.638638496398926, "learning_rate": 1.0966590349917416e-06, "loss": 0.2679, "step": 1031800 }, { "epoch": 13.756649024809695, "grad_norm": 1.723937749862671, "learning_rate": 1.0943347670090864e-06, "loss": 0.2533, "step": 1031900 }, { "epoch": 13.75798216261615, "grad_norm": 1.0293951034545898, "learning_rate": 1.092012918917602e-06, "loss": 0.2339, "step": 1032000 }, { "epoch": 13.759315300422605, "grad_norm": 1.2109655141830444, "learning_rate": 1.089693490911673e-06, "loss": 0.2741, "step": 1032100 }, { "epoch": 13.76064843822906, "grad_norm": 1.5366119146347046, "learning_rate": 1.0873764831854738e-06, "loss": 0.221, "step": 1032200 }, { "epoch": 13.761981576035515, "grad_norm": 3.731661796569824, "learning_rate": 1.085061895932966e-06, "loss": 0.2395, "step": 1032300 }, { "epoch": 13.76331471384197, "grad_norm": 2.8567028045654297, "learning_rate": 1.082749729347935e-06, "loss": 0.271, "step": 1032400 }, { "epoch": 13.764647851648425, "grad_norm": 6.3987345695495605, "learning_rate": 1.0804399836239454e-06, "loss": 0.2317, "step": 1032500 }, { "epoch": 13.76598098945488, "grad_norm": 10.789350509643555, "learning_rate": 1.0781326589543529e-06, "loss": 0.2366, "step": 1032600 }, { "epoch": 13.767314127261335, "grad_norm": 1.3194111585617065, "learning_rate": 1.0758277555323226e-06, "loss": 0.2448, "step": 1032700 }, { "epoch": 13.76864726506779, "grad_norm": 3.7683866024017334, "learning_rate": 1.0735252735508205e-06, "loss": 0.2454, "step": 1032800 }, { "epoch": 13.769980402874245, "grad_norm": 1.2984403371810913, "learning_rate": 1.0712252132025958e-06, "loss": 0.2389, "step": 1032900 }, { "epoch": 13.7713135406807, "grad_norm": 1.5686649084091187, "learning_rate": 1.0689275746802074e-06, "loss": 0.2104, "step": 1033000 }, { "epoch": 13.772646678487154, "grad_norm": 0.9920790791511536, "learning_rate": 1.0666323581760017e-06, "loss": 0.2608, "step": 1033100 }, { "epoch": 13.773979816293611, "grad_norm": 1.9647215604782104, "learning_rate": 1.0643395638821285e-06, "loss": 0.2094, "step": 1033200 }, { "epoch": 13.775312954100066, "grad_norm": 5.520052433013916, "learning_rate": 1.0620491919905339e-06, "loss": 0.2649, "step": 1033300 }, { "epoch": 13.776646091906521, "grad_norm": 0.8091443777084351, "learning_rate": 1.0597612426929615e-06, "loss": 0.2566, "step": 1033400 }, { "epoch": 13.777979229712976, "grad_norm": 1.9689315557479858, "learning_rate": 1.0574757161809513e-06, "loss": 0.2528, "step": 1033500 }, { "epoch": 13.779312367519431, "grad_norm": 8.078714370727539, "learning_rate": 1.05519261264584e-06, "loss": 0.2127, "step": 1033600 }, { "epoch": 13.780645505325886, "grad_norm": 2.84047532081604, "learning_rate": 1.0529119322787583e-06, "loss": 0.2631, "step": 1033700 }, { "epoch": 13.78197864313234, "grad_norm": 3.646965742111206, "learning_rate": 1.0506336752706402e-06, "loss": 0.2157, "step": 1033800 }, { "epoch": 13.783311780938796, "grad_norm": 2.0719738006591797, "learning_rate": 1.0483578418122197e-06, "loss": 0.2542, "step": 1033900 }, { "epoch": 13.78464491874525, "grad_norm": 0.7857186794281006, "learning_rate": 1.0460844320940144e-06, "loss": 0.2315, "step": 1034000 }, { "epoch": 13.785978056551706, "grad_norm": 5.19677209854126, "learning_rate": 1.0438361441651444e-06, "loss": 0.2426, "step": 1034100 }, { "epoch": 13.78731119435816, "grad_norm": 1.2621859312057495, "learning_rate": 1.0415675582559947e-06, "loss": 0.2138, "step": 1034200 }, { "epoch": 13.788644332164615, "grad_norm": 2.222111225128174, "learning_rate": 1.0393013966555221e-06, "loss": 0.2599, "step": 1034300 }, { "epoch": 13.78997746997107, "grad_norm": 1.8756122589111328, "learning_rate": 1.0370376595534447e-06, "loss": 0.2082, "step": 1034400 }, { "epoch": 13.791310607777525, "grad_norm": 3.949198007583618, "learning_rate": 1.0347763471392745e-06, "loss": 0.2354, "step": 1034500 }, { "epoch": 13.79264374558398, "grad_norm": 2.2317943572998047, "learning_rate": 1.0325174596023234e-06, "loss": 0.2568, "step": 1034600 }, { "epoch": 13.793976883390435, "grad_norm": 4.672517776489258, "learning_rate": 1.0302609971316967e-06, "loss": 0.2467, "step": 1034700 }, { "epoch": 13.79531002119689, "grad_norm": 2.3380441665649414, "learning_rate": 1.0280069599163e-06, "loss": 0.2493, "step": 1034800 }, { "epoch": 13.796643159003347, "grad_norm": 2.370701551437378, "learning_rate": 1.0257553481448289e-06, "loss": 0.2544, "step": 1034900 }, { "epoch": 13.797976296809802, "grad_norm": 1.4164094924926758, "learning_rate": 1.0235286418596811e-06, "loss": 0.2361, "step": 1035000 }, { "epoch": 13.799309434616257, "grad_norm": 2.265474557876587, "learning_rate": 1.0212818572822246e-06, "loss": 0.2536, "step": 1035100 }, { "epoch": 13.800642572422712, "grad_norm": 0.8586745262145996, "learning_rate": 1.0190374987117035e-06, "loss": 0.2323, "step": 1035200 }, { "epoch": 13.801975710229167, "grad_norm": 4.279275894165039, "learning_rate": 1.016795566336004e-06, "loss": 0.1762, "step": 1035300 }, { "epoch": 13.803308848035622, "grad_norm": 1.914923906326294, "learning_rate": 1.0145560603428195e-06, "loss": 0.2504, "step": 1035400 }, { "epoch": 13.804641985842077, "grad_norm": 3.479409694671631, "learning_rate": 1.012318980919633e-06, "loss": 0.252, "step": 1035500 }, { "epoch": 13.805975123648532, "grad_norm": 2.1075286865234375, "learning_rate": 1.010084328253722e-06, "loss": 0.2589, "step": 1035600 }, { "epoch": 13.807308261454986, "grad_norm": 3.9589266777038574, "learning_rate": 1.0078521025321663e-06, "loss": 0.226, "step": 1035700 }, { "epoch": 13.808641399261441, "grad_norm": 1.108304500579834, "learning_rate": 1.00562230394184e-06, "loss": 0.2013, "step": 1035800 }, { "epoch": 13.809974537067896, "grad_norm": 3.1798582077026367, "learning_rate": 1.0033949326694203e-06, "loss": 0.236, "step": 1035900 }, { "epoch": 13.811307674874351, "grad_norm": 2.494469165802002, "learning_rate": 1.0011699889013714e-06, "loss": 0.2446, "step": 1036000 }, { "epoch": 13.812640812680806, "grad_norm": 2.857994794845581, "learning_rate": 9.989474728239544e-07, "loss": 0.2331, "step": 1036100 }, { "epoch": 13.813973950487261, "grad_norm": 0.9496856331825256, "learning_rate": 9.967273846232373e-07, "loss": 0.2007, "step": 1036200 }, { "epoch": 13.815307088293716, "grad_norm": 0.9074098467826843, "learning_rate": 9.94509724485081e-07, "loss": 0.2276, "step": 1036300 }, { "epoch": 13.816640226100173, "grad_norm": 1.0981276035308838, "learning_rate": 9.922944925951338e-07, "loss": 0.239, "step": 1036400 }, { "epoch": 13.817973363906628, "grad_norm": 1.9313292503356934, "learning_rate": 9.90081689138851e-07, "loss": 0.2674, "step": 1036500 }, { "epoch": 13.819306501713083, "grad_norm": 1.1998950242996216, "learning_rate": 9.878713143014838e-07, "loss": 0.2531, "step": 1036600 }, { "epoch": 13.820639639519538, "grad_norm": 3.398129463195801, "learning_rate": 9.85663368268075e-07, "loss": 0.2182, "step": 1036700 }, { "epoch": 13.821972777325993, "grad_norm": 3.047816753387451, "learning_rate": 9.834578512234694e-07, "loss": 0.2411, "step": 1036800 }, { "epoch": 13.823305915132448, "grad_norm": 1.125472068786621, "learning_rate": 9.812547633523061e-07, "loss": 0.2074, "step": 1036900 }, { "epoch": 13.824639052938902, "grad_norm": 2.6379191875457764, "learning_rate": 9.790541048390145e-07, "loss": 0.209, "step": 1037000 }, { "epoch": 13.825972190745357, "grad_norm": 5.317410469055176, "learning_rate": 9.76855875867837e-07, "loss": 0.2691, "step": 1037100 }, { "epoch": 13.827305328551812, "grad_norm": 1.1834243535995483, "learning_rate": 9.74660076622793e-07, "loss": 0.2443, "step": 1037200 }, { "epoch": 13.828638466358267, "grad_norm": 0.6929134726524353, "learning_rate": 9.724667072877092e-07, "loss": 0.231, "step": 1037300 }, { "epoch": 13.829971604164722, "grad_norm": 2.5540196895599365, "learning_rate": 9.702757680462116e-07, "loss": 0.2811, "step": 1037400 }, { "epoch": 13.831304741971177, "grad_norm": 2.382902145385742, "learning_rate": 9.680872590817202e-07, "loss": 0.2337, "step": 1037500 }, { "epoch": 13.832637879777632, "grad_norm": 4.915927410125732, "learning_rate": 9.659011805774454e-07, "loss": 0.2486, "step": 1037600 }, { "epoch": 13.833971017584087, "grad_norm": 2.1470441818237305, "learning_rate": 9.637175327164006e-07, "loss": 0.219, "step": 1037700 }, { "epoch": 13.835304155390542, "grad_norm": 5.943410873413086, "learning_rate": 9.615363156813928e-07, "loss": 0.2261, "step": 1037800 }, { "epoch": 13.836637293196997, "grad_norm": 1.9992451667785645, "learning_rate": 9.593575296550295e-07, "loss": 0.2331, "step": 1037900 }, { "epoch": 13.837970431003452, "grad_norm": 1.6973466873168945, "learning_rate": 9.571811748197045e-07, "loss": 0.2669, "step": 1038000 }, { "epoch": 13.839303568809909, "grad_norm": 1.0688226222991943, "learning_rate": 9.550072513576259e-07, "loss": 0.2323, "step": 1038100 }, { "epoch": 13.840636706616364, "grad_norm": 1.3914613723754883, "learning_rate": 9.528357594507808e-07, "loss": 0.2562, "step": 1038200 }, { "epoch": 13.841969844422819, "grad_norm": 4.100545883178711, "learning_rate": 9.506666992809643e-07, "loss": 0.2376, "step": 1038300 }, { "epoch": 13.843302982229273, "grad_norm": 0.3005801737308502, "learning_rate": 9.485000710297542e-07, "loss": 0.2226, "step": 1038400 }, { "epoch": 13.844636120035728, "grad_norm": 0.8584132194519043, "learning_rate": 9.463358748785422e-07, "loss": 0.2071, "step": 1038500 }, { "epoch": 13.845969257842183, "grad_norm": 10.454839706420898, "learning_rate": 9.4417411100851e-07, "loss": 0.2743, "step": 1038600 }, { "epoch": 13.847302395648638, "grad_norm": 0.8472445607185364, "learning_rate": 9.420147796006262e-07, "loss": 0.2548, "step": 1038700 }, { "epoch": 13.848635533455093, "grad_norm": 5.765646934509277, "learning_rate": 9.398578808356628e-07, "loss": 0.2425, "step": 1038800 }, { "epoch": 13.849968671261548, "grad_norm": 1.3738350868225098, "learning_rate": 9.377034148941955e-07, "loss": 0.253, "step": 1038900 }, { "epoch": 13.851301809068003, "grad_norm": 1.0782455205917358, "learning_rate": 9.355513819565864e-07, "loss": 0.2491, "step": 1039000 }, { "epoch": 13.852634946874458, "grad_norm": 1.970872402191162, "learning_rate": 9.334017822029984e-07, "loss": 0.2786, "step": 1039100 }, { "epoch": 13.853968084680913, "grad_norm": 3.155871868133545, "learning_rate": 9.312546158133872e-07, "loss": 0.2042, "step": 1039200 }, { "epoch": 13.855301222487368, "grad_norm": 1.0710601806640625, "learning_rate": 9.291098829674993e-07, "loss": 0.2232, "step": 1039300 }, { "epoch": 13.856634360293823, "grad_norm": 1.2443015575408936, "learning_rate": 9.26967583844901e-07, "loss": 0.2404, "step": 1039400 }, { "epoch": 13.857967498100278, "grad_norm": 1.618241548538208, "learning_rate": 9.248491052287233e-07, "loss": 0.2395, "step": 1039500 }, { "epoch": 13.859300635906735, "grad_norm": 0.93046635389328, "learning_rate": 9.227116497488153e-07, "loss": 0.2356, "step": 1039600 }, { "epoch": 13.86063377371319, "grad_norm": 2.1959242820739746, "learning_rate": 9.205766285278283e-07, "loss": 0.1973, "step": 1039700 }, { "epoch": 13.861966911519644, "grad_norm": 2.0554134845733643, "learning_rate": 9.184440417445061e-07, "loss": 0.2578, "step": 1039800 }, { "epoch": 13.8633000493261, "grad_norm": 2.0135433673858643, "learning_rate": 9.163138895773693e-07, "loss": 0.2367, "step": 1039900 }, { "epoch": 13.864633187132554, "grad_norm": 3.0449602603912354, "learning_rate": 9.141861722047551e-07, "loss": 0.2297, "step": 1040000 }, { "epoch": 13.86596632493901, "grad_norm": 1.7716082334518433, "learning_rate": 9.120608898047877e-07, "loss": 0.2435, "step": 1040100 }, { "epoch": 13.867299462745464, "grad_norm": 7.4309234619140625, "learning_rate": 9.099380425553916e-07, "loss": 0.2458, "step": 1040200 }, { "epoch": 13.86863260055192, "grad_norm": 1.4297988414764404, "learning_rate": 9.078176306342844e-07, "loss": 0.2504, "step": 1040300 }, { "epoch": 13.869965738358374, "grad_norm": 0.9229739308357239, "learning_rate": 9.056996542189744e-07, "loss": 0.2572, "step": 1040400 }, { "epoch": 13.871298876164829, "grad_norm": 2.5573058128356934, "learning_rate": 9.035841134867762e-07, "loss": 0.2569, "step": 1040500 }, { "epoch": 13.872632013971284, "grad_norm": 1.8080121278762817, "learning_rate": 9.014710086148014e-07, "loss": 0.2617, "step": 1040600 }, { "epoch": 13.873965151777739, "grad_norm": 2.5742673873901367, "learning_rate": 8.993603397799422e-07, "loss": 0.2154, "step": 1040700 }, { "epoch": 13.875298289584194, "grad_norm": 1.9859967231750488, "learning_rate": 8.972521071589035e-07, "loss": 0.2641, "step": 1040800 }, { "epoch": 13.876631427390649, "grad_norm": 3.812946081161499, "learning_rate": 8.951463109281776e-07, "loss": 0.1894, "step": 1040900 }, { "epoch": 13.877964565197104, "grad_norm": 3.531074285507202, "learning_rate": 8.93042951264057e-07, "loss": 0.2074, "step": 1041000 }, { "epoch": 13.879297703003559, "grad_norm": 0.7459983229637146, "learning_rate": 8.909420283426273e-07, "loss": 0.2284, "step": 1041100 }, { "epoch": 13.880630840810014, "grad_norm": 2.1985416412353516, "learning_rate": 8.888435423397745e-07, "loss": 0.2462, "step": 1041200 }, { "epoch": 13.88196397861647, "grad_norm": 1.1022251844406128, "learning_rate": 8.867474934311715e-07, "loss": 0.2579, "step": 1041300 }, { "epoch": 13.883297116422925, "grad_norm": 2.653452157974243, "learning_rate": 8.846538817922978e-07, "loss": 0.2426, "step": 1041400 }, { "epoch": 13.88463025422938, "grad_norm": 1.3582204580307007, "learning_rate": 8.825627075984167e-07, "loss": 0.1752, "step": 1041500 }, { "epoch": 13.885963392035835, "grad_norm": 1.7051554918289185, "learning_rate": 8.804948463235507e-07, "loss": 0.247, "step": 1041600 }, { "epoch": 13.88729652984229, "grad_norm": 10.177267074584961, "learning_rate": 8.784085231658423e-07, "loss": 0.285, "step": 1041700 }, { "epoch": 13.888629667648745, "grad_norm": 2.702916383743286, "learning_rate": 8.763246379759848e-07, "loss": 0.1979, "step": 1041800 }, { "epoch": 13.8899628054552, "grad_norm": 2.602324962615967, "learning_rate": 8.742431909284155e-07, "loss": 0.2695, "step": 1041900 }, { "epoch": 13.891295943261655, "grad_norm": 2.7711105346679688, "learning_rate": 8.721641821973946e-07, "loss": 0.2431, "step": 1042000 }, { "epoch": 13.89262908106811, "grad_norm": 14.995798110961914, "learning_rate": 8.700876119569701e-07, "loss": 0.2403, "step": 1042100 }, { "epoch": 13.893962218874565, "grad_norm": 2.0246875286102295, "learning_rate": 8.680134803809859e-07, "loss": 0.2905, "step": 1042200 }, { "epoch": 13.89529535668102, "grad_norm": 3.4604060649871826, "learning_rate": 8.659417876430831e-07, "loss": 0.2451, "step": 1042300 }, { "epoch": 13.896628494487475, "grad_norm": 2.7317333221435547, "learning_rate": 8.638725339166964e-07, "loss": 0.2689, "step": 1042400 }, { "epoch": 13.89796163229393, "grad_norm": 14.503413200378418, "learning_rate": 8.618057193750506e-07, "loss": 0.2392, "step": 1042500 }, { "epoch": 13.899294770100385, "grad_norm": 10.807990074157715, "learning_rate": 8.597413441911839e-07, "loss": 0.2165, "step": 1042600 }, { "epoch": 13.90062790790684, "grad_norm": 3.182060718536377, "learning_rate": 8.576794085379081e-07, "loss": 0.2595, "step": 1042700 }, { "epoch": 13.901961045713296, "grad_norm": 2.527010202407837, "learning_rate": 8.556199125878517e-07, "loss": 0.2213, "step": 1042800 }, { "epoch": 13.903294183519751, "grad_norm": 5.657831192016602, "learning_rate": 8.535628565134234e-07, "loss": 0.2247, "step": 1042900 }, { "epoch": 13.904627321326206, "grad_norm": 3.7111949920654297, "learning_rate": 8.51508240486839e-07, "loss": 0.2694, "step": 1043000 }, { "epoch": 13.905960459132661, "grad_norm": 3.752286195755005, "learning_rate": 8.494765743585131e-07, "loss": 0.2545, "step": 1043100 }, { "epoch": 13.907293596939116, "grad_norm": 4.389616966247559, "learning_rate": 8.474268145386521e-07, "loss": 0.2213, "step": 1043200 }, { "epoch": 13.908626734745571, "grad_norm": 2.9177143573760986, "learning_rate": 8.453794952803206e-07, "loss": 0.2059, "step": 1043300 }, { "epoch": 13.909959872552026, "grad_norm": 1.3055039644241333, "learning_rate": 8.433346167549183e-07, "loss": 0.2646, "step": 1043400 }, { "epoch": 13.91129301035848, "grad_norm": 0.9196100234985352, "learning_rate": 8.412921791336281e-07, "loss": 0.2872, "step": 1043500 }, { "epoch": 13.912626148164936, "grad_norm": 1.1953692436218262, "learning_rate": 8.392521825874466e-07, "loss": 0.2236, "step": 1043600 }, { "epoch": 13.91395928597139, "grad_norm": 2.6467983722686768, "learning_rate": 8.372146272871473e-07, "loss": 0.2285, "step": 1043700 }, { "epoch": 13.915292423777846, "grad_norm": 1.2031904458999634, "learning_rate": 8.351795134033069e-07, "loss": 0.2339, "step": 1043800 }, { "epoch": 13.9166255615843, "grad_norm": 1.0598710775375366, "learning_rate": 8.331468411063059e-07, "loss": 0.2587, "step": 1043900 }, { "epoch": 13.917958699390756, "grad_norm": 2.5300345420837402, "learning_rate": 8.311166105663048e-07, "loss": 0.2387, "step": 1044000 }, { "epoch": 13.91929183719721, "grad_norm": 0.36686766147613525, "learning_rate": 8.290888219532777e-07, "loss": 0.2543, "step": 1044100 }, { "epoch": 13.920624975003665, "grad_norm": 2.618995189666748, "learning_rate": 8.270634754369788e-07, "loss": 0.2151, "step": 1044200 }, { "epoch": 13.92195811281012, "grad_norm": 0.9891971945762634, "learning_rate": 8.250405711869624e-07, "loss": 0.2175, "step": 1044300 }, { "epoch": 13.923291250616575, "grad_norm": 1.4780974388122559, "learning_rate": 8.230201093725831e-07, "loss": 0.2, "step": 1044400 }, { "epoch": 13.924624388423032, "grad_norm": 5.640321731567383, "learning_rate": 8.210020901629888e-07, "loss": 0.2286, "step": 1044500 }, { "epoch": 13.925957526229487, "grad_norm": 2.919233560562134, "learning_rate": 8.18986513727118e-07, "loss": 0.24, "step": 1044600 }, { "epoch": 13.927290664035942, "grad_norm": 1.181450366973877, "learning_rate": 8.169934994755246e-07, "loss": 0.2522, "step": 1044700 }, { "epoch": 13.928623801842397, "grad_norm": 0.6383752822875977, "learning_rate": 8.149827846611691e-07, "loss": 0.2095, "step": 1044800 }, { "epoch": 13.929956939648852, "grad_norm": 2.0391695499420166, "learning_rate": 8.129745131244537e-07, "loss": 0.2277, "step": 1044900 }, { "epoch": 13.931290077455307, "grad_norm": 3.5429561138153076, "learning_rate": 8.109686850335074e-07, "loss": 0.2026, "step": 1045000 }, { "epoch": 13.932623215261762, "grad_norm": 4.570250511169434, "learning_rate": 8.089653005562525e-07, "loss": 0.2084, "step": 1045100 }, { "epoch": 13.933956353068217, "grad_norm": 27.52559471130371, "learning_rate": 8.069643598604015e-07, "loss": 0.2562, "step": 1045200 }, { "epoch": 13.935289490874672, "grad_norm": 6.213932991027832, "learning_rate": 8.049658631134738e-07, "loss": 0.2301, "step": 1045300 }, { "epoch": 13.936622628681127, "grad_norm": 2.627608299255371, "learning_rate": 8.029698104827721e-07, "loss": 0.2575, "step": 1045400 }, { "epoch": 13.937955766487581, "grad_norm": 1.7339298725128174, "learning_rate": 8.009762021353961e-07, "loss": 0.2491, "step": 1045500 }, { "epoch": 13.939288904294036, "grad_norm": 2.704718828201294, "learning_rate": 7.989850382382524e-07, "loss": 0.2485, "step": 1045600 }, { "epoch": 13.940622042100491, "grad_norm": 3.379755735397339, "learning_rate": 7.969963189580376e-07, "loss": 0.2044, "step": 1045700 }, { "epoch": 13.941955179906946, "grad_norm": 0.18967510759830475, "learning_rate": 7.950100444612285e-07, "loss": 0.2347, "step": 1045800 }, { "epoch": 13.943288317713401, "grad_norm": 2.678295850753784, "learning_rate": 7.930262149141188e-07, "loss": 0.2519, "step": 1045900 }, { "epoch": 13.944621455519858, "grad_norm": 0.3462173044681549, "learning_rate": 7.910448304827855e-07, "loss": 0.2634, "step": 1046000 }, { "epoch": 13.945954593326313, "grad_norm": 2.0623433589935303, "learning_rate": 7.890658913331028e-07, "loss": 0.202, "step": 1046100 }, { "epoch": 13.947287731132768, "grad_norm": 1.238069772720337, "learning_rate": 7.870893976307481e-07, "loss": 0.2086, "step": 1046200 }, { "epoch": 13.948620868939223, "grad_norm": 1.9739729166030884, "learning_rate": 7.851153495411756e-07, "loss": 0.2346, "step": 1046300 }, { "epoch": 13.949954006745678, "grad_norm": 4.792996406555176, "learning_rate": 7.831437472296532e-07, "loss": 0.2359, "step": 1046400 }, { "epoch": 13.951287144552133, "grad_norm": 3.27915358543396, "learning_rate": 7.811745908612422e-07, "loss": 0.2161, "step": 1046500 }, { "epoch": 13.952620282358588, "grad_norm": 2.856339693069458, "learning_rate": 7.79207880600784e-07, "loss": 0.2544, "step": 1046600 }, { "epoch": 13.953953420165043, "grad_norm": 3.881521224975586, "learning_rate": 7.772436166129304e-07, "loss": 0.257, "step": 1046700 }, { "epoch": 13.955286557971498, "grad_norm": 2.569241523742676, "learning_rate": 7.752817990621263e-07, "loss": 0.2052, "step": 1046800 }, { "epoch": 13.956619695777952, "grad_norm": 1.3072630167007446, "learning_rate": 7.733224281126039e-07, "loss": 0.2164, "step": 1046900 }, { "epoch": 13.957952833584407, "grad_norm": 10.446572303771973, "learning_rate": 7.713655039283951e-07, "loss": 0.2617, "step": 1047000 }, { "epoch": 13.959285971390862, "grad_norm": 7.678469657897949, "learning_rate": 7.694110266733356e-07, "loss": 0.2626, "step": 1047100 }, { "epoch": 13.960619109197317, "grad_norm": 8.280781745910645, "learning_rate": 7.674589965110379e-07, "loss": 0.2415, "step": 1047200 }, { "epoch": 13.961952247003772, "grad_norm": 3.71421217918396, "learning_rate": 7.655094136049245e-07, "loss": 0.2504, "step": 1047300 }, { "epoch": 13.963285384810227, "grad_norm": 2.567356586456299, "learning_rate": 7.635622781182084e-07, "loss": 0.2747, "step": 1047400 }, { "epoch": 13.964618522616682, "grad_norm": 94.87999725341797, "learning_rate": 7.616175902138955e-07, "loss": 0.257, "step": 1047500 }, { "epoch": 13.965951660423137, "grad_norm": 0.6203433871269226, "learning_rate": 7.596753500547959e-07, "loss": 0.1998, "step": 1047600 }, { "epoch": 13.967284798229594, "grad_norm": 2.2508506774902344, "learning_rate": 7.577355578034961e-07, "loss": 0.2306, "step": 1047700 }, { "epoch": 13.968617936036049, "grad_norm": 4.054562568664551, "learning_rate": 7.557982136223962e-07, "loss": 0.2266, "step": 1047800 }, { "epoch": 13.969951073842504, "grad_norm": 2.6808149814605713, "learning_rate": 7.538633176736898e-07, "loss": 0.2488, "step": 1047900 }, { "epoch": 13.971284211648959, "grad_norm": 3.763068199157715, "learning_rate": 7.519308701193506e-07, "loss": 0.294, "step": 1048000 }, { "epoch": 13.972617349455414, "grad_norm": 0.7521647810935974, "learning_rate": 7.500008711211593e-07, "loss": 0.2031, "step": 1048100 }, { "epoch": 13.973950487261869, "grad_norm": 0.8613218069076538, "learning_rate": 7.480733208406931e-07, "loss": 0.2455, "step": 1048200 }, { "epoch": 13.975283625068323, "grad_norm": 0.5617145299911499, "learning_rate": 7.461482194393199e-07, "loss": 0.2798, "step": 1048300 }, { "epoch": 13.976616762874778, "grad_norm": 0.5242614150047302, "learning_rate": 7.442255670782005e-07, "loss": 0.2871, "step": 1048400 }, { "epoch": 13.977949900681233, "grad_norm": 5.239567279815674, "learning_rate": 7.423053639182964e-07, "loss": 0.2329, "step": 1048500 }, { "epoch": 13.979283038487688, "grad_norm": 4.005058288574219, "learning_rate": 7.403876101203588e-07, "loss": 0.2398, "step": 1048600 }, { "epoch": 13.980616176294143, "grad_norm": 1.0750148296356201, "learning_rate": 7.384723058449295e-07, "loss": 0.2802, "step": 1048700 }, { "epoch": 13.981949314100598, "grad_norm": 2.2542102336883545, "learning_rate": 7.365594512523665e-07, "loss": 0.2593, "step": 1048800 }, { "epoch": 13.983282451907053, "grad_norm": 2.9490160942077637, "learning_rate": 7.346490465027988e-07, "loss": 0.2534, "step": 1048900 }, { "epoch": 13.984615589713508, "grad_norm": 2.3143787384033203, "learning_rate": 7.327410917561584e-07, "loss": 0.2328, "step": 1049000 }, { "epoch": 13.985948727519963, "grad_norm": 1.3683110475540161, "learning_rate": 7.308355871721739e-07, "loss": 0.211, "step": 1049100 }, { "epoch": 13.98728186532642, "grad_norm": 0.4024651050567627, "learning_rate": 7.289325329103713e-07, "loss": 0.2568, "step": 1049200 }, { "epoch": 13.988615003132875, "grad_norm": 3.266940116882324, "learning_rate": 7.270319291300697e-07, "loss": 0.2198, "step": 1049300 }, { "epoch": 13.98994814093933, "grad_norm": 1.5958442687988281, "learning_rate": 7.251337759903753e-07, "loss": 0.2336, "step": 1049400 }, { "epoch": 13.991281278745785, "grad_norm": 5.085100173950195, "learning_rate": 7.232380736502009e-07, "loss": 0.216, "step": 1049500 }, { "epoch": 13.99261441655224, "grad_norm": 29.255556106567383, "learning_rate": 7.213448222682428e-07, "loss": 0.2294, "step": 1049600 }, { "epoch": 13.993947554358694, "grad_norm": 7.8316969871521, "learning_rate": 7.194540220030044e-07, "loss": 0.2577, "step": 1049700 }, { "epoch": 13.99528069216515, "grad_norm": 2.3986003398895264, "learning_rate": 7.175656730127755e-07, "loss": 0.23, "step": 1049800 }, { "epoch": 13.996613829971604, "grad_norm": 4.129421234130859, "learning_rate": 7.156797754556433e-07, "loss": 0.2369, "step": 1049900 }, { "epoch": 13.99794696777806, "grad_norm": 0.4779958128929138, "learning_rate": 7.137963294894844e-07, "loss": 0.2465, "step": 1050000 }, { "epoch": 13.999280105584514, "grad_norm": 1.5396126508712769, "learning_rate": 7.119153352719798e-07, "loss": 0.2356, "step": 1050100 }, { "epoch": 14.00061324339097, "grad_norm": 2.511469841003418, "learning_rate": 7.100367929606033e-07, "loss": 0.2348, "step": 1050200 }, { "epoch": 14.001946381197424, "grad_norm": 0.08594519644975662, "learning_rate": 7.081607027126124e-07, "loss": 0.2261, "step": 1050300 }, { "epoch": 14.003279519003879, "grad_norm": 5.516632080078125, "learning_rate": 7.062870646850717e-07, "loss": 0.2173, "step": 1050400 }, { "epoch": 14.004612656810334, "grad_norm": 5.163456916809082, "learning_rate": 7.044158790348388e-07, "loss": 0.2141, "step": 1050500 }, { "epoch": 14.005945794616789, "grad_norm": 1.2369407415390015, "learning_rate": 7.025471459185584e-07, "loss": 0.2399, "step": 1050600 }, { "epoch": 14.007278932423244, "grad_norm": 3.7262589931488037, "learning_rate": 7.006808654926789e-07, "loss": 0.2127, "step": 1050700 }, { "epoch": 14.008612070229699, "grad_norm": 3.6226911544799805, "learning_rate": 6.988170379134384e-07, "loss": 0.1778, "step": 1050800 }, { "epoch": 14.009945208036156, "grad_norm": NaN, "learning_rate": 6.969742649397604e-07, "loss": 0.2498, "step": 1050900 }, { "epoch": 14.01127834584261, "grad_norm": 2.3343920707702637, "learning_rate": 6.951153189893345e-07, "loss": 0.2334, "step": 1051000 }, { "epoch": 14.012611483649065, "grad_norm": 2.0446863174438477, "learning_rate": 6.932588263514816e-07, "loss": 0.2283, "step": 1051100 }, { "epoch": 14.01394462145552, "grad_norm": 2.164741039276123, "learning_rate": 6.914047871816143e-07, "loss": 0.2023, "step": 1051200 }, { "epoch": 14.015277759261975, "grad_norm": 4.625085353851318, "learning_rate": 6.895532016349482e-07, "loss": 0.213, "step": 1051300 }, { "epoch": 14.01661089706843, "grad_norm": 1.8046081066131592, "learning_rate": 6.87704069866496e-07, "loss": 0.2022, "step": 1051400 }, { "epoch": 14.017944034874885, "grad_norm": 1.7922197580337524, "learning_rate": 6.858573920310573e-07, "loss": 0.2069, "step": 1051500 }, { "epoch": 14.01927717268134, "grad_norm": 1.7987767457962036, "learning_rate": 6.840315983724665e-07, "loss": 0.2416, "step": 1051600 }, { "epoch": 14.020610310487795, "grad_norm": 2.5505318641662598, "learning_rate": 6.821898043234631e-07, "loss": 0.2127, "step": 1051700 }, { "epoch": 14.02194344829425, "grad_norm": 3.0610506534576416, "learning_rate": 6.803504646691139e-07, "loss": 0.2017, "step": 1051800 }, { "epoch": 14.023276586100705, "grad_norm": 1.0439246892929077, "learning_rate": 6.78513579563399e-07, "loss": 0.2485, "step": 1051900 }, { "epoch": 14.02460972390716, "grad_norm": 1.7595933675765991, "learning_rate": 6.766791491600955e-07, "loss": 0.2325, "step": 1052000 }, { "epoch": 14.025942861713615, "grad_norm": 40.344547271728516, "learning_rate": 6.748471736127804e-07, "loss": 0.239, "step": 1052100 }, { "epoch": 14.02727599952007, "grad_norm": 4.208998680114746, "learning_rate": 6.730176530748178e-07, "loss": 0.2304, "step": 1052200 }, { "epoch": 14.028609137326525, "grad_norm": 1.182419776916504, "learning_rate": 6.711905876993752e-07, "loss": 0.2489, "step": 1052300 }, { "epoch": 14.02994227513298, "grad_norm": 4.363132953643799, "learning_rate": 6.693659776394001e-07, "loss": 0.2163, "step": 1052400 }, { "epoch": 14.031275412939436, "grad_norm": 1.2452566623687744, "learning_rate": 6.675438230476438e-07, "loss": 0.2121, "step": 1052500 }, { "epoch": 14.032608550745891, "grad_norm": 1.2766422033309937, "learning_rate": 6.657241240766609e-07, "loss": 0.2182, "step": 1052600 }, { "epoch": 14.033941688552346, "grad_norm": 0.252148300409317, "learning_rate": 6.639068808787796e-07, "loss": 0.279, "step": 1052700 }, { "epoch": 14.035274826358801, "grad_norm": 2.7219202518463135, "learning_rate": 6.620920936061414e-07, "loss": 0.2181, "step": 1052800 }, { "epoch": 14.036607964165256, "grad_norm": 0.08946900069713593, "learning_rate": 6.602797624106749e-07, "loss": 0.2538, "step": 1052900 }, { "epoch": 14.037941101971711, "grad_norm": 0.7402334213256836, "learning_rate": 6.584698874440953e-07, "loss": 0.2171, "step": 1053000 }, { "epoch": 14.039274239778166, "grad_norm": 5.549893379211426, "learning_rate": 6.566624688579248e-07, "loss": 0.2533, "step": 1053100 }, { "epoch": 14.040607377584621, "grad_norm": 2.2942209243774414, "learning_rate": 6.54857506803479e-07, "loss": 0.2607, "step": 1053200 }, { "epoch": 14.041940515391076, "grad_norm": 2.459243059158325, "learning_rate": 6.530550014318537e-07, "loss": 0.2258, "step": 1053300 }, { "epoch": 14.04327365319753, "grad_norm": 1.5531091690063477, "learning_rate": 6.512549528939548e-07, "loss": 0.2467, "step": 1053400 }, { "epoch": 14.044606791003986, "grad_norm": 2.416015148162842, "learning_rate": 6.494573613404787e-07, "loss": 0.2934, "step": 1053500 }, { "epoch": 14.04593992881044, "grad_norm": 1.2626136541366577, "learning_rate": 6.476622269219112e-07, "loss": 0.2805, "step": 1053600 }, { "epoch": 14.047273066616896, "grad_norm": 1.807714581489563, "learning_rate": 6.458695497885359e-07, "loss": 0.2252, "step": 1053700 }, { "epoch": 14.04860620442335, "grad_norm": 3.2715561389923096, "learning_rate": 6.440793300904257e-07, "loss": 0.2299, "step": 1053800 }, { "epoch": 14.049939342229806, "grad_norm": 0.2751200497150421, "learning_rate": 6.422915679774577e-07, "loss": 0.1765, "step": 1053900 }, { "epoch": 14.05127248003626, "grad_norm": 3.3625247478485107, "learning_rate": 6.405062635992987e-07, "loss": 0.2078, "step": 1054000 }, { "epoch": 14.052605617842717, "grad_norm": 1.3523482084274292, "learning_rate": 6.387234171054024e-07, "loss": 0.2825, "step": 1054100 }, { "epoch": 14.053938755649172, "grad_norm": 3.5809059143066406, "learning_rate": 6.369430286450295e-07, "loss": 0.2902, "step": 1054200 }, { "epoch": 14.055271893455627, "grad_norm": 1.6299567222595215, "learning_rate": 6.351650983672241e-07, "loss": 0.2474, "step": 1054300 }, { "epoch": 14.056605031262082, "grad_norm": 1.0550050735473633, "learning_rate": 6.333896264208305e-07, "loss": 0.1968, "step": 1054400 }, { "epoch": 14.057938169068537, "grad_norm": 0.8732298612594604, "learning_rate": 6.316166129544898e-07, "loss": 0.2744, "step": 1054500 }, { "epoch": 14.059271306874992, "grad_norm": 3.0593388080596924, "learning_rate": 6.298460581166266e-07, "loss": 0.235, "step": 1054600 }, { "epoch": 14.060604444681447, "grad_norm": 3.2670884132385254, "learning_rate": 6.280779620554655e-07, "loss": 0.2797, "step": 1054700 }, { "epoch": 14.061937582487902, "grad_norm": 3.1959946155548096, "learning_rate": 6.263299691182367e-07, "loss": 0.1822, "step": 1054800 }, { "epoch": 14.063270720294357, "grad_norm": 1.625636100769043, "learning_rate": 6.245667664628829e-07, "loss": 0.2386, "step": 1054900 }, { "epoch": 14.064603858100812, "grad_norm": 2.7009942531585693, "learning_rate": 6.228060230262045e-07, "loss": 0.2034, "step": 1055000 }, { "epoch": 14.065936995907267, "grad_norm": 1.5402737855911255, "learning_rate": 6.210477389555969e-07, "loss": 0.2221, "step": 1055100 }, { "epoch": 14.067270133713722, "grad_norm": 1.1563163995742798, "learning_rate": 6.192919143982689e-07, "loss": 0.2365, "step": 1055200 }, { "epoch": 14.068603271520177, "grad_norm": 5.429937839508057, "learning_rate": 6.175385495011999e-07, "loss": 0.2453, "step": 1055300 }, { "epoch": 14.069936409326631, "grad_norm": 1.3740977048873901, "learning_rate": 6.15787644411182e-07, "loss": 0.2647, "step": 1055400 }, { "epoch": 14.071269547133086, "grad_norm": 3.8874075412750244, "learning_rate": 6.140391992747951e-07, "loss": 0.1819, "step": 1055500 }, { "epoch": 14.072602684939541, "grad_norm": 1.4664541482925415, "learning_rate": 6.122932142384152e-07, "loss": 0.2506, "step": 1055600 }, { "epoch": 14.073935822745998, "grad_norm": 4.661727428436279, "learning_rate": 6.105496894482121e-07, "loss": 0.2688, "step": 1055700 }, { "epoch": 14.075268960552453, "grad_norm": 2.114750385284424, "learning_rate": 6.088086250501457e-07, "loss": 0.2175, "step": 1055800 }, { "epoch": 14.076602098358908, "grad_norm": 1.6303373575210571, "learning_rate": 6.070700211899693e-07, "loss": 0.249, "step": 1055900 }, { "epoch": 14.077935236165363, "grad_norm": 1.0567864179611206, "learning_rate": 6.053338780132333e-07, "loss": 0.2637, "step": 1056000 }, { "epoch": 14.079268373971818, "grad_norm": 3.6476082801818848, "learning_rate": 6.036001956652882e-07, "loss": 0.2387, "step": 1056100 }, { "epoch": 14.080601511778273, "grad_norm": 2.02285099029541, "learning_rate": 6.018689742912708e-07, "loss": 0.244, "step": 1056200 }, { "epoch": 14.081934649584728, "grad_norm": 0.6519585847854614, "learning_rate": 6.00140214036109e-07, "loss": 0.2513, "step": 1056300 }, { "epoch": 14.083267787391183, "grad_norm": 3.9102509021759033, "learning_rate": 5.984139150445333e-07, "loss": 0.2666, "step": 1056400 }, { "epoch": 14.084600925197638, "grad_norm": 1.9477348327636719, "learning_rate": 5.966900774610618e-07, "loss": 0.2181, "step": 1056500 }, { "epoch": 14.085934063004093, "grad_norm": 1.418280005455017, "learning_rate": 5.949687014300153e-07, "loss": 0.224, "step": 1056600 }, { "epoch": 14.087267200810548, "grad_norm": 1.182983160018921, "learning_rate": 5.932497870954922e-07, "loss": 0.2381, "step": 1056700 }, { "epoch": 14.088600338617002, "grad_norm": 1.4349685907363892, "learning_rate": 5.915333346013974e-07, "loss": 0.2175, "step": 1056800 }, { "epoch": 14.089933476423457, "grad_norm": 1.0067496299743652, "learning_rate": 5.89819344091429e-07, "loss": 0.2219, "step": 1056900 }, { "epoch": 14.091266614229912, "grad_norm": 0.8156697154045105, "learning_rate": 5.881078157090757e-07, "loss": 0.2035, "step": 1057000 }, { "epoch": 14.092599752036367, "grad_norm": 2.1138908863067627, "learning_rate": 5.863987495976231e-07, "loss": 0.2241, "step": 1057100 }, { "epoch": 14.093932889842822, "grad_norm": 4.835636138916016, "learning_rate": 5.846921459001497e-07, "loss": 0.2016, "step": 1057200 }, { "epoch": 14.095266027649279, "grad_norm": 0.4491064250469208, "learning_rate": 5.829880047595215e-07, "loss": 0.2308, "step": 1057300 }, { "epoch": 14.096599165455734, "grad_norm": 2.0618114471435547, "learning_rate": 5.812863263184076e-07, "loss": 0.2336, "step": 1057400 }, { "epoch": 14.097932303262189, "grad_norm": 1.9779330492019653, "learning_rate": 5.795871107192707e-07, "loss": 0.2203, "step": 1057500 }, { "epoch": 14.099265441068644, "grad_norm": 0.534290611743927, "learning_rate": 5.778903581043538e-07, "loss": 0.1924, "step": 1057600 }, { "epoch": 14.100598578875099, "grad_norm": 2.478257417678833, "learning_rate": 5.76196068615713e-07, "loss": 0.2507, "step": 1057700 }, { "epoch": 14.101931716681554, "grad_norm": 0.939971387386322, "learning_rate": 5.745042423951851e-07, "loss": 0.2149, "step": 1057800 }, { "epoch": 14.103264854488009, "grad_norm": 2.2773845195770264, "learning_rate": 5.728148795844035e-07, "loss": 0.3171, "step": 1057900 }, { "epoch": 14.104597992294464, "grad_norm": 1.1590226888656616, "learning_rate": 5.711279803247982e-07, "loss": 0.2646, "step": 1058000 }, { "epoch": 14.105931130100918, "grad_norm": 2.6032161712646484, "learning_rate": 5.694435447575929e-07, "loss": 0.2156, "step": 1058100 }, { "epoch": 14.107264267907373, "grad_norm": 2.831697463989258, "learning_rate": 5.677615730237951e-07, "loss": 0.2098, "step": 1058200 }, { "epoch": 14.108597405713828, "grad_norm": 3.443387031555176, "learning_rate": 5.660820652642285e-07, "loss": 0.206, "step": 1058300 }, { "epoch": 14.109930543520283, "grad_norm": 3.1999926567077637, "learning_rate": 5.644050216194807e-07, "loss": 0.2765, "step": 1058400 }, { "epoch": 14.111263681326738, "grad_norm": 2.9215939044952393, "learning_rate": 5.627304422299595e-07, "loss": 0.1882, "step": 1058500 }, { "epoch": 14.112596819133193, "grad_norm": 1.525112509727478, "learning_rate": 5.610583272358494e-07, "loss": 0.2608, "step": 1058600 }, { "epoch": 14.113929956939648, "grad_norm": 1.3325327634811401, "learning_rate": 5.59388676777135e-07, "loss": 0.2158, "step": 1058700 }, { "epoch": 14.115263094746103, "grad_norm": 4.840986728668213, "learning_rate": 5.577214909935979e-07, "loss": 0.2323, "step": 1058800 }, { "epoch": 14.11659623255256, "grad_norm": 0.3246305584907532, "learning_rate": 5.560567700248099e-07, "loss": 0.237, "step": 1058900 }, { "epoch": 14.117929370359015, "grad_norm": 3.461585760116577, "learning_rate": 5.544111243682981e-07, "loss": 0.1985, "step": 1059000 }, { "epoch": 14.11926250816547, "grad_norm": 1.6240087747573853, "learning_rate": 5.527513087952696e-07, "loss": 0.2906, "step": 1059100 }, { "epoch": 14.120595645971925, "grad_norm": 0.7520774602890015, "learning_rate": 5.510939584530739e-07, "loss": 0.2548, "step": 1059200 }, { "epoch": 14.12192878377838, "grad_norm": 3.8498568534851074, "learning_rate": 5.494556101261528e-07, "loss": 0.2344, "step": 1059300 }, { "epoch": 14.123261921584835, "grad_norm": 3.3461265563964844, "learning_rate": 5.478196782423939e-07, "loss": 0.2396, "step": 1059400 }, { "epoch": 14.12459505939129, "grad_norm": 0.1269373744726181, "learning_rate": 5.46169675110082e-07, "loss": 0.2379, "step": 1059500 }, { "epoch": 14.125928197197744, "grad_norm": 2.2418036460876465, "learning_rate": 5.445221377595988e-07, "loss": 0.2749, "step": 1059600 }, { "epoch": 14.1272613350042, "grad_norm": 1.8416976928710938, "learning_rate": 5.42877066328874e-07, "loss": 0.2491, "step": 1059700 }, { "epoch": 14.128594472810654, "grad_norm": 2.1676957607269287, "learning_rate": 5.412344609556208e-07, "loss": 0.1865, "step": 1059800 }, { "epoch": 14.12992761061711, "grad_norm": 1.6148464679718018, "learning_rate": 5.395943217773591e-07, "loss": 0.2227, "step": 1059900 }, { "epoch": 14.131260748423564, "grad_norm": 4.801997661590576, "learning_rate": 5.379566489313892e-07, "loss": 0.1938, "step": 1060000 }, { "epoch": 14.132593886230019, "grad_norm": 2.599362850189209, "learning_rate": 5.363214425548213e-07, "loss": 0.2302, "step": 1060100 }, { "epoch": 14.133927024036474, "grad_norm": 0.527295708656311, "learning_rate": 5.346887027845426e-07, "loss": 0.1896, "step": 1060200 }, { "epoch": 14.135260161842929, "grad_norm": 1.3379473686218262, "learning_rate": 5.330584297572439e-07, "loss": 0.2, "step": 1060300 }, { "epoch": 14.136593299649384, "grad_norm": 2.6583845615386963, "learning_rate": 5.314306236094057e-07, "loss": 0.2102, "step": 1060400 }, { "epoch": 14.13792643745584, "grad_norm": 4.766459941864014, "learning_rate": 5.298052844772994e-07, "loss": 0.2584, "step": 1060500 }, { "epoch": 14.139259575262296, "grad_norm": 1.7378342151641846, "learning_rate": 5.281824124969991e-07, "loss": 0.2736, "step": 1060600 }, { "epoch": 14.14059271306875, "grad_norm": 2.3273558616638184, "learning_rate": 5.265620078043665e-07, "loss": 0.2659, "step": 1060700 }, { "epoch": 14.141925850875205, "grad_norm": 2.6020495891571045, "learning_rate": 5.249440705350495e-07, "loss": 0.208, "step": 1060800 }, { "epoch": 14.14325898868166, "grad_norm": 1.6856106519699097, "learning_rate": 5.233286008245031e-07, "loss": 0.2712, "step": 1060900 }, { "epoch": 14.144592126488115, "grad_norm": 2.3933169841766357, "learning_rate": 5.217155988079691e-07, "loss": 0.2427, "step": 1061000 }, { "epoch": 14.14592526429457, "grad_norm": 2.3226890563964844, "learning_rate": 5.20105064620483e-07, "loss": 0.2406, "step": 1061100 }, { "epoch": 14.147258402101025, "grad_norm": 2.516571521759033, "learning_rate": 5.184969983968702e-07, "loss": 0.2299, "step": 1061200 }, { "epoch": 14.14859153990748, "grad_norm": 3.2929608821868896, "learning_rate": 5.168914002717561e-07, "loss": 0.2292, "step": 1061300 }, { "epoch": 14.149924677713935, "grad_norm": 4.3031792640686035, "learning_rate": 5.152882703795536e-07, "loss": 0.2179, "step": 1061400 }, { "epoch": 14.15125781552039, "grad_norm": 0.5185239315032959, "learning_rate": 5.136876088544751e-07, "loss": 0.2339, "step": 1061500 }, { "epoch": 14.152590953326845, "grad_norm": 3.8452460765838623, "learning_rate": 5.120894158305167e-07, "loss": 0.2272, "step": 1061600 }, { "epoch": 14.1539240911333, "grad_norm": 2.6604127883911133, "learning_rate": 5.104936914414848e-07, "loss": 0.2399, "step": 1061700 }, { "epoch": 14.155257228939755, "grad_norm": 0.6019144058227539, "learning_rate": 5.089004358209593e-07, "loss": 0.2271, "step": 1061800 }, { "epoch": 14.15659036674621, "grad_norm": 3.2410271167755127, "learning_rate": 5.073096491023232e-07, "loss": 0.2097, "step": 1061900 }, { "epoch": 14.157923504552665, "grad_norm": 1.6819738149642944, "learning_rate": 5.05721331418757e-07, "loss": 0.2483, "step": 1062000 }, { "epoch": 14.159256642359122, "grad_norm": 1.6584017276763916, "learning_rate": 5.041354829032274e-07, "loss": 0.2317, "step": 1062100 }, { "epoch": 14.160589780165576, "grad_norm": 3.0774738788604736, "learning_rate": 5.025521036884917e-07, "loss": 0.2341, "step": 1062200 }, { "epoch": 14.161922917972031, "grad_norm": 0.5440834164619446, "learning_rate": 5.009711939071138e-07, "loss": 0.2283, "step": 1062300 }, { "epoch": 14.163256055778486, "grad_norm": 1.7942670583724976, "learning_rate": 4.993927536914378e-07, "loss": 0.254, "step": 1062400 }, { "epoch": 14.164589193584941, "grad_norm": 3.4941587448120117, "learning_rate": 4.97816783173608e-07, "loss": 0.1895, "step": 1062500 }, { "epoch": 14.165922331391396, "grad_norm": 0.6011430621147156, "learning_rate": 4.96243282485559e-07, "loss": 0.2212, "step": 1062600 }, { "epoch": 14.167255469197851, "grad_norm": 2.390411615371704, "learning_rate": 4.946722517590152e-07, "loss": 0.2582, "step": 1062700 }, { "epoch": 14.168588607004306, "grad_norm": 1.4602457284927368, "learning_rate": 4.931036911255016e-07, "loss": 0.2478, "step": 1062800 }, { "epoch": 14.169921744810761, "grad_norm": 0.9551963210105896, "learning_rate": 4.91537600716333e-07, "loss": 0.204, "step": 1062900 }, { "epoch": 14.171254882617216, "grad_norm": 2.2769064903259277, "learning_rate": 4.899739806626213e-07, "loss": 0.2149, "step": 1063000 }, { "epoch": 14.172588020423671, "grad_norm": 0.7627294659614563, "learning_rate": 4.884128310952618e-07, "loss": 0.2196, "step": 1063100 }, { "epoch": 14.173921158230126, "grad_norm": 1.2973991632461548, "learning_rate": 4.868541521449499e-07, "loss": 0.2348, "step": 1063200 }, { "epoch": 14.17525429603658, "grad_norm": 1.552255630493164, "learning_rate": 4.852979439421778e-07, "loss": 0.2189, "step": 1063300 }, { "epoch": 14.176587433843036, "grad_norm": 0.640201985836029, "learning_rate": 4.837442066172248e-07, "loss": 0.2292, "step": 1063400 }, { "epoch": 14.17792057164949, "grad_norm": 3.797024726867676, "learning_rate": 4.821929403001601e-07, "loss": 0.236, "step": 1063500 }, { "epoch": 14.179253709455946, "grad_norm": 1.961318016052246, "learning_rate": 4.806441451208565e-07, "loss": 0.2487, "step": 1063600 }, { "epoch": 14.180586847262402, "grad_norm": 2.362067461013794, "learning_rate": 4.790978212089703e-07, "loss": 0.2229, "step": 1063700 }, { "epoch": 14.181919985068857, "grad_norm": 2.2718632221221924, "learning_rate": 4.775539686939579e-07, "loss": 0.2262, "step": 1063800 }, { "epoch": 14.183253122875312, "grad_norm": 2.1589412689208984, "learning_rate": 4.760279892804742e-07, "loss": 0.2635, "step": 1063900 }, { "epoch": 14.184586260681767, "grad_norm": 3.7088863849639893, "learning_rate": 4.7448905522954775e-07, "loss": 0.2354, "step": 1064000 }, { "epoch": 14.185919398488222, "grad_norm": 3.076881170272827, "learning_rate": 4.7295259296132653e-07, "loss": 0.2081, "step": 1064100 }, { "epoch": 14.187252536294677, "grad_norm": 2.9585931301116943, "learning_rate": 4.714186026044376e-07, "loss": 0.255, "step": 1064200 }, { "epoch": 14.188585674101132, "grad_norm": 3.2988693714141846, "learning_rate": 4.6988708428730155e-07, "loss": 0.2385, "step": 1064300 }, { "epoch": 14.189918811907587, "grad_norm": 0.6184855699539185, "learning_rate": 4.683580381381292e-07, "loss": 0.2434, "step": 1064400 }, { "epoch": 14.191251949714042, "grad_norm": 4.524077892303467, "learning_rate": 4.668314642849314e-07, "loss": 0.2401, "step": 1064500 }, { "epoch": 14.192585087520497, "grad_norm": 1.3636441230773926, "learning_rate": 4.6530736285550935e-07, "loss": 0.2304, "step": 1064600 }, { "epoch": 14.193918225326952, "grad_norm": 1.1199928522109985, "learning_rate": 4.6378573397745096e-07, "loss": 0.1968, "step": 1064700 }, { "epoch": 14.195251363133407, "grad_norm": 1.2330434322357178, "learning_rate": 4.622665777781443e-07, "loss": 0.2321, "step": 1064800 }, { "epoch": 14.196584500939862, "grad_norm": 0.3125476837158203, "learning_rate": 4.6074989438476766e-07, "loss": 0.1914, "step": 1064900 }, { "epoch": 14.197917638746317, "grad_norm": 13.51365852355957, "learning_rate": 4.5923568392429615e-07, "loss": 0.2164, "step": 1065000 }, { "epoch": 14.199250776552772, "grad_norm": 1.1975526809692383, "learning_rate": 4.577239465234917e-07, "loss": 0.2319, "step": 1065100 }, { "epoch": 14.200583914359227, "grad_norm": 2.3609938621520996, "learning_rate": 4.562146823089164e-07, "loss": 0.2217, "step": 1065200 }, { "epoch": 14.201917052165683, "grad_norm": 1.5830702781677246, "learning_rate": 4.547078914069125e-07, "loss": 0.2177, "step": 1065300 }, { "epoch": 14.203250189972138, "grad_norm": 3.244964838027954, "learning_rate": 4.5320357394363245e-07, "loss": 0.2457, "step": 1065400 }, { "epoch": 14.204583327778593, "grad_norm": 3.2876977920532227, "learning_rate": 4.5170173004501216e-07, "loss": 0.2553, "step": 1065500 }, { "epoch": 14.205916465585048, "grad_norm": 3.1994879245758057, "learning_rate": 4.5021734129367607e-07, "loss": 0.2387, "step": 1065600 }, { "epoch": 14.207249603391503, "grad_norm": 1.5421342849731445, "learning_rate": 4.4872042016256944e-07, "loss": 0.2043, "step": 1065700 }, { "epoch": 14.208582741197958, "grad_norm": 0.9787657856941223, "learning_rate": 4.4722597297143897e-07, "loss": 0.2411, "step": 1065800 }, { "epoch": 14.209915879004413, "grad_norm": 3.1908557415008545, "learning_rate": 4.4573399984538797e-07, "loss": 0.2402, "step": 1065900 }, { "epoch": 14.211249016810868, "grad_norm": 3.8480303287506104, "learning_rate": 4.4424450090932325e-07, "loss": 0.2043, "step": 1066000 }, { "epoch": 14.212582154617323, "grad_norm": 2.416217803955078, "learning_rate": 4.4275747628794163e-07, "loss": 0.2147, "step": 1066100 }, { "epoch": 14.213915292423778, "grad_norm": 2.6424622535705566, "learning_rate": 4.412729261057302e-07, "loss": 0.2048, "step": 1066200 }, { "epoch": 14.215248430230233, "grad_norm": 3.9743118286132812, "learning_rate": 4.3979085048697296e-07, "loss": 0.2361, "step": 1066300 }, { "epoch": 14.216581568036688, "grad_norm": 4.1552863121032715, "learning_rate": 4.383112495557473e-07, "loss": 0.2028, "step": 1066400 }, { "epoch": 14.217914705843143, "grad_norm": 4.751692771911621, "learning_rate": 4.368341234359108e-07, "loss": 0.2613, "step": 1066500 }, { "epoch": 14.219247843649597, "grad_norm": 1.3091301918029785, "learning_rate": 4.3535947225113446e-07, "loss": 0.2093, "step": 1066600 }, { "epoch": 14.220580981456052, "grad_norm": 1.275841474533081, "learning_rate": 4.3388729612486966e-07, "loss": 0.228, "step": 1066700 }, { "epoch": 14.221914119262507, "grad_norm": 1.0690009593963623, "learning_rate": 4.3241759518035773e-07, "loss": 0.2091, "step": 1066800 }, { "epoch": 14.223247257068964, "grad_norm": 0.4837716817855835, "learning_rate": 4.3095036954063693e-07, "loss": 0.2505, "step": 1066900 }, { "epoch": 14.224580394875419, "grad_norm": 1.3869668245315552, "learning_rate": 4.2948561932854566e-07, "loss": 0.2038, "step": 1067000 }, { "epoch": 14.225913532681874, "grad_norm": 5.016386032104492, "learning_rate": 4.2802334466669926e-07, "loss": 0.2244, "step": 1067100 }, { "epoch": 14.227246670488329, "grad_norm": 3.072538375854492, "learning_rate": 4.265635456775263e-07, "loss": 0.2278, "step": 1067200 }, { "epoch": 14.228579808294784, "grad_norm": 1.0095576047897339, "learning_rate": 4.251062224832258e-07, "loss": 0.2556, "step": 1067300 }, { "epoch": 14.229912946101239, "grad_norm": 1.6018412113189697, "learning_rate": 4.2365137520580357e-07, "loss": 0.2348, "step": 1067400 }, { "epoch": 14.231246083907694, "grad_norm": 1.8162797689437866, "learning_rate": 4.2219900396705534e-07, "loss": 0.1897, "step": 1067500 }, { "epoch": 14.232579221714149, "grad_norm": 9.520262718200684, "learning_rate": 4.2074910888857065e-07, "loss": 0.1952, "step": 1067600 }, { "epoch": 14.233912359520604, "grad_norm": 3.324944019317627, "learning_rate": 4.1930169009172905e-07, "loss": 0.2247, "step": 1067700 }, { "epoch": 14.235245497327059, "grad_norm": 0.7765418291091919, "learning_rate": 4.1785674769770354e-07, "loss": 0.2136, "step": 1067800 }, { "epoch": 14.236578635133514, "grad_norm": 1.7434241771697998, "learning_rate": 4.1641428182745746e-07, "loss": 0.2181, "step": 1067900 }, { "epoch": 14.237911772939968, "grad_norm": 3.540602445602417, "learning_rate": 4.1497429260175414e-07, "loss": 0.2552, "step": 1068000 }, { "epoch": 14.239244910746423, "grad_norm": 1.9044179916381836, "learning_rate": 4.135367801411438e-07, "loss": 0.2331, "step": 1068100 }, { "epoch": 14.240578048552878, "grad_norm": 17.243013381958008, "learning_rate": 4.1210174456596695e-07, "loss": 0.2478, "step": 1068200 }, { "epoch": 14.241911186359333, "grad_norm": 1.099382996559143, "learning_rate": 4.1066918599636736e-07, "loss": 0.2318, "step": 1068300 }, { "epoch": 14.243244324165788, "grad_norm": 3.5817298889160156, "learning_rate": 4.0923910455226586e-07, "loss": 0.202, "step": 1068400 }, { "epoch": 14.244577461972245, "grad_norm": 1.249175786972046, "learning_rate": 4.0781150035338997e-07, "loss": 0.2024, "step": 1068500 }, { "epoch": 14.2459105997787, "grad_norm": 2.6634604930877686, "learning_rate": 4.063863735192541e-07, "loss": 0.1903, "step": 1068600 }, { "epoch": 14.247243737585155, "grad_norm": 3.3033366203308105, "learning_rate": 4.049637241691628e-07, "loss": 0.2469, "step": 1068700 }, { "epoch": 14.24857687539161, "grad_norm": 1.4369654655456543, "learning_rate": 4.035435524222142e-07, "loss": 0.2431, "step": 1068800 }, { "epoch": 14.249910013198065, "grad_norm": 4.787734508514404, "learning_rate": 4.0212585839730644e-07, "loss": 0.2171, "step": 1068900 }, { "epoch": 14.25124315100452, "grad_norm": 2.6510469913482666, "learning_rate": 4.007247821092619e-07, "loss": 0.2477, "step": 1069000 }, { "epoch": 14.252576288810975, "grad_norm": 1.439478874206543, "learning_rate": 3.9931201910409666e-07, "loss": 0.1965, "step": 1069100 }, { "epoch": 14.25390942661743, "grad_norm": 1.6086617708206177, "learning_rate": 3.9790173417522224e-07, "loss": 0.2467, "step": 1069200 }, { "epoch": 14.255242564423884, "grad_norm": 1.9777209758758545, "learning_rate": 3.964939274407009e-07, "loss": 0.1812, "step": 1069300 }, { "epoch": 14.25657570223034, "grad_norm": 3.4645862579345703, "learning_rate": 3.950885990183883e-07, "loss": 0.2484, "step": 1069400 }, { "epoch": 14.257908840036794, "grad_norm": 1.3928947448730469, "learning_rate": 3.936857490259338e-07, "loss": 0.2351, "step": 1069500 }, { "epoch": 14.25924197784325, "grad_norm": 0.5417750477790833, "learning_rate": 3.922853775807833e-07, "loss": 0.2298, "step": 1069600 }, { "epoch": 14.260575115649704, "grad_norm": 1.0707982778549194, "learning_rate": 3.908874848001698e-07, "loss": 0.238, "step": 1069700 }, { "epoch": 14.26190825345616, "grad_norm": 1.846781849861145, "learning_rate": 3.894920708011196e-07, "loss": 0.216, "step": 1069800 }, { "epoch": 14.263241391262614, "grad_norm": 2.9344401359558105, "learning_rate": 3.8809913570045265e-07, "loss": 0.2196, "step": 1069900 }, { "epoch": 14.264574529069069, "grad_norm": 3.0035150051116943, "learning_rate": 3.86708679614779e-07, "loss": 0.281, "step": 1070000 }, { "epoch": 14.265907666875524, "grad_norm": 3.6640520095825195, "learning_rate": 3.8532070266051213e-07, "loss": 0.2711, "step": 1070100 }, { "epoch": 14.26724080468198, "grad_norm": 0.8083263039588928, "learning_rate": 3.839352049538358e-07, "loss": 0.2059, "step": 1070200 }, { "epoch": 14.268573942488436, "grad_norm": 2.468679428100586, "learning_rate": 3.8255218661075065e-07, "loss": 0.2273, "step": 1070300 }, { "epoch": 14.26990708029489, "grad_norm": 1.0755535364151, "learning_rate": 3.811716477470306e-07, "loss": 0.1991, "step": 1070400 }, { "epoch": 14.271240218101346, "grad_norm": 4.323497772216797, "learning_rate": 3.7979358847825665e-07, "loss": 0.2213, "step": 1070500 }, { "epoch": 14.2725733559078, "grad_norm": 0.8071557879447937, "learning_rate": 3.7841800891978974e-07, "loss": 0.2005, "step": 1070600 }, { "epoch": 14.273906493714255, "grad_norm": 0.008665742352604866, "learning_rate": 3.7704490918679444e-07, "loss": 0.2159, "step": 1070700 }, { "epoch": 14.27523963152071, "grad_norm": 1.6896063089370728, "learning_rate": 3.756742893942189e-07, "loss": 0.1961, "step": 1070800 }, { "epoch": 14.276572769327165, "grad_norm": 2.1377110481262207, "learning_rate": 3.7430614965680785e-07, "loss": 0.2513, "step": 1070900 }, { "epoch": 14.27790590713362, "grad_norm": 7.927426815032959, "learning_rate": 3.7294049008909646e-07, "loss": 0.2381, "step": 1071000 }, { "epoch": 14.279239044940075, "grad_norm": 3.5932846069335938, "learning_rate": 3.7157731080541656e-07, "loss": 0.2203, "step": 1071100 }, { "epoch": 14.28057218274653, "grad_norm": 1.06309974193573, "learning_rate": 3.702302066303942e-07, "loss": 0.2094, "step": 1071200 }, { "epoch": 14.281905320552985, "grad_norm": 1.2288427352905273, "learning_rate": 3.688719634512461e-07, "loss": 0.2458, "step": 1071300 }, { "epoch": 14.28323845835944, "grad_norm": 2.431720018386841, "learning_rate": 3.675162008967303e-07, "loss": 0.2101, "step": 1071400 }, { "epoch": 14.284571596165895, "grad_norm": 3.5188257694244385, "learning_rate": 3.661629190803495e-07, "loss": 0.2142, "step": 1071500 }, { "epoch": 14.28590473397235, "grad_norm": 8.136746406555176, "learning_rate": 3.6481211811539294e-07, "loss": 0.2499, "step": 1071600 }, { "epoch": 14.287237871778807, "grad_norm": 2.220297336578369, "learning_rate": 3.634637981149469e-07, "loss": 0.2192, "step": 1071700 }, { "epoch": 14.288571009585262, "grad_norm": 7.135437965393066, "learning_rate": 3.6211795919189105e-07, "loss": 0.2372, "step": 1071800 }, { "epoch": 14.289904147391717, "grad_norm": 2.60679030418396, "learning_rate": 3.607746014588953e-07, "loss": 0.2392, "step": 1071900 }, { "epoch": 14.291237285198171, "grad_norm": 2.888174295425415, "learning_rate": 3.594337250284163e-07, "loss": 0.1865, "step": 1072000 }, { "epoch": 14.292570423004626, "grad_norm": 2.627380132675171, "learning_rate": 3.58095330012711e-07, "loss": 0.1996, "step": 1072100 }, { "epoch": 14.293903560811081, "grad_norm": 1.7829020023345947, "learning_rate": 3.567594165238264e-07, "loss": 0.2022, "step": 1072200 }, { "epoch": 14.295236698617536, "grad_norm": 3.5665276050567627, "learning_rate": 3.5542598467359966e-07, "loss": 0.2289, "step": 1072300 }, { "epoch": 14.296569836423991, "grad_norm": 2.995089292526245, "learning_rate": 3.540950345736582e-07, "loss": 0.2059, "step": 1072400 }, { "epoch": 14.297902974230446, "grad_norm": 2.33872127532959, "learning_rate": 3.5276656633543626e-07, "loss": 0.226, "step": 1072500 }, { "epoch": 14.299236112036901, "grad_norm": 3.427321434020996, "learning_rate": 3.514405800701315e-07, "loss": 0.2131, "step": 1072600 }, { "epoch": 14.300569249843356, "grad_norm": 2.6470553874969482, "learning_rate": 3.501170758887651e-07, "loss": 0.2389, "step": 1072700 }, { "epoch": 14.301902387649811, "grad_norm": 0.4566672146320343, "learning_rate": 3.487960539021351e-07, "loss": 0.2538, "step": 1072800 }, { "epoch": 14.303235525456266, "grad_norm": 3.7573189735412598, "learning_rate": 3.474775142208297e-07, "loss": 0.2316, "step": 1072900 }, { "epoch": 14.304568663262721, "grad_norm": 0.5864859223365784, "learning_rate": 3.4616145695523406e-07, "loss": 0.2706, "step": 1073000 }, { "epoch": 14.305901801069176, "grad_norm": 3.5496625900268555, "learning_rate": 3.448478822155232e-07, "loss": 0.2177, "step": 1073100 }, { "epoch": 14.30723493887563, "grad_norm": 0.9740768671035767, "learning_rate": 3.4353679011166596e-07, "loss": 0.1966, "step": 1073200 }, { "epoch": 14.308568076682086, "grad_norm": 1.4810540676116943, "learning_rate": 3.422281807534278e-07, "loss": 0.2162, "step": 1073300 }, { "epoch": 14.309901214488542, "grad_norm": 2.489074945449829, "learning_rate": 3.4093510322489196e-07, "loss": 0.2269, "step": 1073400 }, { "epoch": 14.311234352294997, "grad_norm": 4.751602649688721, "learning_rate": 3.3963143485614135e-07, "loss": 0.2238, "step": 1073500 }, { "epoch": 14.312567490101452, "grad_norm": 0.6327700614929199, "learning_rate": 3.383302495599516e-07, "loss": 0.2656, "step": 1073600 }, { "epoch": 14.313900627907907, "grad_norm": 1.694870948791504, "learning_rate": 3.370445221742935e-07, "loss": 0.1977, "step": 1073700 }, { "epoch": 14.315233765714362, "grad_norm": 2.9204790592193604, "learning_rate": 3.3574827851636905e-07, "loss": 0.2461, "step": 1073800 }, { "epoch": 14.316566903520817, "grad_norm": 2.4600989818573, "learning_rate": 3.344545182560887e-07, "loss": 0.1948, "step": 1073900 }, { "epoch": 14.317900041327272, "grad_norm": 1.9618139266967773, "learning_rate": 3.3316324150175935e-07, "loss": 0.2448, "step": 1074000 }, { "epoch": 14.319233179133727, "grad_norm": 2.5300099849700928, "learning_rate": 3.3187444836148773e-07, "loss": 0.178, "step": 1074100 }, { "epoch": 14.320566316940182, "grad_norm": 1.7631052732467651, "learning_rate": 3.305881389431642e-07, "loss": 0.205, "step": 1074200 }, { "epoch": 14.321899454746637, "grad_norm": 2.1338071823120117, "learning_rate": 3.293043133544793e-07, "loss": 0.2411, "step": 1074300 }, { "epoch": 14.323232592553092, "grad_norm": 9.926796913146973, "learning_rate": 3.2802297170290375e-07, "loss": 0.2322, "step": 1074400 }, { "epoch": 14.324565730359547, "grad_norm": 0.7929961681365967, "learning_rate": 3.2674411409571167e-07, "loss": 0.2398, "step": 1074500 }, { "epoch": 14.325898868166002, "grad_norm": 8.504914283752441, "learning_rate": 3.254677406399642e-07, "loss": 0.249, "step": 1074600 }, { "epoch": 14.327232005972457, "grad_norm": 1.2226693630218506, "learning_rate": 3.2419385144251244e-07, "loss": 0.2539, "step": 1074700 }, { "epoch": 14.328565143778912, "grad_norm": 2.0307986736297607, "learning_rate": 3.229224466100078e-07, "loss": 0.2654, "step": 1074800 }, { "epoch": 14.329898281585368, "grad_norm": 2.280538558959961, "learning_rate": 3.2165352624888846e-07, "loss": 0.2769, "step": 1074900 }, { "epoch": 14.331231419391823, "grad_norm": 1.2502098083496094, "learning_rate": 3.2038709046537605e-07, "loss": 0.2256, "step": 1075000 }, { "epoch": 14.332564557198278, "grad_norm": 2.156388759613037, "learning_rate": 3.191231393655025e-07, "loss": 0.2137, "step": 1075100 }, { "epoch": 14.333897695004733, "grad_norm": 3.9826302528381348, "learning_rate": 3.178616730550765e-07, "loss": 0.2277, "step": 1075200 }, { "epoch": 14.335230832811188, "grad_norm": 0.8344094157218933, "learning_rate": 3.1660269163970356e-07, "loss": 0.2415, "step": 1075300 }, { "epoch": 14.336563970617643, "grad_norm": 2.283099412918091, "learning_rate": 3.15346195224786e-07, "loss": 0.2264, "step": 1075400 }, { "epoch": 14.337897108424098, "grad_norm": 4.438360214233398, "learning_rate": 3.140921839155064e-07, "loss": 0.1988, "step": 1075500 }, { "epoch": 14.339230246230553, "grad_norm": 1.8395885229110718, "learning_rate": 3.128406578168541e-07, "loss": 0.2466, "step": 1075600 }, { "epoch": 14.340563384037008, "grad_norm": 0.5313010215759277, "learning_rate": 3.115916170335986e-07, "loss": 0.2375, "step": 1075700 }, { "epoch": 14.341896521843463, "grad_norm": 2.060359477996826, "learning_rate": 3.1034506167030963e-07, "loss": 0.2337, "step": 1075800 }, { "epoch": 14.343229659649918, "grad_norm": 1.7618120908737183, "learning_rate": 3.091009918313403e-07, "loss": 0.229, "step": 1075900 }, { "epoch": 14.344562797456373, "grad_norm": 0.4105694591999054, "learning_rate": 3.078594076208441e-07, "loss": 0.1925, "step": 1076000 }, { "epoch": 14.345895935262828, "grad_norm": 1.332769751548767, "learning_rate": 3.0662030914275775e-07, "loss": 0.2211, "step": 1076100 }, { "epoch": 14.347229073069283, "grad_norm": 2.1347923278808594, "learning_rate": 3.053836965008183e-07, "loss": 0.2472, "step": 1076200 }, { "epoch": 14.348562210875738, "grad_norm": 1.022882103919983, "learning_rate": 3.041495697985497e-07, "loss": 0.2385, "step": 1076300 }, { "epoch": 14.349895348682193, "grad_norm": 0.8016020655632019, "learning_rate": 3.029179291392725e-07, "loss": 0.2099, "step": 1076400 }, { "epoch": 14.351228486488647, "grad_norm": 5.145022869110107, "learning_rate": 3.01688774626091e-07, "loss": 0.2143, "step": 1076500 }, { "epoch": 14.352561624295104, "grad_norm": 4.8959197998046875, "learning_rate": 3.0046210636190954e-07, "loss": 0.2212, "step": 1076600 }, { "epoch": 14.35389476210156, "grad_norm": 2.2201054096221924, "learning_rate": 2.992379244494159e-07, "loss": 0.2355, "step": 1076700 }, { "epoch": 14.355227899908014, "grad_norm": 35.877803802490234, "learning_rate": 2.980162289911015e-07, "loss": 0.2319, "step": 1076800 }, { "epoch": 14.356561037714469, "grad_norm": 1.7159249782562256, "learning_rate": 2.9679702008923783e-07, "loss": 0.2503, "step": 1076900 }, { "epoch": 14.357894175520924, "grad_norm": 3.563765287399292, "learning_rate": 2.9558029784589657e-07, "loss": 0.2308, "step": 1077000 }, { "epoch": 14.359227313327379, "grad_norm": 1.5799205303192139, "learning_rate": 2.9436606236293627e-07, "loss": 0.2344, "step": 1077100 }, { "epoch": 14.360560451133834, "grad_norm": 2.7083029747009277, "learning_rate": 2.9315431374200894e-07, "loss": 0.225, "step": 1077200 }, { "epoch": 14.361893588940289, "grad_norm": 0.4472668766975403, "learning_rate": 2.9194505208455677e-07, "loss": 0.2144, "step": 1077300 }, { "epoch": 14.363226726746744, "grad_norm": 2.27547287940979, "learning_rate": 2.907382774918188e-07, "loss": 0.3025, "step": 1077400 }, { "epoch": 14.364559864553199, "grad_norm": 1.219482421875, "learning_rate": 2.8953399006482083e-07, "loss": 0.2296, "step": 1077500 }, { "epoch": 14.365893002359654, "grad_norm": 1.0385181903839111, "learning_rate": 2.883321899043789e-07, "loss": 0.2265, "step": 1077600 }, { "epoch": 14.367226140166109, "grad_norm": 2.168337106704712, "learning_rate": 2.8713287711110924e-07, "loss": 0.2029, "step": 1077700 }, { "epoch": 14.368559277972563, "grad_norm": 5.5252790451049805, "learning_rate": 2.859360517854115e-07, "loss": 0.238, "step": 1077800 }, { "epoch": 14.369892415779018, "grad_norm": 1.9013903141021729, "learning_rate": 2.8474171402748216e-07, "loss": 0.2158, "step": 1077900 }, { "epoch": 14.371225553585473, "grad_norm": 3.75081467628479, "learning_rate": 2.8354986393730464e-07, "loss": 0.2355, "step": 1078000 }, { "epoch": 14.37255869139193, "grad_norm": 3.2709782123565674, "learning_rate": 2.823605016146591e-07, "loss": 0.2115, "step": 1078100 }, { "epoch": 14.373891829198385, "grad_norm": 1.429347038269043, "learning_rate": 2.811736271591092e-07, "loss": 0.2219, "step": 1078200 }, { "epoch": 14.37522496700484, "grad_norm": 1.6992439031600952, "learning_rate": 2.7998924067002885e-07, "loss": 0.1816, "step": 1078300 }, { "epoch": 14.376558104811295, "grad_norm": 0.17080868780612946, "learning_rate": 2.788073422465587e-07, "loss": 0.2126, "step": 1078400 }, { "epoch": 14.37789124261775, "grad_norm": 3.024038076400757, "learning_rate": 2.7762793198764957e-07, "loss": 0.208, "step": 1078500 }, { "epoch": 14.379224380424205, "grad_norm": 3.0150129795074463, "learning_rate": 2.764510099920392e-07, "loss": 0.1917, "step": 1078600 }, { "epoch": 14.38055751823066, "grad_norm": 0.14786671102046967, "learning_rate": 2.752765763582521e-07, "loss": 0.1753, "step": 1078700 }, { "epoch": 14.381890656037115, "grad_norm": 1.4717267751693726, "learning_rate": 2.741046311846096e-07, "loss": 0.2746, "step": 1078800 }, { "epoch": 14.38322379384357, "grad_norm": 3.2593789100646973, "learning_rate": 2.729351745692232e-07, "loss": 0.2135, "step": 1078900 }, { "epoch": 14.384556931650025, "grad_norm": 2.6768639087677, "learning_rate": 2.7176820660999804e-07, "loss": 0.2064, "step": 1079000 }, { "epoch": 14.38589006945648, "grad_norm": 5.410953044891357, "learning_rate": 2.7060372740462247e-07, "loss": 0.2279, "step": 1079100 }, { "epoch": 14.387223207262934, "grad_norm": 0.9090679883956909, "learning_rate": 2.6944173705059196e-07, "loss": 0.2471, "step": 1079200 }, { "epoch": 14.38855634506939, "grad_norm": 2.9790308475494385, "learning_rate": 2.682822356451786e-07, "loss": 0.2705, "step": 1079300 }, { "epoch": 14.389889482875844, "grad_norm": 1.6085124015808105, "learning_rate": 2.6712522328545483e-07, "loss": 0.2435, "step": 1079400 }, { "epoch": 14.3912226206823, "grad_norm": 2.0836374759674072, "learning_rate": 2.659707000682832e-07, "loss": 0.2081, "step": 1079500 }, { "epoch": 14.392555758488754, "grad_norm": 0.32875680923461914, "learning_rate": 2.648186660903096e-07, "loss": 0.2329, "step": 1079600 }, { "epoch": 14.39388889629521, "grad_norm": 0.00875641405582428, "learning_rate": 2.6368060457188203e-07, "loss": 0.2532, "step": 1079700 }, { "epoch": 14.395222034101666, "grad_norm": 1.5450595617294312, "learning_rate": 2.6253352446665044e-07, "loss": 0.2067, "step": 1079800 }, { "epoch": 14.39655517190812, "grad_norm": 2.477813959121704, "learning_rate": 2.613889338883668e-07, "loss": 0.2222, "step": 1079900 }, { "epoch": 14.397888309714576, "grad_norm": 0.32918423414230347, "learning_rate": 2.602468329328611e-07, "loss": 0.2603, "step": 1080000 }, { "epoch": 14.39922144752103, "grad_norm": 5.637506008148193, "learning_rate": 2.591072216957435e-07, "loss": 0.2326, "step": 1080100 }, { "epoch": 14.400554585327486, "grad_norm": 1.862728476524353, "learning_rate": 2.579701002724177e-07, "loss": 0.2387, "step": 1080200 }, { "epoch": 14.40188772313394, "grad_norm": 4.662835121154785, "learning_rate": 2.5683546875807764e-07, "loss": 0.2593, "step": 1080300 }, { "epoch": 14.403220860940396, "grad_norm": 1.583704948425293, "learning_rate": 2.5570332724771716e-07, "loss": 0.1861, "step": 1080400 }, { "epoch": 14.40455399874685, "grad_norm": 1.490795373916626, "learning_rate": 2.5457367583610725e-07, "loss": 0.2207, "step": 1080500 }, { "epoch": 14.405887136553305, "grad_norm": 0.7444589734077454, "learning_rate": 2.5344651461782884e-07, "loss": 0.2715, "step": 1080600 }, { "epoch": 14.40722027435976, "grad_norm": 2.9704041481018066, "learning_rate": 2.523218436872365e-07, "loss": 0.2322, "step": 1080700 }, { "epoch": 14.408553412166215, "grad_norm": 2.67118239402771, "learning_rate": 2.51199663138485e-07, "loss": 0.2545, "step": 1080800 }, { "epoch": 14.40988654997267, "grad_norm": 0.9999749660491943, "learning_rate": 2.500799730655223e-07, "loss": 0.2777, "step": 1080900 }, { "epoch": 14.411219687779125, "grad_norm": 2.3123743534088135, "learning_rate": 2.4896277356208696e-07, "loss": 0.2482, "step": 1081000 }, { "epoch": 14.41255282558558, "grad_norm": 1.0865659713745117, "learning_rate": 2.478480647217007e-07, "loss": 0.204, "step": 1081100 }, { "epoch": 14.413885963392035, "grad_norm": 4.461703777313232, "learning_rate": 2.46735846637689e-07, "loss": 0.2262, "step": 1081200 }, { "epoch": 14.41521910119849, "grad_norm": 3.798304796218872, "learning_rate": 2.456261194031639e-07, "loss": 0.2253, "step": 1081300 }, { "epoch": 14.416552239004947, "grad_norm": 3.4843945503234863, "learning_rate": 2.4451888311102457e-07, "loss": 0.237, "step": 1081400 }, { "epoch": 14.417885376811402, "grad_norm": 4.163318157196045, "learning_rate": 2.4341413785397005e-07, "loss": 0.2437, "step": 1081500 }, { "epoch": 14.419218514617857, "grad_norm": 1.2536782026290894, "learning_rate": 2.423118837244798e-07, "loss": 0.2193, "step": 1081600 }, { "epoch": 14.420551652424312, "grad_norm": 0.818077802658081, "learning_rate": 2.4121212081483324e-07, "loss": 0.1923, "step": 1081700 }, { "epoch": 14.421884790230767, "grad_norm": 4.529926300048828, "learning_rate": 2.4011484921710017e-07, "loss": 0.2204, "step": 1081800 }, { "epoch": 14.423217928037221, "grad_norm": 1.2817684412002563, "learning_rate": 2.3902006902314366e-07, "loss": 0.2112, "step": 1081900 }, { "epoch": 14.424551065843676, "grad_norm": 2.501976251602173, "learning_rate": 2.3792778032461382e-07, "loss": 0.1999, "step": 1082000 }, { "epoch": 14.425884203650131, "grad_norm": 2.880394458770752, "learning_rate": 2.3683798321294415e-07, "loss": 0.2772, "step": 1082100 }, { "epoch": 14.427217341456586, "grad_norm": 2.7105660438537598, "learning_rate": 2.3575067777938164e-07, "loss": 0.2339, "step": 1082200 }, { "epoch": 14.428550479263041, "grad_norm": 1.194891095161438, "learning_rate": 2.3466586411495018e-07, "loss": 0.2221, "step": 1082300 }, { "epoch": 14.429883617069496, "grad_norm": 0.6430549621582031, "learning_rate": 2.335835423104604e-07, "loss": 0.2344, "step": 1082400 }, { "epoch": 14.431216754875951, "grad_norm": 3.169478178024292, "learning_rate": 2.3250371245652658e-07, "loss": 0.2589, "step": 1082500 }, { "epoch": 14.432549892682406, "grad_norm": 1.0083879232406616, "learning_rate": 2.3142637464354634e-07, "loss": 0.2296, "step": 1082600 }, { "epoch": 14.433883030488861, "grad_norm": 1.4366967678070068, "learning_rate": 2.3035152896171086e-07, "loss": 0.212, "step": 1082700 }, { "epoch": 14.435216168295316, "grad_norm": 1.3334300518035889, "learning_rate": 2.2927917550100154e-07, "loss": 0.2141, "step": 1082800 }, { "epoch": 14.436549306101771, "grad_norm": 1.6374568939208984, "learning_rate": 2.2820931435119985e-07, "loss": 0.2238, "step": 1082900 }, { "epoch": 14.437882443908228, "grad_norm": 5.086225509643555, "learning_rate": 2.2714194560186084e-07, "loss": 0.2582, "step": 1083000 }, { "epoch": 14.439215581714683, "grad_norm": 2.0922887325286865, "learning_rate": 2.2608770576682424e-07, "loss": 0.2394, "step": 1083100 }, { "epoch": 14.440548719521137, "grad_norm": 2.763601779937744, "learning_rate": 2.2502529716005194e-07, "loss": 0.226, "step": 1083200 }, { "epoch": 14.441881857327592, "grad_norm": 1.81222403049469, "learning_rate": 2.239653812203024e-07, "loss": 0.2801, "step": 1083300 }, { "epoch": 14.443214995134047, "grad_norm": 1.6465073823928833, "learning_rate": 2.2290795803631136e-07, "loss": 0.1932, "step": 1083400 }, { "epoch": 14.444548132940502, "grad_norm": 0.8647502064704895, "learning_rate": 2.2185302769659798e-07, "loss": 0.1874, "step": 1083500 }, { "epoch": 14.445881270746957, "grad_norm": 3.69812273979187, "learning_rate": 2.2080059028948162e-07, "loss": 0.1711, "step": 1083600 }, { "epoch": 14.447214408553412, "grad_norm": 1.8818788528442383, "learning_rate": 2.197506459030685e-07, "loss": 0.2157, "step": 1083700 }, { "epoch": 14.448547546359867, "grad_norm": 0.5747514963150024, "learning_rate": 2.187031946252549e-07, "loss": 0.2424, "step": 1083800 }, { "epoch": 14.449880684166322, "grad_norm": 3.9575276374816895, "learning_rate": 2.1765823654373074e-07, "loss": 0.2641, "step": 1083900 }, { "epoch": 14.451213821972777, "grad_norm": 0.08903432637453079, "learning_rate": 2.1661577174597602e-07, "loss": 0.2535, "step": 1084000 }, { "epoch": 14.452546959779232, "grad_norm": 0.7502389550209045, "learning_rate": 2.155758003192676e-07, "loss": 0.1976, "step": 1084100 }, { "epoch": 14.453880097585687, "grad_norm": 0.8605971336364746, "learning_rate": 2.145383223506592e-07, "loss": 0.2558, "step": 1084200 }, { "epoch": 14.455213235392142, "grad_norm": 1.6611512899398804, "learning_rate": 2.135033379270146e-07, "loss": 0.2036, "step": 1084300 }, { "epoch": 14.456546373198597, "grad_norm": 1.5688257217407227, "learning_rate": 2.1247084713497457e-07, "loss": 0.218, "step": 1084400 }, { "epoch": 14.457879511005052, "grad_norm": 7.090421199798584, "learning_rate": 2.1144085006097658e-07, "loss": 0.2595, "step": 1084500 }, { "epoch": 14.459212648811508, "grad_norm": 1.8100091218948364, "learning_rate": 2.1041334679124834e-07, "loss": 0.2442, "step": 1084600 }, { "epoch": 14.460545786617963, "grad_norm": 0.4908294975757599, "learning_rate": 2.0938833741180775e-07, "loss": 0.2012, "step": 1084700 }, { "epoch": 14.461878924424418, "grad_norm": 2.6988942623138428, "learning_rate": 2.0836582200846942e-07, "loss": 0.2095, "step": 1084800 }, { "epoch": 14.463212062230873, "grad_norm": 2.924128770828247, "learning_rate": 2.0734580066683494e-07, "loss": 0.2449, "step": 1084900 }, { "epoch": 14.464545200037328, "grad_norm": 0.8407785296440125, "learning_rate": 2.06328273472296e-07, "loss": 0.2049, "step": 1085000 }, { "epoch": 14.465878337843783, "grad_norm": 7.481359958648682, "learning_rate": 2.053132405100344e-07, "loss": 0.2143, "step": 1085100 }, { "epoch": 14.467211475650238, "grad_norm": 0.7972537875175476, "learning_rate": 2.043007018650256e-07, "loss": 0.2494, "step": 1085200 }, { "epoch": 14.468544613456693, "grad_norm": 3.215834140777588, "learning_rate": 2.0329065762203836e-07, "loss": 0.2206, "step": 1085300 }, { "epoch": 14.469877751263148, "grad_norm": 1.3454376459121704, "learning_rate": 2.0228310786563176e-07, "loss": 0.2366, "step": 1085400 }, { "epoch": 14.471210889069603, "grad_norm": 1.518630027770996, "learning_rate": 2.0127805268015498e-07, "loss": 0.2357, "step": 1085500 }, { "epoch": 14.472544026876058, "grad_norm": 4.1403937339782715, "learning_rate": 2.0027549214974073e-07, "loss": 0.2547, "step": 1085600 }, { "epoch": 14.473877164682513, "grad_norm": 1.6898244619369507, "learning_rate": 1.9927542635832853e-07, "loss": 0.2581, "step": 1085700 }, { "epoch": 14.475210302488968, "grad_norm": 3.2854151725769043, "learning_rate": 1.9827785538963804e-07, "loss": 0.2367, "step": 1085800 }, { "epoch": 14.476543440295423, "grad_norm": 3.4232051372528076, "learning_rate": 1.9728277932718252e-07, "loss": 0.1955, "step": 1085900 }, { "epoch": 14.477876578101878, "grad_norm": 4.996261119842529, "learning_rate": 1.962901982542653e-07, "loss": 0.2696, "step": 1086000 }, { "epoch": 14.479209715908333, "grad_norm": 4.059439659118652, "learning_rate": 1.9530011225397994e-07, "loss": 0.2427, "step": 1086100 }, { "epoch": 14.48054285371479, "grad_norm": 0.46858665347099304, "learning_rate": 1.9431252140922008e-07, "loss": 0.2688, "step": 1086200 }, { "epoch": 14.481875991521244, "grad_norm": 2.3213770389556885, "learning_rate": 1.9332742580266293e-07, "loss": 0.217, "step": 1086300 }, { "epoch": 14.4832091293277, "grad_norm": 0.6724988222122192, "learning_rate": 1.9234482551677257e-07, "loss": 0.2141, "step": 1086400 }, { "epoch": 14.484542267134154, "grad_norm": 13.775615692138672, "learning_rate": 1.9136472063380983e-07, "loss": 0.2632, "step": 1086500 }, { "epoch": 14.485875404940609, "grad_norm": 5.102641582489014, "learning_rate": 1.903871112358291e-07, "loss": 0.2335, "step": 1086600 }, { "epoch": 14.487208542747064, "grad_norm": 1.5019807815551758, "learning_rate": 1.8941199740467152e-07, "loss": 0.2374, "step": 1086700 }, { "epoch": 14.488541680553519, "grad_norm": 1.4529519081115723, "learning_rate": 1.8843937922197185e-07, "loss": 0.201, "step": 1086800 }, { "epoch": 14.489874818359974, "grad_norm": 0.605335533618927, "learning_rate": 1.874692567691516e-07, "loss": 0.2444, "step": 1086900 }, { "epoch": 14.491207956166429, "grad_norm": 1.7578747272491455, "learning_rate": 1.8650163012742583e-07, "loss": 0.2587, "step": 1087000 }, { "epoch": 14.492541093972884, "grad_norm": 27.16965103149414, "learning_rate": 1.8553649937780638e-07, "loss": 0.2295, "step": 1087100 }, { "epoch": 14.493874231779339, "grad_norm": 1.4121026992797852, "learning_rate": 1.845738646010886e-07, "loss": 0.2455, "step": 1087200 }, { "epoch": 14.495207369585794, "grad_norm": 3.103036403656006, "learning_rate": 1.8361372587785807e-07, "loss": 0.2318, "step": 1087300 }, { "epoch": 14.496540507392249, "grad_norm": 2.301626443862915, "learning_rate": 1.8265608328849715e-07, "loss": 0.2134, "step": 1087400 }, { "epoch": 14.497873645198704, "grad_norm": 2.3698172569274902, "learning_rate": 1.8170093691317836e-07, "loss": 0.191, "step": 1087500 }, { "epoch": 14.499206783005159, "grad_norm": 0.02892233617603779, "learning_rate": 1.8074828683186106e-07, "loss": 0.2282, "step": 1087600 }, { "epoch": 14.500539920811613, "grad_norm": 2.7302567958831787, "learning_rate": 1.7979813312429815e-07, "loss": 0.194, "step": 1087700 }, { "epoch": 14.50187305861807, "grad_norm": 4.430066108703613, "learning_rate": 1.78850475870036e-07, "loss": 0.2325, "step": 1087800 }, { "epoch": 14.503206196424525, "grad_norm": 3.551706075668335, "learning_rate": 1.7790531514840447e-07, "loss": 0.1967, "step": 1087900 }, { "epoch": 14.50453933423098, "grad_norm": 0.8843647241592407, "learning_rate": 1.769626510385336e-07, "loss": 0.2278, "step": 1088000 }, { "epoch": 14.505872472037435, "grad_norm": 4.967342376708984, "learning_rate": 1.760224836193436e-07, "loss": 0.2158, "step": 1088100 }, { "epoch": 14.50720560984389, "grad_norm": 1.9700545072555542, "learning_rate": 1.750848129695315e-07, "loss": 0.2312, "step": 1088200 }, { "epoch": 14.508538747650345, "grad_norm": 4.59177303314209, "learning_rate": 1.7415897854597186e-07, "loss": 0.2497, "step": 1088300 }, { "epoch": 14.5098718854568, "grad_norm": 2.944375514984131, "learning_rate": 1.73226276700571e-07, "loss": 0.2042, "step": 1088400 }, { "epoch": 14.511205023263255, "grad_norm": 1.9712553024291992, "learning_rate": 1.7229607185864414e-07, "loss": 0.2493, "step": 1088500 }, { "epoch": 14.51253816106971, "grad_norm": 2.7839176654815674, "learning_rate": 1.7136836409806233e-07, "loss": 0.2119, "step": 1088600 }, { "epoch": 14.513871298876165, "grad_norm": 5.114515781402588, "learning_rate": 1.7044315349649674e-07, "loss": 0.2136, "step": 1088700 }, { "epoch": 14.51520443668262, "grad_norm": 0.7135917544364929, "learning_rate": 1.6952044013139544e-07, "loss": 0.2563, "step": 1088800 }, { "epoch": 14.516537574489075, "grad_norm": 3.541088581085205, "learning_rate": 1.686002240800133e-07, "loss": 0.2135, "step": 1088900 }, { "epoch": 14.51787071229553, "grad_norm": 1.8916410207748413, "learning_rate": 1.67682505419382e-07, "loss": 0.2383, "step": 1089000 }, { "epoch": 14.519203850101984, "grad_norm": 2.751904010772705, "learning_rate": 1.6676728422633014e-07, "loss": 0.1851, "step": 1089100 }, { "epoch": 14.52053698790844, "grad_norm": 1.3056265115737915, "learning_rate": 1.6586367545087576e-07, "loss": 0.2495, "step": 1089200 }, { "epoch": 14.521870125714894, "grad_norm": 0.9910221099853516, "learning_rate": 1.6495342444605156e-07, "loss": 0.2374, "step": 1089300 }, { "epoch": 14.52320326352135, "grad_norm": 0.6153773665428162, "learning_rate": 1.6404567113728065e-07, "loss": 0.2338, "step": 1089400 }, { "epoch": 14.524536401327806, "grad_norm": 9.402287483215332, "learning_rate": 1.6314041560055558e-07, "loss": 0.2178, "step": 1089500 }, { "epoch": 14.525869539134261, "grad_norm": 0.36956942081451416, "learning_rate": 1.6223765791166246e-07, "loss": 0.228, "step": 1089600 }, { "epoch": 14.527202676940716, "grad_norm": 2.734565258026123, "learning_rate": 1.6133739814617078e-07, "loss": 0.2226, "step": 1089700 }, { "epoch": 14.52853581474717, "grad_norm": 2.322495698928833, "learning_rate": 1.6043963637945692e-07, "loss": 0.2437, "step": 1089800 }, { "epoch": 14.529868952553626, "grad_norm": 1.5868706703186035, "learning_rate": 1.5954437268667078e-07, "loss": 0.206, "step": 1089900 }, { "epoch": 14.53120209036008, "grad_norm": 2.409102201461792, "learning_rate": 1.5865160714276572e-07, "loss": 0.2276, "step": 1090000 }, { "epoch": 14.532535228166536, "grad_norm": 2.0051450729370117, "learning_rate": 1.5776133982248198e-07, "loss": 0.2538, "step": 1090100 }, { "epoch": 14.53386836597299, "grad_norm": 1.8309484720230103, "learning_rate": 1.568735708003466e-07, "loss": 0.1976, "step": 1090200 }, { "epoch": 14.535201503779446, "grad_norm": 2.582489252090454, "learning_rate": 1.5598830015068344e-07, "loss": 0.2427, "step": 1090300 }, { "epoch": 14.5365346415859, "grad_norm": 2.2414166927337646, "learning_rate": 1.5510552794759992e-07, "loss": 0.2411, "step": 1090400 }, { "epoch": 14.537867779392355, "grad_norm": 1.4990789890289307, "learning_rate": 1.542252542650069e-07, "loss": 0.1996, "step": 1090500 }, { "epoch": 14.53920091719881, "grad_norm": 0.6122162342071533, "learning_rate": 1.533474791765921e-07, "loss": 0.2018, "step": 1090600 }, { "epoch": 14.540534055005265, "grad_norm": 3.9726221561431885, "learning_rate": 1.5247220275584007e-07, "loss": 0.2507, "step": 1090700 }, { "epoch": 14.54186719281172, "grad_norm": 0.9714933633804321, "learning_rate": 1.5159942507602887e-07, "loss": 0.228, "step": 1090800 }, { "epoch": 14.543200330618175, "grad_norm": 1.2315493822097778, "learning_rate": 1.5072914621022339e-07, "loss": 0.2268, "step": 1090900 }, { "epoch": 14.544533468424632, "grad_norm": 2.0493171215057373, "learning_rate": 1.4986136623127867e-07, "loss": 0.2139, "step": 1091000 }, { "epoch": 14.545866606231087, "grad_norm": 2.653667688369751, "learning_rate": 1.489960852118466e-07, "loss": 0.1889, "step": 1091100 }, { "epoch": 14.547199744037542, "grad_norm": 0.970445990562439, "learning_rate": 1.4813330322436257e-07, "loss": 0.2998, "step": 1091200 }, { "epoch": 14.548532881843997, "grad_norm": 1.8147656917572021, "learning_rate": 1.4727302034106215e-07, "loss": 0.1917, "step": 1091300 }, { "epoch": 14.549866019650452, "grad_norm": 1.4604816436767578, "learning_rate": 1.4641523663395438e-07, "loss": 0.1983, "step": 1091400 }, { "epoch": 14.551199157456907, "grad_norm": 1.9364150762557983, "learning_rate": 1.4555995217485852e-07, "loss": 0.223, "step": 1091500 }, { "epoch": 14.552532295263362, "grad_norm": 1.8658620119094849, "learning_rate": 1.447071670353739e-07, "loss": 0.2428, "step": 1091600 }, { "epoch": 14.553865433069817, "grad_norm": 2.470299482345581, "learning_rate": 1.4386537177216163e-07, "loss": 0.2401, "step": 1091700 }, { "epoch": 14.555198570876271, "grad_norm": 0.7685579657554626, "learning_rate": 1.4301756049089565e-07, "loss": 0.281, "step": 1091800 }, { "epoch": 14.556531708682726, "grad_norm": 1.671728491783142, "learning_rate": 1.4217224874208068e-07, "loss": 0.1934, "step": 1091900 }, { "epoch": 14.557864846489181, "grad_norm": 1.8046187162399292, "learning_rate": 1.4132943659648346e-07, "loss": 0.2379, "step": 1092000 }, { "epoch": 14.559197984295636, "grad_norm": 2.667191982269287, "learning_rate": 1.4048912412466753e-07, "loss": 0.2496, "step": 1092100 }, { "epoch": 14.560531122102091, "grad_norm": 2.542785167694092, "learning_rate": 1.396513113969733e-07, "loss": 0.2151, "step": 1092200 }, { "epoch": 14.561864259908546, "grad_norm": 2.839308500289917, "learning_rate": 1.3881599848354798e-07, "loss": 0.2032, "step": 1092300 }, { "epoch": 14.563197397715001, "grad_norm": 1.156233310699463, "learning_rate": 1.3798318545431232e-07, "loss": 0.2095, "step": 1092400 }, { "epoch": 14.564530535521456, "grad_norm": 4.275930881500244, "learning_rate": 1.3715287237899388e-07, "loss": 0.2067, "step": 1092500 }, { "epoch": 14.565863673327911, "grad_norm": 9.032136917114258, "learning_rate": 1.3632505932709705e-07, "loss": 0.2502, "step": 1092600 }, { "epoch": 14.567196811134368, "grad_norm": 2.265763759613037, "learning_rate": 1.3549974636793305e-07, "loss": 0.236, "step": 1092700 }, { "epoch": 14.568529948940823, "grad_norm": 0.8538075089454651, "learning_rate": 1.3467693357058662e-07, "loss": 0.1815, "step": 1092800 }, { "epoch": 14.569863086747278, "grad_norm": 2.3309688568115234, "learning_rate": 1.3385662100394268e-07, "loss": 0.2742, "step": 1092900 }, { "epoch": 14.571196224553733, "grad_norm": 0.24947354197502136, "learning_rate": 1.3303880873667628e-07, "loss": 0.2659, "step": 1093000 }, { "epoch": 14.572529362360187, "grad_norm": 2.918534517288208, "learning_rate": 1.32223496837256e-07, "loss": 0.2168, "step": 1093100 }, { "epoch": 14.573862500166642, "grad_norm": 0.8028960824012756, "learning_rate": 1.3141068537393054e-07, "loss": 0.2192, "step": 1093200 }, { "epoch": 14.575195637973097, "grad_norm": 0.918573796749115, "learning_rate": 1.306003744147488e-07, "loss": 0.2431, "step": 1093300 }, { "epoch": 14.576528775779552, "grad_norm": 2.1291816234588623, "learning_rate": 1.2979256402754658e-07, "loss": 0.2369, "step": 1093400 }, { "epoch": 14.577861913586007, "grad_norm": 4.419079303741455, "learning_rate": 1.2898725427994972e-07, "loss": 0.2415, "step": 1093500 }, { "epoch": 14.579195051392462, "grad_norm": 2.188053846359253, "learning_rate": 1.28184445239381e-07, "loss": 0.2624, "step": 1093600 }, { "epoch": 14.580528189198917, "grad_norm": 2.7794008255004883, "learning_rate": 1.2738413697304997e-07, "loss": 0.2269, "step": 1093700 }, { "epoch": 14.581861327005372, "grad_norm": 0.3639009892940521, "learning_rate": 1.2658632954794635e-07, "loss": 0.2247, "step": 1093800 }, { "epoch": 14.583194464811827, "grad_norm": 2.7321109771728516, "learning_rate": 1.2579102303086675e-07, "loss": 0.2562, "step": 1093900 }, { "epoch": 14.584527602618282, "grad_norm": 1.1530100107192993, "learning_rate": 1.2499821748839458e-07, "loss": 0.2095, "step": 1094000 }, { "epoch": 14.585860740424737, "grad_norm": 3.5729892253875732, "learning_rate": 1.2421580365153838e-07, "loss": 0.2415, "step": 1094100 }, { "epoch": 14.587193878231194, "grad_norm": 2.19926118850708, "learning_rate": 1.2342797524577897e-07, "loss": 0.2383, "step": 1094200 }, { "epoch": 14.588527016037649, "grad_norm": 4.013283729553223, "learning_rate": 1.2264264801245206e-07, "loss": 0.2251, "step": 1094300 }, { "epoch": 14.589860153844104, "grad_norm": 4.3036651611328125, "learning_rate": 1.218598220172984e-07, "loss": 0.2571, "step": 1094400 }, { "epoch": 14.591193291650558, "grad_norm": 2.9385225772857666, "learning_rate": 1.2107949732585556e-07, "loss": 0.2091, "step": 1094500 }, { "epoch": 14.592526429457013, "grad_norm": 3.94559383392334, "learning_rate": 1.2030167400345126e-07, "loss": 0.209, "step": 1094600 }, { "epoch": 14.593859567263468, "grad_norm": 1.7323633432388306, "learning_rate": 1.1952635211520346e-07, "loss": 0.2616, "step": 1094700 }, { "epoch": 14.595192705069923, "grad_norm": 1.759950876235962, "learning_rate": 1.1875353172601689e-07, "loss": 0.256, "step": 1094800 }, { "epoch": 14.596525842876378, "grad_norm": 0.10897655785083771, "learning_rate": 1.1798321290059311e-07, "loss": 0.2039, "step": 1094900 }, { "epoch": 14.597858980682833, "grad_norm": 3.402097702026367, "learning_rate": 1.1721539570341389e-07, "loss": 0.2556, "step": 1095000 }, { "epoch": 14.599192118489288, "grad_norm": 3.294278383255005, "learning_rate": 1.1645008019876446e-07, "loss": 0.2285, "step": 1095100 }, { "epoch": 14.600525256295743, "grad_norm": 3.290271759033203, "learning_rate": 1.1568726645071692e-07, "loss": 0.2502, "step": 1095200 }, { "epoch": 14.601858394102198, "grad_norm": 2.869133710861206, "learning_rate": 1.1492695452312352e-07, "loss": 0.2606, "step": 1095300 }, { "epoch": 14.603191531908653, "grad_norm": 2.121361255645752, "learning_rate": 1.1416914447964333e-07, "loss": 0.2117, "step": 1095400 }, { "epoch": 14.604524669715108, "grad_norm": 24.49291229248047, "learning_rate": 1.1342137707982314e-07, "loss": 0.2526, "step": 1095500 }, { "epoch": 14.605857807521563, "grad_norm": 0.8428504467010498, "learning_rate": 1.126685459742549e-07, "loss": 0.2173, "step": 1095600 }, { "epoch": 14.607190945328018, "grad_norm": 3.0896058082580566, "learning_rate": 1.1191821694186199e-07, "loss": 0.2315, "step": 1095700 }, { "epoch": 14.608524083134473, "grad_norm": 23.68909454345703, "learning_rate": 1.1117039004546081e-07, "loss": 0.233, "step": 1095800 }, { "epoch": 14.60985722094093, "grad_norm": 19.160888671875, "learning_rate": 1.1042506534765795e-07, "loss": 0.2225, "step": 1095900 }, { "epoch": 14.611190358747384, "grad_norm": 4.9858479499816895, "learning_rate": 1.0968224291085016e-07, "loss": 0.209, "step": 1096000 }, { "epoch": 14.61252349655384, "grad_norm": 3.0150721073150635, "learning_rate": 1.0894192279722104e-07, "loss": 0.2204, "step": 1096100 }, { "epoch": 14.613856634360294, "grad_norm": 2.9415810108184814, "learning_rate": 1.0820410506874767e-07, "loss": 0.2798, "step": 1096200 }, { "epoch": 14.61518977216675, "grad_norm": 1.9843536615371704, "learning_rate": 1.0746878978720065e-07, "loss": 0.2219, "step": 1096300 }, { "epoch": 14.616522909973204, "grad_norm": 1.2366050481796265, "learning_rate": 1.067359770141374e-07, "loss": 0.2176, "step": 1096400 }, { "epoch": 14.617856047779659, "grad_norm": 1.6527801752090454, "learning_rate": 1.060056668109055e-07, "loss": 0.2113, "step": 1096500 }, { "epoch": 14.619189185586114, "grad_norm": 2.0873217582702637, "learning_rate": 1.0527785923864608e-07, "loss": 0.2436, "step": 1096600 }, { "epoch": 14.620522323392569, "grad_norm": 0.12029558420181274, "learning_rate": 1.0455255435829036e-07, "loss": 0.2318, "step": 1096700 }, { "epoch": 14.621855461199024, "grad_norm": 0.33803457021713257, "learning_rate": 1.0382975223055313e-07, "loss": 0.2574, "step": 1096800 }, { "epoch": 14.623188599005479, "grad_norm": 1.6404632329940796, "learning_rate": 1.0310945291594931e-07, "loss": 0.2158, "step": 1096900 }, { "epoch": 14.624521736811934, "grad_norm": 1.8430471420288086, "learning_rate": 1.0239165647477732e-07, "loss": 0.2458, "step": 1097000 }, { "epoch": 14.625854874618389, "grad_norm": 3.7683827877044678, "learning_rate": 1.0167636296713246e-07, "loss": 0.2723, "step": 1097100 }, { "epoch": 14.627188012424844, "grad_norm": 3.196026563644409, "learning_rate": 1.0096357245289345e-07, "loss": 0.1922, "step": 1097200 }, { "epoch": 14.628521150231299, "grad_norm": 3.169393539428711, "learning_rate": 1.0025328499173592e-07, "loss": 0.237, "step": 1097300 }, { "epoch": 14.629854288037755, "grad_norm": 0.47343868017196655, "learning_rate": 9.954550064311896e-08, "loss": 0.2313, "step": 1097400 }, { "epoch": 14.63118742584421, "grad_norm": 0.8054661154747009, "learning_rate": 9.884021946630185e-08, "loss": 0.2463, "step": 1097500 }, { "epoch": 14.632520563650665, "grad_norm": 0.12625114619731903, "learning_rate": 9.813744152032067e-08, "loss": 0.2406, "step": 1097600 }, { "epoch": 14.63385370145712, "grad_norm": 2.340726137161255, "learning_rate": 9.743716686401505e-08, "loss": 0.2043, "step": 1097700 }, { "epoch": 14.635186839263575, "grad_norm": 1.0401679277420044, "learning_rate": 9.673939555601141e-08, "loss": 0.2433, "step": 1097800 }, { "epoch": 14.63651997707003, "grad_norm": 2.3715662956237793, "learning_rate": 9.604412765472303e-08, "loss": 0.2287, "step": 1097900 }, { "epoch": 14.637853114876485, "grad_norm": 4.009907245635986, "learning_rate": 9.535136321835003e-08, "loss": 0.2875, "step": 1098000 }, { "epoch": 14.63918625268294, "grad_norm": 2.919999122619629, "learning_rate": 9.4661102304896e-08, "loss": 0.2273, "step": 1098100 }, { "epoch": 14.640519390489395, "grad_norm": 1.8077586889266968, "learning_rate": 9.397334497214471e-08, "loss": 0.2557, "step": 1098200 }, { "epoch": 14.64185252829585, "grad_norm": 0.855048656463623, "learning_rate": 9.328809127767346e-08, "loss": 0.2069, "step": 1098300 }, { "epoch": 14.643185666102305, "grad_norm": 4.806489944458008, "learning_rate": 9.260534127884967e-08, "loss": 0.201, "step": 1098400 }, { "epoch": 14.64451880390876, "grad_norm": 1.804535984992981, "learning_rate": 9.192509503282765e-08, "loss": 0.2144, "step": 1098500 }, { "epoch": 14.645851941715215, "grad_norm": 0.17819185554981232, "learning_rate": 9.12473525965618e-08, "loss": 0.2175, "step": 1098600 }, { "epoch": 14.64718507952167, "grad_norm": 1.9897193908691406, "learning_rate": 9.057211402678678e-08, "loss": 0.2474, "step": 1098700 }, { "epoch": 14.648518217328125, "grad_norm": 1.667493224143982, "learning_rate": 8.989937938002734e-08, "loss": 0.2153, "step": 1098800 }, { "epoch": 14.64985135513458, "grad_norm": 19.537992477416992, "learning_rate": 8.922914871261179e-08, "loss": 0.2536, "step": 1098900 }, { "epoch": 14.651184492941034, "grad_norm": 2.712280035018921, "learning_rate": 8.856142208064188e-08, "loss": 0.2402, "step": 1099000 }, { "epoch": 14.652517630747491, "grad_norm": 4.109059810638428, "learning_rate": 8.789619954002292e-08, "loss": 0.2368, "step": 1099100 }, { "epoch": 14.653850768553946, "grad_norm": 1.3237800598144531, "learning_rate": 8.72334811464437e-08, "loss": 0.217, "step": 1099200 }, { "epoch": 14.655183906360401, "grad_norm": 2.1177690029144287, "learning_rate": 8.657326695538314e-08, "loss": 0.2433, "step": 1099300 }, { "epoch": 14.656517044166856, "grad_norm": 1.6898552179336548, "learning_rate": 8.59155570221104e-08, "loss": 0.2295, "step": 1099400 }, { "epoch": 14.657850181973311, "grad_norm": 2.132972002029419, "learning_rate": 8.526035140169475e-08, "loss": 0.2364, "step": 1099500 }, { "epoch": 14.659183319779766, "grad_norm": 1.3898018598556519, "learning_rate": 8.460765014897897e-08, "loss": 0.181, "step": 1099600 }, { "epoch": 14.66051645758622, "grad_norm": 3.3850817680358887, "learning_rate": 8.39574533186127e-08, "loss": 0.242, "step": 1099700 }, { "epoch": 14.661849595392676, "grad_norm": 1.0075639486312866, "learning_rate": 8.330976096502241e-08, "loss": 0.243, "step": 1099800 }, { "epoch": 14.66318273319913, "grad_norm": 2.656728506088257, "learning_rate": 8.26645731424347e-08, "loss": 0.2128, "step": 1099900 }, { "epoch": 14.664515871005586, "grad_norm": 3.0267879962921143, "learning_rate": 8.202188990486304e-08, "loss": 0.2606, "step": 1100000 }, { "epoch": 14.66584900881204, "grad_norm": 4.139639377593994, "learning_rate": 8.138171130610772e-08, "loss": 0.2262, "step": 1100100 }, { "epoch": 14.667182146618496, "grad_norm": 1.7901595830917358, "learning_rate": 8.074403739976255e-08, "loss": 0.2513, "step": 1100200 }, { "epoch": 14.66851528442495, "grad_norm": 2.2069602012634277, "learning_rate": 8.010886823921481e-08, "loss": 0.193, "step": 1100300 }, { "epoch": 14.669848422231405, "grad_norm": 2.1485118865966797, "learning_rate": 7.947620387763532e-08, "loss": 0.2223, "step": 1100400 }, { "epoch": 14.67118156003786, "grad_norm": 3.9703166484832764, "learning_rate": 7.885233356389576e-08, "loss": 0.2508, "step": 1100500 }, { "epoch": 14.672514697844317, "grad_norm": 1.035662293434143, "learning_rate": 7.822465390963318e-08, "loss": 0.2015, "step": 1100600 }, { "epoch": 14.673847835650772, "grad_norm": 5.9007673263549805, "learning_rate": 7.759947921208266e-08, "loss": 0.2344, "step": 1100700 }, { "epoch": 14.675180973457227, "grad_norm": 1.3332327604293823, "learning_rate": 7.6976809523579e-08, "loss": 0.2534, "step": 1100800 }, { "epoch": 14.676514111263682, "grad_norm": 1.3515115976333618, "learning_rate": 7.635664489625049e-08, "loss": 0.2275, "step": 1100900 }, { "epoch": 14.677847249070137, "grad_norm": 1.3690574169158936, "learning_rate": 7.573898538201895e-08, "loss": 0.2254, "step": 1101000 }, { "epoch": 14.679180386876592, "grad_norm": 3.799182176589966, "learning_rate": 7.512383103258635e-08, "loss": 0.2732, "step": 1101100 }, { "epoch": 14.680513524683047, "grad_norm": 1.7066431045532227, "learning_rate": 7.451118189945816e-08, "loss": 0.2569, "step": 1101200 }, { "epoch": 14.681846662489502, "grad_norm": 0.8733155727386475, "learning_rate": 7.390103803392e-08, "loss": 0.2038, "step": 1101300 }, { "epoch": 14.683179800295957, "grad_norm": 2.554091691970825, "learning_rate": 7.329339948705105e-08, "loss": 0.242, "step": 1101400 }, { "epoch": 14.684512938102412, "grad_norm": 2.4151573181152344, "learning_rate": 7.268826630972058e-08, "loss": 0.2413, "step": 1101500 }, { "epoch": 14.685846075908866, "grad_norm": 2.7481534481048584, "learning_rate": 7.208563855259143e-08, "loss": 0.2022, "step": 1101600 }, { "epoch": 14.687179213715321, "grad_norm": 2.6785738468170166, "learning_rate": 7.148551626610988e-08, "loss": 0.2308, "step": 1101700 }, { "epoch": 14.688512351521776, "grad_norm": 23.58347511291504, "learning_rate": 7.088789950052243e-08, "loss": 0.2535, "step": 1101800 }, { "epoch": 14.689845489328231, "grad_norm": 3.0518360137939453, "learning_rate": 7.029278830585239e-08, "loss": 0.186, "step": 1101900 }, { "epoch": 14.691178627134686, "grad_norm": 3.8719520568847656, "learning_rate": 6.970018273191992e-08, "loss": 0.2399, "step": 1102000 }, { "epoch": 14.692511764941141, "grad_norm": 3.362269163131714, "learning_rate": 6.91100828283453e-08, "loss": 0.2331, "step": 1102100 }, { "epoch": 14.693844902747596, "grad_norm": 3.2650978565216064, "learning_rate": 6.852248864452237e-08, "loss": 0.2415, "step": 1102200 }, { "epoch": 14.695178040554053, "grad_norm": 1.2350752353668213, "learning_rate": 6.793740022964179e-08, "loss": 0.255, "step": 1102300 }, { "epoch": 14.696511178360508, "grad_norm": 2.3998379707336426, "learning_rate": 6.735481763269103e-08, "loss": 0.2724, "step": 1102400 }, { "epoch": 14.697844316166963, "grad_norm": 1.6243103742599487, "learning_rate": 6.677474090243774e-08, "loss": 0.2311, "step": 1102500 }, { "epoch": 14.699177453973418, "grad_norm": 2.3621902465820312, "learning_rate": 6.619717008744641e-08, "loss": 0.238, "step": 1102600 }, { "epoch": 14.700510591779873, "grad_norm": 0.19342051446437836, "learning_rate": 6.562210523606837e-08, "loss": 0.2314, "step": 1102700 }, { "epoch": 14.701843729586328, "grad_norm": 2.006676197052002, "learning_rate": 6.504954639644845e-08, "loss": 0.2572, "step": 1102800 }, { "epoch": 14.703176867392783, "grad_norm": 1.4492162466049194, "learning_rate": 6.447949361651495e-08, "loss": 0.2274, "step": 1102900 }, { "epoch": 14.704510005199237, "grad_norm": 1.3413459062576294, "learning_rate": 6.391194694399305e-08, "loss": 0.2496, "step": 1103000 }, { "epoch": 14.705843143005692, "grad_norm": 2.0644314289093018, "learning_rate": 6.334690642639806e-08, "loss": 0.1985, "step": 1103100 }, { "epoch": 14.707176280812147, "grad_norm": 10.90981388092041, "learning_rate": 6.278437211102883e-08, "loss": 0.2415, "step": 1103200 }, { "epoch": 14.708509418618602, "grad_norm": 4.721365928649902, "learning_rate": 6.222434404498434e-08, "loss": 0.2095, "step": 1103300 }, { "epoch": 14.709842556425057, "grad_norm": 3.7053561210632324, "learning_rate": 6.166682227514376e-08, "loss": 0.2491, "step": 1103400 }, { "epoch": 14.711175694231512, "grad_norm": 3.795802116394043, "learning_rate": 6.111180684818307e-08, "loss": 0.2816, "step": 1103500 }, { "epoch": 14.712508832037967, "grad_norm": 3.2352452278137207, "learning_rate": 6.055929781056846e-08, "loss": 0.235, "step": 1103600 }, { "epoch": 14.713841969844422, "grad_norm": 2.6323952674865723, "learning_rate": 6.000929520854959e-08, "loss": 0.2282, "step": 1103700 }, { "epoch": 14.715175107650879, "grad_norm": 1.2587429285049438, "learning_rate": 5.946179908817628e-08, "loss": 0.2384, "step": 1103800 }, { "epoch": 14.716508245457334, "grad_norm": 6.461183547973633, "learning_rate": 5.8916809495278554e-08, "loss": 0.2388, "step": 1103900 }, { "epoch": 14.717841383263789, "grad_norm": 2.551708221435547, "learning_rate": 5.8374326475483244e-08, "loss": 0.2366, "step": 1104000 }, { "epoch": 14.719174521070244, "grad_norm": 3.302340507507324, "learning_rate": 5.783973743030857e-08, "loss": 0.2234, "step": 1104100 }, { "epoch": 14.720507658876699, "grad_norm": 2.6168806552886963, "learning_rate": 5.730224262589401e-08, "loss": 0.25, "step": 1104200 }, { "epoch": 14.721840796683153, "grad_norm": 2.9495139122009277, "learning_rate": 5.676725452974707e-08, "loss": 0.1948, "step": 1104300 }, { "epoch": 14.723173934489608, "grad_norm": 3.7018697261810303, "learning_rate": 5.623477318665526e-08, "loss": 0.1895, "step": 1104400 }, { "epoch": 14.724507072296063, "grad_norm": 1.9929382801055908, "learning_rate": 5.57047986411996e-08, "loss": 0.2631, "step": 1104500 }, { "epoch": 14.725840210102518, "grad_norm": 3.988555669784546, "learning_rate": 5.5177330937744575e-08, "loss": 0.2717, "step": 1104600 }, { "epoch": 14.727173347908973, "grad_norm": 1.1447651386260986, "learning_rate": 5.4652370120448215e-08, "loss": 0.2364, "step": 1104700 }, { "epoch": 14.728506485715428, "grad_norm": 1.546449065208435, "learning_rate": 5.412991623326202e-08, "loss": 0.2375, "step": 1104800 }, { "epoch": 14.729839623521883, "grad_norm": 2.0842716693878174, "learning_rate": 5.360996931991768e-08, "loss": 0.2177, "step": 1104900 }, { "epoch": 14.731172761328338, "grad_norm": 3.199242353439331, "learning_rate": 5.3092529423950376e-08, "loss": 0.199, "step": 1105000 }, { "epoch": 14.732505899134793, "grad_norm": 6.561161518096924, "learning_rate": 5.257759658867545e-08, "loss": 0.2306, "step": 1105100 }, { "epoch": 14.733839036941248, "grad_norm": 2.532771110534668, "learning_rate": 5.206517085720175e-08, "loss": 0.2536, "step": 1105200 }, { "epoch": 14.735172174747703, "grad_norm": 4.034378528594971, "learning_rate": 5.15552522724283e-08, "loss": 0.2277, "step": 1105300 }, { "epoch": 14.736505312554158, "grad_norm": 1.8615295886993408, "learning_rate": 5.104784087704428e-08, "loss": 0.2079, "step": 1105400 }, { "epoch": 14.737838450360615, "grad_norm": 0.3933381736278534, "learning_rate": 5.054293671352572e-08, "loss": 0.2134, "step": 1105500 }, { "epoch": 14.73917158816707, "grad_norm": 9.44931411743164, "learning_rate": 5.004053982414547e-08, "loss": 0.2099, "step": 1105600 }, { "epoch": 14.740504725973524, "grad_norm": 1.780083179473877, "learning_rate": 4.954065025095988e-08, "loss": 0.2752, "step": 1105700 }, { "epoch": 14.74183786377998, "grad_norm": 2.6784441471099854, "learning_rate": 4.9043268035818824e-08, "loss": 0.2488, "step": 1105800 }, { "epoch": 14.743171001586434, "grad_norm": 1.0493738651275635, "learning_rate": 4.855332955674863e-08, "loss": 0.2262, "step": 1105900 }, { "epoch": 14.74450413939289, "grad_norm": 2.459165096282959, "learning_rate": 4.8060937107791136e-08, "loss": 0.2106, "step": 1106000 }, { "epoch": 14.745837277199344, "grad_norm": 2.5652310848236084, "learning_rate": 4.7571052140755746e-08, "loss": 0.2286, "step": 1106100 }, { "epoch": 14.7471704150058, "grad_norm": 1.5453170537948608, "learning_rate": 4.708367469664965e-08, "loss": 0.2106, "step": 1106200 }, { "epoch": 14.748503552812254, "grad_norm": 1.1181327104568481, "learning_rate": 4.659880481628021e-08, "loss": 0.293, "step": 1106300 }, { "epoch": 14.749836690618709, "grad_norm": 3.4779772758483887, "learning_rate": 4.611644254023828e-08, "loss": 0.2207, "step": 1106400 }, { "epoch": 14.751169828425164, "grad_norm": 2.5486741065979004, "learning_rate": 4.5636587908904903e-08, "loss": 0.2463, "step": 1106500 }, { "epoch": 14.752502966231619, "grad_norm": 0.553034245967865, "learning_rate": 4.5159240962451276e-08, "loss": 0.2272, "step": 1106600 }, { "epoch": 14.753836104038074, "grad_norm": 0.562800407409668, "learning_rate": 4.46844017408421e-08, "loss": 0.2257, "step": 1106700 }, { "epoch": 14.755169241844529, "grad_norm": 3.6558401584625244, "learning_rate": 4.421207028382557e-08, "loss": 0.2881, "step": 1106800 }, { "epoch": 14.756502379650984, "grad_norm": 3.1777684688568115, "learning_rate": 4.3742246630946724e-08, "loss": 0.2278, "step": 1106900 }, { "epoch": 14.75783551745744, "grad_norm": 2.306469678878784, "learning_rate": 4.327493082153744e-08, "loss": 0.2204, "step": 1107000 }, { "epoch": 14.759168655263895, "grad_norm": 0.2706179618835449, "learning_rate": 4.281012289471642e-08, "loss": 0.2347, "step": 1107100 }, { "epoch": 14.76050179307035, "grad_norm": 1.2667357921600342, "learning_rate": 4.23478228893992e-08, "loss": 0.2162, "step": 1107200 }, { "epoch": 14.761834930876805, "grad_norm": 2.0316641330718994, "learning_rate": 4.188803084429149e-08, "loss": 0.2303, "step": 1107300 }, { "epoch": 14.76316806868326, "grad_norm": 11.965141296386719, "learning_rate": 4.143074679787917e-08, "loss": 0.2925, "step": 1107400 }, { "epoch": 14.764501206489715, "grad_norm": 2.2873122692108154, "learning_rate": 4.097597078844495e-08, "loss": 0.2068, "step": 1107500 }, { "epoch": 14.76583434429617, "grad_norm": 4.3551025390625, "learning_rate": 4.052370285406504e-08, "loss": 0.2197, "step": 1107600 }, { "epoch": 14.767167482102625, "grad_norm": 2.420642375946045, "learning_rate": 4.007394303260248e-08, "loss": 0.2254, "step": 1107700 }, { "epoch": 14.76850061990908, "grad_norm": 1.6895467042922974, "learning_rate": 3.9626691361707155e-08, "loss": 0.2007, "step": 1107800 }, { "epoch": 14.769833757715535, "grad_norm": 2.021040678024292, "learning_rate": 3.918194787881912e-08, "loss": 0.2159, "step": 1107900 }, { "epoch": 14.77116689552199, "grad_norm": 2.5123324394226074, "learning_rate": 3.873971262117526e-08, "loss": 0.2021, "step": 1108000 }, { "epoch": 14.772500033328445, "grad_norm": 4.920448303222656, "learning_rate": 3.829998562579595e-08, "loss": 0.2554, "step": 1108100 }, { "epoch": 14.7738331711349, "grad_norm": 3.712636947631836, "learning_rate": 3.786712670025949e-08, "loss": 0.1995, "step": 1108200 }, { "epoch": 14.775166308941355, "grad_norm": 1.7607699632644653, "learning_rate": 3.743239125609921e-08, "loss": 0.2662, "step": 1108300 }, { "epoch": 14.77649944674781, "grad_norm": 2.9575235843658447, "learning_rate": 3.70001641836526e-08, "loss": 0.2556, "step": 1108400 }, { "epoch": 14.777832584554265, "grad_norm": 1.606081485748291, "learning_rate": 3.6570445519100716e-08, "loss": 0.2203, "step": 1108500 }, { "epoch": 14.77916572236072, "grad_norm": 2.2406585216522217, "learning_rate": 3.614749498370928e-08, "loss": 0.2352, "step": 1108600 }, { "epoch": 14.780498860167176, "grad_norm": 3.127394914627075, "learning_rate": 3.5722768157686605e-08, "loss": 0.2236, "step": 1108700 }, { "epoch": 14.781831997973631, "grad_norm": 6.673893928527832, "learning_rate": 3.530054984649977e-08, "loss": 0.2783, "step": 1108800 }, { "epoch": 14.783165135780086, "grad_norm": 1.5289709568023682, "learning_rate": 3.488084008549053e-08, "loss": 0.2372, "step": 1108900 }, { "epoch": 14.784498273586541, "grad_norm": 4.204627990722656, "learning_rate": 3.4463638909804085e-08, "loss": 0.2228, "step": 1109000 }, { "epoch": 14.785831411392996, "grad_norm": 1.906161904335022, "learning_rate": 3.404894635435918e-08, "loss": 0.2354, "step": 1109100 }, { "epoch": 14.787164549199451, "grad_norm": 0.12106068432331085, "learning_rate": 3.3636762453874706e-08, "loss": 0.1924, "step": 1109200 }, { "epoch": 14.788497687005906, "grad_norm": 2.9744508266448975, "learning_rate": 3.322708724285972e-08, "loss": 0.261, "step": 1109300 }, { "epoch": 14.78983082481236, "grad_norm": 1.0635625123977661, "learning_rate": 3.281992075561013e-08, "loss": 0.1984, "step": 1109400 }, { "epoch": 14.791163962618816, "grad_norm": 0.8551486730575562, "learning_rate": 3.241526302621201e-08, "loss": 0.2351, "step": 1109500 }, { "epoch": 14.79249710042527, "grad_norm": 1.7774430513381958, "learning_rate": 3.201311408854157e-08, "loss": 0.2283, "step": 1109600 }, { "epoch": 14.793830238231726, "grad_norm": 0.8024520874023438, "learning_rate": 3.161347397626857e-08, "loss": 0.2462, "step": 1109700 }, { "epoch": 14.79516337603818, "grad_norm": 0.013751182705163956, "learning_rate": 3.1216342722842905e-08, "loss": 0.2417, "step": 1109800 }, { "epoch": 14.796496513844636, "grad_norm": 2.5804672241210938, "learning_rate": 3.082172036151798e-08, "loss": 0.2296, "step": 1109900 }, { "epoch": 14.79782965165109, "grad_norm": 0.3418811559677124, "learning_rate": 3.042960692532404e-08, "loss": 0.2297, "step": 1110000 }, { "epoch": 14.799162789457545, "grad_norm": 1.8809813261032104, "learning_rate": 3.004000244709482e-08, "loss": 0.267, "step": 1110100 }, { "epoch": 14.800495927264002, "grad_norm": 6.727523326873779, "learning_rate": 2.9652906959444226e-08, "loss": 0.2954, "step": 1110200 }, { "epoch": 14.801829065070457, "grad_norm": 0.8022236824035645, "learning_rate": 2.9268320494773015e-08, "loss": 0.292, "step": 1110300 }, { "epoch": 14.803162202876912, "grad_norm": 4.413967132568359, "learning_rate": 2.8886243085285424e-08, "loss": 0.2671, "step": 1110400 }, { "epoch": 14.804495340683367, "grad_norm": 0.2063269168138504, "learning_rate": 2.8506674762962535e-08, "loss": 0.2556, "step": 1110500 }, { "epoch": 14.805828478489822, "grad_norm": 1.771986722946167, "learning_rate": 2.8129615559585597e-08, "loss": 0.2073, "step": 1110600 }, { "epoch": 14.807161616296277, "grad_norm": 6.01920223236084, "learning_rate": 2.7755065506716027e-08, "loss": 0.203, "step": 1110700 }, { "epoch": 14.808494754102732, "grad_norm": 2.8497190475463867, "learning_rate": 2.7383024635712073e-08, "loss": 0.209, "step": 1110800 }, { "epoch": 14.809827891909187, "grad_norm": 0.979199230670929, "learning_rate": 2.7013492977718825e-08, "loss": 0.2319, "step": 1110900 }, { "epoch": 14.811161029715642, "grad_norm": 2.187302827835083, "learning_rate": 2.664647056367153e-08, "loss": 0.189, "step": 1111000 }, { "epoch": 14.812494167522097, "grad_norm": 2.6545631885528564, "learning_rate": 2.6281957424298953e-08, "loss": 0.2286, "step": 1111100 }, { "epoch": 14.813827305328552, "grad_norm": 2.126570701599121, "learning_rate": 2.5919953590116676e-08, "loss": 0.2472, "step": 1111200 }, { "epoch": 14.815160443135007, "grad_norm": 3.940732002258301, "learning_rate": 2.5560459091427124e-08, "loss": 0.2531, "step": 1111300 }, { "epoch": 14.816493580941462, "grad_norm": 9.479852676391602, "learning_rate": 2.520347395832956e-08, "loss": 0.244, "step": 1111400 }, { "epoch": 14.817826718747916, "grad_norm": 2.230825424194336, "learning_rate": 2.4848998220710074e-08, "loss": 0.2318, "step": 1111500 }, { "epoch": 14.819159856554371, "grad_norm": 2.8809008598327637, "learning_rate": 2.4497031908241597e-08, "loss": 0.2768, "step": 1111600 }, { "epoch": 14.820492994360826, "grad_norm": 2.061112642288208, "learning_rate": 2.4147575050387226e-08, "loss": 0.2224, "step": 1111700 }, { "epoch": 14.821826132167281, "grad_norm": 3.3335580825805664, "learning_rate": 2.3800627676410226e-08, "loss": 0.2521, "step": 1111800 }, { "epoch": 14.823159269973738, "grad_norm": 2.6980907917022705, "learning_rate": 2.345618981535069e-08, "loss": 0.2751, "step": 1111900 }, { "epoch": 14.824492407780193, "grad_norm": 2.50089693069458, "learning_rate": 2.3114261496042232e-08, "loss": 0.2152, "step": 1112000 }, { "epoch": 14.825825545586648, "grad_norm": 2.7138962745666504, "learning_rate": 2.2774842747115276e-08, "loss": 0.2108, "step": 1112100 }, { "epoch": 14.827158683393103, "grad_norm": 0.37681224942207336, "learning_rate": 2.243793359698043e-08, "loss": 0.2561, "step": 1112200 }, { "epoch": 14.828491821199558, "grad_norm": 1.5789073705673218, "learning_rate": 2.2103534073845134e-08, "loss": 0.1964, "step": 1112300 }, { "epoch": 14.829824959006013, "grad_norm": 0.8334382176399231, "learning_rate": 2.1771644205703656e-08, "loss": 0.2583, "step": 1112400 }, { "epoch": 14.831158096812468, "grad_norm": 2.521712064743042, "learning_rate": 2.144226402034044e-08, "loss": 0.2823, "step": 1112500 }, { "epoch": 14.832491234618923, "grad_norm": 2.3155641555786133, "learning_rate": 2.11153935453301e-08, "loss": 0.2178, "step": 1112600 }, { "epoch": 14.833824372425378, "grad_norm": 3.8116300106048584, "learning_rate": 2.07910328080374e-08, "loss": 0.2355, "step": 1112700 }, { "epoch": 14.835157510231832, "grad_norm": 2.454415798187256, "learning_rate": 2.04691818356173e-08, "loss": 0.2437, "step": 1112800 }, { "epoch": 14.836490648038287, "grad_norm": 0.8835386633872986, "learning_rate": 2.0149840655014906e-08, "loss": 0.2193, "step": 1112900 }, { "epoch": 14.837823785844742, "grad_norm": 2.3758299350738525, "learning_rate": 1.9833009292962166e-08, "loss": 0.1987, "step": 1113000 }, { "epoch": 14.839156923651197, "grad_norm": 2.4309890270233154, "learning_rate": 1.9518687775981204e-08, "loss": 0.1933, "step": 1113100 }, { "epoch": 14.840490061457652, "grad_norm": 1.2545489072799683, "learning_rate": 1.920687613039096e-08, "loss": 0.2219, "step": 1113200 }, { "epoch": 14.841823199264107, "grad_norm": 4.681244373321533, "learning_rate": 1.8897574382293893e-08, "loss": 0.2172, "step": 1113300 }, { "epoch": 14.843156337070564, "grad_norm": 2.4116077423095703, "learning_rate": 1.859078255758595e-08, "loss": 0.2122, "step": 1113400 }, { "epoch": 14.844489474877019, "grad_norm": 1.2668851613998413, "learning_rate": 1.8286500681946593e-08, "loss": 0.2382, "step": 1113500 }, { "epoch": 14.845822612683474, "grad_norm": 3.641735792160034, "learning_rate": 1.7984728780848782e-08, "loss": 0.2428, "step": 1113600 }, { "epoch": 14.847155750489929, "grad_norm": 3.5652551651000977, "learning_rate": 1.7685466879558965e-08, "loss": 0.2609, "step": 1113700 }, { "epoch": 14.848488888296384, "grad_norm": 2.7937371730804443, "learning_rate": 1.738871500313044e-08, "loss": 0.2599, "step": 1113800 }, { "epoch": 14.849822026102839, "grad_norm": 1.3483574390411377, "learning_rate": 1.7094473176403336e-08, "loss": 0.2405, "step": 1113900 }, { "epoch": 14.851155163909294, "grad_norm": 1.6710253953933716, "learning_rate": 1.680274142401461e-08, "loss": 0.1994, "step": 1114000 }, { "epoch": 14.852488301715749, "grad_norm": 2.350649356842041, "learning_rate": 1.6513519770384734e-08, "loss": 0.2286, "step": 1114100 }, { "epoch": 14.853821439522203, "grad_norm": 0.25009843707084656, "learning_rate": 1.6226808239724332e-08, "loss": 0.239, "step": 1114200 }, { "epoch": 14.855154577328658, "grad_norm": 3.1041500568389893, "learning_rate": 1.5942606856040874e-08, "loss": 0.2677, "step": 1114300 }, { "epoch": 14.856487715135113, "grad_norm": 2.0568230152130127, "learning_rate": 1.566091564312533e-08, "loss": 0.2359, "step": 1114400 }, { "epoch": 14.857820852941568, "grad_norm": 2.693599224090576, "learning_rate": 1.538173462455883e-08, "loss": 0.2275, "step": 1114500 }, { "epoch": 14.859153990748023, "grad_norm": 2.725987672805786, "learning_rate": 1.5105063823712683e-08, "loss": 0.2226, "step": 1114600 }, { "epoch": 14.860487128554478, "grad_norm": 3.520921468734741, "learning_rate": 1.4830903263751694e-08, "loss": 0.2472, "step": 1114700 }, { "epoch": 14.861820266360933, "grad_norm": 1.8874502182006836, "learning_rate": 1.455925296762417e-08, "loss": 0.2311, "step": 1114800 }, { "epoch": 14.863153404167388, "grad_norm": 1.6955515146255493, "learning_rate": 1.4290112958075253e-08, "loss": 0.2222, "step": 1114900 }, { "epoch": 14.864486541973843, "grad_norm": 1.828455924987793, "learning_rate": 1.4023483257633585e-08, "loss": 0.2071, "step": 1115000 }, { "epoch": 14.865819679780298, "grad_norm": 4.166486740112305, "learning_rate": 1.3759363888621313e-08, "loss": 0.2216, "step": 1115100 }, { "epoch": 14.867152817586755, "grad_norm": 2.3827521800994873, "learning_rate": 1.3497754873154077e-08, "loss": 0.2029, "step": 1115200 }, { "epoch": 14.86848595539321, "grad_norm": 0.5457360744476318, "learning_rate": 1.3238656233124368e-08, "loss": 0.1897, "step": 1115300 }, { "epoch": 14.869819093199665, "grad_norm": 0.6834166646003723, "learning_rate": 1.2982067990231493e-08, "loss": 0.2401, "step": 1115400 }, { "epoch": 14.87115223100612, "grad_norm": 4.541533470153809, "learning_rate": 1.2727990165951608e-08, "loss": 0.1871, "step": 1115500 }, { "epoch": 14.872485368812574, "grad_norm": 3.6470890045166016, "learning_rate": 1.2476422781554365e-08, "loss": 0.2423, "step": 1115600 }, { "epoch": 14.87381850661903, "grad_norm": 1.339185357093811, "learning_rate": 1.2227365858106244e-08, "loss": 0.2293, "step": 1115700 }, { "epoch": 14.875151644425484, "grad_norm": 1.8547354936599731, "learning_rate": 1.1980819416450573e-08, "loss": 0.2714, "step": 1115800 }, { "epoch": 14.87648478223194, "grad_norm": 1.9706776142120361, "learning_rate": 1.173678347723084e-08, "loss": 0.2373, "step": 1115900 }, { "epoch": 14.877817920038394, "grad_norm": 3.310957670211792, "learning_rate": 1.1495258060877367e-08, "loss": 0.2092, "step": 1116000 }, { "epoch": 14.87915105784485, "grad_norm": 2.1764280796051025, "learning_rate": 1.1256243187607317e-08, "loss": 0.2313, "step": 1116100 }, { "epoch": 14.880484195651304, "grad_norm": 3.4224565029144287, "learning_rate": 1.1019738877431351e-08, "loss": 0.2725, "step": 1116200 }, { "epoch": 14.881817333457759, "grad_norm": 3.0649170875549316, "learning_rate": 1.0785745150150294e-08, "loss": 0.1872, "step": 1116300 }, { "epoch": 14.883150471264214, "grad_norm": 0.8521239757537842, "learning_rate": 1.0554262025355144e-08, "loss": 0.2404, "step": 1116400 }, { "epoch": 14.884483609070669, "grad_norm": 1.0331872701644897, "learning_rate": 1.03252895224204e-08, "loss": 0.224, "step": 1116500 }, { "epoch": 14.885816746877124, "grad_norm": 2.114942789077759, "learning_rate": 1.0098827660514065e-08, "loss": 0.2156, "step": 1116600 }, { "epoch": 14.88714988468358, "grad_norm": 1.8230907917022705, "learning_rate": 9.874876458600968e-09, "loss": 0.2199, "step": 1116700 }, { "epoch": 14.888483022490036, "grad_norm": 1.2461490631103516, "learning_rate": 9.653435935426113e-09, "loss": 0.2146, "step": 1116800 }, { "epoch": 14.88981616029649, "grad_norm": 8.319184303283691, "learning_rate": 9.434506109528007e-09, "loss": 0.2194, "step": 1116900 }, { "epoch": 14.891149298102945, "grad_norm": 4.207222938537598, "learning_rate": 9.218086999231989e-09, "loss": 0.2557, "step": 1117000 }, { "epoch": 14.8924824359094, "grad_norm": 1.244916558265686, "learning_rate": 9.00417862266023e-09, "loss": 0.2525, "step": 1117100 }, { "epoch": 14.893815573715855, "grad_norm": 4.254659652709961, "learning_rate": 8.792780997718409e-09, "loss": 0.2466, "step": 1117200 }, { "epoch": 14.89514871152231, "grad_norm": 2.429460048675537, "learning_rate": 8.583894142105697e-09, "loss": 0.2485, "step": 1117300 }, { "epoch": 14.896481849328765, "grad_norm": 1.95992910861969, "learning_rate": 8.37751807330811e-09, "loss": 0.2118, "step": 1117400 }, { "epoch": 14.89781498713522, "grad_norm": 2.0807249546051025, "learning_rate": 8.173652808601827e-09, "loss": 0.2222, "step": 1117500 }, { "epoch": 14.899148124941675, "grad_norm": 3.02598237991333, "learning_rate": 7.974299480869584e-09, "loss": 0.24, "step": 1117600 }, { "epoch": 14.90048126274813, "grad_norm": 1.6733980178833008, "learning_rate": 7.775430766876479e-09, "loss": 0.2743, "step": 1117700 }, { "epoch": 14.901814400554585, "grad_norm": 2.2109124660491943, "learning_rate": 7.579072907381023e-09, "loss": 0.2095, "step": 1117800 }, { "epoch": 14.90314753836104, "grad_norm": 1.8142688274383545, "learning_rate": 7.385225918820071e-09, "loss": 0.2262, "step": 1117900 }, { "epoch": 14.904480676167495, "grad_norm": 5.43609619140625, "learning_rate": 7.193889817423971e-09, "loss": 0.2457, "step": 1118000 }, { "epoch": 14.90581381397395, "grad_norm": 2.2992775440216064, "learning_rate": 7.005064619209911e-09, "loss": 0.1956, "step": 1118100 }, { "epoch": 14.907146951780405, "grad_norm": 0.419576495885849, "learning_rate": 6.818750339988577e-09, "loss": 0.2448, "step": 1118200 }, { "epoch": 14.90848008958686, "grad_norm": 1.0851354598999023, "learning_rate": 6.634946995350832e-09, "loss": 0.1993, "step": 1118300 }, { "epoch": 14.909813227393316, "grad_norm": 3.635978937149048, "learning_rate": 6.4536546006876974e-09, "loss": 0.2429, "step": 1118400 }, { "epoch": 14.911146365199771, "grad_norm": 2.96907114982605, "learning_rate": 6.274873171180362e-09, "loss": 0.2172, "step": 1118500 }, { "epoch": 14.912479503006226, "grad_norm": 1.5484018325805664, "learning_rate": 6.100352996881719e-09, "loss": 0.1985, "step": 1118600 }, { "epoch": 14.913812640812681, "grad_norm": 2.3009748458862305, "learning_rate": 5.9265684323495105e-09, "loss": 0.2194, "step": 1118700 }, { "epoch": 14.915145778619136, "grad_norm": 2.8244268894195557, "learning_rate": 5.755294877091144e-09, "loss": 0.2312, "step": 1118800 }, { "epoch": 14.916478916425591, "grad_norm": 1.530735969543457, "learning_rate": 5.5865323454518116e-09, "loss": 0.2021, "step": 1118900 }, { "epoch": 14.917812054232046, "grad_norm": 2.1248607635498047, "learning_rate": 5.4202808515568806e-09, "loss": 0.2238, "step": 1119000 }, { "epoch": 14.919145192038501, "grad_norm": 6.261746883392334, "learning_rate": 5.258165383993374e-09, "loss": 0.2217, "step": 1119100 }, { "epoch": 14.920478329844956, "grad_norm": 2.463336944580078, "learning_rate": 5.096910896413843e-09, "loss": 0.2476, "step": 1119200 }, { "epoch": 14.92181146765141, "grad_norm": 5.880086898803711, "learning_rate": 4.938167487565926e-09, "loss": 0.2685, "step": 1119300 }, { "epoch": 14.923144605457866, "grad_norm": 6.79551362991333, "learning_rate": 4.781935170742324e-09, "loss": 0.2332, "step": 1119400 }, { "epoch": 14.92447774326432, "grad_norm": 1.9773590564727783, "learning_rate": 4.628213959019245e-09, "loss": 0.2412, "step": 1119500 }, { "epoch": 14.925810881070776, "grad_norm": 0.7869948148727417, "learning_rate": 4.477003865273055e-09, "loss": 0.2283, "step": 1119600 }, { "epoch": 14.92714401887723, "grad_norm": 2.5226786136627197, "learning_rate": 4.328304902156966e-09, "loss": 0.2446, "step": 1119700 }, { "epoch": 14.928477156683686, "grad_norm": 5.149608612060547, "learning_rate": 4.182117082117687e-09, "loss": 0.2489, "step": 1119800 }, { "epoch": 14.929810294490142, "grad_norm": 2.7509148120880127, "learning_rate": 4.038440417398759e-09, "loss": 0.2213, "step": 1119900 }, { "epoch": 14.931143432296597, "grad_norm": 3.4597740173339844, "learning_rate": 3.897274920027227e-09, "loss": 0.2244, "step": 1120000 }, { "epoch": 14.932476570103052, "grad_norm": 0.07813716679811478, "learning_rate": 3.758620601816976e-09, "loss": 0.2548, "step": 1120100 }, { "epoch": 14.933809707909507, "grad_norm": 12.044000625610352, "learning_rate": 3.6224774743820466e-09, "loss": 0.1959, "step": 1120200 }, { "epoch": 14.935142845715962, "grad_norm": 4.100311279296875, "learning_rate": 3.4888455491166594e-09, "loss": 0.2328, "step": 1120300 }, { "epoch": 14.936475983522417, "grad_norm": 2.0183496475219727, "learning_rate": 3.3577248372052006e-09, "loss": 0.2678, "step": 1120400 }, { "epoch": 14.937809121328872, "grad_norm": 3.6058173179626465, "learning_rate": 3.229115349628886e-09, "loss": 0.2539, "step": 1120500 }, { "epoch": 14.939142259135327, "grad_norm": 1.6062281131744385, "learning_rate": 3.1030170971524384e-09, "loss": 0.2178, "step": 1120600 }, { "epoch": 14.940475396941782, "grad_norm": 2.477905750274658, "learning_rate": 2.979430090334079e-09, "loss": 0.2474, "step": 1120700 }, { "epoch": 14.941808534748237, "grad_norm": 2.3061723709106445, "learning_rate": 2.858354339518865e-09, "loss": 0.2311, "step": 1120800 }, { "epoch": 14.943141672554692, "grad_norm": 0.6065038442611694, "learning_rate": 2.739789854842023e-09, "loss": 0.2506, "step": 1120900 }, { "epoch": 14.944474810361147, "grad_norm": 2.5224928855895996, "learning_rate": 2.623736646228947e-09, "loss": 0.2251, "step": 1121000 }, { "epoch": 14.945807948167602, "grad_norm": 1.7275978326797485, "learning_rate": 2.5101947234018596e-09, "loss": 0.2648, "step": 1121100 }, { "epoch": 14.947141085974057, "grad_norm": 2.956862688064575, "learning_rate": 2.3991640958564987e-09, "loss": 0.2841, "step": 1121200 }, { "epoch": 14.948474223780511, "grad_norm": 2.7287282943725586, "learning_rate": 2.2906447728954227e-09, "loss": 0.2199, "step": 1121300 }, { "epoch": 14.949807361586966, "grad_norm": 2.4678778648376465, "learning_rate": 2.184636763598036e-09, "loss": 0.2429, "step": 1121400 }, { "epoch": 14.951140499393421, "grad_norm": 3.233285665512085, "learning_rate": 2.0811400768439015e-09, "loss": 0.2753, "step": 1121500 }, { "epoch": 14.952473637199878, "grad_norm": 1.6657795906066895, "learning_rate": 1.9801547212927596e-09, "loss": 0.207, "step": 1121600 }, { "epoch": 14.953806775006333, "grad_norm": 1.261744499206543, "learning_rate": 1.8826530144044897e-09, "loss": 0.2574, "step": 1121700 }, { "epoch": 14.955139912812788, "grad_norm": 7.7496137619018555, "learning_rate": 1.7866652328979438e-09, "loss": 0.2202, "step": 1121800 }, { "epoch": 14.956473050619243, "grad_norm": 1.4323503971099854, "learning_rate": 1.6931888072480029e-09, "loss": 0.2319, "step": 1121900 }, { "epoch": 14.957806188425698, "grad_norm": 7.76153564453125, "learning_rate": 1.6022237452850697e-09, "loss": 0.2789, "step": 1122000 }, { "epoch": 14.959139326232153, "grad_norm": 1.402716875076294, "learning_rate": 1.5137700546163925e-09, "loss": 0.2238, "step": 1122100 }, { "epoch": 14.960472464038608, "grad_norm": 3.7063100337982178, "learning_rate": 1.4278277426560404e-09, "loss": 0.1944, "step": 1122200 }, { "epoch": 14.961805601845063, "grad_norm": 0.655596911907196, "learning_rate": 1.3452186944706846e-09, "loss": 0.2049, "step": 1122300 }, { "epoch": 14.963138739651518, "grad_norm": 3.128188371658325, "learning_rate": 1.2642740473256532e-09, "loss": 0.2525, "step": 1122400 }, { "epoch": 14.964471877457973, "grad_norm": 4.6885809898376465, "learning_rate": 1.185840799772242e-09, "loss": 0.2293, "step": 1122500 }, { "epoch": 14.965805015264428, "grad_norm": 1.065401315689087, "learning_rate": 1.1099189583785307e-09, "loss": 0.2633, "step": 1122600 }, { "epoch": 14.967138153070882, "grad_norm": 0.11957506835460663, "learning_rate": 1.0365085294961052e-09, "loss": 0.2167, "step": 1122700 }, { "epoch": 14.968471290877337, "grad_norm": 3.2825794219970703, "learning_rate": 9.656095192733804e-10, "loss": 0.2706, "step": 1122800 }, { "epoch": 14.969804428683792, "grad_norm": 1.4825578927993774, "learning_rate": 8.972219336422782e-10, "loss": 0.2056, "step": 1122900 }, { "epoch": 14.971137566490247, "grad_norm": 1.7557429075241089, "learning_rate": 8.313457783315492e-10, "loss": 0.2801, "step": 1123000 }, { "epoch": 14.972470704296704, "grad_norm": 0.9545793533325195, "learning_rate": 7.679810588534508e-10, "loss": 0.2375, "step": 1123100 }, { "epoch": 14.973803842103159, "grad_norm": 1.7216016054153442, "learning_rate": 7.071277805170695e-10, "loss": 0.2316, "step": 1123200 }, { "epoch": 14.975136979909614, "grad_norm": 0.7296294569969177, "learning_rate": 6.487859484116676e-10, "loss": 0.2342, "step": 1123300 }, { "epoch": 14.976470117716069, "grad_norm": 1.6550626754760742, "learning_rate": 5.929555674266674e-10, "loss": 0.2535, "step": 1123400 }, { "epoch": 14.977803255522524, "grad_norm": 0.9168030619621277, "learning_rate": 5.396366422316667e-10, "loss": 0.2195, "step": 1123500 }, { "epoch": 14.979136393328979, "grad_norm": 1.8546727895736694, "learning_rate": 4.88829177293093e-10, "loss": 0.2081, "step": 1123600 }, { "epoch": 14.980469531135434, "grad_norm": 1.6650184392929077, "learning_rate": 4.4053317686088e-10, "loss": 0.2222, "step": 1123700 }, { "epoch": 14.981802668941889, "grad_norm": 0.9853866696357727, "learning_rate": 3.947486449817905e-10, "loss": 0.2393, "step": 1123800 }, { "epoch": 14.983135806748344, "grad_norm": 1.1653326749801636, "learning_rate": 3.51475585486094e-10, "loss": 0.2242, "step": 1123900 }, { "epoch": 14.984468944554798, "grad_norm": 2.6506571769714355, "learning_rate": 3.107140019975585e-10, "loss": 0.2687, "step": 1124000 }, { "epoch": 14.985802082361253, "grad_norm": 1.0954481363296509, "learning_rate": 2.724638979301197e-10, "loss": 0.2497, "step": 1124100 }, { "epoch": 14.987135220167708, "grad_norm": 2.601396083831787, "learning_rate": 2.3672527648455067e-10, "loss": 0.2409, "step": 1124200 }, { "epoch": 14.988468357974163, "grad_norm": 3.040102481842041, "learning_rate": 2.0349814065179217e-10, "loss": 0.2243, "step": 1124300 }, { "epoch": 14.989801495780618, "grad_norm": 2.184983968734741, "learning_rate": 1.727824932162836e-10, "loss": 0.232, "step": 1124400 }, { "epoch": 14.991134633587073, "grad_norm": 2.3814218044281006, "learning_rate": 1.4457833674930143e-10, "loss": 0.228, "step": 1124500 }, { "epoch": 14.992467771393528, "grad_norm": 2.266294002532959, "learning_rate": 1.1888567360562873e-10, "loss": 0.2153, "step": 1124600 }, { "epoch": 14.993800909199983, "grad_norm": 2.2852466106414795, "learning_rate": 9.570450594686974e-11, "loss": 0.2698, "step": 1124700 }, { "epoch": 14.99513404700644, "grad_norm": 2.8337674140930176, "learning_rate": 7.50348357048125e-11, "loss": 0.2146, "step": 1124800 }, { "epoch": 14.996467184812895, "grad_norm": 2.229300022125244, "learning_rate": 5.687666461140495e-11, "loss": 0.2025, "step": 1124900 }, { "epoch": 14.99780032261935, "grad_norm": 2.1734752655029297, "learning_rate": 4.1229994188762835e-11, "loss": 0.2621, "step": 1125000 }, { "epoch": 14.999133460425805, "grad_norm": 2.158562421798706, "learning_rate": 2.809482574916977e-11, "loss": 0.2016, "step": 1125100 } ], "logging_steps": 100, "max_steps": 1125165, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2094420230104553e+22, "train_batch_size": 8, "trial_name": null, "trial_params": null }