diff --git "a/checkpoint-21773/trainer_state.json" "b/checkpoint-21773/trainer_state.json"
deleted file mode 100644--- "a/checkpoint-21773/trainer_state.json"
+++ /dev/null
@@ -1,6145 +0,0 @@
-{
-  "best_metric": 0.0981861874461174,
-  "best_model_checkpoint": "autotrain-ai-image-detect-20241226-0202/checkpoint-21773",
-  "epoch": 1.0,
-  "eval_steps": 500,
-  "global_step": 21773,
-  "is_hyper_param_search": false,
-  "is_local_process_zero": true,
-  "is_world_process_zero": true,
-  "log_history": [
-    {
-      "epoch": 0.0011482110871262573,
-      "grad_norm": 0.009014680050313473,
-      "learning_rate": 3.9999869880391874e-05,
-      "loss": 0.0647,
-      "step": 25
-    },
-    {
-      "epoch": 0.0022964221742525145,
-      "grad_norm": 0.1195719987154007,
-      "learning_rate": 3.999947952326062e-05,
-      "loss": 0.0634,
-      "step": 50
-    },
-    {
-      "epoch": 0.003444633261378772,
-      "grad_norm": 0.033902380615472794,
-      "learning_rate": 3.9998828933685526e-05,
-      "loss": 0.0411,
-      "step": 75
-    },
-    {
-      "epoch": 0.004592844348505029,
-      "grad_norm": 0.003960250411182642,
-      "learning_rate": 3.999791812013205e-05,
-      "loss": 0.0076,
-      "step": 100
-    },
-    {
-      "epoch": 0.005741055435631286,
-      "grad_norm": 0.06124406307935715,
-      "learning_rate": 3.9996747094451666e-05,
-      "loss": 0.0671,
-      "step": 125
-    },
-    {
-      "epoch": 0.006889266522757544,
-      "grad_norm": 0.4315168559551239,
-      "learning_rate": 3.999531587188172e-05,
-      "loss": 0.0831,
-      "step": 150
-    },
-    {
-      "epoch": 0.008037477609883801,
-      "grad_norm": 0.20916788280010223,
-      "learning_rate": 3.99936244710452e-05,
-      "loss": 0.1156,
-      "step": 175
-    },
-    {
-      "epoch": 0.009185688697010058,
-      "grad_norm": 0.03289841115474701,
-      "learning_rate": 3.999167291395058e-05,
-      "loss": 0.1006,
-      "step": 200
-    },
-    {
-      "epoch": 0.010333899784136315,
-      "grad_norm": 0.803317129611969,
-      "learning_rate": 3.998946122599141e-05,
-      "loss": 0.0838,
-      "step": 225
-    },
-    {
-      "epoch": 0.011482110871262573,
-      "grad_norm": 25.849609375,
-      "learning_rate": 3.998698943594612e-05,
-      "loss": 0.0933,
-      "step": 250
-    },
-    {
-      "epoch": 0.01263032195838883,
-      "grad_norm": 12.879656791687012,
-      "learning_rate": 3.9984257575977524e-05,
-      "loss": 0.1025,
-      "step": 275
-    },
-    {
-      "epoch": 0.013778533045515087,
-      "grad_norm": 0.27299392223358154,
-      "learning_rate": 3.998126568163247e-05,
-      "loss": 0.1247,
-      "step": 300
-    },
-    {
-      "epoch": 0.014926744132641345,
-      "grad_norm": 0.012173276394605637,
-      "learning_rate": 3.99780137918414e-05,
-      "loss": 0.0649,
-      "step": 325
-    },
-    {
-      "epoch": 0.016074955219767602,
-      "grad_norm": 0.026948055252432823,
-      "learning_rate": 3.9974501948917754e-05,
-      "loss": 0.1292,
-      "step": 350
-    },
-    {
-      "epoch": 0.01722316630689386,
-      "grad_norm": 0.39485013484954834,
-      "learning_rate": 3.9970730198557495e-05,
-      "loss": 0.0592,
-      "step": 375
-    },
-    {
-      "epoch": 0.018371377394020116,
-      "grad_norm": 20.581707000732422,
-      "learning_rate": 3.996669858983851e-05,
-      "loss": 0.0457,
-      "step": 400
-    },
-    {
-      "epoch": 0.019519588481146374,
-      "grad_norm": 0.007092855870723724,
-      "learning_rate": 3.99624071752199e-05,
-      "loss": 0.1605,
-      "step": 425
-    },
-    {
-      "epoch": 0.02066779956827263,
-      "grad_norm": 9.681135177612305,
-      "learning_rate": 3.9957856010541405e-05,
-      "loss": 0.12,
-      "step": 450
-    },
-    {
-      "epoch": 0.021816010655398888,
-      "grad_norm": 0.29390987753868103,
-      "learning_rate": 3.9953045155022606e-05,
-      "loss": 0.051,
-      "step": 475
-    },
-    {
-      "epoch": 0.022964221742525145,
-      "grad_norm": 0.12392851710319519,
-      "learning_rate": 3.9947974671262166e-05,
-      "loss": 0.154,
-      "step": 500
-    },
-    {
-      "epoch": 0.024112432829651403,
-      "grad_norm": 19.981584548950195,
-      "learning_rate": 3.9942644625237004e-05,
-      "loss": 0.1149,
-      "step": 525
-    },
-    {
-      "epoch": 0.02526064391677766,
-      "grad_norm": 0.2311665415763855,
-      "learning_rate": 3.993705508630148e-05,
-      "loss": 0.09,
-      "step": 550
-    },
-    {
-      "epoch": 0.026408855003903917,
-      "grad_norm": 0.10829292237758636,
-      "learning_rate": 3.993120612718646e-05,
-      "loss": 0.109,
-      "step": 575
-    },
-    {
-      "epoch": 0.027557066091030175,
-      "grad_norm": 97.97807312011719,
-      "learning_rate": 3.992509782399837e-05,
-      "loss": 0.0767,
-      "step": 600
-    },
-    {
-      "epoch": 0.028705277178156432,
-      "grad_norm": 0.16646654903888702,
-      "learning_rate": 3.991873025621821e-05,
-      "loss": 0.1011,
-      "step": 625
-    },
-    {
-      "epoch": 0.02985348826528269,
-      "grad_norm": 0.0468452163040638,
-      "learning_rate": 3.991210350670052e-05,
-      "loss": 0.0476,
-      "step": 650
-    },
-    {
-      "epoch": 0.031001699352408946,
-      "grad_norm": 0.061356183141469955,
-      "learning_rate": 3.9905217661672294e-05,
-      "loss": 0.1146,
-      "step": 675
-    },
-    {
-      "epoch": 0.032149910439535204,
-      "grad_norm": 15.284680366516113,
-      "learning_rate": 3.989807281073191e-05,
-      "loss": 0.0839,
-      "step": 700
-    },
-    {
-      "epoch": 0.033298121526661464,
-      "grad_norm": 0.3123476803302765,
-      "learning_rate": 3.989066904684786e-05,
-      "loss": 0.154,
-      "step": 725
-    },
-    {
-      "epoch": 0.03444633261378772,
-      "grad_norm": 85.98719787597656,
-      "learning_rate": 3.988300646635763e-05,
-      "loss": 0.1015,
-      "step": 750
-    },
-    {
-      "epoch": 0.03559454370091398,
-      "grad_norm": 5.17075252532959,
-      "learning_rate": 3.987508516896643e-05,
-      "loss": 0.1069,
-      "step": 775
-    },
-    {
-      "epoch": 0.03674275478804023,
-      "grad_norm": 0.2559378445148468,
-      "learning_rate": 3.9866905257745875e-05,
-      "loss": 0.1191,
-      "step": 800
-    },
-    {
-      "epoch": 0.037890965875166494,
-      "grad_norm": 0.17535167932510376,
-      "learning_rate": 3.985846683913263e-05,
-      "loss": 0.104,
-      "step": 825
-    },
-    {
-      "epoch": 0.03903917696229275,
-      "grad_norm": 0.6827572584152222,
-      "learning_rate": 3.9849770022927085e-05,
-      "loss": 0.0745,
-      "step": 850
-    },
-    {
-      "epoch": 0.04018738804941901,
-      "grad_norm": 38.4011344909668,
-      "learning_rate": 3.9840814922291857e-05,
-      "loss": 0.0954,
-      "step": 875
-    },
-    {
-      "epoch": 0.04133559913654526,
-      "grad_norm": 25.91033172607422,
-      "learning_rate": 3.983160165375038e-05,
-      "loss": 0.1061,
-      "step": 900
-    },
-    {
-      "epoch": 0.04248381022367152,
-      "grad_norm": 17.619667053222656,
-      "learning_rate": 3.982213033718533e-05,
-      "loss": 0.0914,
-      "step": 925
-    },
-    {
-      "epoch": 0.043632021310797776,
-      "grad_norm": 0.0705079659819603,
-      "learning_rate": 3.981240109583711e-05,
-      "loss": 0.118,
-      "step": 950
-    },
-    {
-      "epoch": 0.04478023239792404,
-      "grad_norm": 0.1458115577697754,
-      "learning_rate": 3.9802414056302235e-05,
-      "loss": 0.0843,
-      "step": 975
-    },
-    {
-      "epoch": 0.04592844348505029,
-      "grad_norm": 0.156606525182724,
-      "learning_rate": 3.9792169348531666e-05,
-      "loss": 0.1119,
-      "step": 1000
-    },
-    {
-      "epoch": 0.04707665457217655,
-      "grad_norm": 1.229665994644165,
-      "learning_rate": 3.978166710582914e-05,
-      "loss": 0.1472,
-      "step": 1025
-    },
-    {
-      "epoch": 0.048224865659302805,
-      "grad_norm": 0.1480337530374527,
-      "learning_rate": 3.977090746484942e-05,
-      "loss": 0.1022,
-      "step": 1050
-    },
-    {
-      "epoch": 0.049373076746429066,
-      "grad_norm": 17.64451789855957,
-      "learning_rate": 3.975989056559655e-05,
-      "loss": 0.0852,
-      "step": 1075
-    },
-    {
-      "epoch": 0.05052128783355532,
-      "grad_norm": 72.8317642211914,
-      "learning_rate": 3.974861655142198e-05,
-      "loss": 0.0509,
-      "step": 1100
-    },
-    {
-      "epoch": 0.05166949892068158,
-      "grad_norm": 0.027190979570150375,
-      "learning_rate": 3.9737085569022736e-05,
-      "loss": 0.0603,
-      "step": 1125
-    },
-    {
-      "epoch": 0.052817710007807835,
-      "grad_norm": 0.06858149915933609,
-      "learning_rate": 3.9725297768439514e-05,
-      "loss": 0.0894,
-      "step": 1150
-    },
-    {
-      "epoch": 0.053965921094934095,
-      "grad_norm": 1.3908005952835083,
-      "learning_rate": 3.971325330305472e-05,
-      "loss": 0.1071,
-      "step": 1175
-    },
-    {
-      "epoch": 0.05511413218206035,
-      "grad_norm": 23.995895385742188,
-      "learning_rate": 3.9700952329590454e-05,
-      "loss": 0.1354,
-      "step": 1200
-    },
-    {
-      "epoch": 0.05626234326918661,
-      "grad_norm": 42.93232727050781,
-      "learning_rate": 3.968839500810651e-05,
-      "loss": 0.0654,
-      "step": 1225
-    },
-    {
-      "epoch": 0.057410554356312864,
-      "grad_norm": 0.07380808889865875,
-      "learning_rate": 3.9675581501998255e-05,
-      "loss": 0.1138,
-      "step": 1250
-    },
-    {
-      "epoch": 0.058558765443439124,
-      "grad_norm": 0.1862788200378418,
-      "learning_rate": 3.966251197799454e-05,
-      "loss": 0.0774,
-      "step": 1275
-    },
-    {
-      "epoch": 0.05970697653056538,
-      "grad_norm": 0.6021133661270142,
-      "learning_rate": 3.964918660615549e-05,
-      "loss": 0.0541,
-      "step": 1300
-    },
-    {
-      "epoch": 0.06085518761769164,
-      "grad_norm": 0.012276153080165386,
-      "learning_rate": 3.9635605559870324e-05,
-      "loss": 0.079,
-      "step": 1325
-    },
-    {
-      "epoch": 0.06200339870481789,
-      "grad_norm": 0.12071600556373596,
-      "learning_rate": 3.962176901585508e-05,
-      "loss": 0.0939,
-      "step": 1350
-    },
-    {
-      "epoch": 0.06315160979194415,
-      "grad_norm": 193.4734344482422,
-      "learning_rate": 3.960767715415033e-05,
-      "loss": 0.1279,
-      "step": 1375
-    },
-    {
-      "epoch": 0.06429982087907041,
-      "grad_norm": 20.70905876159668,
-      "learning_rate": 3.9593330158118826e-05,
-      "loss": 0.1717,
-      "step": 1400
-    },
-    {
-      "epoch": 0.06544803196619667,
-      "grad_norm": 0.20234434306621552,
-      "learning_rate": 3.957872821444312e-05,
-      "loss": 0.0846,
-      "step": 1425
-    },
-    {
-      "epoch": 0.06659624305332293,
-      "grad_norm": 0.10155820101499557,
-      "learning_rate": 3.956387151312312e-05,
-      "loss": 0.0694,
-      "step": 1450
-    },
-    {
-      "epoch": 0.06774445414044918,
-      "grad_norm": 0.027118023484945297,
-      "learning_rate": 3.9548760247473666e-05,
-      "loss": 0.1274,
-      "step": 1475
-    },
-    {
-      "epoch": 0.06889266522757544,
-      "grad_norm": 0.05700485408306122,
-      "learning_rate": 3.9533394614121926e-05,
-      "loss": 0.0701,
-      "step": 1500
-    },
-    {
-      "epoch": 0.0700408763147017,
-      "grad_norm": 64.16590881347656,
-      "learning_rate": 3.951777481300494e-05,
-      "loss": 0.1467,
-      "step": 1525
-    },
-    {
-      "epoch": 0.07118908740182796,
-      "grad_norm": 10.680575370788574,
-      "learning_rate": 3.950190104736694e-05,
-      "loss": 0.142,
-      "step": 1550
-    },
-    {
-      "epoch": 0.0723372984889542,
-      "grad_norm": 14.797567367553711,
-      "learning_rate": 3.948577352375674e-05,
-      "loss": 0.143,
-      "step": 1575
-    },
-    {
-      "epoch": 0.07348550957608047,
-      "grad_norm": 1.4608783721923828,
-      "learning_rate": 3.946939245202505e-05,
-      "loss": 0.1086,
-      "step": 1600
-    },
-    {
-      "epoch": 0.07463372066320673,
-      "grad_norm": 7.491002082824707,
-      "learning_rate": 3.9452758045321726e-05,
-      "loss": 0.1096,
-      "step": 1625
-    },
-    {
-      "epoch": 0.07578193175033299,
-      "grad_norm": 0.8704826235771179,
-      "learning_rate": 3.9435870520093027e-05,
-      "loss": 0.0745,
-      "step": 1650
-    },
-    {
-      "epoch": 0.07693014283745923,
-      "grad_norm": 18.54655647277832,
-      "learning_rate": 3.941873009607876e-05,
-      "loss": 0.1094,
-      "step": 1675
-    },
-    {
-      "epoch": 0.0780783539245855,
-      "grad_norm": 0.26919639110565186,
-      "learning_rate": 3.940133699630945e-05,
-      "loss": 0.1378,
-      "step": 1700
-    },
-    {
-      "epoch": 0.07922656501171176,
-      "grad_norm": 0.07600715756416321,
-      "learning_rate": 3.9383691447103443e-05,
-      "loss": 0.0277,
-      "step": 1725
-    },
-    {
-      "epoch": 0.08037477609883802,
-      "grad_norm": 0.03989580273628235,
-      "learning_rate": 3.936579367806392e-05,
-      "loss": 0.1106,
-      "step": 1750
-    },
-    {
-      "epoch": 0.08152298718596426,
-      "grad_norm": 0.060093723237514496,
-      "learning_rate": 3.934764392207595e-05,
-      "loss": 0.0976,
-      "step": 1775
-    },
-    {
-      "epoch": 0.08267119827309052,
-      "grad_norm": 16.212749481201172,
-      "learning_rate": 3.9329242415303464e-05,
-      "loss": 0.128,
-      "step": 1800
-    },
-    {
-      "epoch": 0.08381940936021678,
-      "grad_norm": 10.21111011505127,
-      "learning_rate": 3.931058939718613e-05,
-      "loss": 0.0758,
-      "step": 1825
-    },
-    {
-      "epoch": 0.08496762044734305,
-      "grad_norm": 0.7069344520568848,
-      "learning_rate": 3.9291685110436285e-05,
-      "loss": 0.1639,
-      "step": 1850
-    },
-    {
-      "epoch": 0.08611583153446929,
-      "grad_norm": 1.7670834064483643,
-      "learning_rate": 3.9272529801035785e-05,
-      "loss": 0.1184,
-      "step": 1875
-    },
-    {
-      "epoch": 0.08726404262159555,
-      "grad_norm": 0.5139174461364746,
-      "learning_rate": 3.925312371823275e-05,
-      "loss": 0.1004,
-      "step": 1900
-    },
-    {
-      "epoch": 0.08841225370872181,
-      "grad_norm": 0.5114403963088989,
-      "learning_rate": 3.9233467114538376e-05,
-      "loss": 0.0819,
-      "step": 1925
-    },
-    {
-      "epoch": 0.08956046479584807,
-      "grad_norm": 0.01510544028133154,
-      "learning_rate": 3.9213560245723625e-05,
-      "loss": 0.1955,
-      "step": 1950
-    },
-    {
-      "epoch": 0.09070867588297432,
-      "grad_norm": 0.10435652732849121,
-      "learning_rate": 3.919340337081589e-05,
-      "loss": 0.1491,
-      "step": 1975
-    },
-    {
-      "epoch": 0.09185688697010058,
-      "grad_norm": 22.64883804321289,
-      "learning_rate": 3.917299675209563e-05,
-      "loss": 0.142,
-      "step": 2000
-    },
-    {
-      "epoch": 0.09300509805722684,
-      "grad_norm": 9.807494163513184,
-      "learning_rate": 3.9152340655092975e-05,
-      "loss": 0.0557,
-      "step": 2025
-    },
-    {
-      "epoch": 0.0941533091443531,
-      "grad_norm": 0.6470879316329956,
-      "learning_rate": 3.9131435348584245e-05,
-      "loss": 0.0908,
-      "step": 2050
-    },
-    {
-      "epoch": 0.09530152023147935,
-      "grad_norm": 28.449705123901367,
-      "learning_rate": 3.9110281104588476e-05,
-      "loss": 0.074,
-      "step": 2075
-    },
-    {
-      "epoch": 0.09644973131860561,
-      "grad_norm": 0.19204096496105194,
-      "learning_rate": 3.908887819836386e-05,
-      "loss": 0.0709,
-      "step": 2100
-    },
-    {
-      "epoch": 0.09759794240573187,
-      "grad_norm": 15.106844902038574,
-      "learning_rate": 3.9067226908404166e-05,
-      "loss": 0.1325,
-      "step": 2125
-    },
-    {
-      "epoch": 0.09874615349285813,
-      "grad_norm": 0.47706907987594604,
-      "learning_rate": 3.904532751643514e-05,
-      "loss": 0.1235,
-      "step": 2150
-    },
-    {
-      "epoch": 0.09989436457998438,
-      "grad_norm": 11.67285442352295,
-      "learning_rate": 3.902318030741081e-05,
-      "loss": 0.0946,
-      "step": 2175
-    },
-    {
-      "epoch": 0.10104257566711064,
-      "grad_norm": 16.763837814331055,
-      "learning_rate": 3.9000785569509785e-05,
-      "loss": 0.1192,
-      "step": 2200
-    },
-    {
-      "epoch": 0.1021907867542369,
-      "grad_norm": 0.05309774726629257,
-      "learning_rate": 3.897814359413153e-05,
-      "loss": 0.0778,
-      "step": 2225
-    },
-    {
-      "epoch": 0.10333899784136316,
-      "grad_norm": 0.08014708012342453,
-      "learning_rate": 3.895525467589253e-05,
-      "loss": 0.0665,
-      "step": 2250
-    },
-    {
-      "epoch": 0.10448720892848941,
-      "grad_norm": 0.309901624917984,
-      "learning_rate": 3.89321191126225e-05,
-      "loss": 0.1596,
-      "step": 2275
-    },
-    {
-      "epoch": 0.10563542001561567,
-      "grad_norm": 11.939780235290527,
-      "learning_rate": 3.890873720536048e-05,
-      "loss": 0.1026,
-      "step": 2300
-    },
-    {
-      "epoch": 0.10678363110274193,
-      "grad_norm": 4.285236835479736,
-      "learning_rate": 3.888510925835093e-05,
-      "loss": 0.1026,
-      "step": 2325
-    },
-    {
-      "epoch": 0.10793184218986819,
-      "grad_norm": 0.06218241900205612,
-      "learning_rate": 3.886123557903976e-05,
-      "loss": 0.079,
-      "step": 2350
-    },
-    {
-      "epoch": 0.10908005327699444,
-      "grad_norm": 19.595443725585938,
-      "learning_rate": 3.883711647807037e-05,
-      "loss": 0.0781,
-      "step": 2375
-    },
-    {
-      "epoch": 0.1102282643641207,
-      "grad_norm": 58.581581115722656,
-      "learning_rate": 3.8812752269279544e-05,
-      "loss": 0.1329,
-      "step": 2400
-    },
-    {
-      "epoch": 0.11137647545124696,
-      "grad_norm": 21.686433792114258,
-      "learning_rate": 3.878814326969341e-05,
-      "loss": 0.1016,
-      "step": 2425
-    },
-    {
-      "epoch": 0.11252468653837322,
-      "grad_norm": 1.1680649518966675,
-      "learning_rate": 3.876328979952332e-05,
-      "loss": 0.1263,
-      "step": 2450
-    },
-    {
-      "epoch": 0.11367289762549947,
-      "grad_norm": 0.2271161824464798,
-      "learning_rate": 3.8738192182161645e-05,
-      "loss": 0.0411,
-      "step": 2475
-    },
-    {
-      "epoch": 0.11482110871262573,
-      "grad_norm": 18.177799224853516,
-      "learning_rate": 3.871285074417759e-05,
-      "loss": 0.1228,
-      "step": 2500
-    },
-    {
-      "epoch": 0.11596931979975199,
-      "grad_norm": 0.03746473789215088,
-      "learning_rate": 3.868726581531297e-05,
-      "loss": 0.0828,
-      "step": 2525
-    },
-    {
-      "epoch": 0.11711753088687825,
-      "grad_norm": 0.31662654876708984,
-      "learning_rate": 3.866143772847786e-05,
-      "loss": 0.0739,
-      "step": 2550
-    },
-    {
-      "epoch": 0.1182657419740045,
-      "grad_norm": 0.0405048243701458,
-      "learning_rate": 3.8635366819746336e-05,
-      "loss": 0.0636,
-      "step": 2575
-    },
-    {
-      "epoch": 0.11941395306113076,
-      "grad_norm": 0.07984010130167007,
-      "learning_rate": 3.860905342835201e-05,
-      "loss": 0.0898,
-      "step": 2600
-    },
-    {
-      "epoch": 0.12056216414825702,
-      "grad_norm": 0.061471085995435715,
-      "learning_rate": 3.8582497896683725e-05,
-      "loss": 0.1553,
-      "step": 2625
-    },
-    {
-      "epoch": 0.12171037523538328,
-      "grad_norm": 11.33873462677002,
-      "learning_rate": 3.855570057028101e-05,
-      "loss": 0.0815,
-      "step": 2650
-    },
-    {
-      "epoch": 0.12285858632250952,
-      "grad_norm": 0.12285740673542023,
-      "learning_rate": 3.8528661797829626e-05,
-      "loss": 0.0393,
-      "step": 2675
-    },
-    {
-      "epoch": 0.12400679740963579,
-      "grad_norm": 0.009187364019453526,
-      "learning_rate": 3.8501381931157026e-05,
-      "loss": 0.1046,
-      "step": 2700
-    },
-    {
-      "epoch": 0.12515500849676203,
-      "grad_norm": 10.243695259094238,
-      "learning_rate": 3.847386132522776e-05,
-      "loss": 0.1128,
-      "step": 2725
-    },
-    {
-      "epoch": 0.1263032195838883,
-      "grad_norm": 0.03348785266280174,
-      "learning_rate": 3.8446100338138864e-05,
-      "loss": 0.0571,
-      "step": 2750
-    },
-    {
-      "epoch": 0.12745143067101455,
-      "grad_norm": 0.054208606481552124,
-      "learning_rate": 3.841809933111523e-05,
-      "loss": 0.1265,
-      "step": 2775
-    },
-    {
-      "epoch": 0.12859964175814081,
-      "grad_norm": 0.04985649138689041,
-      "learning_rate": 3.838985866850486e-05,
-      "loss": 0.0614,
-      "step": 2800
-    },
-    {
-      "epoch": 0.12974785284526708,
-      "grad_norm": 0.009153477847576141,
-      "learning_rate": 3.836137871777414e-05,
-      "loss": 0.0833,
-      "step": 2825
-    },
-    {
-      "epoch": 0.13089606393239334,
-      "grad_norm": 0.103277787566185,
-      "learning_rate": 3.833265984950309e-05,
-      "loss": 0.1913,
-      "step": 2850
-    },
-    {
-      "epoch": 0.1320442750195196,
-      "grad_norm": 32.65317916870117,
-      "learning_rate": 3.830370243738049e-05,
-      "loss": 0.1379,
-      "step": 2875
-    },
-    {
-      "epoch": 0.13319248610664586,
-      "grad_norm": 0.17990216612815857,
-      "learning_rate": 3.827450685819905e-05,
-      "loss": 0.1472,
-      "step": 2900
-    },
-    {
-      "epoch": 0.1343406971937721,
-      "grad_norm": 0.43453171849250793,
-      "learning_rate": 3.8245073491850494e-05,
-      "loss": 0.0601,
-      "step": 2925
-    },
-    {
-      "epoch": 0.13548890828089835,
-      "grad_norm": 30.274080276489258,
-      "learning_rate": 3.821540272132065e-05,
-      "loss": 0.1168,
-      "step": 2950
-    },
-    {
-      "epoch": 0.1366371193680246,
-      "grad_norm": 0.02154465764760971,
-      "learning_rate": 3.8185494932684417e-05,
-      "loss": 0.0858,
-      "step": 2975
-    },
-    {
-      "epoch": 0.13778533045515087,
-      "grad_norm": 0.8407604694366455,
-      "learning_rate": 3.815535051510076e-05,
-      "loss": 0.1562,
-      "step": 3000
-    },
-    {
-      "epoch": 0.13893354154227713,
-      "grad_norm": 0.06254715472459793,
-      "learning_rate": 3.8124969860807655e-05,
-      "loss": 0.0887,
-      "step": 3025
-    },
-    {
-      "epoch": 0.1400817526294034,
-      "grad_norm": 0.47905296087265015,
-      "learning_rate": 3.8094353365117005e-05,
-      "loss": 0.1225,
-      "step": 3050
-    },
-    {
-      "epoch": 0.14122996371652966,
-      "grad_norm": 0.09489297866821289,
-      "learning_rate": 3.806350142640943e-05,
-      "loss": 0.0628,
-      "step": 3075
-    },
-    {
-      "epoch": 0.14237817480365592,
-      "grad_norm": 9.106457710266113,
-      "learning_rate": 3.803241444612917e-05,
-      "loss": 0.1389,
-      "step": 3100
-    },
-    {
-      "epoch": 0.14352638589078215,
-      "grad_norm": 0.08327560126781464,
-      "learning_rate": 3.8001092828778766e-05,
-      "loss": 0.0686,
-      "step": 3125
-    },
-    {
-      "epoch": 0.1446745969779084,
-      "grad_norm": 0.2027389407157898,
-      "learning_rate": 3.7969536981913906e-05,
-      "loss": 0.1172,
-      "step": 3150
-    },
-    {
-      "epoch": 0.14582280806503467,
-      "grad_norm": 0.024477185681462288,
-      "learning_rate": 3.7937747316138015e-05,
-      "loss": 0.0676,
-      "step": 3175
-    },
-    {
-      "epoch": 0.14697101915216093,
-      "grad_norm": 223.8543701171875,
-      "learning_rate": 3.790572424509698e-05,
-      "loss": 0.1085,
-      "step": 3200
-    },
-    {
-      "epoch": 0.1481192302392872,
-      "grad_norm": 0.2227756232023239,
-      "learning_rate": 3.787346818547375e-05,
-      "loss": 0.0865,
-      "step": 3225
-    },
-    {
-      "epoch": 0.14926744132641345,
-      "grad_norm": 0.060894425958395004,
-      "learning_rate": 3.784097955698291e-05,
-      "loss": 0.1782,
-      "step": 3250
-    },
-    {
-      "epoch": 0.1504156524135397,
-      "grad_norm": 0.5547670125961304,
-      "learning_rate": 3.780825878236521e-05,
-      "loss": 0.0733,
-      "step": 3275
-    },
-    {
-      "epoch": 0.15156386350066597,
-      "grad_norm": 6.312192916870117,
-      "learning_rate": 3.77753062873821e-05,
-      "loss": 0.1209,
-      "step": 3300
-    },
-    {
-      "epoch": 0.1527120745877922,
-      "grad_norm": 12.162946701049805,
-      "learning_rate": 3.774212250081014e-05,
-      "loss": 0.1259,
-      "step": 3325
-    },
-    {
-      "epoch": 0.15386028567491847,
-      "grad_norm": 0.04875678941607475,
-      "learning_rate": 3.770870785443548e-05,
-      "loss": 0.0683,
-      "step": 3350
-    },
-    {
-      "epoch": 0.15500849676204473,
-      "grad_norm": 40.281455993652344,
-      "learning_rate": 3.767506278304818e-05,
-      "loss": 0.1197,
-      "step": 3375
-    },
-    {
-      "epoch": 0.156156707849171,
-      "grad_norm": 0.008391838520765305,
-      "learning_rate": 3.7641187724436576e-05,
-      "loss": 0.0589,
-      "step": 3400
-    },
-    {
-      "epoch": 0.15730491893629725,
-      "grad_norm": 0.00909591093659401,
-      "learning_rate": 3.760708311938163e-05,
-      "loss": 0.0528,
-      "step": 3425
-    },
-    {
-      "epoch": 0.1584531300234235,
-      "grad_norm": 0.004507018718868494,
-      "learning_rate": 3.75727494116511e-05,
-      "loss": 0.0541,
-      "step": 3450
-    },
-    {
-      "epoch": 0.15960134111054977,
-      "grad_norm": 15.795205116271973,
-      "learning_rate": 3.753818704799386e-05,
-      "loss": 0.1272,
-      "step": 3475
-    },
-    {
-      "epoch": 0.16074955219767603,
-      "grad_norm": 0.024282341822981834,
-      "learning_rate": 3.750339647813403e-05,
-      "loss": 0.0625,
-      "step": 3500
-    },
-    {
-      "epoch": 0.16189776328480227,
-      "grad_norm": 0.060861699283123016,
-      "learning_rate": 3.7468378154765146e-05,
-      "loss": 0.0647,
-      "step": 3525
-    },
-    {
-      "epoch": 0.16304597437192853,
-      "grad_norm": 0.021951772272586823,
-      "learning_rate": 3.743313253354425e-05,
-      "loss": 0.1148,
-      "step": 3550
-    },
-    {
-      "epoch": 0.1641941854590548,
-      "grad_norm": 1.0026686191558838,
-      "learning_rate": 3.7397660073085994e-05,
-      "loss": 0.1176,
-      "step": 3575
-    },
-    {
-      "epoch": 0.16534239654618105,
-      "grad_norm": 0.014557062648236752,
-      "learning_rate": 3.736196123495663e-05,
-      "loss": 0.0518,
-      "step": 3600
-    },
-    {
-      "epoch": 0.1664906076333073,
-      "grad_norm": 0.09844854474067688,
-      "learning_rate": 3.732603648366805e-05,
-      "loss": 0.1042,
-      "step": 3625
-    },
-    {
-      "epoch": 0.16763881872043357,
-      "grad_norm": 426.1970520019531,
-      "learning_rate": 3.728988628667171e-05,
-      "loss": 0.1162,
-      "step": 3650
-    },
-    {
-      "epoch": 0.16878702980755983,
-      "grad_norm": 43.39368438720703,
-      "learning_rate": 3.725351111435256e-05,
-      "loss": 0.1072,
-      "step": 3675
-    },
-    {
-      "epoch": 0.1699352408946861,
-      "grad_norm": 0.1358594000339508,
-      "learning_rate": 3.7216911440022906e-05,
-      "loss": 0.1128,
-      "step": 3700
-    },
-    {
-      "epoch": 0.17108345198181232,
-      "grad_norm": 0.05883224308490753,
-      "learning_rate": 3.7180087739916284e-05,
-      "loss": 0.0222,
-      "step": 3725
-    },
-    {
-      "epoch": 0.17223166306893858,
-      "grad_norm": 3.3152406215667725,
-      "learning_rate": 3.7143040493181236e-05,
-      "loss": 0.1888,
-      "step": 3750
-    },
-    {
-      "epoch": 0.17337987415606484,
-      "grad_norm": 0.13006258010864258,
-      "learning_rate": 3.710577018187508e-05,
-      "loss": 0.1396,
-      "step": 3775
-    },
-    {
-      "epoch": 0.1745280852431911,
-      "grad_norm": 0.487556129693985,
-      "learning_rate": 3.706827729095765e-05,
-      "loss": 0.0768,
-      "step": 3800
-    },
-    {
-      "epoch": 0.17567629633031737,
-      "grad_norm": 0.18899738788604736,
-      "learning_rate": 3.703056230828497e-05,
-      "loss": 0.0797,
-      "step": 3825
-    },
-    {
-      "epoch": 0.17682450741744363,
-      "grad_norm": 0.32145893573760986,
-      "learning_rate": 3.699262572460293e-05,
-      "loss": 0.132,
-      "step": 3850
-    },
-    {
-      "epoch": 0.1779727185045699,
-      "grad_norm": 0.08297406136989594,
-      "learning_rate": 3.695446803354086e-05,
-      "loss": 0.0802,
-      "step": 3875
-    },
-    {
-      "epoch": 0.17912092959169615,
-      "grad_norm": 0.5447966456413269,
-      "learning_rate": 3.691608973160513e-05,
-      "loss": 0.0227,
-      "step": 3900
-    },
-    {
-      "epoch": 0.18026914067882238,
-      "grad_norm": 18.947834014892578,
-      "learning_rate": 3.687749131817272e-05,
-      "loss": 0.1073,
-      "step": 3925
-    },
-    {
-      "epoch": 0.18141735176594864,
-      "grad_norm": 0.3541482388973236,
-      "learning_rate": 3.683867329548466e-05,
-      "loss": 0.1116,
-      "step": 3950
-    },
-    {
-      "epoch": 0.1825655628530749,
-      "grad_norm": 0.40254440903663635,
-      "learning_rate": 3.679963616863955e-05,
-      "loss": 0.0876,
-      "step": 3975
-    },
-    {
-      "epoch": 0.18371377394020116,
-      "grad_norm": 1.2953015565872192,
-      "learning_rate": 3.676038044558694e-05,
-      "loss": 0.0962,
-      "step": 4000
-    },
-    {
-      "epoch": 0.18486198502732742,
-      "grad_norm": 0.8362603783607483,
-      "learning_rate": 3.672090663712078e-05,
-      "loss": 0.0926,
-      "step": 4025
-    },
-    {
-      "epoch": 0.18601019611445369,
-      "grad_norm": 10.473535537719727,
-      "learning_rate": 3.66812152568727e-05,
-      "loss": 0.0758,
-      "step": 4050
-    },
-    {
-      "epoch": 0.18715840720157995,
-      "grad_norm": 0.20654985308647156,
-      "learning_rate": 3.66413068213054e-05,
-      "loss": 0.1323,
-      "step": 4075
-    },
-    {
-      "epoch": 0.1883066182887062,
-      "grad_norm": 0.19674348831176758,
-      "learning_rate": 3.6601181849705864e-05,
-      "loss": 0.1177,
-      "step": 4100
-    },
-    {
-      "epoch": 0.18945482937583244,
-      "grad_norm": 46.79829025268555,
-      "learning_rate": 3.656084086417867e-05,
-      "loss": 0.1193,
-      "step": 4125
-    },
-    {
-      "epoch": 0.1906030404629587,
-      "grad_norm": 0.13020487129688263,
-      "learning_rate": 3.652028438963912e-05,
-      "loss": 0.0877,
-      "step": 4150
-    },
-    {
-      "epoch": 0.19175125155008496,
-      "grad_norm": 0.07615011185407639,
-      "learning_rate": 3.647951295380648e-05,
-      "loss": 0.0912,
-      "step": 4175
-    },
-    {
-      "epoch": 0.19289946263721122,
-      "grad_norm": 0.33390146493911743,
-      "learning_rate": 3.643852708719708e-05,
-      "loss": 0.0703,
-      "step": 4200
-    },
-    {
-      "epoch": 0.19404767372433748,
-      "grad_norm": 0.8612595200538635,
-      "learning_rate": 3.6397327323117406e-05,
-      "loss": 0.1447,
-      "step": 4225
-    },
-    {
-      "epoch": 0.19519588481146374,
-      "grad_norm": 0.028214924037456512,
-      "learning_rate": 3.635591419765717e-05,
-      "loss": 0.0586,
-      "step": 4250
-    },
-    {
-      "epoch": 0.19634409589859,
-      "grad_norm": 0.050730813294649124,
-      "learning_rate": 3.631428824968235e-05,
-      "loss": 0.0726,
-      "step": 4275
-    },
-    {
-      "epoch": 0.19749230698571626,
-      "grad_norm": 12.573225021362305,
-      "learning_rate": 3.627245002082814e-05,
-      "loss": 0.116,
-      "step": 4300
-    },
-    {
-      "epoch": 0.1986405180728425,
-      "grad_norm": 0.6728110313415527,
-      "learning_rate": 3.623040005549193e-05,
-      "loss": 0.1846,
-      "step": 4325
-    },
-    {
-      "epoch": 0.19978872915996876,
-      "grad_norm": 34.582183837890625,
-      "learning_rate": 3.6188138900826225e-05,
-      "loss": 0.1518,
-      "step": 4350
-    },
-    {
-      "epoch": 0.20093694024709502,
-      "grad_norm": 22.49669647216797,
-      "learning_rate": 3.6145667106731516e-05,
-      "loss": 0.0471,
-      "step": 4375
-    },
-    {
-      "epoch": 0.20208515133422128,
-      "grad_norm": 0.12985405325889587,
-      "learning_rate": 3.610298522584913e-05,
-      "loss": 0.1173,
-      "step": 4400
-    },
-    {
-      "epoch": 0.20323336242134754,
-      "grad_norm": 0.30833348631858826,
-      "learning_rate": 3.606009381355401e-05,
-      "loss": 0.1189,
-      "step": 4425
-    },
-    {
-      "epoch": 0.2043815735084738,
-      "grad_norm": 0.16521891951560974,
-      "learning_rate": 3.601699342794755e-05,
-      "loss": 0.0812,
-      "step": 4450
-    },
-    {
-      "epoch": 0.20552978459560006,
-      "grad_norm": 0.01110359001904726,
-      "learning_rate": 3.597368462985027e-05,
-      "loss": 0.0567,
-      "step": 4475
-    },
-    {
-      "epoch": 0.20667799568272632,
-      "grad_norm": 0.005069703795015812,
-      "learning_rate": 3.5930167982794555e-05,
-      "loss": 0.1032,
-      "step": 4500
-    },
-    {
-      "epoch": 0.20782620676985256,
-      "grad_norm": 0.046138398349285126,
-      "learning_rate": 3.588644405301731e-05,
-      "loss": 0.0537,
-      "step": 4525
-    },
-    {
-      "epoch": 0.20897441785697882,
-      "grad_norm": 0.9147820472717285,
-      "learning_rate": 3.5842513409452606e-05,
-      "loss": 0.0653,
-      "step": 4550
-    },
-    {
-      "epoch": 0.21012262894410508,
-      "grad_norm": 7.11830997467041,
-      "learning_rate": 3.579837662372424e-05,
-      "loss": 0.1436,
-      "step": 4575
-    },
-    {
-      "epoch": 0.21127084003123134,
-      "grad_norm": 0.07249364256858826,
-      "learning_rate": 3.575403427013834e-05,
-      "loss": 0.1258,
-      "step": 4600
-    },
-    {
-      "epoch": 0.2124190511183576,
-      "grad_norm": 9.790180206298828,
-      "learning_rate": 3.5709486925675887e-05,
-      "loss": 0.082,
-      "step": 4625
-    },
-    {
-      "epoch": 0.21356726220548386,
-      "grad_norm": 0.03522699326276779,
-      "learning_rate": 3.566473516998517e-05,
-      "loss": 0.1248,
-      "step": 4650
-    },
-    {
-      "epoch": 0.21471547329261012,
-      "grad_norm": 0.570968508720398,
-      "learning_rate": 3.5619779585374275e-05,
-      "loss": 0.0791,
-      "step": 4675
-    },
-    {
-      "epoch": 0.21586368437973638,
-      "grad_norm": 0.13998058438301086,
-      "learning_rate": 3.557462075680352e-05,
-      "loss": 0.0686,
-      "step": 4700
-    },
-    {
-      "epoch": 0.21701189546686261,
-      "grad_norm": 14.644072532653809,
-      "learning_rate": 3.552925927187781e-05,
-      "loss": 0.1849,
-      "step": 4725
-    },
-    {
-      "epoch": 0.21816010655398888,
-      "grad_norm": 0.02759512886404991,
-      "learning_rate": 3.548369572083901e-05,
-      "loss": 0.0851,
-      "step": 4750
-    },
-    {
-      "epoch": 0.21930831764111514,
-      "grad_norm": 0.30943742394447327,
-      "learning_rate": 3.543793069655825e-05,
-      "loss": 0.0682,
-      "step": 4775
-    },
-    {
-      "epoch": 0.2204565287282414,
-      "grad_norm": 0.032128553837537766,
-      "learning_rate": 3.5391964794528245e-05,
-      "loss": 0.0817,
-      "step": 4800
-    },
-    {
-      "epoch": 0.22160473981536766,
-      "grad_norm": 0.35530638694763184,
-      "learning_rate": 3.534579861285551e-05,
-      "loss": 0.1607,
-      "step": 4825
-    },
-    {
-      "epoch": 0.22275295090249392,
-      "grad_norm": 0.5857437252998352,
-      "learning_rate": 3.529943275225258e-05,
-      "loss": 0.0937,
-      "step": 4850
-    },
-    {
-      "epoch": 0.22390116198962018,
-      "grad_norm": 0.026839064434170723,
-      "learning_rate": 3.525286781603023e-05,
-      "loss": 0.1423,
-      "step": 4875
-    },
-    {
-      "epoch": 0.22504937307674644,
-      "grad_norm": 0.08431849628686905,
-      "learning_rate": 3.5206104410089584e-05,
-      "loss": 0.0854,
-      "step": 4900
-    },
-    {
-      "epoch": 0.22619758416387267,
-      "grad_norm": 0.15257743000984192,
-      "learning_rate": 3.5159143142914236e-05,
-      "loss": 0.0698,
-      "step": 4925
-    },
-    {
-      "epoch": 0.22734579525099893,
-      "grad_norm": 26.322877883911133,
-      "learning_rate": 3.511198462556236e-05,
-      "loss": 0.1412,
-      "step": 4950
-    },
-    {
-      "epoch": 0.2284940063381252,
-      "grad_norm": 0.6526215076446533,
-      "learning_rate": 3.506462947165874e-05,
-      "loss": 0.1362,
-      "step": 4975
-    },
-    {
-      "epoch": 0.22964221742525145,
-      "grad_norm": 0.3663279712200165,
-      "learning_rate": 3.5017078297386776e-05,
-      "loss": 0.1357,
-      "step": 5000
-    },
-    {
-      "epoch": 0.23079042851237772,
-      "grad_norm": 1.1023207902908325,
-      "learning_rate": 3.4969331721480495e-05,
-      "loss": 0.0967,
-      "step": 5025
-    },
-    {
-      "epoch": 0.23193863959950398,
-      "grad_norm": 64.01104736328125,
-      "learning_rate": 3.492139036521646e-05,
-      "loss": 0.0745,
-      "step": 5050
-    },
-    {
-      "epoch": 0.23308685068663024,
-      "grad_norm": 12.65407943725586,
-      "learning_rate": 3.487325485240573e-05,
-      "loss": 0.0891,
-      "step": 5075
-    },
-    {
-      "epoch": 0.2342350617737565,
-      "grad_norm": 0.8625269532203674,
-      "learning_rate": 3.48249258093857e-05,
-      "loss": 0.1101,
-      "step": 5100
-    },
-    {
-      "epoch": 0.23538327286088276,
-      "grad_norm": 0.026748131960630417,
-      "learning_rate": 3.477640386501199e-05,
-      "loss": 0.0314,
-      "step": 5125
-    },
-    {
-      "epoch": 0.236531483948009,
-      "grad_norm": 0.11494199931621552,
-      "learning_rate": 3.472768965065024e-05,
-      "loss": 0.1041,
-      "step": 5150
-    },
-    {
-      "epoch": 0.23767969503513525,
-      "grad_norm": 0.3611622154712677,
-      "learning_rate": 3.46787838001679e-05,
-      "loss": 0.1058,
-      "step": 5175
-    },
-    {
-      "epoch": 0.2388279061222615,
-      "grad_norm": 0.06698184460401535,
-      "learning_rate": 3.4629686949925976e-05,
-      "loss": 0.0987,
-      "step": 5200
-    },
-    {
-      "epoch": 0.23997611720938777,
-      "grad_norm": 0.0825871005654335,
-      "learning_rate": 3.458039973877076e-05,
-      "loss": 0.0437,
-      "step": 5225
-    },
-    {
-      "epoch": 0.24112432829651403,
-      "grad_norm": 8.870837211608887,
-      "learning_rate": 3.453092280802551e-05,
-      "loss": 0.0329,
-      "step": 5250
-    },
-    {
-      "epoch": 0.2422725393836403,
-      "grad_norm": 0.07859117537736893,
-      "learning_rate": 3.448125680148212e-05,
-      "loss": 0.1058,
-      "step": 5275
-    },
-    {
-      "epoch": 0.24342075047076656,
-      "grad_norm": 0.0069983587600290775,
-      "learning_rate": 3.44314023653927e-05,
-      "loss": 0.091,
-      "step": 5300
-    },
-    {
-      "epoch": 0.24456896155789282,
-      "grad_norm": 78.82186126708984,
-      "learning_rate": 3.438136014846124e-05,
-      "loss": 0.0884,
-      "step": 5325
-    },
-    {
-      "epoch": 0.24571717264501905,
-      "grad_norm": 0.035996224731206894,
-      "learning_rate": 3.43311308018351e-05,
-      "loss": 0.1089,
-      "step": 5350
-    },
-    {
-      "epoch": 0.2468653837321453,
-      "grad_norm": 0.06088200584053993,
-      "learning_rate": 3.428071497909657e-05,
-      "loss": 0.0349,
-      "step": 5375
-    },
-    {
-      "epoch": 0.24801359481927157,
-      "grad_norm": 0.19518066942691803,
-      "learning_rate": 3.423011333625435e-05,
-      "loss": 0.1557,
-      "step": 5400
-    },
-    {
-      "epoch": 0.24916180590639783,
-      "grad_norm": 0.14072376489639282,
-      "learning_rate": 3.417932653173505e-05,
-      "loss": 0.1763,
-      "step": 5425
-    },
-    {
-      "epoch": 0.25031001699352406,
-      "grad_norm": 0.13947640359401703,
-      "learning_rate": 3.412835522637456e-05,
-      "loss": 0.044,
-      "step": 5450
-    },
-    {
-      "epoch": 0.2514582280806503,
-      "grad_norm": 0.019208233803510666,
-      "learning_rate": 3.407720008340952e-05,
-      "loss": 0.0769,
-      "step": 5475
-    },
-    {
-      "epoch": 0.2526064391677766,
-      "grad_norm": 1.0778671503067017,
-      "learning_rate": 3.402586176846866e-05,
-      "loss": 0.0479,
-      "step": 5500
-    },
-    {
-      "epoch": 0.25375465025490285,
-      "grad_norm": 53.735904693603516,
-      "learning_rate": 3.397434094956409e-05,
-      "loss": 0.1,
-      "step": 5525
-    },
-    {
-      "epoch": 0.2549028613420291,
-      "grad_norm": 0.013616573065519333,
-      "learning_rate": 3.3922638297082715e-05,
-      "loss": 0.0678,
-      "step": 5550
-    },
-    {
-      "epoch": 0.25605107242915537,
-      "grad_norm": 0.19283778965473175,
-      "learning_rate": 3.387075448377741e-05,
-      "loss": 0.0992,
-      "step": 5575
-    },
-    {
-      "epoch": 0.25719928351628163,
-      "grad_norm": 0.1097509115934372,
-      "learning_rate": 3.381869018475832e-05,
-      "loss": 0.0624,
-      "step": 5600
-    },
-    {
-      "epoch": 0.2583474946034079,
-      "grad_norm": 3.6526105403900146,
-      "learning_rate": 3.376644607748406e-05,
-      "loss": 0.1127,
-      "step": 5625
-    },
-    {
-      "epoch": 0.25949570569053415,
-      "grad_norm": 11.323371887207031,
-      "learning_rate": 3.371402284175292e-05,
-      "loss": 0.125,
-      "step": 5650
-    },
-    {
-      "epoch": 0.2606439167776604,
-      "grad_norm": 0.08170542865991592,
-      "learning_rate": 3.366142115969398e-05,
-      "loss": 0.0569,
-      "step": 5675
-    },
-    {
-      "epoch": 0.2617921278647867,
-      "grad_norm": 0.30748364329338074,
-      "learning_rate": 3.3608641715758264e-05,
-      "loss": 0.0745,
-      "step": 5700
-    },
-    {
-      "epoch": 0.26294033895191293,
-      "grad_norm": 12.165470123291016,
-      "learning_rate": 3.3555685196709836e-05,
-      "loss": 0.1015,
-      "step": 5725
-    },
-    {
-      "epoch": 0.2640885500390392,
-      "grad_norm": 13.167491912841797,
-      "learning_rate": 3.350255229161684e-05,
-      "loss": 0.0937,
-      "step": 5750
-    },
-    {
-      "epoch": 0.26523676112616545,
-      "grad_norm": 0.2379087507724762,
-      "learning_rate": 3.3449243691842555e-05,
-      "loss": 0.1279,
-      "step": 5775
-    },
-    {
-      "epoch": 0.2663849722132917,
-      "grad_norm": 0.28682318329811096,
-      "learning_rate": 3.33957600910364e-05,
-      "loss": 0.0699,
-      "step": 5800
-    },
-    {
-      "epoch": 0.267533183300418,
-      "grad_norm": 10.061598777770996,
-      "learning_rate": 3.334210218512488e-05,
-      "loss": 0.0877,
-      "step": 5825
-    },
-    {
-      "epoch": 0.2686813943875442,
-      "grad_norm": 0.11539531499147415,
-      "learning_rate": 3.3288270672302575e-05,
-      "loss": 0.1336,
-      "step": 5850
-    },
-    {
-      "epoch": 0.26982960547467044,
-      "grad_norm": 0.02353590354323387,
-      "learning_rate": 3.3234266253023014e-05,
-      "loss": 0.1055,
-      "step": 5875
-    },
-    {
-      "epoch": 0.2709778165617967,
-      "grad_norm": 159.0912322998047,
-      "learning_rate": 3.3180089629989585e-05,
-      "loss": 0.1191,
-      "step": 5900
-    },
-    {
-      "epoch": 0.27212602764892296,
-      "grad_norm": 0.15841203927993774,
-      "learning_rate": 3.312574150814639e-05,
-      "loss": 0.1149,
-      "step": 5925
-    },
-    {
-      "epoch": 0.2732742387360492,
-      "grad_norm": 0.10681680589914322,
-      "learning_rate": 3.3071222594669045e-05,
-      "loss": 0.1223,
-      "step": 5950
-    },
-    {
-      "epoch": 0.2744224498231755,
-      "grad_norm": 0.1897619068622589,
-      "learning_rate": 3.301653359895554e-05,
-      "loss": 0.0769,
-      "step": 5975
-    },
-    {
-      "epoch": 0.27557066091030175,
-      "grad_norm": 0.33233270049095154,
-      "learning_rate": 3.296167523261692e-05,
-      "loss": 0.0216,
-      "step": 6000
-    },
-    {
-      "epoch": 0.276718871997428,
-      "grad_norm": 0.11045894026756287,
-      "learning_rate": 3.2906648209468116e-05,
-      "loss": 0.0968,
-      "step": 6025
-    },
-    {
-      "epoch": 0.27786708308455427,
-      "grad_norm": 0.17217367887496948,
-      "learning_rate": 3.2851453245518585e-05,
-      "loss": 0.0703,
-      "step": 6050
-    },
-    {
-      "epoch": 0.27901529417168053,
-      "grad_norm": 0.09274031966924667,
-      "learning_rate": 3.279609105896304e-05,
-      "loss": 0.0921,
-      "step": 6075
-    },
-    {
-      "epoch": 0.2801635052588068,
-      "grad_norm": 0.11065138876438141,
-      "learning_rate": 3.274056237017209e-05,
-      "loss": 0.0489,
-      "step": 6100
-    },
-    {
-      "epoch": 0.28131171634593305,
-      "grad_norm": 0.32041868567466736,
-      "learning_rate": 3.268486790168285e-05,
-      "loss": 0.1396,
-      "step": 6125
-    },
-    {
-      "epoch": 0.2824599274330593,
-      "grad_norm": 190.3155975341797,
-      "learning_rate": 3.262900837818955e-05,
-      "loss": 0.0857,
-      "step": 6150
-    },
-    {
-      "epoch": 0.28360813852018557,
-      "grad_norm": 0.05174362286925316,
-      "learning_rate": 3.257298452653414e-05,
-      "loss": 0.099,
-      "step": 6175
-    },
-    {
-      "epoch": 0.28475634960731183,
-      "grad_norm": 0.04005354642868042,
-      "learning_rate": 3.251679707569677e-05,
-      "loss": 0.0774,
-      "step": 6200
-    },
-    {
-      "epoch": 0.2859045606944381,
-      "grad_norm": 2.6924703121185303,
-      "learning_rate": 3.246044675678636e-05,
-      "loss": 0.1248,
-      "step": 6225
-    },
-    {
-      "epoch": 0.2870527717815643,
-      "grad_norm": 1.217990756034851,
-      "learning_rate": 3.240393430303105e-05,
-      "loss": 0.1095,
-      "step": 6250
-    },
-    {
-      "epoch": 0.28820098286869056,
-      "grad_norm": 10.461615562438965,
-      "learning_rate": 3.234726044976865e-05,
-      "loss": 0.0696,
-      "step": 6275
-    },
-    {
-      "epoch": 0.2893491939558168,
-      "grad_norm": 0.1737290918827057,
-      "learning_rate": 3.2290425934437146e-05,
-      "loss": 0.0865,
-      "step": 6300
-    },
-    {
-      "epoch": 0.2904974050429431,
-      "grad_norm": 0.6627403497695923,
-      "learning_rate": 3.2233431496565015e-05,
-      "loss": 0.0659,
-      "step": 6325
-    },
-    {
-      "epoch": 0.29164561613006934,
-      "grad_norm": 20.512529373168945,
-      "learning_rate": 3.2176277877761645e-05,
-      "loss": 0.1103,
-      "step": 6350
-    },
-    {
-      "epoch": 0.2927938272171956,
-      "grad_norm": 1.0080277919769287,
-      "learning_rate": 3.211896582170769e-05,
-      "loss": 0.1816,
-      "step": 6375
-    },
-    {
-      "epoch": 0.29394203830432186,
-      "grad_norm": 11.729516983032227,
-      "learning_rate": 3.2061496074145375e-05,
-      "loss": 0.0864,
-      "step": 6400
-    },
-    {
-      "epoch": 0.2950902493914481,
-      "grad_norm": 141.841064453125,
-      "learning_rate": 3.20038693828688e-05,
-      "loss": 0.1042,
-      "step": 6425
-    },
-    {
-      "epoch": 0.2962384604785744,
-      "grad_norm": 0.052077438682317734,
-      "learning_rate": 3.194608649771421e-05,
-      "loss": 0.0248,
-      "step": 6450
-    },
-    {
-      "epoch": 0.29738667156570064,
-      "grad_norm": 0.021261312067508698,
-      "learning_rate": 3.188814817055026e-05,
-      "loss": 0.0778,
-      "step": 6475
-    },
-    {
-      "epoch": 0.2985348826528269,
-      "grad_norm": 0.026634275913238525,
-      "learning_rate": 3.183005515526818e-05,
-      "loss": 0.0372,
-      "step": 6500
-    },
-    {
-      "epoch": 0.29968309373995317,
-      "grad_norm": 0.02382843941450119,
-      "learning_rate": 3.177180820777201e-05,
-      "loss": 0.0579,
-      "step": 6525
-    },
-    {
-      "epoch": 0.3008313048270794,
-      "grad_norm": 0.18128037452697754,
-      "learning_rate": 3.171340808596875e-05,
-      "loss": 0.1122,
-      "step": 6550
-    },
-    {
-      "epoch": 0.3019795159142057,
-      "grad_norm": 0.0806470662355423,
-      "learning_rate": 3.165485554975849e-05,
-      "loss": 0.1145,
-      "step": 6575
-    },
-    {
-      "epoch": 0.30312772700133195,
-      "grad_norm": 0.20076870918273926,
-      "learning_rate": 3.1596151361024545e-05,
-      "loss": 0.087,
-      "step": 6600
-    },
-    {
-      "epoch": 0.3042759380884582,
-      "grad_norm": 1.3655481338500977,
-      "learning_rate": 3.153729628362351e-05,
-      "loss": 0.1082,
-      "step": 6625
-    },
-    {
-      "epoch": 0.3054241491755844,
-      "grad_norm": 0.5538046956062317,
-      "learning_rate": 3.147829108337536e-05,
-      "loss": 0.0857,
-      "step": 6650
-    },
-    {
-      "epoch": 0.3065723602627107,
-      "grad_norm": 13.633685111999512,
-      "learning_rate": 3.141913652805343e-05,
-      "loss": 0.0875,
-      "step": 6675
-    },
-    {
-      "epoch": 0.30772057134983694,
-      "grad_norm": 0.4091302156448364,
-      "learning_rate": 3.135983338737449e-05,
-      "loss": 0.0719,
-      "step": 6700
-    },
-    {
-      "epoch": 0.3088687824369632,
-      "grad_norm": 0.15042781829833984,
-      "learning_rate": 3.130038243298867e-05,
-      "loss": 0.0981,
-      "step": 6725
-    },
-    {
-      "epoch": 0.31001699352408946,
-      "grad_norm": 0.1054098829627037,
-      "learning_rate": 3.124078443846947e-05,
-      "loss": 0.0626,
-      "step": 6750
-    },
-    {
-      "epoch": 0.3111652046112157,
-      "grad_norm": 0.033151715993881226,
-      "learning_rate": 3.118104017930365e-05,
-      "loss": 0.059,
-      "step": 6775
-    },
-    {
-      "epoch": 0.312313415698342,
-      "grad_norm": 0.18374769389629364,
-      "learning_rate": 3.1121150432881174e-05,
-      "loss": 0.0972,
-      "step": 6800
-    },
-    {
-      "epoch": 0.31346162678546824,
-      "grad_norm": 0.016897082328796387,
-      "learning_rate": 3.106111597848508e-05,
-      "loss": 0.0453,
-      "step": 6825
-    },
-    {
-      "epoch": 0.3146098378725945,
-      "grad_norm": 0.054736267775297165,
-      "learning_rate": 3.100093759728133e-05,
-      "loss": 0.0442,
-      "step": 6850
-    },
-    {
-      "epoch": 0.31575804895972076,
-      "grad_norm": 15.296791076660156,
-      "learning_rate": 3.0940616072308665e-05,
-      "loss": 0.13,
-      "step": 6875
-    },
-    {
-      "epoch": 0.316906260046847,
-      "grad_norm": 0.01988917961716652,
-      "learning_rate": 3.088015218846841e-05,
-      "loss": 0.036,
-      "step": 6900
-    },
-    {
-      "epoch": 0.3180544711339733,
-      "grad_norm": 0.025909580290317535,
-      "learning_rate": 3.081954673251423e-05,
-      "loss": 0.1197,
-      "step": 6925
-    },
-    {
-      "epoch": 0.31920268222109954,
-      "grad_norm": 0.13559691607952118,
-      "learning_rate": 3.075880049304196e-05,
-      "loss": 0.0492,
-      "step": 6950
-    },
-    {
-      "epoch": 0.3203508933082258,
-      "grad_norm": 0.3738120198249817,
-      "learning_rate": 3.069791426047929e-05,
-      "loss": 0.1177,
-      "step": 6975
-    },
-    {
-      "epoch": 0.32149910439535206,
-      "grad_norm": 0.032735977321863174,
-      "learning_rate": 3.063688882707549e-05,
-      "loss": 0.1113,
-      "step": 7000
-    },
-    {
-      "epoch": 0.3226473154824783,
-      "grad_norm": 16.048425674438477,
-      "learning_rate": 3.0575724986891096e-05,
-      "loss": 0.0528,
-      "step": 7025
-    },
-    {
-      "epoch": 0.32379552656960453,
-      "grad_norm": 11.062355041503906,
-      "learning_rate": 3.0514423535787618e-05,
-      "loss": 0.1168,
-      "step": 7050
-    },
-    {
-      "epoch": 0.3249437376567308,
-      "grad_norm": 0.09487222135066986,
-      "learning_rate": 3.0452985271417116e-05,
-      "loss": 0.0712,
-      "step": 7075
-    },
-    {
-      "epoch": 0.32609194874385705,
-      "grad_norm": 0.012298893183469772,
-      "learning_rate": 3.0391410993211897e-05,
-      "loss": 0.071,
-      "step": 7100
-    },
-    {
-      "epoch": 0.3272401598309833,
-      "grad_norm": 0.11313242465257645,
-      "learning_rate": 3.0329701502374046e-05,
-      "loss": 0.0509,
-      "step": 7125
-    },
-    {
-      "epoch": 0.3283883709181096,
-      "grad_norm": 0.025708986446261406,
-      "learning_rate": 3.0267857601865042e-05,
-      "loss": 0.0877,
-      "step": 7150
-    },
-    {
-      "epoch": 0.32953658200523583,
-      "grad_norm": 0.06797124445438385,
-      "learning_rate": 3.0205880096395294e-05,
-      "loss": 0.0589,
-      "step": 7175
-    },
-    {
-      "epoch": 0.3306847930923621,
-      "grad_norm": 0.6260665059089661,
-      "learning_rate": 3.0143769792413667e-05,
-      "loss": 0.1141,
-      "step": 7200
-    },
-    {
-      "epoch": 0.33183300417948836,
-      "grad_norm": 0.044946711510419846,
-      "learning_rate": 3.008152749809702e-05,
-      "loss": 0.0956,
-      "step": 7225
-    },
-    {
-      "epoch": 0.3329812152666146,
-      "grad_norm": 0.12308789044618607,
-      "learning_rate": 3.0019154023339633e-05,
-      "loss": 0.0621,
-      "step": 7250
-    },
-    {
-      "epoch": 0.3341294263537409,
-      "grad_norm": 0.25819188356399536,
-      "learning_rate": 2.9956650179742723e-05,
-      "loss": 0.0631,
-      "step": 7275
-    },
-    {
-      "epoch": 0.33527763744086714,
-      "grad_norm": 9.435999870300293,
-      "learning_rate": 2.9894016780603845e-05,
-      "loss": 0.1329,
-      "step": 7300
-    },
-    {
-      "epoch": 0.3364258485279934,
-      "grad_norm": 0.24013052880764008,
-      "learning_rate": 2.9831254640906346e-05,
-      "loss": 0.121,
-      "step": 7325
-    },
-    {
-      "epoch": 0.33757405961511966,
-      "grad_norm": 20.42167854309082,
-      "learning_rate": 2.9768364577308718e-05,
-      "loss": 0.0869,
-      "step": 7350
-    },
-    {
-      "epoch": 0.3387222707022459,
-      "grad_norm": 0.2154628038406372,
-      "learning_rate": 2.970534740813401e-05,
-      "loss": 0.1218,
-      "step": 7375
-    },
-    {
-      "epoch": 0.3398704817893722,
-      "grad_norm": 0.3771252930164337,
-      "learning_rate": 2.9642203953359154e-05,
-      "loss": 0.1596,
-      "step": 7400
-    },
-    {
-      "epoch": 0.34101869287649844,
-      "grad_norm": 0.2281711995601654,
-      "learning_rate": 2.957893503460431e-05,
-      "loss": 0.1132,
-      "step": 7425
-    },
-    {
-      "epoch": 0.34216690396362465,
-      "grad_norm": 0.0789218544960022,
-      "learning_rate": 2.9515541475122177e-05,
-      "loss": 0.0721,
-      "step": 7450
-    },
-    {
-      "epoch": 0.3433151150507509,
-      "grad_norm": 0.15669232606887817,
-      "learning_rate": 2.945202409978725e-05,
-      "loss": 0.1045,
-      "step": 7475
-    },
-    {
-      "epoch": 0.34446332613787717,
-      "grad_norm": 0.27901673316955566,
-      "learning_rate": 2.938838373508514e-05,
-      "loss": 0.081,
-      "step": 7500
-    },
-    {
-      "epoch": 0.34561153722500343,
-      "grad_norm": 0.14721404016017914,
-      "learning_rate": 2.9324621209101777e-05,
-      "loss": 0.0968,
-      "step": 7525
-    },
-    {
-      "epoch": 0.3467597483121297,
-      "grad_norm": 0.5337648987770081,
-      "learning_rate": 2.9260737351512653e-05,
-      "loss": 0.1014,
-      "step": 7550
-    },
-    {
-      "epoch": 0.34790795939925595,
-      "grad_norm": 0.0496150366961956,
-      "learning_rate": 2.9196732993572014e-05,
-      "loss": 0.0825,
-      "step": 7575
-    },
-    {
-      "epoch": 0.3490561704863822,
-      "grad_norm": 0.1201993003487587,
-      "learning_rate": 2.913260896810206e-05,
-      "loss": 0.0834,
-      "step": 7600
-    },
-    {
-      "epoch": 0.35020438157350847,
-      "grad_norm": 0.041360825300216675,
-      "learning_rate": 2.9068366109482096e-05,
-      "loss": 0.1034,
-      "step": 7625
-    },
-    {
-      "epoch": 0.35135259266063473,
-      "grad_norm": 0.031238993629813194,
-      "learning_rate": 2.9004005253637683e-05,
-      "loss": 0.0454,
-      "step": 7650
-    },
-    {
-      "epoch": 0.352500803747761,
-      "grad_norm": 0.20502029359340668,
-      "learning_rate": 2.8939527238029757e-05,
-      "loss": 0.0591,
-      "step": 7675
-    },
-    {
-      "epoch": 0.35364901483488725,
-      "grad_norm": 0.028439056128263474,
-      "learning_rate": 2.8874932901643724e-05,
-      "loss": 0.1009,
-      "step": 7700
-    },
-    {
-      "epoch": 0.3547972259220135,
-      "grad_norm": 0.04592956230044365,
-      "learning_rate": 2.881022308497856e-05,
-      "loss": 0.0688,
-      "step": 7725
-    },
-    {
-      "epoch": 0.3559454370091398,
-      "grad_norm": 0.26893487572669983,
-      "learning_rate": 2.874539863003587e-05,
-      "loss": 0.1086,
-      "step": 7750
-    },
-    {
-      "epoch": 0.35709364809626604,
-      "grad_norm": 0.029499026015400887,
-      "learning_rate": 2.868046038030891e-05,
-      "loss": 0.0941,
-      "step": 7775
-    },
-    {
-      "epoch": 0.3582418591833923,
-      "grad_norm": 0.42461779713630676,
-      "learning_rate": 2.8615409180771652e-05,
-      "loss": 0.0788,
-      "step": 7800
-    },
-    {
-      "epoch": 0.35939007027051856,
-      "grad_norm": 0.015663934871554375,
-      "learning_rate": 2.8550245877867745e-05,
-      "loss": 0.0633,
-      "step": 7825
-    },
-    {
-      "epoch": 0.36053828135764476,
-      "grad_norm": 25.207908630371094,
-      "learning_rate": 2.8484971319499547e-05,
-      "loss": 0.0354,
-      "step": 7850
-    },
-    {
-      "epoch": 0.361686492444771,
-      "grad_norm": 0.037977807223796844,
-      "learning_rate": 2.8419586355017034e-05,
-      "loss": 0.0868,
-      "step": 7875
-    },
-    {
-      "epoch": 0.3628347035318973,
-      "grad_norm": 0.06867849826812744,
-      "learning_rate": 2.8354091835206818e-05,
-      "loss": 0.0357,
-      "step": 7900
-    },
-    {
-      "epoch": 0.36398291461902355,
-      "grad_norm": 0.119336798787117,
-      "learning_rate": 2.828848861228102e-05,
-      "loss": 0.0761,
-      "step": 7925
-    },
-    {
-      "epoch": 0.3651311257061498,
-      "grad_norm": 20.873859405517578,
-      "learning_rate": 2.8222777539866197e-05,
-      "loss": 0.1192,
-      "step": 7950
-    },
-    {
-      "epoch": 0.36627933679327607,
-      "grad_norm": 8.662124633789062,
-      "learning_rate": 2.8156959472992264e-05,
-      "loss": 0.054,
-      "step": 7975
-    },
-    {
-      "epoch": 0.3674275478804023,
-      "grad_norm": 0.2797779440879822,
-      "learning_rate": 2.809103526808131e-05,
-      "loss": 0.0914,
-      "step": 8000
-    },
-    {
-      "epoch": 0.3685757589675286,
-      "grad_norm": 0.22510646283626556,
-      "learning_rate": 2.8025005782936525e-05,
-      "loss": 0.1265,
-      "step": 8025
-    },
-    {
-      "epoch": 0.36972397005465485,
-      "grad_norm": 0.7231925129890442,
-      "learning_rate": 2.7958871876730964e-05,
-      "loss": 0.1283,
-      "step": 8050
-    },
-    {
-      "epoch": 0.3708721811417811,
-      "grad_norm": 0.06375906616449356,
-      "learning_rate": 2.7892634409996433e-05,
-      "loss": 0.0671,
-      "step": 8075
-    },
-    {
-      "epoch": 0.37202039222890737,
-      "grad_norm": 0.02470921352505684,
-      "learning_rate": 2.7826294244612255e-05,
-      "loss": 0.0949,
-      "step": 8100
-    },
-    {
-      "epoch": 0.37316860331603363,
-      "grad_norm": 8.095888137817383,
-      "learning_rate": 2.775985224379406e-05,
-      "loss": 0.1367,
-      "step": 8125
-    },
-    {
-      "epoch": 0.3743168144031599,
-      "grad_norm": 10.58785343170166,
-      "learning_rate": 2.7693309272082554e-05,
-      "loss": 0.1,
-      "step": 8150
-    },
-    {
-      "epoch": 0.37546502549028615,
-      "grad_norm": 0.03961142152547836,
-      "learning_rate": 2.762666619533228e-05,
-      "loss": 0.0529,
-      "step": 8175
-    },
-    {
-      "epoch": 0.3766132365774124,
-      "grad_norm": 0.20349906384944916,
-      "learning_rate": 2.7559923880700345e-05,
-      "loss": 0.1317,
-      "step": 8200
-    },
-    {
-      "epoch": 0.3777614476645387,
-      "grad_norm": 0.05352284759283066,
-      "learning_rate": 2.7493083196635127e-05,
-      "loss": 0.173,
-      "step": 8225
-    },
-    {
-      "epoch": 0.3789096587516649,
-      "grad_norm": 47.993160247802734,
-      "learning_rate": 2.7426145012865e-05,
-      "loss": 0.0912,
-      "step": 8250
-    },
-    {
-      "epoch": 0.38005786983879114,
-      "grad_norm": 0.7617914080619812,
-      "learning_rate": 2.7359110200386966e-05,
-      "loss": 0.057,
-      "step": 8275
-    },
-    {
-      "epoch": 0.3812060809259174,
-      "grad_norm": 16.62209701538086,
-      "learning_rate": 2.7291979631455393e-05,
-      "loss": 0.1096,
-      "step": 8300
-    },
-    {
-      "epoch": 0.38235429201304366,
-      "grad_norm": 0.02364448457956314,
-      "learning_rate": 2.722475417957061e-05,
-      "loss": 0.1241,
-      "step": 8325
-    },
-    {
-      "epoch": 0.3835025031001699,
-      "grad_norm": 78.79348754882812,
-      "learning_rate": 2.7157434719467558e-05,
-      "loss": 0.1111,
-      "step": 8350
-    },
-    {
-      "epoch": 0.3846507141872962,
-      "grad_norm": 29.081613540649414,
-      "learning_rate": 2.7090022127104426e-05,
-      "loss": 0.0971,
-      "step": 8375
-    },
-    {
-      "epoch": 0.38579892527442244,
-      "grad_norm": 1.15839684009552,
-      "learning_rate": 2.7022517279651208e-05,
-      "loss": 0.0933,
-      "step": 8400
-    },
-    {
-      "epoch": 0.3869471363615487,
-      "grad_norm": 0.3460037410259247,
-      "learning_rate": 2.695492105547835e-05,
-      "loss": 0.1333,
-      "step": 8425
-    },
-    {
-      "epoch": 0.38809534744867497,
-      "grad_norm": 0.03244093060493469,
-      "learning_rate": 2.6887234334145257e-05,
-      "loss": 0.07,
-      "step": 8450
-    },
-    {
-      "epoch": 0.3892435585358012,
-      "grad_norm": 0.11166483163833618,
-      "learning_rate": 2.6819457996388907e-05,
-      "loss": 0.1051,
-      "step": 8475
-    },
-    {
-      "epoch": 0.3903917696229275,
-      "grad_norm": 0.8538084626197815,
-      "learning_rate": 2.6751592924112347e-05,
-      "loss": 0.1099,
-      "step": 8500
-    },
-    {
-      "epoch": 0.39153998071005375,
-      "grad_norm": 1.3156309127807617,
-      "learning_rate": 2.6683640000373232e-05,
-      "loss": 0.1429,
-      "step": 8525
-    },
-    {
-      "epoch": 0.39268819179718,
-      "grad_norm": 0.0183534175157547,
-      "learning_rate": 2.661560010937235e-05,
-      "loss": 0.1151,
-      "step": 8550
-    },
-    {
-      "epoch": 0.39383640288430627,
-      "grad_norm": 10.313261032104492,
-      "learning_rate": 2.6547474136442088e-05,
-      "loss": 0.108,
-      "step": 8575
-    },
-    {
-      "epoch": 0.39498461397143253,
-      "grad_norm": 0.022917896509170532,
-      "learning_rate": 2.647926296803495e-05,
-      "loss": 0.0776,
-      "step": 8600
-    },
-    {
-      "epoch": 0.3961328250585588,
-      "grad_norm": 0.6006048917770386,
-      "learning_rate": 2.6410967491711975e-05,
-      "loss": 0.04,
-      "step": 8625
-    },
-    {
-      "epoch": 0.397281036145685,
-      "grad_norm": 0.8469564914703369,
-      "learning_rate": 2.6342588596131225e-05,
-      "loss": 0.0361,
-      "step": 8650
-    },
-    {
-      "epoch": 0.39842924723281126,
-      "grad_norm": 0.8163219094276428,
-      "learning_rate": 2.6274127171036217e-05,
-      "loss": 0.0943,
-      "step": 8675
-    },
-    {
-      "epoch": 0.3995774583199375,
-      "grad_norm": 10.35794734954834,
-      "learning_rate": 2.6205584107244324e-05,
-      "loss": 0.0826,
-      "step": 8700
-    },
-    {
-      "epoch": 0.4007256694070638,
-      "grad_norm": 137.8352813720703,
-      "learning_rate": 2.613696029663521e-05,
-      "loss": 0.1485,
-      "step": 8725
-    },
-    {
-      "epoch": 0.40187388049419004,
-      "grad_norm": 0.1350816935300827,
-      "learning_rate": 2.6068256632139203e-05,
-      "loss": 0.0875,
-      "step": 8750
-    },
-    {
-      "epoch": 0.4030220915813163,
-      "grad_norm": 11.143074035644531,
-      "learning_rate": 2.5999474007725702e-05,
-      "loss": 0.1005,
-      "step": 8775
-    },
-    {
-      "epoch": 0.40417030266844256,
-      "grad_norm": 0.28397971391677856,
-      "learning_rate": 2.5930613318391517e-05,
-      "loss": 0.0564,
-      "step": 8800
-    },
-    {
-      "epoch": 0.4053185137555688,
-      "grad_norm": 0.023485900834202766,
-      "learning_rate": 2.5861675460149244e-05,
-      "loss": 0.0957,
-      "step": 8825
-    },
-    {
-      "epoch": 0.4064667248426951,
-      "grad_norm": 0.2290368527173996,
-      "learning_rate": 2.579266133001558e-05,
-      "loss": 0.0567,
-      "step": 8850
-    },
-    {
-      "epoch": 0.40761493592982134,
-      "grad_norm": 0.20042946934700012,
-      "learning_rate": 2.5723571825999692e-05,
-      "loss": 0.0697,
-      "step": 8875
-    },
-    {
-      "epoch": 0.4087631470169476,
-      "grad_norm": 24.241487503051758,
-      "learning_rate": 2.56544078470915e-05,
-      "loss": 0.098,
-      "step": 8900
-    },
-    {
-      "epoch": 0.40991135810407386,
-      "grad_norm": 31.435606002807617,
-      "learning_rate": 2.558517029324998e-05,
-      "loss": 0.0716,
-      "step": 8925
-    },
-    {
-      "epoch": 0.4110595691912001,
-      "grad_norm": 0.030203837901353836,
-      "learning_rate": 2.5515860065391477e-05,
-      "loss": 0.0695,
-      "step": 8950
-    },
-    {
-      "epoch": 0.4122077802783264,
-      "grad_norm": 0.09278186410665512,
-      "learning_rate": 2.5446478065377948e-05,
-      "loss": 0.1542,
-      "step": 8975
-    },
-    {
-      "epoch": 0.41335599136545265,
-      "grad_norm": 0.015812937170267105,
-      "learning_rate": 2.5377025196005277e-05,
-      "loss": 0.1059,
-      "step": 9000
-    },
-    {
-      "epoch": 0.4145042024525789,
-      "grad_norm": 120.72908020019531,
-      "learning_rate": 2.530750236099146e-05,
-      "loss": 0.0975,
-      "step": 9025
-    },
-    {
-      "epoch": 0.4156524135397051,
-      "grad_norm": 12.126846313476562,
-      "learning_rate": 2.5237910464964915e-05,
-      "loss": 0.0327,
-      "step": 9050
-    },
-    {
-      "epoch": 0.4168006246268314,
-      "grad_norm": 0.022481214255094528,
-      "learning_rate": 2.516825041345266e-05,
-      "loss": 0.0526,
-      "step": 9075
-    },
-    {
-      "epoch": 0.41794883571395763,
-      "grad_norm": 22.173906326293945,
-      "learning_rate": 2.5098523112868553e-05,
-      "loss": 0.0843,
-      "step": 9100
-    },
-    {
-      "epoch": 0.4190970468010839,
-      "grad_norm": 14.182297706604004,
-      "learning_rate": 2.5028729470501495e-05,
-      "loss": 0.0936,
-      "step": 9125
-    },
-    {
-      "epoch": 0.42024525788821016,
-      "grad_norm": 20.062166213989258,
-      "learning_rate": 2.4958870394503637e-05,
-      "loss": 0.0503,
-      "step": 9150
-    },
-    {
-      "epoch": 0.4213934689753364,
-      "grad_norm": 1.9899086952209473,
-      "learning_rate": 2.488894679387853e-05,
-      "loss": 0.1136,
-      "step": 9175
-    },
-    {
-      "epoch": 0.4225416800624627,
-      "grad_norm": 0.30002573132514954,
-      "learning_rate": 2.4818959578469325e-05,
-      "loss": 0.0558,
-      "step": 9200
-    },
-    {
-      "epoch": 0.42368989114958894,
-      "grad_norm": 0.0924188494682312,
-      "learning_rate": 2.474890965894693e-05,
-      "loss": 0.0939,
-      "step": 9225
-    },
-    {
-      "epoch": 0.4248381022367152,
-      "grad_norm": 0.11107456684112549,
-      "learning_rate": 2.467879794679815e-05,
-      "loss": 0.1044,
-      "step": 9250
-    },
-    {
-      "epoch": 0.42598631332384146,
-      "grad_norm": 0.28399938344955444,
-      "learning_rate": 2.4608625354313836e-05,
-      "loss": 0.0749,
-      "step": 9275
-    },
-    {
-      "epoch": 0.4271345244109677,
-      "grad_norm": 0.035467736423015594,
-      "learning_rate": 2.4538392794577014e-05,
-      "loss": 0.0737,
-      "step": 9300
-    },
-    {
-      "epoch": 0.428282735498094,
-      "grad_norm": 112.69757080078125,
-      "learning_rate": 2.4468101181450995e-05,
-      "loss": 0.1343,
-      "step": 9325
-    },
-    {
-      "epoch": 0.42943094658522024,
-      "grad_norm": 224.90357971191406,
-      "learning_rate": 2.43977514295675e-05,
-      "loss": 0.0376,
-      "step": 9350
-    },
-    {
-      "epoch": 0.4305791576723465,
-      "grad_norm": 0.23775708675384521,
-      "learning_rate": 2.4327344454314738e-05,
-      "loss": 0.0685,
-      "step": 9375
-    },
-    {
-      "epoch": 0.43172736875947276,
-      "grad_norm": 0.705800473690033,
-      "learning_rate": 2.4256881171825512e-05,
-      "loss": 0.0655,
-      "step": 9400
-    },
-    {
-      "epoch": 0.432875579846599,
-      "grad_norm": 0.2821938991546631,
-      "learning_rate": 2.4186362498965295e-05,
-      "loss": 0.1061,
-      "step": 9425
-    },
-    {
-      "epoch": 0.43402379093372523,
-      "grad_norm": 0.0656893253326416,
-      "learning_rate": 2.4115789353320302e-05,
-      "loss": 0.0865,
-      "step": 9450
-    },
-    {
-      "epoch": 0.4351720020208515,
-      "grad_norm": 2.238302707672119,
-      "learning_rate": 2.4045162653185528e-05,
-      "loss": 0.127,
-      "step": 9475
-    },
-    {
-      "epoch": 0.43632021310797775,
-      "grad_norm": 0.44611087441444397,
-      "learning_rate": 2.3974483317552824e-05,
-      "loss": 0.037,
-      "step": 9500
-    },
-    {
-      "epoch": 0.437468424195104,
-      "grad_norm": 0.26402464509010315,
-      "learning_rate": 2.3903752266098946e-05,
-      "loss": 0.0218,
-      "step": 9525
-    },
-    {
-      "epoch": 0.43861663528223027,
-      "grad_norm": 0.018698792904615402,
-      "learning_rate": 2.3832970419173558e-05,
-      "loss": 0.0601,
-      "step": 9550
-    },
-    {
-      "epoch": 0.43976484636935653,
-      "grad_norm": 44.78874206542969,
-      "learning_rate": 2.376213869778728e-05,
-      "loss": 0.1243,
-      "step": 9575
-    },
-    {
-      "epoch": 0.4409130574564828,
-      "grad_norm": 0.03857385739684105,
-      "learning_rate": 2.3691258023599706e-05,
-      "loss": 0.058,
-      "step": 9600
-    },
-    {
-      "epoch": 0.44206126854360905,
-      "grad_norm": 0.03389425948262215,
-      "learning_rate": 2.3620329318907363e-05,
-      "loss": 0.0755,
-      "step": 9625
-    },
-    {
-      "epoch": 0.4432094796307353,
-      "grad_norm": 15.255989074707031,
-      "learning_rate": 2.3549353506631805e-05,
-      "loss": 0.1009,
-      "step": 9650
-    },
-    {
-      "epoch": 0.4443576907178616,
-      "grad_norm": 0.014160329475998878,
-      "learning_rate": 2.3478331510307508e-05,
-      "loss": 0.1342,
-      "step": 9675
-    },
-    {
-      "epoch": 0.44550590180498784,
-      "grad_norm": 14.596662521362305,
-      "learning_rate": 2.3407264254069908e-05,
-      "loss": 0.0558,
-      "step": 9700
-    },
-    {
-      "epoch": 0.4466541128921141,
-      "grad_norm": 1.6787420511245728,
-      "learning_rate": 2.333615266264335e-05,
-      "loss": 0.0342,
-      "step": 9725
-    },
-    {
-      "epoch": 0.44780232397924036,
-      "grad_norm": 0.008284349925816059,
-      "learning_rate": 2.3264997661329085e-05,
-      "loss": 0.1212,
-      "step": 9750
-    },
-    {
-      "epoch": 0.4489505350663666,
-      "grad_norm": 0.06463667005300522,
-      "learning_rate": 2.3193800175993197e-05,
-      "loss": 0.119,
-      "step": 9775
-    },
-    {
-      "epoch": 0.4500987461534929,
-      "grad_norm": 0.08876292407512665,
-      "learning_rate": 2.3122561133054572e-05,
-      "loss": 0.0863,
-      "step": 9800
-    },
-    {
-      "epoch": 0.45124695724061914,
-      "grad_norm": 21.290327072143555,
-      "learning_rate": 2.3051281459472855e-05,
-      "loss": 0.0651,
-      "step": 9825
-    },
-    {
-      "epoch": 0.45239516832774535,
-      "grad_norm": 19.711048126220703,
-      "learning_rate": 2.2979962082736362e-05,
-      "loss": 0.1419,
-      "step": 9850
-    },
-    {
-      "epoch": 0.4535433794148716,
-      "grad_norm": 11.61158561706543,
-      "learning_rate": 2.290860393085002e-05,
-      "loss": 0.0398,
-      "step": 9875
-    },
-    {
-      "epoch": 0.45469159050199787,
-      "grad_norm": 0.06531768292188644,
-      "learning_rate": 2.2837207932323308e-05,
-      "loss": 0.1047,
-      "step": 9900
-    },
-    {
-      "epoch": 0.4558398015891241,
-      "grad_norm": 0.004605613183230162,
-      "learning_rate": 2.2765775016158173e-05,
-      "loss": 0.0674,
-      "step": 9925
-    },
-    {
-      "epoch": 0.4569880126762504,
-      "grad_norm": 0.27768808603286743,
-      "learning_rate": 2.2694306111836905e-05,
-      "loss": 0.1106,
-      "step": 9950
-    },
-    {
-      "epoch": 0.45813622376337665,
-      "grad_norm": 0.04515165835618973,
-      "learning_rate": 2.262280214931009e-05,
-      "loss": 0.1219,
-      "step": 9975
-    },
-    {
-      "epoch": 0.4592844348505029,
-      "grad_norm": 7.108341217041016,
-      "learning_rate": 2.2551264058984498e-05,
-      "loss": 0.1299,
-      "step": 10000
-    },
-    {
-      "epoch": 0.46043264593762917,
-      "grad_norm": 0.2922165095806122,
-      "learning_rate": 2.247969277171094e-05,
-      "loss": 0.0849,
-      "step": 10025
-    },
-    {
-      "epoch": 0.46158085702475543,
-      "grad_norm": 0.11757965385913849,
-      "learning_rate": 2.2408089218772215e-05,
-      "loss": 0.1056,
-      "step": 10050
-    },
-    {
-      "epoch": 0.4627290681118817,
-      "grad_norm": 13.596343994140625,
-      "learning_rate": 2.2336454331870937e-05,
-      "loss": 0.1195,
-      "step": 10075
-    },
-    {
-      "epoch": 0.46387727919900795,
-      "grad_norm": 0.041957736015319824,
-      "learning_rate": 2.2264789043117457e-05,
-      "loss": 0.0368,
-      "step": 10100
-    },
-    {
-      "epoch": 0.4650254902861342,
-      "grad_norm": 0.09741153568029404,
-      "learning_rate": 2.2193094285017692e-05,
-      "loss": 0.0597,
-      "step": 10125
-    },
-    {
-      "epoch": 0.4661737013732605,
-      "grad_norm": 0.41714927554130554,
-      "learning_rate": 2.2121370990461042e-05,
-      "loss": 0.0787,
-      "step": 10150
-    },
-    {
-      "epoch": 0.46732191246038673,
-      "grad_norm": 0.02291625551879406,
-      "learning_rate": 2.2049620092708194e-05,
-      "loss": 0.0905,
-      "step": 10175
-    },
-    {
-      "epoch": 0.468470123547513,
-      "grad_norm": 2.7025392055511475,
-      "learning_rate": 2.1977842525379012e-05,
-      "loss": 0.0952,
-      "step": 10200
-    },
-    {
-      "epoch": 0.46961833463463926,
-      "grad_norm": 0.047728363424539566,
-      "learning_rate": 2.1906039222440406e-05,
-      "loss": 0.0662,
-      "step": 10225
-    },
-    {
-      "epoch": 0.4707665457217655,
-      "grad_norm": 2.8927001953125,
-      "learning_rate": 2.1834211118194122e-05,
-      "loss": 0.091,
-      "step": 10250
-    },
-    {
-      "epoch": 0.4719147568088917,
-      "grad_norm": 0.3148484528064728,
-      "learning_rate": 2.1762359147264655e-05,
-      "loss": 0.0741,
-      "step": 10275
-    },
-    {
-      "epoch": 0.473062967896018,
-      "grad_norm": 20.764738082885742,
-      "learning_rate": 2.1690484244587023e-05,
-      "loss": 0.0896,
-      "step": 10300
-    },
-    {
-      "epoch": 0.47421117898314424,
-      "grad_norm": 37.63784408569336,
-      "learning_rate": 2.1618587345394643e-05,
-      "loss": 0.079,
-      "step": 10325
-    },
-    {
-      "epoch": 0.4753593900702705,
-      "grad_norm": 0.14948450028896332,
-      "learning_rate": 2.1546669385207152e-05,
-      "loss": 0.0781,
-      "step": 10350
-    },
-    {
-      "epoch": 0.47650760115739677,
-      "grad_norm": 0.020155781880021095,
-      "learning_rate": 2.1474731299818236e-05,
-      "loss": 0.0772,
-      "step": 10375
-    },
-    {
-      "epoch": 0.477655812244523,
-      "grad_norm": 25.559816360473633,
-      "learning_rate": 2.1402774025283435e-05,
-      "loss": 0.1737,
-      "step": 10400
-    },
-    {
-      "epoch": 0.4788040233316493,
-      "grad_norm": 10.59823989868164,
-      "learning_rate": 2.1330798497907986e-05,
-      "loss": 0.0766,
-      "step": 10425
-    },
-    {
-      "epoch": 0.47995223441877555,
-      "grad_norm": 0.12846903502941132,
-      "learning_rate": 2.125880565423464e-05,
-      "loss": 0.0181,
-      "step": 10450
-    },
-    {
-      "epoch": 0.4811004455059018,
-      "grad_norm": 0.02507212944328785,
-      "learning_rate": 2.118679643103144e-05,
-      "loss": 0.0862,
-      "step": 10475
-    },
-    {
-      "epoch": 0.48224865659302807,
-      "grad_norm": 0.03787585720419884,
-      "learning_rate": 2.1114771765279594e-05,
-      "loss": 0.0594,
-      "step": 10500
-    },
-    {
-      "epoch": 0.48339686768015433,
-      "grad_norm": 0.031070342287421227,
-      "learning_rate": 2.1042732594161227e-05,
-      "loss": 0.0266,
-      "step": 10525
-    },
-    {
-      "epoch": 0.4845450787672806,
-      "grad_norm": 0.5428506731987,
-      "learning_rate": 2.09706798550472e-05,
-      "loss": 0.1439,
-      "step": 10550
-    },
-    {
-      "epoch": 0.48569328985440685,
-      "grad_norm": 0.06007479876279831,
-      "learning_rate": 2.089861448548494e-05,
-      "loss": 0.0368,
-      "step": 10575
-    },
-    {
-      "epoch": 0.4868415009415331,
-      "grad_norm": 0.4172542095184326,
-      "learning_rate": 2.0826537423186204e-05,
-      "loss": 0.0624,
-      "step": 10600
-    },
-    {
-      "epoch": 0.4879897120286594,
-      "grad_norm": 0.019977344200015068,
-      "learning_rate": 2.0754449606014916e-05,
-      "loss": 0.0457,
-      "step": 10625
-    },
-    {
-      "epoch": 0.48913792311578563,
-      "grad_norm": 8.558683395385742,
-      "learning_rate": 2.0682351971974915e-05,
-      "loss": 0.0674,
-      "step": 10650
-    },
-    {
-      "epoch": 0.49028613420291184,
-      "grad_norm": 49.44523239135742,
-      "learning_rate": 2.06102454591978e-05,
-      "loss": 0.1491,
-      "step": 10675
-    },
-    {
-      "epoch": 0.4914343452900381,
-      "grad_norm": 6.603743076324463,
-      "learning_rate": 2.0538131005930678e-05,
-      "loss": 0.078,
-      "step": 10700
-    },
-    {
-      "epoch": 0.49258255637716436,
-      "grad_norm": 0.7394055724143982,
-      "learning_rate": 2.0466009550523997e-05,
-      "loss": 0.0763,
-      "step": 10725
-    },
-    {
-      "epoch": 0.4937307674642906,
-      "grad_norm": 0.5767204761505127,
-      "learning_rate": 2.0393882031419307e-05,
-      "loss": 0.1439,
-      "step": 10750
-    },
-    {
-      "epoch": 0.4948789785514169,
-      "grad_norm": 0.05295969545841217,
-      "learning_rate": 2.0321749387137055e-05,
-      "loss": 0.139,
-      "step": 10775
-    },
-    {
-      "epoch": 0.49602718963854314,
-      "grad_norm": 0.20006586611270905,
-      "learning_rate": 2.024961255626439e-05,
-      "loss": 0.0691,
-      "step": 10800
-    },
-    {
-      "epoch": 0.4971754007256694,
-      "grad_norm": 24.380508422851562,
-      "learning_rate": 2.017747247744292e-05,
-      "loss": 0.069,
-      "step": 10825
-    },
-    {
-      "epoch": 0.49832361181279566,
-      "grad_norm": 0.635622501373291,
-      "learning_rate": 2.0105330089356535e-05,
-      "loss": 0.0681,
-      "step": 10850
-    },
-    {
-      "epoch": 0.4994718228999219,
-      "grad_norm": 0.33230355381965637,
-      "learning_rate": 2.0033186330719147e-05,
-      "loss": 0.0754,
-      "step": 10875
-    },
-    {
-      "epoch": 0.5006200339870481,
-      "grad_norm": 0.046828582882881165,
-      "learning_rate": 1.9961042140262533e-05,
-      "loss": 0.0581,
-      "step": 10900
-    },
-    {
-      "epoch": 0.5017682450741744,
-      "grad_norm": 0.0318719819188118,
-      "learning_rate": 1.9888898456724058e-05,
-      "loss": 0.0746,
-      "step": 10925
-    },
-    {
-      "epoch": 0.5029164561613007,
-      "grad_norm": 0.019355185329914093,
-      "learning_rate": 1.9816756218834515e-05,
-      "loss": 0.0664,
-      "step": 10950
-    },
-    {
-      "epoch": 0.5040646672484269,
-      "grad_norm": 427.9779052734375,
-      "learning_rate": 1.974461636530587e-05,
-      "loss": 0.0777,
-      "step": 10975
-    },
-    {
-      "epoch": 0.5052128783355532,
-      "grad_norm": 0.22881121933460236,
-      "learning_rate": 1.9672479834819065e-05,
-      "loss": 0.1357,
-      "step": 11000
-    },
-    {
-      "epoch": 0.5063610894226794,
-      "grad_norm": 15.586023330688477,
-      "learning_rate": 1.960034756601182e-05,
-      "loss": 0.1463,
-      "step": 11025
-    },
-    {
-      "epoch": 0.5075093005098057,
-      "grad_norm": 0.1670362949371338,
-      "learning_rate": 1.9528220497466382e-05,
-      "loss": 0.2057,
-      "step": 11050
-    },
-    {
-      "epoch": 0.508657511596932,
-      "grad_norm": 0.052390482276678085,
-      "learning_rate": 1.945609956769735e-05,
-      "loss": 0.0423,
-      "step": 11075
-    },
-    {
-      "epoch": 0.5098057226840582,
-      "grad_norm": 3.1492886543273926,
-      "learning_rate": 1.938398571513942e-05,
-      "loss": 0.0344,
-      "step": 11100
-    },
-    {
-      "epoch": 0.5109539337711845,
-      "grad_norm": 0.30143025517463684,
-      "learning_rate": 1.9311879878135228e-05,
-      "loss": 0.0601,
-      "step": 11125
-    },
-    {
-      "epoch": 0.5121021448583107,
-      "grad_norm": 0.17168006300926208,
-      "learning_rate": 1.92397829949231e-05,
-      "loss": 0.0636,
-      "step": 11150
-    },
-    {
-      "epoch": 0.513250355945437,
-      "grad_norm": 0.601152777671814,
-      "learning_rate": 1.9167696003624846e-05,
-      "loss": 0.0369,
-      "step": 11175
-    },
-    {
-      "epoch": 0.5143985670325633,
-      "grad_norm": 0.04143074154853821,
-      "learning_rate": 1.909561984223358e-05,
-      "loss": 0.1208,
-      "step": 11200
-    },
-    {
-      "epoch": 0.5155467781196895,
-      "grad_norm": 0.0267263762652874,
-      "learning_rate": 1.9023555448601482e-05,
-      "loss": 0.0431,
-      "step": 11225
-    },
-    {
-      "epoch": 0.5166949892068158,
-      "grad_norm": 0.03883841261267662,
-      "learning_rate": 1.8951503760427628e-05,
-      "loss": 0.0247,
-      "step": 11250
-    },
-    {
-      "epoch": 0.517843200293942,
-      "grad_norm": 17.886707305908203,
-      "learning_rate": 1.8879465715245756e-05,
-      "loss": 0.129,
-      "step": 11275
-    },
-    {
-      "epoch": 0.5189914113810683,
-      "grad_norm": 0.12370496988296509,
-      "learning_rate": 1.8807442250412078e-05,
-      "loss": 0.0849,
-      "step": 11300
-    },
-    {
-      "epoch": 0.5201396224681946,
-      "grad_norm": 0.08131173998117447,
-      "learning_rate": 1.873543430309311e-05,
-      "loss": 0.112,
-      "step": 11325
-    },
-    {
-      "epoch": 0.5212878335553208,
-      "grad_norm": 0.0747842863202095,
-      "learning_rate": 1.8663442810253435e-05,
-      "loss": 0.0306,
-      "step": 11350
-    },
-    {
-      "epoch": 0.5224360446424471,
-      "grad_norm": 15.537251472473145,
-      "learning_rate": 1.8591468708643538e-05,
-      "loss": 0.0562,
-      "step": 11375
-    },
-    {
-      "epoch": 0.5235842557295733,
-      "grad_norm": 0.011435016058385372,
-      "learning_rate": 1.85195129347876e-05,
-      "loss": 0.0637,
-      "step": 11400
-    },
-    {
-      "epoch": 0.5247324668166996,
-      "grad_norm": 0.21264196932315826,
-      "learning_rate": 1.8447576424971348e-05,
-      "loss": 0.0993,
-      "step": 11425
-    },
-    {
-      "epoch": 0.5258806779038259,
-      "grad_norm": 15.959806442260742,
-      "learning_rate": 1.8375660115229815e-05,
-      "loss": 0.0553,
-      "step": 11450
-    },
-    {
-      "epoch": 0.5270288889909521,
-      "grad_norm": 16.946958541870117,
-      "learning_rate": 1.8303764941335206e-05,
-      "loss": 0.0264,
-      "step": 11475
-    },
-    {
-      "epoch": 0.5281771000780784,
-      "grad_norm": 0.8537108302116394,
-      "learning_rate": 1.8231891838784713e-05,
-      "loss": 0.097,
-      "step": 11500
-    },
-    {
-      "epoch": 0.5293253111652046,
-      "grad_norm": 0.011382007971405983,
-      "learning_rate": 1.816004174278832e-05,
-      "loss": 0.0714,
-      "step": 11525
-    },
-    {
-      "epoch": 0.5304735222523309,
-      "grad_norm": 9.222038269042969,
-      "learning_rate": 1.8088215588256672e-05,
-      "loss": 0.0796,
-      "step": 11550
-    },
-    {
-      "epoch": 0.5316217333394572,
-      "grad_norm": 0.031983211636543274,
-      "learning_rate": 1.8016414309788867e-05,
-      "loss": 0.0703,
-      "step": 11575
-    },
-    {
-      "epoch": 0.5327699444265834,
-      "grad_norm": 0.11978611350059509,
-      "learning_rate": 1.7944638841660334e-05,
-      "loss": 0.0489,
-      "step": 11600
-    },
-    {
-      "epoch": 0.5339181555137097,
-      "grad_norm": 0.2896723449230194,
-      "learning_rate": 1.7872890117810654e-05,
-      "loss": 0.1119,
-      "step": 11625
-    },
-    {
-      "epoch": 0.535066366600836,
-      "grad_norm": 213.61729431152344,
-      "learning_rate": 1.7801169071831396e-05,
-      "loss": 0.1102,
-      "step": 11650
-    },
-    {
-      "epoch": 0.5362145776879621,
-      "grad_norm": 15.954391479492188,
-      "learning_rate": 1.772947663695402e-05,
-      "loss": 0.0812,
-      "step": 11675
-    },
-    {
-      "epoch": 0.5373627887750884,
-      "grad_norm": 0.16466745734214783,
-      "learning_rate": 1.7657813746037663e-05,
-      "loss": 0.0633,
-      "step": 11700
-    },
-    {
-      "epoch": 0.5385109998622146,
-      "grad_norm": 0.5646315217018127,
-      "learning_rate": 1.7586181331557057e-05,
-      "loss": 0.111,
-      "step": 11725
-    },
-    {
-      "epoch": 0.5396592109493409,
-      "grad_norm": 0.21161770820617676,
-      "learning_rate": 1.751458032559037e-05,
-      "loss": 0.0424,
-      "step": 11750
-    },
-    {
-      "epoch": 0.5408074220364671,
-      "grad_norm": 0.1260633021593094,
-      "learning_rate": 1.744301165980709e-05,
-      "loss": 0.0511,
-      "step": 11775
-    },
-    {
-      "epoch": 0.5419556331235934,
-      "grad_norm": 0.11227844655513763,
-      "learning_rate": 1.737147626545589e-05,
-      "loss": 0.1005,
-      "step": 11800
-    },
-    {
-      "epoch": 0.5431038442107197,
-      "grad_norm": 0.010617699474096298,
-      "learning_rate": 1.7299975073352523e-05,
-      "loss": 0.0544,
-      "step": 11825
-    },
-    {
-      "epoch": 0.5442520552978459,
-      "grad_norm": 0.055700164288282394,
-      "learning_rate": 1.722850901386769e-05,
-      "loss": 0.0757,
-      "step": 11850
-    },
-    {
-      "epoch": 0.5454002663849722,
-      "grad_norm": 0.08620750904083252,
-      "learning_rate": 1.715707901691496e-05,
-      "loss": 0.0407,
-      "step": 11875
-    },
-    {
-      "epoch": 0.5465484774720984,
-      "grad_norm": 27.830577850341797,
-      "learning_rate": 1.708568601193866e-05,
-      "loss": 0.0594,
-      "step": 11900
-    },
-    {
-      "epoch": 0.5476966885592247,
-      "grad_norm": 0.10323012620210648,
-      "learning_rate": 1.7014330927901764e-05,
-      "loss": 0.1063,
-      "step": 11925
-    },
-    {
-      "epoch": 0.548844899646351,
-      "grad_norm": 0.05482901260256767,
-      "learning_rate": 1.6943014693273837e-05,
-      "loss": 0.0711,
-      "step": 11950
-    },
-    {
-      "epoch": 0.5499931107334772,
-      "grad_norm": 34.74000930786133,
-      "learning_rate": 1.6871738236018918e-05,
-      "loss": 0.044,
-      "step": 11975
-    },
-    {
-      "epoch": 0.5511413218206035,
-      "grad_norm": 0.025118809193372726,
-      "learning_rate": 1.680050248358349e-05,
-      "loss": 0.0378,
-      "step": 12000
-    },
-    {
-      "epoch": 0.5522895329077298,
-      "grad_norm": 4.1571149826049805,
-      "learning_rate": 1.672930836288436e-05,
-      "loss": 0.0656,
-      "step": 12025
-    },
-    {
-      "epoch": 0.553437743994856,
-      "grad_norm": 0.053131286054849625,
-      "learning_rate": 1.6658156800296646e-05,
-      "loss": 0.1149,
-      "step": 12050
-    },
-    {
-      "epoch": 0.5545859550819823,
-      "grad_norm": 0.09221284091472626,
-      "learning_rate": 1.6587048721641693e-05,
-      "loss": 0.1209,
-      "step": 12075
-    },
-    {
-      "epoch": 0.5557341661691085,
-      "grad_norm": 0.021747712045907974,
-      "learning_rate": 1.651598505217502e-05,
-      "loss": 0.0404,
-      "step": 12100
-    },
-    {
-      "epoch": 0.5568823772562348,
-      "grad_norm": 0.19503289461135864,
-      "learning_rate": 1.644496671657432e-05,
-      "loss": 0.1202,
-      "step": 12125
-    },
-    {
-      "epoch": 0.5580305883433611,
-      "grad_norm": 11.685429573059082,
-      "learning_rate": 1.6373994638927394e-05,
-      "loss": 0.0749,
-      "step": 12150
-    },
-    {
-      "epoch": 0.5591787994304873,
-      "grad_norm": 0.0062902239151299,
-      "learning_rate": 1.630306974272013e-05,
-      "loss": 0.0774,
-      "step": 12175
-    },
-    {
-      "epoch": 0.5603270105176136,
-      "grad_norm": 125.52642059326172,
-      "learning_rate": 1.6232192950824504e-05,
-      "loss": 0.0919,
-      "step": 12200
-    },
-    {
-      "epoch": 0.5614752216047398,
-      "grad_norm": 1.1520588397979736,
-      "learning_rate": 1.6161365185486546e-05,
-      "loss": 0.0294,
-      "step": 12225
-    },
-    {
-      "epoch": 0.5626234326918661,
-      "grad_norm": 0.2502564787864685,
-      "learning_rate": 1.609058736831437e-05,
-      "loss": 0.0986,
-      "step": 12250
-    },
-    {
-      "epoch": 0.5637716437789924,
-      "grad_norm": 0.015697428956627846,
-      "learning_rate": 1.6019860420266157e-05,
-      "loss": 0.0206,
-      "step": 12275
-    },
-    {
-      "epoch": 0.5649198548661186,
-      "grad_norm": 0.07955513894557953,
-      "learning_rate": 1.5949185261638186e-05,
-      "loss": 0.0877,
-      "step": 12300
-    },
-    {
-      "epoch": 0.5660680659532449,
-      "grad_norm": 0.09653720259666443,
-      "learning_rate": 1.5878562812052845e-05,
-      "loss": 0.0854,
-      "step": 12325
-    },
-    {
-      "epoch": 0.5672162770403711,
-      "grad_norm": 64.97950744628906,
-      "learning_rate": 1.5807993990446687e-05,
-      "loss": 0.0222,
-      "step": 12350
-    },
-    {
-      "epoch": 0.5683644881274974,
-      "grad_norm": 0.025900257751345634,
-      "learning_rate": 1.5737479715058454e-05,
-      "loss": 0.0203,
-      "step": 12375
-    },
-    {
-      "epoch": 0.5695126992146237,
-      "grad_norm": 0.007998030632734299,
-      "learning_rate": 1.5667020903417124e-05,
-      "loss": 0.064,
-      "step": 12400
-    },
-    {
-      "epoch": 0.5706609103017499,
-      "grad_norm": 0.07901493459939957,
-      "learning_rate": 1.5596618472330008e-05,
-      "loss": 0.0952,
-      "step": 12425
-    },
-    {
-      "epoch": 0.5718091213888762,
-      "grad_norm": 0.338176429271698,
-      "learning_rate": 1.5526273337870767e-05,
-      "loss": 0.0647,
-      "step": 12450
-    },
-    {
-      "epoch": 0.5729573324760023,
-      "grad_norm": 0.10939247161149979,
-      "learning_rate": 1.5455986415367547e-05,
-      "loss": 0.0651,
-      "step": 12475
-    },
-    {
-      "epoch": 0.5741055435631286,
-      "grad_norm": 0.003566980129107833,
-      "learning_rate": 1.538575861939102e-05,
-      "loss": 0.0155,
-      "step": 12500
-    },
-    {
-      "epoch": 0.5752537546502549,
-      "grad_norm": 0.6434283256530762,
-      "learning_rate": 1.531559086374251e-05,
-      "loss": 0.0527,
-      "step": 12525
-    },
-    {
-      "epoch": 0.5764019657373811,
-      "grad_norm": 0.05047542229294777,
-      "learning_rate": 1.5245484061442113e-05,
-      "loss": 0.0715,
-      "step": 12550
-    },
-    {
-      "epoch": 0.5775501768245074,
-      "grad_norm": 0.1261250227689743,
-      "learning_rate": 1.5175439124716793e-05,
-      "loss": 0.134,
-      "step": 12575
-    },
-    {
-      "epoch": 0.5786983879116336,
-      "grad_norm": 9.76130199432373,
-      "learning_rate": 1.5105456964988517e-05,
-      "loss": 0.0846,
-      "step": 12600
-    },
-    {
-      "epoch": 0.5798465989987599,
-      "grad_norm": 0.0062255095690488815,
-      "learning_rate": 1.5035538492862411e-05,
-      "loss": 0.1306,
-      "step": 12625
-    },
-    {
-      "epoch": 0.5809948100858862,
-      "grad_norm": 0.03955512493848801,
-      "learning_rate": 1.4965684618114891e-05,
-      "loss": 0.0427,
-      "step": 12650
-    },
-    {
-      "epoch": 0.5821430211730124,
-      "grad_norm": 0.12320832908153534,
-      "learning_rate": 1.4895896249681831e-05,
-      "loss": 0.0926,
-      "step": 12675
-    },
-    {
-      "epoch": 0.5832912322601387,
-      "grad_norm": 5.945989608764648,
-      "learning_rate": 1.4826174295646763e-05,
-      "loss": 0.0327,
-      "step": 12700
-    },
-    {
-      "epoch": 0.5844394433472649,
-      "grad_norm": 0.010435862466692924,
-      "learning_rate": 1.4756519663229e-05,
-      "loss": 0.0856,
-      "step": 12725
-    },
-    {
-      "epoch": 0.5855876544343912,
-      "grad_norm": 0.029769467189908028,
-      "learning_rate": 1.468693325877191e-05,
-      "loss": 0.0804,
-      "step": 12750
-    },
-    {
-      "epoch": 0.5867358655215175,
-      "grad_norm": 11.208464622497559,
-      "learning_rate": 1.4617415987731045e-05,
-      "loss": 0.0947,
-      "step": 12775
-    },
-    {
-      "epoch": 0.5878840766086437,
-      "grad_norm": 101.77286529541016,
-      "learning_rate": 1.454796875466242e-05,
-      "loss": 0.0808,
-      "step": 12800
-    },
-    {
-      "epoch": 0.58903228769577,
-      "grad_norm": 0.1369004100561142,
-      "learning_rate": 1.447859246321071e-05,
-      "loss": 0.0784,
-      "step": 12825
-    },
-    {
-      "epoch": 0.5901804987828962,
-      "grad_norm": 0.04609095677733421,
-      "learning_rate": 1.4409288016097493e-05,
-      "loss": 0.0839,
-      "step": 12850
-    },
-    {
-      "epoch": 0.5913287098700225,
-      "grad_norm": 0.10736856609582901,
-      "learning_rate": 1.434005631510953e-05,
-      "loss": 0.0442,
-      "step": 12875
-    },
-    {
-      "epoch": 0.5924769209571488,
-      "grad_norm": 0.07319389283657074,
-      "learning_rate": 1.427089826108699e-05,
-      "loss": 0.0816,
-      "step": 12900
-    },
-    {
-      "epoch": 0.593625132044275,
-      "grad_norm": 13.238898277282715,
-      "learning_rate": 1.4201814753911771e-05,
-      "loss": 0.0961,
-      "step": 12925
-    },
-    {
-      "epoch": 0.5947733431314013,
-      "grad_norm": 0.26180291175842285,
-      "learning_rate": 1.4132806692495761e-05,
-      "loss": 0.0415,
-      "step": 12950
-    },
-    {
-      "epoch": 0.5959215542185275,
-      "grad_norm": 0.839544951915741,
-      "learning_rate": 1.4063874974769141e-05,
-      "loss": 0.0837,
-      "step": 12975
-    },
-    {
-      "epoch": 0.5970697653056538,
-      "grad_norm": 1.000243902206421,
-      "learning_rate": 1.3995020497668735e-05,
-      "loss": 0.053,
-      "step": 13000
-    },
-    {
-      "epoch": 0.5982179763927801,
-      "grad_norm": 0.024470528587698936,
-      "learning_rate": 1.3926244157126285e-05,
-      "loss": 0.1149,
-      "step": 13025
-    },
-    {
-      "epoch": 0.5993661874799063,
-      "grad_norm": 274.2747497558594,
-      "learning_rate": 1.385754684805685e-05,
-      "loss": 0.1034,
-      "step": 13050
-    },
-    {
-      "epoch": 0.6005143985670326,
-      "grad_norm": 0.02918631210923195,
-      "learning_rate": 1.3788929464347121e-05,
-      "loss": 0.0702,
-      "step": 13075
-    },
-    {
-      "epoch": 0.6016626096541589,
-      "grad_norm": 0.040033821016550064,
-      "learning_rate": 1.3720392898843808e-05,
-      "loss": 0.0546,
-      "step": 13100
-    },
-    {
-      "epoch": 0.6028108207412851,
-      "grad_norm": 9.150288581848145,
-      "learning_rate": 1.3651938043342013e-05,
-      "loss": 0.1065,
-      "step": 13125
-    },
-    {
-      "epoch": 0.6039590318284114,
-      "grad_norm": 0.039467647671699524,
-      "learning_rate": 1.358356578857363e-05,
-      "loss": 0.0938,
-      "step": 13150
-    },
-    {
-      "epoch": 0.6051072429155376,
-      "grad_norm": 0.035682786256074905,
-      "learning_rate": 1.3515277024195765e-05,
-      "loss": 0.0543,
-      "step": 13175
-    },
-    {
-      "epoch": 0.6062554540026639,
-      "grad_norm": 0.22070759534835815,
-      "learning_rate": 1.3447072638779137e-05,
-      "loss": 0.0657,
-      "step": 13200
-    },
-    {
-      "epoch": 0.6074036650897902,
-      "grad_norm": 0.009781109169125557,
-      "learning_rate": 1.3378953519796545e-05,
-      "loss": 0.0457,
-      "step": 13225
-    },
-    {
-      "epoch": 0.6085518761769164,
-      "grad_norm": 21.625680923461914,
-      "learning_rate": 1.3310920553611286e-05,
-      "loss": 0.0978,
-      "step": 13250
-    },
-    {
-      "epoch": 0.6097000872640426,
-      "grad_norm": 1.1382207870483398,
-      "learning_rate": 1.324297462546567e-05,
-      "loss": 0.0682,
-      "step": 13275
-    },
-    {
-      "epoch": 0.6108482983511688,
-      "grad_norm": 24.955183029174805,
-      "learning_rate": 1.3175116619469424e-05,
-      "loss": 0.0923,
-      "step": 13300
-    },
-    {
-      "epoch": 0.6119965094382951,
-      "grad_norm": 0.0540093258023262,
-      "learning_rate": 1.3107347418588276e-05,
-      "loss": 0.0339,
-      "step": 13325
-    },
-    {
-      "epoch": 0.6131447205254213,
-      "grad_norm": 28.082834243774414,
-      "learning_rate": 1.3039667904632412e-05,
-      "loss": 0.0556,
-      "step": 13350
-    },
-    {
-      "epoch": 0.6142929316125476,
-      "grad_norm": 0.023283861577510834,
-      "learning_rate": 1.2972078958245016e-05,
-      "loss": 0.0866,
-      "step": 13375
-    },
-    {
-      "epoch": 0.6154411426996739,
-      "grad_norm": 22.21122932434082,
-      "learning_rate": 1.2904581458890809e-05,
-      "loss": 0.1098,
-      "step": 13400
-    },
-    {
-      "epoch": 0.6165893537868001,
-      "grad_norm": 0.04448782652616501,
-      "learning_rate": 1.2837176284844604e-05,
-      "loss": 0.0954,
-      "step": 13425
-    },
-    {
-      "epoch": 0.6177375648739264,
-      "grad_norm": 0.056919727474451065,
-      "learning_rate": 1.276986431317989e-05,
-      "loss": 0.1402,
-      "step": 13450
-    },
-    {
-      "epoch": 0.6188857759610527,
-      "grad_norm": 0.10666215419769287,
-      "learning_rate": 1.27026464197574e-05,
-      "loss": 0.087,
-      "step": 13475
-    },
-    {
-      "epoch": 0.6200339870481789,
-      "grad_norm": 0.20836232602596283,
-      "learning_rate": 1.2635523479213732e-05,
-      "loss": 0.056,
-      "step": 13500
-    },
-    {
-      "epoch": 0.6211821981353052,
-      "grad_norm": 12.055030822753906,
-      "learning_rate": 1.2568496364949953e-05,
-      "loss": 0.1186,
-      "step": 13525
-    },
-    {
-      "epoch": 0.6223304092224314,
-      "grad_norm": 0.016045846045017242,
-      "learning_rate": 1.2501565949120258e-05,
-      "loss": 0.1067,
-      "step": 13550
-    },
-    {
-      "epoch": 0.6234786203095577,
-      "grad_norm": 0.24520662426948547,
-      "learning_rate": 1.2434733102620586e-05,
-      "loss": 0.0778,
-      "step": 13575
-    },
-    {
-      "epoch": 0.624626831396684,
-      "grad_norm": 0.03329060226678848,
-      "learning_rate": 1.2367998695077317e-05,
-      "loss": 0.0169,
-      "step": 13600
-    },
-    {
-      "epoch": 0.6257750424838102,
-      "grad_norm": 0.04618681222200394,
-      "learning_rate": 1.2301363594835954e-05,
-      "loss": 0.0935,
-      "step": 13625
-    },
-    {
-      "epoch": 0.6269232535709365,
-      "grad_norm": 0.06207937374711037,
-      "learning_rate": 1.2234828668949796e-05,
-      "loss": 0.0924,
-      "step": 13650
-    },
-    {
-      "epoch": 0.6280714646580627,
-      "grad_norm": 0.05461873859167099,
-      "learning_rate": 1.2168394783168707e-05,
-      "loss": 0.0716,
-      "step": 13675
-    },
-    {
-      "epoch": 0.629219675745189,
-      "grad_norm": 0.06397520750761032,
-      "learning_rate": 1.2102062801927792e-05,
-      "loss": 0.0773,
-      "step": 13700
-    },
-    {
-      "epoch": 0.6303678868323153,
-      "grad_norm": 35.83101272583008,
-      "learning_rate": 1.2035833588336205e-05,
-      "loss": 0.059,
-      "step": 13725
-    },
-    {
-      "epoch": 0.6315160979194415,
-      "grad_norm": 0.06737085431814194,
-      "learning_rate": 1.1969708004165869e-05,
-      "loss": 0.0339,
-      "step": 13750
-    },
-    {
-      "epoch": 0.6326643090065678,
-      "grad_norm": 0.18903277814388275,
-      "learning_rate": 1.190368690984029e-05,
-      "loss": 0.049,
-      "step": 13775
-    },
-    {
-      "epoch": 0.633812520093694,
-      "grad_norm": 30.07693862915039,
-      "learning_rate": 1.1837771164423372e-05,
-      "loss": 0.0811,
-      "step": 13800
-    },
-    {
-      "epoch": 0.6349607311808203,
-      "grad_norm": 0.031337298452854156,
-      "learning_rate": 1.1771961625608203e-05,
-      "loss": 0.0716,
-      "step": 13825
-    },
-    {
-      "epoch": 0.6361089422679466,
-      "grad_norm": 0.1882849633693695,
-      "learning_rate": 1.1706259149705927e-05,
-      "loss": 0.0561,
-      "step": 13850
-    },
-    {
-      "epoch": 0.6372571533550728,
-      "grad_norm": 0.00982393603771925,
-      "learning_rate": 1.1640664591634585e-05,
-      "loss": 0.0454,
-      "step": 13875
-    },
-    {
-      "epoch": 0.6384053644421991,
-      "grad_norm": 0.00783027894794941,
-      "learning_rate": 1.1575178804907993e-05,
-      "loss": 0.1095,
-      "step": 13900
-    },
-    {
-      "epoch": 0.6395535755293253,
-      "grad_norm": 0.06899901479482651,
-      "learning_rate": 1.1509802641624642e-05,
-      "loss": 0.0606,
-      "step": 13925
-    },
-    {
-      "epoch": 0.6407017866164516,
-      "grad_norm": 0.05791301280260086,
-      "learning_rate": 1.1444536952456611e-05,
-      "loss": 0.0876,
-      "step": 13950
-    },
-    {
-      "epoch": 0.6418499977035779,
-      "grad_norm": 0.0551036112010479,
-      "learning_rate": 1.1379382586638487e-05,
-      "loss": 0.0195,
-      "step": 13975
-    },
-    {
-      "epoch": 0.6429982087907041,
-      "grad_norm": 0.3207205832004547,
-      "learning_rate": 1.1314340391956326e-05,
-      "loss": 0.0235,
-      "step": 14000
-    },
-    {
-      "epoch": 0.6441464198778304,
-      "grad_norm": 0.5788611173629761,
-      "learning_rate": 1.1249411214736616e-05,
-      "loss": 0.0834,
-      "step": 14025
-    },
-    {
-      "epoch": 0.6452946309649567,
-      "grad_norm": 0.015164912678301334,
-      "learning_rate": 1.118459589983526e-05,
-      "loss": 0.0862,
-      "step": 14050
-    },
-    {
-      "epoch": 0.6464428420520828,
-      "grad_norm": 0.19156979024410248,
-      "learning_rate": 1.1119895290626616e-05,
-      "loss": 0.1002,
-      "step": 14075
-    },
-    {
-      "epoch": 0.6475910531392091,
-      "grad_norm": 0.8341970443725586,
-      "learning_rate": 1.1055310228992453e-05,
-      "loss": 0.0336,
-      "step": 14100
-    },
-    {
-      "epoch": 0.6487392642263353,
-      "grad_norm": 0.015737101435661316,
-      "learning_rate": 1.0990841555311062e-05,
-      "loss": 0.0473,
-      "step": 14125
-    },
-    {
-      "epoch": 0.6498874753134616,
-      "grad_norm": 0.09364493191242218,
-      "learning_rate": 1.0926490108446317e-05,
-      "loss": 0.0548,
-      "step": 14150
-    },
-    {
-      "epoch": 0.6510356864005878,
-      "grad_norm": 0.07271132618188858,
-      "learning_rate": 1.0862256725736713e-05,
-      "loss": 0.0133,
-      "step": 14175
-    },
-    {
-      "epoch": 0.6521838974877141,
-      "grad_norm": 0.035903144627809525,
-      "learning_rate": 1.0798142242984507e-05,
-      "loss": 0.0337,
-      "step": 14200
-    },
-    {
-      "epoch": 0.6533321085748404,
-      "grad_norm": 19.09912109375,
-      "learning_rate": 1.0734147494444835e-05,
-      "loss": 0.0449,
-      "step": 14225
-    },
-    {
-      "epoch": 0.6544803196619666,
-      "grad_norm": 0.03382350504398346,
-      "learning_rate": 1.0670273312814854e-05,
-      "loss": 0.0619,
-      "step": 14250
-    },
-    {
-      "epoch": 0.6556285307490929,
-      "grad_norm": 0.2958403527736664,
-      "learning_rate": 1.0606520529222928e-05,
-      "loss": 0.0788,
-      "step": 14275
-    },
-    {
-      "epoch": 0.6567767418362191,
-      "grad_norm": 0.008743496611714363,
-      "learning_rate": 1.0542889973217765e-05,
-      "loss": 0.0784,
-      "step": 14300
-    },
-    {
-      "epoch": 0.6579249529233454,
-      "grad_norm": 12.422863960266113,
-      "learning_rate": 1.0479382472757673e-05,
-      "loss": 0.0671,
-      "step": 14325
-    },
-    {
-      "epoch": 0.6590731640104717,
-      "grad_norm": 0.12558233737945557,
-      "learning_rate": 1.0415998854199753e-05,
-      "loss": 0.108,
-      "step": 14350
-    },
-    {
-      "epoch": 0.6602213750975979,
-      "grad_norm": 0.021612035110592842,
-      "learning_rate": 1.0352739942289165e-05,
-      "loss": 0.0598,
-      "step": 14375
-    },
-    {
-      "epoch": 0.6613695861847242,
-      "grad_norm": 0.2242107391357422,
-      "learning_rate": 1.0289606560148402e-05,
-      "loss": 0.0619,
-      "step": 14400
-    },
-    {
-      "epoch": 0.6625177972718505,
-      "grad_norm": 11.117570877075195,
-      "learning_rate": 1.0226599529266554e-05,
-      "loss": 0.0695,
-      "step": 14425
-    },
-    {
-      "epoch": 0.6636660083589767,
-      "grad_norm": 0.20262782275676727,
-      "learning_rate": 1.0163719669488632e-05,
-      "loss": 0.0254,
-      "step": 14450
-    },
-    {
-      "epoch": 0.664814219446103,
-      "grad_norm": 0.039894625544548035,
-      "learning_rate": 1.0100967799004915e-05,
-      "loss": 0.0877,
-      "step": 14475
-    },
-    {
-      "epoch": 0.6659624305332292,
-      "grad_norm": 0.005091145634651184,
-      "learning_rate": 1.0038344734340271e-05,
-      "loss": 0.0592,
-      "step": 14500
-    },
-    {
-      "epoch": 0.6671106416203555,
-      "grad_norm": 0.004997065290808678,
-      "learning_rate": 9.975851290343577e-06,
-      "loss": 0.0867,
-      "step": 14525
-    },
-    {
-      "epoch": 0.6682588527074818,
-      "grad_norm": 1.4442912340164185,
-      "learning_rate": 9.913488280177072e-06,
-      "loss": 0.0725,
-      "step": 14550
-    },
-    {
-      "epoch": 0.669407063794608,
-      "grad_norm": 94.27518463134766,
-      "learning_rate": 9.851256515305803e-06,
-      "loss": 0.0307,
-      "step": 14575
-    },
-    {
-      "epoch": 0.6705552748817343,
-      "grad_norm": 0.0411621555685997,
-      "learning_rate": 9.789156805487044e-06,
-      "loss": 0.0963,
-      "step": 14600
-    },
-    {
-      "epoch": 0.6717034859688605,
-      "grad_norm": 10.251852035522461,
-      "learning_rate": 9.727189958759799e-06,
-      "loss": 0.1885,
-      "step": 14625
-    },
-    {
-      "epoch": 0.6728516970559868,
-      "grad_norm": 23.81732940673828,
-      "learning_rate": 9.665356781434249e-06,
-      "loss": 0.0569,
-      "step": 14650
-    },
-    {
-      "epoch": 0.6739999081431131,
-      "grad_norm": 0.010315894149243832,
-      "learning_rate": 9.603658078081268e-06,
-      "loss": 0.1003,
-      "step": 14675
-    },
-    {
-      "epoch": 0.6751481192302393,
-      "grad_norm": 26.99117088317871,
-      "learning_rate": 9.54209465152197e-06,
-      "loss": 0.0429,
-      "step": 14700
-    },
-    {
-      "epoch": 0.6762963303173656,
-      "grad_norm": 0.15148140490055084,
-      "learning_rate": 9.480667302817238e-06,
-      "loss": 0.097,
-      "step": 14725
-    },
-    {
-      "epoch": 0.6774445414044918,
-      "grad_norm": 15.545239448547363,
-      "learning_rate": 9.419376831257342e-06,
-      "loss": 0.1203,
-      "step": 14750
-    },
-    {
-      "epoch": 0.6785927524916181,
-      "grad_norm": 0.01652432791888714,
-      "learning_rate": 9.358224034351493e-06,
-      "loss": 0.0323,
-      "step": 14775
-    },
-    {
-      "epoch": 0.6797409635787444,
-      "grad_norm": 0.1552172303199768,
-      "learning_rate": 9.297209707817483e-06,
-      "loss": 0.052,
-      "step": 14800
-    },
-    {
-      "epoch": 0.6808891746658706,
-      "grad_norm": 0.020266342908143997,
-      "learning_rate": 9.23633464557134e-06,
-      "loss": 0.0756,
-      "step": 14825
-    },
-    {
-      "epoch": 0.6820373857529969,
-      "grad_norm": 0.062302496284246445,
-      "learning_rate": 9.175599639716976e-06,
-      "loss": 0.011,
-      "step": 14850
-    },
-    {
-      "epoch": 0.683185596840123,
-      "grad_norm": 0.14944753050804138,
-      "learning_rate": 9.115005480535938e-06,
-      "loss": 0.0373,
-      "step": 14875
-    },
-    {
-      "epoch": 0.6843338079272493,
-      "grad_norm": 0.363838255405426,
-      "learning_rate": 9.054552956477022e-06,
-      "loss": 0.074,
-      "step": 14900
-    },
-    {
-      "epoch": 0.6854820190143756,
-      "grad_norm": 0.018838651478290558,
-      "learning_rate": 8.994242854146114e-06,
-      "loss": 0.0707,
-      "step": 14925
-    },
-    {
-      "epoch": 0.6866302301015018,
-      "grad_norm": 0.015431873500347137,
-      "learning_rate": 8.93407595829589e-06,
-      "loss": 0.0222,
-      "step": 14950
-    },
-    {
-      "epoch": 0.6877784411886281,
-      "grad_norm": 10.010194778442383,
-      "learning_rate": 8.874053051815658e-06,
-      "loss": 0.0714,
-      "step": 14975
-    },
-    {
-      "epoch": 0.6889266522757543,
-      "grad_norm": 0.50653076171875,
-      "learning_rate": 8.81417491572112e-06,
-      "loss": 0.0292,
-      "step": 15000
-    },
-    {
-      "epoch": 0.6900748633628806,
-      "grad_norm": 0.018615659326314926,
-      "learning_rate": 8.754442329144232e-06,
-      "loss": 0.0173,
-      "step": 15025
-    },
-    {
-      "epoch": 0.6912230744500069,
-      "grad_norm": 0.0025054675061255693,
-      "learning_rate": 8.694856069323065e-06,
-      "loss": 0.0108,
-      "step": 15050
-    },
-    {
-      "epoch": 0.6923712855371331,
-      "grad_norm": 0.05472448095679283,
-      "learning_rate": 8.635416911591712e-06,
-      "loss": 0.0198,
-      "step": 15075
-    },
-    {
-      "epoch": 0.6935194966242594,
-      "grad_norm": 0.0030740767251700163,
-      "learning_rate": 8.576125629370156e-06,
-      "loss": 0.0536,
-      "step": 15100
-    },
-    {
-      "epoch": 0.6946677077113856,
-      "grad_norm": 0.011597322300076485,
-      "learning_rate": 8.516982994154238e-06,
-      "loss": 0.0311,
-      "step": 15125
-    },
-    {
-      "epoch": 0.6958159187985119,
-      "grad_norm": 9.239104270935059,
-      "learning_rate": 8.457989775505607e-06,
-      "loss": 0.16,
-      "step": 15150
-    },
-    {
-      "epoch": 0.6969641298856382,
-      "grad_norm": 0.03688933327794075,
-      "learning_rate": 8.399146741041709e-06,
-      "loss": 0.036,
-      "step": 15175
-    },
-    {
-      "epoch": 0.6981123409727644,
-      "grad_norm": 0.010223845951259136,
-      "learning_rate": 8.340454656425814e-06,
-      "loss": 0.001,
-      "step": 15200
-    },
-    {
-      "epoch": 0.6992605520598907,
-      "grad_norm": 17.244403839111328,
-      "learning_rate": 8.28191428535702e-06,
-      "loss": 0.1266,
-      "step": 15225
-    },
-    {
-      "epoch": 0.7004087631470169,
-      "grad_norm": 0.015315833501517773,
-      "learning_rate": 8.223526389560345e-06,
-      "loss": 0.0009,
-      "step": 15250
-    },
-    {
-      "epoch": 0.7015569742341432,
-      "grad_norm": 0.007865412160754204,
-      "learning_rate": 8.165291728776799e-06,
-      "loss": 0.0632,
-      "step": 15275
-    },
-    {
-      "epoch": 0.7027051853212695,
-      "grad_norm": 0.1337803304195404,
-      "learning_rate": 8.107211060753497e-06,
-      "loss": 0.0481,
-      "step": 15300
-    },
-    {
-      "epoch": 0.7038533964083957,
-      "grad_norm": 0.21445252001285553,
-      "learning_rate": 8.049285141233831e-06,
-      "loss": 0.0528,
-      "step": 15325
-    },
-    {
-      "epoch": 0.705001607495522,
-      "grad_norm": 0.2052716165781021,
-      "learning_rate": 7.991514723947589e-06,
-      "loss": 0.1004,
-      "step": 15350
-    },
-    {
-      "epoch": 0.7061498185826482,
-      "grad_norm": 6.991987705230713,
-      "learning_rate": 7.933900560601176e-06,
-      "loss": 0.0474,
-      "step": 15375
-    },
-    {
-      "epoch": 0.7072980296697745,
-      "grad_norm": 0.10487792640924454,
-      "learning_rate": 7.876443400867828e-06,
-      "loss": 0.111,
-      "step": 15400
-    },
-    {
-      "epoch": 0.7084462407569008,
-      "grad_norm": 2.286684989929199,
-      "learning_rate": 7.819143992377848e-06,
-      "loss": 0.0326,
-      "step": 15425
-    },
-    {
-      "epoch": 0.709594451844027,
-      "grad_norm": 0.013362064026296139,
-      "learning_rate": 7.76200308070891e-06,
-      "loss": 0.0688,
-      "step": 15450
-    },
-    {
-      "epoch": 0.7107426629311533,
-      "grad_norm": 0.2844740152359009,
-      "learning_rate": 7.70502140937631e-06,
-      "loss": 0.045,
-      "step": 15475
-    },
-    {
-      "epoch": 0.7118908740182796,
-      "grad_norm": 0.06062127277255058,
-      "learning_rate": 7.648199719823321e-06,
-      "loss": 0.0972,
-      "step": 15500
-    },
-    {
-      "epoch": 0.7130390851054058,
-      "grad_norm": 34.67185974121094,
-      "learning_rate": 7.591538751411536e-06,
-      "loss": 0.0903,
-      "step": 15525
-    },
-    {
-      "epoch": 0.7141872961925321,
-      "grad_norm": 0.020777028053998947,
-      "learning_rate": 7.535039241411266e-06,
-      "loss": 0.0811,
-      "step": 15550
-    },
-    {
-      "epoch": 0.7153355072796583,
-      "grad_norm": 0.023871062323451042,
-      "learning_rate": 7.478701924991918e-06,
-      "loss": 0.0868,
-      "step": 15575
-    },
-    {
-      "epoch": 0.7164837183667846,
-      "grad_norm": 0.0139063261449337,
-      "learning_rate": 7.422527535212443e-06,
-      "loss": 0.0627,
-      "step": 15600
-    },
-    {
-      "epoch": 0.7176319294539109,
-      "grad_norm": 0.12506511807441711,
-      "learning_rate": 7.366516803011798e-06,
-      "loss": 0.1005,
-      "step": 15625
-    },
-    {
-      "epoch": 0.7187801405410371,
-      "grad_norm": 0.0652109682559967,
-      "learning_rate": 7.310670457199434e-06,
-      "loss": 0.0836,
-      "step": 15650
-    },
-    {
-      "epoch": 0.7199283516281634,
-      "grad_norm": 0.21352873742580414,
-      "learning_rate": 7.254989224445823e-06,
-      "loss": 0.0903,
-      "step": 15675
-    },
-    {
-      "epoch": 0.7210765627152895,
-      "grad_norm": 0.2726813554763794,
-      "learning_rate": 7.199473829272985e-06,
-      "loss": 0.0385,
-      "step": 15700
-    },
-    {
-      "epoch": 0.7222247738024158,
-      "grad_norm": 0.3017083406448364,
-      "learning_rate": 7.144124994045054e-06,
-      "loss": 0.0429,
-      "step": 15725
-    },
-    {
-      "epoch": 0.723372984889542,
-      "grad_norm": 0.9701851010322571,
-      "learning_rate": 7.088943438958904e-06,
-      "loss": 0.0713,
-      "step": 15750
-    },
-    {
-      "epoch": 0.7245211959766683,
-      "grad_norm": 0.0777645856142044,
-      "learning_rate": 7.03392988203478e-06,
-      "loss": 0.0527,
-      "step": 15775
-    },
-    {
-      "epoch": 0.7256694070637946,
-      "grad_norm": 0.09907884895801544,
-      "learning_rate": 6.979085039106923e-06,
-      "loss": 0.0649,
-      "step": 15800
-    },
-    {
-      "epoch": 0.7268176181509208,
-      "grad_norm": 0.02636418305337429,
-      "learning_rate": 6.924409623814281e-06,
-      "loss": 0.0302,
-      "step": 15825
-    },
-    {
-      "epoch": 0.7279658292380471,
-      "grad_norm": 0.020427890121936798,
-      "learning_rate": 6.8699043475912145e-06,
-      "loss": 0.0163,
-      "step": 15850
-    },
-    {
-      "epoch": 0.7291140403251734,
-      "grad_norm": 15.661312103271484,
-      "learning_rate": 6.815569919658234e-06,
-      "loss": 0.0487,
-      "step": 15875
-    },
-    {
-      "epoch": 0.7302622514122996,
-      "grad_norm": 0.07850444316864014,
-      "learning_rate": 6.7614070470128e-06,
-      "loss": 0.0355,
-      "step": 15900
-    },
-    {
-      "epoch": 0.7314104624994259,
-      "grad_norm": 0.011141127906739712,
-      "learning_rate": 6.707416434420084e-06,
-      "loss": 0.0007,
-      "step": 15925
-    },
-    {
-      "epoch": 0.7325586735865521,
-      "grad_norm": 0.08933714032173157,
-      "learning_rate": 6.65359878440382e-06,
-      "loss": 0.1007,
-      "step": 15950
-    },
-    {
-      "epoch": 0.7337068846736784,
-      "grad_norm": 0.011219929903745651,
-      "learning_rate": 6.599954797237154e-06,
-      "loss": 0.0691,
-      "step": 15975
-    },
-    {
-      "epoch": 0.7348550957608047,
-      "grad_norm": 0.1323518455028534,
-      "learning_rate": 6.546485170933561e-06,
-      "loss": 0.0721,
-      "step": 16000
-    },
-    {
-      "epoch": 0.7360033068479309,
-      "grad_norm": 0.16300442814826965,
-      "learning_rate": 6.493190601237711e-06,
-      "loss": 0.1,
-      "step": 16025
-    },
-    {
-      "epoch": 0.7371515179350572,
-      "grad_norm": 0.08380606770515442,
-      "learning_rate": 6.440071781616462e-06,
-      "loss": 0.0276,
-      "step": 16050
-    },
-    {
-      "epoch": 0.7382997290221834,
-      "grad_norm": 13.4380521774292,
-      "learning_rate": 6.38712940324981e-06,
-      "loss": 0.0711,
-      "step": 16075
-    },
-    {
-      "epoch": 0.7394479401093097,
-      "grad_norm": 0.01914297230541706,
-      "learning_rate": 6.334364155021901e-06,
-      "loss": 0.0429,
-      "step": 16100
-    },
-    {
-      "epoch": 0.740596151196436,
-      "grad_norm": 0.007009518798440695,
-      "learning_rate": 6.281776723512094e-06,
-      "loss": 0.1372,
-      "step": 16125
-    },
-    {
-      "epoch": 0.7417443622835622,
-      "grad_norm": 0.09678731113672256,
-      "learning_rate": 6.229367792985976e-06,
-      "loss": 0.1245,
-      "step": 16150
-    },
-    {
-      "epoch": 0.7428925733706885,
-      "grad_norm": 0.007255534175783396,
-      "learning_rate": 6.177138045386499e-06,
-      "loss": 0.0326,
-      "step": 16175
-    },
-    {
-      "epoch": 0.7440407844578147,
-      "grad_norm": 0.1381562054157257,
-      "learning_rate": 6.125088160325092e-06,
-      "loss": 0.0234,
-      "step": 16200
-    },
-    {
-      "epoch": 0.745188995544941,
-      "grad_norm": 25.754289627075195,
-      "learning_rate": 6.0732188150728125e-06,
-      "loss": 0.0781,
-      "step": 16225
-    },
-    {
-      "epoch": 0.7463372066320673,
-      "grad_norm": 0.053420692682266235,
-      "learning_rate": 6.021530684551564e-06,
-      "loss": 0.0976,
-      "step": 16250
-    },
-    {
-      "epoch": 0.7474854177191935,
-      "grad_norm": 0.01614423282444477,
-      "learning_rate": 5.970024441325266e-06,
-      "loss": 0.06,
-      "step": 16275
-    },
-    {
-      "epoch": 0.7486336288063198,
-      "grad_norm": 0.2804288864135742,
-      "learning_rate": 5.918700755591138e-06,
-      "loss": 0.0173,
-      "step": 16300
-    },
-    {
-      "epoch": 0.749781839893446,
-      "grad_norm": 45.07802200317383,
-      "learning_rate": 5.867560295170967e-06,
-      "loss": 0.0699,
-      "step": 16325
-    },
-    {
-      "epoch": 0.7509300509805723,
-      "grad_norm": 2.6006088256835938,
-      "learning_rate": 5.816603725502412e-06,
-      "loss": 0.0218,
-      "step": 16350
-    },
-    {
-      "epoch": 0.7520782620676986,
-      "grad_norm": 0.07584603875875473,
-      "learning_rate": 5.7658317096303785e-06,
-      "loss": 0.0445,
-      "step": 16375
-    },
-    {
-      "epoch": 0.7532264731548248,
-      "grad_norm": 0.1880454570055008,
-      "learning_rate": 5.715244908198336e-06,
-      "loss": 0.0648,
-      "step": 16400
-    },
-    {
-      "epoch": 0.7543746842419511,
-      "grad_norm": 16.69906234741211,
-      "learning_rate": 5.664843979439765e-06,
-      "loss": 0.1212,
-      "step": 16425
-    },
-    {
-      "epoch": 0.7555228953290773,
-      "grad_norm": 0.2480192333459854,
-      "learning_rate": 5.614629579169568e-06,
-      "loss": 0.0396,
-      "step": 16450
-    },
-    {
-      "epoch": 0.7566711064162036,
-      "grad_norm": 234.59837341308594,
-      "learning_rate": 5.564602360775566e-06,
-      "loss": 0.1436,
-      "step": 16475
-    },
-    {
-      "epoch": 0.7578193175033298,
-      "grad_norm": 0.18311983346939087,
-      "learning_rate": 5.514762975209964e-06,
-      "loss": 0.0444,
-      "step": 16500
-    },
-    {
-      "epoch": 0.758967528590456,
-      "grad_norm": 0.024939807131886482,
-      "learning_rate": 5.465112070980885e-06,
-      "loss": 0.0401,
-      "step": 16525
-    },
-    {
-      "epoch": 0.7601157396775823,
-      "grad_norm": 0.010250646620988846,
-      "learning_rate": 5.415650294143944e-06,
-      "loss": 0.0773,
-      "step": 16550
-    },
-    {
-      "epoch": 0.7612639507647085,
-      "grad_norm": 0.34293168783187866,
-      "learning_rate": 5.366378288293856e-06,
-      "loss": 0.0606,
-      "step": 16575
-    },
-    {
-      "epoch": 0.7624121618518348,
-      "grad_norm": 1.0682902336120605,
-      "learning_rate": 5.317296694556029e-06,
-      "loss": 0.1323,
-      "step": 16600
-    },
-    {
-      "epoch": 0.7635603729389611,
-      "grad_norm": 0.12277599424123764,
-      "learning_rate": 5.268406151578234e-06,
-      "loss": 0.001,
-      "step": 16625
-    },
-    {
-      "epoch": 0.7647085840260873,
-      "grad_norm": 0.6210665702819824,
-      "learning_rate": 5.219707295522298e-06,
-      "loss": 0.046,
-      "step": 16650
-    },
-    {
-      "epoch": 0.7658567951132136,
-      "grad_norm": 19.144742965698242,
-      "learning_rate": 5.171200760055825e-06,
-      "loss": 0.0858,
-      "step": 16675
-    },
-    {
-      "epoch": 0.7670050062003398,
-      "grad_norm": 0.1801862269639969,
-      "learning_rate": 5.122887176343965e-06,
-      "loss": 0.0109,
-      "step": 16700
-    },
-    {
-      "epoch": 0.7681532172874661,
-      "grad_norm": 39.14018630981445,
-      "learning_rate": 5.074767173041169e-06,
-      "loss": 0.1078,
-      "step": 16725
-    },
-    {
-      "epoch": 0.7693014283745924,
-      "grad_norm": 0.11534450948238373,
-      "learning_rate": 5.0268413762830336e-06,
-      "loss": 0.0664,
-      "step": 16750
-    },
-    {
-      "epoch": 0.7704496394617186,
-      "grad_norm": 0.6854662299156189,
-      "learning_rate": 4.979110409678152e-06,
-      "loss": 0.0854,
-      "step": 16775
-    },
-    {
-      "epoch": 0.7715978505488449,
-      "grad_norm": 0.11855833977460861,
-      "learning_rate": 4.931574894299979e-06,
-      "loss": 0.0618,
-      "step": 16800
-    },
-    {
-      "epoch": 0.7727460616359711,
-      "grad_norm": 0.013820292428135872,
-      "learning_rate": 4.884235448678796e-06,
-      "loss": 0.0381,
-      "step": 16825
-    },
-    {
-      "epoch": 0.7738942727230974,
-      "grad_norm": 0.004285596311092377,
-      "learning_rate": 4.837092688793605e-06,
-      "loss": 0.0526,
-      "step": 16850
-    },
-    {
-      "epoch": 0.7750424838102237,
-      "grad_norm": 13.307071685791016,
-      "learning_rate": 4.7901472280641525e-06,
-      "loss": 0.0462,
-      "step": 16875
-    },
-    {
-      "epoch": 0.7761906948973499,
-      "grad_norm": 0.019855046644806862,
-      "learning_rate": 4.743399677342926e-06,
-      "loss": 0.0933,
-      "step": 16900
-    },
-    {
-      "epoch": 0.7773389059844762,
-      "grad_norm": 0.12389481067657471,
-      "learning_rate": 4.696850644907234e-06,
-      "loss": 0.0464,
-      "step": 16925
-    },
-    {
-      "epoch": 0.7784871170716025,
-      "grad_norm": 0.09542281925678253,
-      "learning_rate": 4.65050073645126e-06,
-      "loss": 0.0691,
-      "step": 16950
-    },
-    {
-      "epoch": 0.7796353281587287,
-      "grad_norm": 0.2983253598213196,
-      "learning_rate": 4.6043505550781945e-06,
-      "loss": 0.0359,
-      "step": 16975
-    },
-    {
-      "epoch": 0.780783539245855,
-      "grad_norm": 14.008248329162598,
-      "learning_rate": 4.558400701292389e-06,
-      "loss": 0.0641,
-      "step": 17000
-    },
-    {
-      "epoch": 0.7819317503329812,
-      "grad_norm": 0.024128809571266174,
-      "learning_rate": 4.512651772991534e-06,
-      "loss": 0.0224,
-      "step": 17025
-    },
-    {
-      "epoch": 0.7830799614201075,
-      "grad_norm": 0.02002117410302162,
-      "learning_rate": 4.467104365458905e-06,
-      "loss": 0.0216,
-      "step": 17050
-    },
-    {
-      "epoch": 0.7842281725072338,
-      "grad_norm": 0.004471524618566036,
-      "learning_rate": 4.421759071355578e-06,
-      "loss": 0.0698,
-      "step": 17075
-    },
-    {
-      "epoch": 0.78537638359436,
-      "grad_norm": 14.2806978225708,
-      "learning_rate": 4.376616480712741e-06,
-      "loss": 0.0849,
-      "step": 17100
-    },
-    {
-      "epoch": 0.7865245946814863,
-      "grad_norm": 0.05184144154191017,
-      "learning_rate": 4.331677180924017e-06,
-      "loss": 0.1411,
-      "step": 17125
-    },
-    {
-      "epoch": 0.7876728057686125,
-      "grad_norm": 0.0295930914580822,
-      "learning_rate": 4.286941756737806e-06,
-      "loss": 0.0095,
-      "step": 17150
-    },
-    {
-      "epoch": 0.7888210168557388,
-      "grad_norm": 32.65333938598633,
-      "learning_rate": 4.242410790249705e-06,
-      "loss": 0.1065,
-      "step": 17175
-    },
-    {
-      "epoch": 0.7899692279428651,
-      "grad_norm": 0.07840365171432495,
-      "learning_rate": 4.198084860894902e-06,
-      "loss": 0.0555,
-      "step": 17200
-    },
-    {
-      "epoch": 0.7911174390299913,
-      "grad_norm": 0.10743328183889389,
-      "learning_rate": 4.153964545440652e-06,
-      "loss": 0.0224,
-      "step": 17225
-    },
-    {
-      "epoch": 0.7922656501171176,
-      "grad_norm": 0.038462039083242416,
-      "learning_rate": 4.11005041797877e-06,
-      "loss": 0.0474,
-      "step": 17250
-    },
-    {
-      "epoch": 0.7934138612042438,
-      "grad_norm": 0.08056436479091644,
-      "learning_rate": 4.066343049918156e-06,
-      "loss": 0.049,
-      "step": 17275
-    },
-    {
-      "epoch": 0.79456207229137,
-      "grad_norm": 0.2512977421283722,
-      "learning_rate": 4.022843009977388e-06,
-      "loss": 0.0407,
-      "step": 17300
-    },
-    {
-      "epoch": 0.7957102833784963,
-      "grad_norm": 47.47632598876953,
-      "learning_rate": 3.979550864177262e-06,
-      "loss": 0.0447,
-      "step": 17325
-    },
-    {
-      "epoch": 0.7968584944656225,
-      "grad_norm": 0.1907694935798645,
-      "learning_rate": 3.936467175833487e-06,
-      "loss": 0.1266,
-      "step": 17350
-    },
-    {
-      "epoch": 0.7980067055527488,
-      "grad_norm": 118.2800064086914,
-      "learning_rate": 3.893592505549335e-06,
-      "loss": 0.0792,
-      "step": 17375
-    },
-    {
-      "epoch": 0.799154916639875,
-      "grad_norm": 0.35802847146987915,
-      "learning_rate": 3.85092741120833e-06,
-      "loss": 0.0686,
-      "step": 17400
-    },
-    {
-      "epoch": 0.8003031277270013,
-      "grad_norm": 99.87713623046875,
-      "learning_rate": 3.808472447967009e-06,
-      "loss": 0.0879,
-      "step": 17425
-    },
-    {
-      "epoch": 0.8014513388141276,
-      "grad_norm": 0.14348523318767548,
-      "learning_rate": 3.76622816824769e-06,
-      "loss": 0.0545,
-      "step": 17450
-    },
-    {
-      "epoch": 0.8025995499012538,
-      "grad_norm": 0.20051589608192444,
-      "learning_rate": 3.7241951217312777e-06,
-      "loss": 0.0261,
-      "step": 17475
-    },
-    {
-      "epoch": 0.8037477609883801,
-      "grad_norm": 11.250834465026855,
-      "learning_rate": 3.6823738553501408e-06,
-      "loss": 0.0782,
-      "step": 17500
-    },
-    {
-      "epoch": 0.8048959720755063,
-      "grad_norm": 0.04791349917650223,
-      "learning_rate": 3.64076491328095e-06,
-      "loss": 0.0687,
-      "step": 17525
-    },
-    {
-      "epoch": 0.8060441831626326,
-      "grad_norm": 126.63636016845703,
-      "learning_rate": 3.599368836937631e-06,
-      "loss": 0.033,
-      "step": 17550
-    },
-    {
-      "epoch": 0.8071923942497589,
-      "grad_norm": 0.020389311015605927,
-      "learning_rate": 3.558186164964306e-06,
-      "loss": 0.0724,
-      "step": 17575
-    },
-    {
-      "epoch": 0.8083406053368851,
-      "grad_norm": 0.026229048147797585,
-      "learning_rate": 3.517217433228284e-06,
-      "loss": 0.0687,
-      "step": 17600
-    },
-    {
-      "epoch": 0.8094888164240114,
-      "grad_norm": 0.02965112403035164,
-      "learning_rate": 3.476463174813105e-06,
-      "loss": 0.0642,
-      "step": 17625
-    },
-    {
-      "epoch": 0.8106370275111376,
-      "grad_norm": 0.09377150237560272,
-      "learning_rate": 3.4359239200115814e-06,
-      "loss": 0.0336,
-      "step": 17650
-    },
-    {
-      "epoch": 0.8117852385982639,
-      "grad_norm": 0.8769047260284424,
-      "learning_rate": 3.3956001963189045e-06,
-      "loss": 0.0494,
-      "step": 17675
-    },
-    {
-      "epoch": 0.8129334496853902,
-      "grad_norm": 0.2824127972126007,
-      "learning_rate": 3.3554925284257877e-06,
-      "loss": 0.0969,
-      "step": 17700
-    },
-    {
-      "epoch": 0.8140816607725164,
-      "grad_norm": 7.552093505859375,
-      "learning_rate": 3.3156014382116308e-06,
-      "loss": 0.052,
-      "step": 17725
-    },
-    {
-      "epoch": 0.8152298718596427,
-      "grad_norm": 0.0463847815990448,
-      "learning_rate": 3.2759274447377452e-06,
-      "loss": 0.0794,
-      "step": 17750
-    },
-    {
-      "epoch": 0.816378082946769,
-      "grad_norm": 0.02667197957634926,
-      "learning_rate": 3.2364710642405717e-06,
-      "loss": 0.0484,
-      "step": 17775
-    },
-    {
-      "epoch": 0.8175262940338952,
-      "grad_norm": 0.21280555427074432,
-      "learning_rate": 3.1972328101249927e-06,
-      "loss": 0.08,
-      "step": 17800
-    },
-    {
-      "epoch": 0.8186745051210215,
-      "grad_norm": 0.01187364012002945,
-      "learning_rate": 3.1582131929576263e-06,
-      "loss": 0.0336,
-      "step": 17825
-    },
-    {
-      "epoch": 0.8198227162081477,
-      "grad_norm": 0.022009817883372307,
-      "learning_rate": 3.119412720460204e-06,
-      "loss": 0.0624,
-      "step": 17850
-    },
-    {
-      "epoch": 0.820970927295274,
-      "grad_norm": 72.19200134277344,
-      "learning_rate": 3.080831897502958e-06,
-      "loss": 0.1004,
-      "step": 17875
-    },
-    {
-      "epoch": 0.8221191383824002,
-      "grad_norm": 0.12891149520874023,
-      "learning_rate": 3.0424712260980425e-06,
-      "loss": 0.0342,
-      "step": 17900
-    },
-    {
-      "epoch": 0.8232673494695265,
-      "grad_norm": 0.015281450003385544,
-      "learning_rate": 3.0043312053930095e-06,
-      "loss": 0.0711,
-      "step": 17925
-    },
-    {
-      "epoch": 0.8244155605566528,
-      "grad_norm": 0.06948760896921158,
-      "learning_rate": 2.96641233166431e-06,
-      "loss": 0.0971,
-      "step": 17950
-    },
-    {
-      "epoch": 0.825563771643779,
-      "grad_norm": 0.022813860327005386,
-      "learning_rate": 2.9287150983108526e-06,
-      "loss": 0.1079,
-      "step": 17975
-    },
-    {
-      "epoch": 0.8267119827309053,
-      "grad_norm": 0.027087997645139694,
-      "learning_rate": 2.8912399958475546e-06,
-      "loss": 0.0543,
-      "step": 18000
-    },
-    {
-      "epoch": 0.8278601938180316,
-      "grad_norm": 0.012122283689677715,
-      "learning_rate": 2.8539875118989813e-06,
-      "loss": 0.0429,
-      "step": 18025
-    },
-    {
-      "epoch": 0.8290084049051578,
-      "grad_norm": 0.1680118888616562,
-      "learning_rate": 2.816958131192993e-06,
-      "loss": 0.0524,
-      "step": 18050
-    },
-    {
-      "epoch": 0.8301566159922841,
-      "grad_norm": 0.015649737790226936,
-      "learning_rate": 2.7801523355544357e-06,
-      "loss": 0.0395,
-      "step": 18075
-    },
-    {
-      "epoch": 0.8313048270794102,
-      "grad_norm": 0.02703564055263996,
-      "learning_rate": 2.743570603898895e-06,
-      "loss": 0.0796,
-      "step": 18100
-    },
-    {
-      "epoch": 0.8324530381665365,
-      "grad_norm": 7.242969989776611,
-      "learning_rate": 2.707213412226417e-06,
-      "loss": 0.1004,
-      "step": 18125
-    },
-    {
-      "epoch": 0.8336012492536627,
-      "grad_norm": 0.03490910306572914,
-      "learning_rate": 2.6710812336153556e-06,
-      "loss": 0.0751,
-      "step": 18150
-    },
-    {
-      "epoch": 0.834749460340789,
-      "grad_norm": 0.08122912794351578,
-      "learning_rate": 2.635174538216203e-06,
-      "loss": 0.017,
-      "step": 18175
-    },
-    {
-      "epoch": 0.8358976714279153,
-      "grad_norm": 0.31337007880210876,
-      "learning_rate": 2.599493793245478e-06,
-      "loss": 0.044,
-      "step": 18200
-    },
-    {
-      "epoch": 0.8370458825150415,
-      "grad_norm": 0.022868860512971878,
-      "learning_rate": 2.564039462979635e-06,
-      "loss": 0.1036,
-      "step": 18225
-    },
-    {
-      "epoch": 0.8381940936021678,
-      "grad_norm": 0.1141880452632904,
-      "learning_rate": 2.5288120087490263e-06,
-      "loss": 0.0555,
-      "step": 18250
-    },
-    {
-      "epoch": 0.839342304689294,
-      "grad_norm": 0.021731365472078323,
-      "learning_rate": 2.4938118889319074e-06,
-      "loss": 0.0358,
-      "step": 18275
-    },
-    {
-      "epoch": 0.8404905157764203,
-      "grad_norm": 0.03330765664577484,
-      "learning_rate": 2.459039558948464e-06,
-      "loss": 0.0635,
-      "step": 18300
-    },
-    {
-      "epoch": 0.8416387268635466,
-      "grad_norm": 0.01710633747279644,
-      "learning_rate": 2.424495471254895e-06,
-      "loss": 0.1074,
-      "step": 18325
-    },
-    {
-      "epoch": 0.8427869379506728,
-      "grad_norm": 0.21491554379463196,
-      "learning_rate": 2.3901800753375137e-06,
-      "loss": 0.0803,
-      "step": 18350
-    },
-    {
-      "epoch": 0.8439351490377991,
-      "grad_norm": 0.03334735333919525,
-      "learning_rate": 2.356093817706908e-06,
-      "loss": 0.1059,
-      "step": 18375
-    },
-    {
-      "epoch": 0.8450833601249254,
-      "grad_norm": 10.869914054870605,
-      "learning_rate": 2.322237141892123e-06,
-      "loss": 0.0149,
-      "step": 18400
-    },
-    {
-      "epoch": 0.8462315712120516,
-      "grad_norm": 0.2710411250591278,
-      "learning_rate": 2.2886104884349035e-06,
-      "loss": 0.0454,
-      "step": 18425
-    },
-    {
-      "epoch": 0.8473797822991779,
-      "grad_norm": 0.3168097734451294,
-      "learning_rate": 2.255214294883943e-06,
-      "loss": 0.0519,
-      "step": 18450
-    },
-    {
-      "epoch": 0.8485279933863041,
-      "grad_norm": 0.020407404750585556,
-      "learning_rate": 2.2220489957892035e-06,
-      "loss": 0.0243,
-      "step": 18475
-    },
-    {
-      "epoch": 0.8496762044734304,
-      "grad_norm": 0.03963831812143326,
-      "learning_rate": 2.1891150226962577e-06,
-      "loss": 0.0435,
-      "step": 18500
-    },
-    {
-      "epoch": 0.8508244155605567,
-      "grad_norm": 0.031030111014842987,
-      "learning_rate": 2.1564128041406685e-06,
-      "loss": 0.0417,
-      "step": 18525
-    },
-    {
-      "epoch": 0.8519726266476829,
-      "grad_norm": 7.520127773284912,
-      "learning_rate": 2.1239427656424306e-06,
-      "loss": 0.0576,
-      "step": 18550
-    },
-    {
-      "epoch": 0.8531208377348092,
-      "grad_norm": 0.009268928319215775,
-      "learning_rate": 2.0917053297004086e-06,
-      "loss": 0.0296,
-      "step": 18575
-    },
-    {
-      "epoch": 0.8542690488219354,
-      "grad_norm": 15.918971061706543,
-      "learning_rate": 2.059700915786853e-06,
-      "loss": 0.0514,
-      "step": 18600
-    },
-    {
-      "epoch": 0.8554172599090617,
-      "grad_norm": 0.021697254851460457,
-      "learning_rate": 2.0279299403419483e-06,
-      "loss": 0.0315,
-      "step": 18625
-    },
-    {
-      "epoch": 0.856565470996188,
-      "grad_norm": 0.029788566753268242,
-      "learning_rate": 1.9963928167683756e-06,
-      "loss": 0.002,
-      "step": 18650
-    },
-    {
-      "epoch": 0.8577136820833142,
-      "grad_norm": 0.03993965685367584,
-      "learning_rate": 1.965089955425956e-06,
-      "loss": 0.0918,
-      "step": 18675
-    },
-    {
-      "epoch": 0.8588618931704405,
-      "grad_norm": 0.01322225946933031,
-      "learning_rate": 1.9340217636262948e-06,
-      "loss": 0.0726,
-      "step": 18700
-    },
-    {
-      "epoch": 0.8600101042575667,
-      "grad_norm": 8.532318115234375,
-      "learning_rate": 1.903188645627485e-06,
-      "loss": 0.121,
-      "step": 18725
-    },
-    {
-      "epoch": 0.861158315344693,
-      "grad_norm": 0.8324846625328064,
-      "learning_rate": 1.8725910026288496e-06,
-      "loss": 0.0393,
-      "step": 18750
-    },
-    {
-      "epoch": 0.8623065264318193,
-      "grad_norm": 7.0772576332092285,
-      "learning_rate": 1.8422292327657153e-06,
-      "loss": 0.0515,
-      "step": 18775
-    },
-    {
-      "epoch": 0.8634547375189455,
-      "grad_norm": 0.027621662244200706,
-      "learning_rate": 1.8121037311042512e-06,
-      "loss": 0.0582,
-      "step": 18800
-    },
-    {
-      "epoch": 0.8646029486060718,
-      "grad_norm": 0.23945468664169312,
-      "learning_rate": 1.7822148896363e-06,
-      "loss": 0.0701,
-      "step": 18825
-    },
-    {
-      "epoch": 0.865751159693198,
-      "grad_norm": 46.322044372558594,
-      "learning_rate": 1.7525630972742958e-06,
-      "loss": 0.0458,
-      "step": 18850
-    },
-    {
-      "epoch": 0.8668993707803243,
-      "grad_norm": 0.22386154532432556,
-      "learning_rate": 1.7231487398461943e-06,
-      "loss": 0.0535,
-      "step": 18875
-    },
-    {
-      "epoch": 0.8680475818674505,
-      "grad_norm": 0.15120260417461395,
-      "learning_rate": 1.6939722000904724e-06,
-      "loss": 0.0679,
-      "step": 18900
-    },
-    {
-      "epoch": 0.8691957929545767,
-      "grad_norm": 0.014234176836907864,
-      "learning_rate": 1.6650338576511128e-06,
-      "loss": 0.0751,
-      "step": 18925
-    },
-    {
-      "epoch": 0.870344004041703,
-      "grad_norm": 0.3317568600177765,
-      "learning_rate": 1.6363340890726953e-06,
-      "loss": 0.0044,
-      "step": 18950
-    },
-    {
-      "epoch": 0.8714922151288292,
-      "grad_norm": 0.04376505687832832,
-      "learning_rate": 1.607873267795481e-06,
-      "loss": 0.0367,
-      "step": 18975
-    },
-    {
-      "epoch": 0.8726404262159555,
-      "grad_norm": 150.4222869873047,
-      "learning_rate": 1.5796517641505692e-06,
-      "loss": 0.1018,
-      "step": 19000
-    },
-    {
-      "epoch": 0.8737886373030818,
-      "grad_norm": 0.057014692574739456,
-      "learning_rate": 1.5516699453550542e-06,
-      "loss": 0.0369,
-      "step": 19025
-    },
-    {
-      "epoch": 0.874936848390208,
-      "grad_norm": 15.234986305236816,
-      "learning_rate": 1.5239281755072655e-06,
-      "loss": 0.0832,
-      "step": 19050
-    },
-    {
-      "epoch": 0.8760850594773343,
-      "grad_norm": 15.890921592712402,
-      "learning_rate": 1.4964268155820261e-06,
-      "loss": 0.0393,
-      "step": 19075
-    },
-    {
-      "epoch": 0.8772332705644605,
-      "grad_norm": 0.3105641305446625,
-      "learning_rate": 1.4691662234259507e-06,
-      "loss": 0.0443,
-      "step": 19100
-    },
-    {
-      "epoch": 0.8783814816515868,
-      "grad_norm": 0.4761696457862854,
-      "learning_rate": 1.4421467537528022e-06,
-      "loss": 0.0697,
-      "step": 19125
-    },
-    {
-      "epoch": 0.8795296927387131,
-      "grad_norm": 0.06688586622476578,
-      "learning_rate": 1.4153687581388598e-06,
-      "loss": 0.0267,
-      "step": 19150
-    },
-    {
-      "epoch": 0.8806779038258393,
-      "grad_norm": 0.0743216946721077,
-      "learning_rate": 1.3888325850183494e-06,
-      "loss": 0.0311,
-      "step": 19175
-    },
-    {
-      "epoch": 0.8818261149129656,
-      "grad_norm": 26.711551666259766,
-      "learning_rate": 1.362538579678918e-06,
-      "loss": 0.0641,
-      "step": 19200
-    },
-    {
-      "epoch": 0.8829743260000918,
-      "grad_norm": 0.05731752887368202,
-      "learning_rate": 1.3364870842571298e-06,
-      "loss": 0.0191,
-      "step": 19225
-    },
-    {
-      "epoch": 0.8841225370872181,
-      "grad_norm": 0.12287302315235138,
-      "learning_rate": 1.3106784377340276e-06,
-      "loss": 0.0728,
-      "step": 19250
-    },
-    {
-      "epoch": 0.8852707481743444,
-      "grad_norm": 0.14047569036483765,
-      "learning_rate": 1.2851129759307047e-06,
-      "loss": 0.062,
-      "step": 19275
-    },
-    {
-      "epoch": 0.8864189592614706,
-      "grad_norm": 0.01031449530273676,
-      "learning_rate": 1.2597910315039496e-06,
-      "loss": 0.0776,
-      "step": 19300
-    },
-    {
-      "epoch": 0.8875671703485969,
-      "grad_norm": 0.013590280897915363,
-      "learning_rate": 1.2347129339419083e-06,
-      "loss": 0.0013,
-      "step": 19325
-    },
-    {
-      "epoch": 0.8887153814357232,
-      "grad_norm": 0.09587027877569199,
-      "learning_rate": 1.209879009559809e-06,
-      "loss": 0.0635,
-      "step": 19350
-    },
-    {
-      "epoch": 0.8898635925228494,
-      "grad_norm": 8.050373077392578,
-      "learning_rate": 1.1852895814956988e-06,
-      "loss": 0.0532,
-      "step": 19375
-    },
-    {
-      "epoch": 0.8910118036099757,
-      "grad_norm": 29.888036727905273,
-      "learning_rate": 1.1609449697062547e-06,
-      "loss": 0.1204,
-      "step": 19400
-    },
-    {
-      "epoch": 0.8921600146971019,
-      "grad_norm": 0.007251910865306854,
-      "learning_rate": 1.1368454909626058e-06,
-      "loss": 0.0317,
-      "step": 19425
-    },
-    {
-      "epoch": 0.8933082257842282,
-      "grad_norm": 0.022124579176306725,
-      "learning_rate": 1.1129914588462288e-06,
-      "loss": 0.0285,
-      "step": 19450
-    },
-    {
-      "epoch": 0.8944564368713545,
-      "grad_norm": 0.20724959671497345,
-      "learning_rate": 1.0893831837448566e-06,
-      "loss": 0.0535,
-      "step": 19475
-    },
-    {
-      "epoch": 0.8956046479584807,
-      "grad_norm": 0.03264782205224037,
-      "learning_rate": 1.0660209728484383e-06,
-      "loss": 0.0531,
-      "step": 19500
-    },
-    {
-      "epoch": 0.896752859045607,
-      "grad_norm": 0.30090564489364624,
-      "learning_rate": 1.0429051301451444e-06,
-      "loss": 0.0339,
-      "step": 19525
-    },
-    {
-      "epoch": 0.8979010701327332,
-      "grad_norm": 0.01747279241681099,
-      "learning_rate": 1.0200359564174157e-06,
-      "loss": 0.0655,
-      "step": 19550
-    },
-    {
-      "epoch": 0.8990492812198595,
-      "grad_norm": 0.43753546476364136,
-      "learning_rate": 9.974137492380431e-07,
-      "loss": 0.0686,
-      "step": 19575
-    },
-    {
-      "epoch": 0.9001974923069858,
-      "grad_norm": 0.16258420050144196,
-      "learning_rate": 9.750388029663061e-07,
-      "loss": 0.078,
-      "step": 19600
-    },
-    {
-      "epoch": 0.901345703394112,
-      "grad_norm": 0.1479305922985077,
-      "learning_rate": 9.529114087441216e-07,
-      "loss": 0.018,
-      "step": 19625
-    },
-    {
-      "epoch": 0.9024939144812383,
-      "grad_norm": 0.008134293369948864,
-      "learning_rate": 9.310318544922791e-07,
-      "loss": 0.0675,
-      "step": 19650
-    },
-    {
-      "epoch": 0.9036421255683645,
-      "grad_norm": 0.42792314291000366,
-      "learning_rate": 9.094004249066812e-07,
-      "loss": 0.0833,
-      "step": 19675
-    },
-    {
-      "epoch": 0.9047903366554907,
-      "grad_norm": 21.225181579589844,
-      "learning_rate": 8.880174014546417e-07,
-      "loss": 0.0778,
-      "step": 19700
-    },
-    {
-      "epoch": 0.905938547742617,
-      "grad_norm": 1.8646355867385864,
-      "learning_rate": 8.668830623712243e-07,
-      "loss": 0.05,
-      "step": 19725
-    },
-    {
-      "epoch": 0.9070867588297432,
-      "grad_norm": 0.005776954814791679,
-      "learning_rate": 8.459976826556194e-07,
-      "loss": 0.0939,
-      "step": 19750
-    },
-    {
-      "epoch": 0.9082349699168695,
-      "grad_norm": 0.02450496144592762,
-      "learning_rate": 8.253615340675658e-07,
-      "loss": 0.0309,
-      "step": 19775
-    },
-    {
-      "epoch": 0.9093831810039957,
-      "grad_norm": 0.02992299385368824,
-      "learning_rate": 8.049748851238304e-07,
-      "loss": 0.035,
-      "step": 19800
-    },
-    {
-      "epoch": 0.910531392091122,
-      "grad_norm": 108.27339172363281,
-      "learning_rate": 7.848380010946832e-07,
-      "loss": 0.0834,
-      "step": 19825
-    },
-    {
-      "epoch": 0.9116796031782483,
-      "grad_norm": 0.3087676167488098,
-      "learning_rate": 7.64951144000472e-07,
-      "loss": 0.1033,
-      "step": 19850
-    },
-    {
-      "epoch": 0.9128278142653745,
-      "grad_norm": 15.906920433044434,
-      "learning_rate": 7.453145726082023e-07,
-      "loss": 0.0817,
-      "step": 19875
-    },
-    {
-      "epoch": 0.9139760253525008,
-      "grad_norm": 0.0587725006043911,
-      "learning_rate": 7.25928542428167e-07,
-      "loss": 0.0329,
-      "step": 19900
-    },
-    {
-      "epoch": 0.915124236439627,
-      "grad_norm": 0.01723266765475273,
-      "learning_rate": 7.06793305710638e-07,
-      "loss": 0.0183,
-      "step": 19925
-    },
-    {
-      "epoch": 0.9162724475267533,
-      "grad_norm": 0.01341981254518032,
-      "learning_rate": 6.87909111442564e-07,
-      "loss": 0.0285,
-      "step": 19950
-    },
-    {
-      "epoch": 0.9174206586138796,
-      "grad_norm": 0.079460009932518,
-      "learning_rate": 6.69276205344338e-07,
-      "loss": 0.0697,
-      "step": 19975
-    },
-    {
-      "epoch": 0.9185688697010058,
-      "grad_norm": 0.03450249508023262,
-      "learning_rate": 6.50894829866604e-07,
-      "loss": 0.0491,
-      "step": 20000
-    },
-    {
-      "epoch": 0.9197170807881321,
-      "grad_norm": 0.05472569912672043,
-      "learning_rate": 6.327652241870996e-07,
-      "loss": 0.0937,
-      "step": 20025
-    },
-    {
-      "epoch": 0.9208652918752583,
-      "grad_norm": 0.27753645181655884,
-      "learning_rate": 6.148876242075475e-07,
-      "loss": 0.0456,
-      "step": 20050
-    },
-    {
-      "epoch": 0.9220135029623846,
-      "grad_norm": 0.0179009847342968,
-      "learning_rate": 5.972622625505753e-07,
-      "loss": 0.0364,
-      "step": 20075
-    },
-    {
-      "epoch": 0.9231617140495109,
-      "grad_norm": 0.07981289178133011,
-      "learning_rate": 5.798893685566964e-07,
-      "loss": 0.061,
-      "step": 20100
-    },
-    {
-      "epoch": 0.9243099251366371,
-      "grad_norm": 0.020030954852700233,
-      "learning_rate": 5.627691682813296e-07,
-      "loss": 0.0784,
-      "step": 20125
-    },
-    {
-      "epoch": 0.9254581362237634,
-      "grad_norm": 0.01465025544166565,
-      "learning_rate": 5.459018844918462e-07,
-      "loss": 0.0563,
-      "step": 20150
-    },
-    {
-      "epoch": 0.9266063473108896,
-      "grad_norm": 0.14181044697761536,
-      "learning_rate": 5.292877366646853e-07,
-      "loss": 0.0499,
-      "step": 20175
-    },
-    {
-      "epoch": 0.9277545583980159,
-      "grad_norm": 0.02160894311964512,
-      "learning_rate": 5.129269409824877e-07,
-      "loss": 0.0837,
-      "step": 20200
-    },
-    {
-      "epoch": 0.9289027694851422,
-      "grad_norm": 0.0065475874580442905,
-      "learning_rate": 4.968197103312844e-07,
-      "loss": 0.043,
-      "step": 20225
-    },
-    {
-      "epoch": 0.9300509805722684,
-      "grad_norm": 0.27221518754959106,
-      "learning_rate": 4.809662542977279e-07,
-      "loss": 0.0252,
-      "step": 20250
-    },
-    {
-      "epoch": 0.9311991916593947,
-      "grad_norm": 0.10816125571727753,
-      "learning_rate": 4.653667791663696e-07,
-      "loss": 0.0313,
-      "step": 20275
-    },
-    {
-      "epoch": 0.932347402746521,
-      "grad_norm": 0.011103514581918716,
-      "learning_rate": 4.500214879169651e-07,
-      "loss": 0.0285,
-      "step": 20300
-    },
-    {
-      "epoch": 0.9334956138336472,
-      "grad_norm": 0.12883403897285461,
-      "learning_rate": 4.3493058022184844e-07,
-      "loss": 0.0435,
-      "step": 20325
-    },
-    {
-      "epoch": 0.9346438249207735,
-      "grad_norm": 0.6828334927558899,
-      "learning_rate": 4.2009425244331493e-07,
-      "loss": 0.0606,
-      "step": 20350
-    },
-    {
-      "epoch": 0.9357920360078997,
-      "grad_norm": 0.06785145401954651,
-      "learning_rate": 4.055126976310786e-07,
-      "loss": 0.0012,
-      "step": 20375
-    },
-    {
-      "epoch": 0.936940247095026,
-      "grad_norm": 0.00857201311737299,
-      "learning_rate": 3.91186105519763e-07,
-      "loss": 0.0316,
-      "step": 20400
-    },
-    {
-      "epoch": 0.9380884581821523,
-      "grad_norm": 0.025530261918902397,
-      "learning_rate": 3.7711466252642324e-07,
-      "loss": 0.1559,
-      "step": 20425
-    },
-    {
-      "epoch": 0.9392366692692785,
-      "grad_norm": 0.021817797794938087,
-      "learning_rate": 3.632985517481213e-07,
-      "loss": 0.0298,
-      "step": 20450
-    },
-    {
-      "epoch": 0.9403848803564048,
-      "grad_norm": 0.006989351473748684,
-      "learning_rate": 3.4973795295955237e-07,
-      "loss": 0.0314,
-      "step": 20475
-    },
-    {
-      "epoch": 0.941533091443531,
-      "grad_norm": 0.01993320696055889,
-      "learning_rate": 3.364330426106932e-07,
-      "loss": 0.0323,
-      "step": 20500
-    },
-    {
-      "epoch": 0.9426813025306572,
-      "grad_norm": 7.905085563659668,
-      "learning_rate": 3.2338399382451977e-07,
-      "loss": 0.0802,
-      "step": 20525
-    },
-    {
-      "epoch": 0.9438295136177834,
-      "grad_norm": 0.06555715948343277,
-      "learning_rate": 3.1059097639474014e-07,
-      "loss": 0.0775,
-      "step": 20550
-    },
-    {
-      "epoch": 0.9449777247049097,
-      "grad_norm": 14.530689239501953,
-      "learning_rate": 2.9805415678359816e-07,
-      "loss": 0.0629,
-      "step": 20575
-    },
-    {
-      "epoch": 0.946125935792036,
-      "grad_norm": 0.01627318374812603,
-      "learning_rate": 2.8577369811969345e-07,
-      "loss": 0.053,
-      "step": 20600
-    },
-    {
-      "epoch": 0.9472741468791622,
-      "grad_norm": 0.0996640995144844,
-      "learning_rate": 2.737497601958827e-07,
-      "loss": 0.0729,
-      "step": 20625
-    },
-    {
-      "epoch": 0.9484223579662885,
-      "grad_norm": 0.1124410629272461,
-      "learning_rate": 2.6198249946716824e-07,
-      "loss": 0.0693,
-      "step": 20650
-    },
-    {
-      "epoch": 0.9495705690534147,
-      "grad_norm": 0.039213843643665314,
-      "learning_rate": 2.5047206904868616e-07,
-      "loss": 0.0742,
-      "step": 20675
-    },
-    {
-      "epoch": 0.950718780140541,
-      "grad_norm": 0.014431199990212917,
-      "learning_rate": 2.392186187137058e-07,
-      "loss": 0.0113,
-      "step": 20700
-    },
-    {
-      "epoch": 0.9518669912276673,
-      "grad_norm": 0.02886173129081726,
-      "learning_rate": 2.282222948916868e-07,
-      "loss": 0.0144,
-      "step": 20725
-    },
-    {
-      "epoch": 0.9530152023147935,
-      "grad_norm": 0.0110493628308177,
-      "learning_rate": 2.174832406663585e-07,
-      "loss": 0.046,
-      "step": 20750
-    },
-    {
-      "epoch": 0.9541634134019198,
-      "grad_norm": 0.015410786494612694,
-      "learning_rate": 2.070015957738747e-07,
-      "loss": 0.0199,
-      "step": 20775
-    },
-    {
-      "epoch": 0.955311624489046,
-      "grad_norm": 0.06066295877099037,
-      "learning_rate": 1.9677749660098831e-07,
-      "loss": 0.068,
-      "step": 20800
-    },
-    {
-      "epoch": 0.9564598355761723,
-      "grad_norm": 0.8917893767356873,
-      "learning_rate": 1.8681107618327755e-07,
-      "loss": 0.062,
-      "step": 20825
-    },
-    {
-      "epoch": 0.9576080466632986,
-      "grad_norm": 0.018326930701732635,
-      "learning_rate": 1.7710246420341582e-07,
-      "loss": 0.0741,
-      "step": 20850
-    },
-    {
-      "epoch": 0.9587562577504248,
-      "grad_norm": 0.0710015818476677,
-      "learning_rate": 1.676517869894778e-07,
-      "loss": 0.081,
-      "step": 20875
-    },
-    {
-      "epoch": 0.9599044688375511,
-      "grad_norm": 0.04719538986682892,
-      "learning_rate": 1.584591675133096e-07,
-      "loss": 0.029,
-      "step": 20900
-    },
-    {
-      "epoch": 0.9610526799246774,
-      "grad_norm": 0.012458267621695995,
-      "learning_rate": 1.4952472538891205e-07,
-      "loss": 0.0572,
-      "step": 20925
-    },
-    {
-      "epoch": 0.9622008910118036,
-      "grad_norm": 0.11031362414360046,
-      "learning_rate": 1.408485768708956e-07,
-      "loss": 0.0264,
-      "step": 20950
-    },
-    {
-      "epoch": 0.9633491020989299,
-      "grad_norm": 0.36117517948150635,
-      "learning_rate": 1.3243083485296793e-07,
-      "loss": 0.0468,
-      "step": 20975
-    },
-    {
-      "epoch": 0.9644973131860561,
-      "grad_norm": 0.02414921671152115,
-      "learning_rate": 1.242716088664575e-07,
-      "loss": 0.0524,
-      "step": 21000
-    },
-    {
-      "epoch": 0.9656455242731824,
-      "grad_norm": 0.15573085844516754,
-      "learning_rate": 1.1637100507889243e-07,
-      "loss": 0.0261,
-      "step": 21025
-    },
-    {
-      "epoch": 0.9667937353603087,
-      "grad_norm": 17.496049880981445,
-      "learning_rate": 1.0872912629261934e-07,
-      "loss": 0.1225,
-      "step": 21050
-    },
-    {
-      "epoch": 0.9679419464474349,
-      "grad_norm": 0.3148431181907654,
-      "learning_rate": 1.0134607194346446e-07,
-      "loss": 0.1067,
-      "step": 21075
-    },
-    {
-      "epoch": 0.9690901575345612,
-      "grad_norm": 0.05796843022108078,
-      "learning_rate": 9.422193809944358e-08,
-      "loss": 0.0683,
-      "step": 21100
-    },
-    {
-      "epoch": 0.9702383686216874,
-      "grad_norm": 0.009837535209953785,
-      "learning_rate": 8.735681745950741e-08,
-      "loss": 0.0158,
-      "step": 21125
-    },
-    {
-      "epoch": 0.9713865797088137,
-      "grad_norm": 0.031075546517968178,
-      "learning_rate": 8.075079935233599e-08,
-      "loss": 0.0333,
-      "step": 21150
-    },
-    {
-      "epoch": 0.97253479079594,
-      "grad_norm": 0.11271132528781891,
-      "learning_rate": 7.440396973517727e-08,
-      "loss": 0.0506,
-      "step": 21175
-    },
-    {
-      "epoch": 0.9736830018830662,
-      "grad_norm": 0.03392348811030388,
-      "learning_rate": 6.831641119272814e-08,
-      "loss": 0.0162,
-      "step": 21200
-    },
-    {
-      "epoch": 0.9748312129701925,
-      "grad_norm": 0.02314956858754158,
-      "learning_rate": 6.248820293606628e-08,
-      "loss": 0.0234,
-      "step": 21225
-    },
-    {
-      "epoch": 0.9759794240573187,
-      "grad_norm": 0.08869185298681259,
-      "learning_rate": 5.691942080160662e-08,
-      "loss": 0.0305,
-      "step": 21250
-    },
-    {
-      "epoch": 0.977127635144445,
-      "grad_norm": 0.08665268123149872,
-      "learning_rate": 5.16101372501221e-08,
-      "loss": 0.0254,
-      "step": 21275
-    },
-    {
-      "epoch": 0.9782758462315713,
-      "grad_norm": 0.1377086043357849,
-      "learning_rate": 4.6560421365804406e-08,
-      "loss": 0.0301,
-      "step": 21300
-    },
-    {
-      "epoch": 0.9794240573186974,
-      "grad_norm": 112.7161865234375,
-      "learning_rate": 4.1770338855360305e-08,
-      "loss": 0.0663,
-      "step": 21325
-    },
-    {
-      "epoch": 0.9805722684058237,
-      "grad_norm": 0.02316017635166645,
-      "learning_rate": 3.723995204715225e-08,
-      "loss": 0.0475,
-      "step": 21350
-    },
-    {
-      "epoch": 0.9817204794929499,
-      "grad_norm": 0.011443709954619408,
-      "learning_rate": 3.296931989039909e-08,
-      "loss": 0.0341,
-      "step": 21375
-    },
-    {
-      "epoch": 0.9828686905800762,
-      "grad_norm": 0.22276480495929718,
-      "learning_rate": 2.8958497954396646e-08,
-      "loss": 0.1059,
-      "step": 21400
-    },
-    {
-      "epoch": 0.9840169016672025,
-      "grad_norm": 0.2308289110660553,
-      "learning_rate": 2.520753842780277e-08,
-      "loss": 0.0883,
-      "step": 21425
-    },
-    {
-      "epoch": 0.9851651127543287,
-      "grad_norm": 0.011477758176624775,
-      "learning_rate": 2.1716490117957846e-08,
-      "loss": 0.062,
-      "step": 21450
-    },
-    {
-      "epoch": 0.986313323841455,
-      "grad_norm": 0.005774139892309904,
-      "learning_rate": 1.8485398450243107e-08,
-      "loss": 0.0689,
-      "step": 21475
-    },
-    {
-      "epoch": 0.9874615349285812,
-      "grad_norm": 0.10775581747293472,
-      "learning_rate": 1.551430546749888e-08,
-      "loss": 0.0417,
-      "step": 21500
-    },
-    {
-      "epoch": 0.9886097460157075,
-      "grad_norm": 0.013670174404978752,
-      "learning_rate": 1.2803249829471675e-08,
-      "loss": 0.0569,
-      "step": 21525
-    },
-    {
-      "epoch": 0.9897579571028338,
-      "grad_norm": 0.2979724705219269,
-      "learning_rate": 1.0352266812307943e-08,
-      "loss": 0.0224,
-      "step": 21550
-    },
-    {
-      "epoch": 0.99090616818996,
-      "grad_norm": 0.1788400113582611,
-      "learning_rate": 8.161388308103314e-09,
-      "loss": 0.0926,
-      "step": 21575
-    },
-    {
-      "epoch": 0.9920543792770863,
-      "grad_norm": 0.06912702322006226,
-      "learning_rate": 6.230642824485156e-09,
-      "loss": 0.0808,
-      "step": 21600
-    },
-    {
-      "epoch": 0.9932025903642125,
-      "grad_norm": 0.05865752696990967,
-      "learning_rate": 4.560055484235104e-09,
-      "loss": 0.0343,
-      "step": 21625
-    },
-    {
-      "epoch": 0.9943508014513388,
-      "grad_norm": 0.08185602724552155,
-      "learning_rate": 3.1496480249737506e-09,
-      "loss": 0.0328,
-      "step": 21650
-    },
-    {
-      "epoch": 0.9954990125384651,
-      "grad_norm": 0.564564049243927,
-      "learning_rate": 1.999438798863107e-09,
-      "loss": 0.0388,
-      "step": 21675
-    },
-    {
-      "epoch": 0.9966472236255913,
-      "grad_norm": 0.5557194948196411,
-      "learning_rate": 1.1094427723845613e-09,
-      "loss": 0.0493,
-      "step": 21700
-    },
-    {
-      "epoch": 0.9977954347127176,
-      "grad_norm": 0.0258924663066864,
-      "learning_rate": 4.796715261323748e-10,
-      "loss": 0.0509,
-      "step": 21725
-    },
-    {
-      "epoch": 0.9989436457998438,
-      "grad_norm": 0.021062856540083885,
-      "learning_rate": 1.1013325466047164e-10,
-      "loss": 0.0594,
-      "step": 21750
-    },
-    {
-      "epoch": 1.0,
-      "eval_accuracy": 0.9813904967021133,
-      "eval_auc": 0.9926618557097038,
-      "eval_f1": 0.98761561373256,
-      "eval_loss": 0.0981861874461174,
-      "eval_precision": 0.982620320855615,
-      "eval_recall": 0.9926619547112051,
-      "eval_runtime": 4593.0656,
-      "eval_samples_per_second": 6.47,
-      "eval_steps_per_second": 0.27,
-      "step": 21773
-    }
-  ],
-  "logging_steps": 25,
-  "max_steps": 21773,
-  "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
-  "save_steps": 500,
-  "stateful_callbacks": {
-    "EarlyStoppingCallback": {
-      "args": {
-        "early_stopping_patience": 5,
-        "early_stopping_threshold": 0.01
-      },
-      "attributes": {
-        "early_stopping_patience_counter": 0
-      }
-    },
-    "TrainerControl": {
-      "args": {
-        "should_epoch_stop": false,
-        "should_evaluate": false,
-        "should_log": false,
-        "should_save": true,
-        "should_training_stop": true
-      },
-      "attributes": {}
-    }
-  },
-  "total_flos": 6.0162864745331294e+19,
-  "train_batch_size": 12,
-  "trial_name": null,
-  "trial_params": null
-}