{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500.0,
  "global_step": 6237,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02405002405002405,
      "grad_norm": 0.5065411329269409,
      "learning_rate": 0.00019996828714700116,
      "loss": 1.7914,
      "step": 50
    },
    {
      "epoch": 0.0481000481000481,
      "grad_norm": 0.34842124581336975,
      "learning_rate": 0.00019987316870210547,
      "loss": 1.2136,
      "step": 100
    },
    {
      "epoch": 0.07215007215007214,
      "grad_norm": 0.3516596257686615,
      "learning_rate": 0.0001997147049948582,
      "loss": 1.1937,
      "step": 150
    },
    {
      "epoch": 0.0962000962000962,
      "grad_norm": 0.4468790888786316,
      "learning_rate": 0.0001994929965319844,
      "loss": 1.181,
      "step": 200
    },
    {
      "epoch": 0.12025012025012025,
      "grad_norm": 0.32536762952804565,
      "learning_rate": 0.0001992081839336419,
      "loss": 1.1738,
      "step": 250
    },
    {
      "epoch": 0.1443001443001443,
      "grad_norm": 0.2973495125770569,
      "learning_rate": 0.00019886044784423197,
      "loss": 1.1665,
      "step": 300
    },
    {
      "epoch": 0.16835016835016836,
      "grad_norm": 0.35850659012794495,
      "learning_rate": 0.00019845000881782432,
      "loss": 1.1676,
      "step": 350
    },
    {
      "epoch": 0.1924001924001924,
      "grad_norm": 6.042001724243164,
      "learning_rate": 0.00019797712717826914,
      "loss": 1.1572,
      "step": 400
    },
    {
      "epoch": 0.21645021645021645,
      "grad_norm": 0.2577451765537262,
      "learning_rate": 0.00019744210285408488,
      "loss": 1.1686,
      "step": 450
    },
    {
      "epoch": 0.2405002405002405,
      "grad_norm": 0.32454270124435425,
      "learning_rate": 0.0001968452751882264,
      "loss": 1.1575,
      "step": 500
    },
    {
      "epoch": 0.26455026455026454,
      "grad_norm": 0.30101215839385986,
      "learning_rate": 0.00019618702272285434,
      "loss": 1.159,
      "step": 550
    },
    {
      "epoch": 0.2886002886002886,
      "grad_norm": 0.6339617967605591,
      "learning_rate": 0.00019546776295924212,
      "loss": 1.1571,
      "step": 600
    },
    {
      "epoch": 0.3126503126503126,
      "grad_norm": 0.2634935975074768,
      "learning_rate": 0.0001946879520929728,
      "loss": 1.1475,
      "step": 650
    },
    {
      "epoch": 0.3367003367003367,
      "grad_norm": 0.292190819978714,
      "learning_rate": 0.00019384808472459368,
      "loss": 1.1545,
      "step": 700
    },
    {
      "epoch": 0.36075036075036077,
      "grad_norm": 0.20567256212234497,
      "learning_rate": 0.0001929486935459127,
      "loss": 1.1391,
      "step": 750
    },
    {
      "epoch": 0.3848003848003848,
      "grad_norm": 0.2804057002067566,
      "learning_rate": 0.00019199034900213452,
      "loss": 1.1427,
      "step": 800
    },
    {
      "epoch": 0.40885040885040885,
      "grad_norm": 0.3674640357494354,
      "learning_rate": 0.000190973658930052,
      "loss": 1.1448,
      "step": 850
    },
    {
      "epoch": 0.4329004329004329,
      "grad_norm": 0.9717519283294678,
      "learning_rate": 0.00018989926817252113,
      "loss": 1.1418,
      "step": 900
    },
    {
      "epoch": 0.45695045695045694,
      "grad_norm": 0.30908727645874023,
      "learning_rate": 0.00018876785816946505,
      "loss": 1.1368,
      "step": 950
    },
    {
      "epoch": 0.481000481000481,
      "grad_norm": 0.23766139149665833,
      "learning_rate": 0.00018758014652566597,
      "loss": 1.1464,
      "step": 1000
    },
    {
      "epoch": 0.5050505050505051,
      "grad_norm": 0.23821455240249634,
      "learning_rate": 0.0001863368865556191,
      "loss": 1.1366,
      "step": 1050
    },
    {
      "epoch": 0.5291005291005291,
      "grad_norm": 0.26834988594055176,
      "learning_rate": 0.0001850388668057379,
      "loss": 1.1345,
      "step": 1100
    },
    {
      "epoch": 0.5531505531505532,
      "grad_norm": 0.24402070045471191,
      "learning_rate": 0.0001836869105542127,
      "loss": 1.1354,
      "step": 1150
    },
    {
      "epoch": 0.5772005772005772,
      "grad_norm": 0.257290244102478,
      "learning_rate": 0.0001822818752888408,
      "loss": 1.1351,
      "step": 1200
    },
    {
      "epoch": 0.6012506012506013,
      "grad_norm": 0.22287152707576752,
      "learning_rate": 0.00018082465216315882,
      "loss": 1.1273,
      "step": 1250
    },
    {
      "epoch": 0.6253006253006252,
      "grad_norm": 0.2477453052997589,
      "learning_rate": 0.00017931616543122214,
      "loss": 1.1369,
      "step": 1300
    },
    {
      "epoch": 0.6493506493506493,
      "grad_norm": 0.34627246856689453,
      "learning_rate": 0.00017775737186139038,
      "loss": 1.122,
      "step": 1350
    },
    {
      "epoch": 0.6734006734006734,
      "grad_norm": 0.2759104073047638,
      "learning_rate": 0.00017614926012949028,
      "loss": 1.1216,
      "step": 1400
    },
    {
      "epoch": 0.6974506974506974,
      "grad_norm": 0.2591134011745453,
      "learning_rate": 0.00017449285019174098,
      "loss": 1.13,
      "step": 1450
    },
    {
      "epoch": 0.7215007215007215,
      "grad_norm": 0.20448699593544006,
      "learning_rate": 0.00017278919263783978,
      "loss": 1.1289,
      "step": 1500
    },
    {
      "epoch": 0.7455507455507455,
      "grad_norm": 0.2126525342464447,
      "learning_rate": 0.00017103936802461797,
      "loss": 1.1256,
      "step": 1550
    },
    {
      "epoch": 0.7696007696007696,
      "grad_norm": 0.18976248800754547,
      "learning_rate": 0.00016924448619069023,
      "loss": 1.1172,
      "step": 1600
    },
    {
      "epoch": 0.7936507936507936,
      "grad_norm": 0.22607876360416412,
      "learning_rate": 0.00016740568555253155,
      "loss": 1.1245,
      "step": 1650
    },
    {
      "epoch": 0.8177008177008177,
      "grad_norm": 0.24248099327087402,
      "learning_rate": 0.00016552413238242857,
      "loss": 1.1228,
      "step": 1700
    },
    {
      "epoch": 0.8417508417508418,
      "grad_norm": 0.20902663469314575,
      "learning_rate": 0.00016360102006876317,
      "loss": 1.1176,
      "step": 1750
    },
    {
      "epoch": 0.8658008658008658,
      "grad_norm": 0.2559059262275696,
      "learning_rate": 0.0001616375683590974,
      "loss": 1.1187,
      "step": 1800
    },
    {
      "epoch": 0.8898508898508899,
      "grad_norm": 0.2621828317642212,
      "learning_rate": 0.00015963502258654005,
      "loss": 1.1103,
      "step": 1850
    },
    {
      "epoch": 0.9139009139009139,
      "grad_norm": 0.17843465507030487,
      "learning_rate": 0.0001575946528798853,
      "loss": 1.1129,
      "step": 1900
    },
    {
      "epoch": 0.937950937950938,
      "grad_norm": 0.22196488082408905,
      "learning_rate": 0.0001555177533580245,
      "loss": 1.1121,
      "step": 1950
    },
    {
      "epoch": 0.962000962000962,
      "grad_norm": 0.20080924034118652,
      "learning_rate": 0.00015340564130914233,
      "loss": 1.1104,
      "step": 2000
    },
    {
      "epoch": 0.9860509860509861,
      "grad_norm": 0.17231349647045135,
      "learning_rate": 0.00015125965635521724,
      "loss": 1.1171,
      "step": 2050
    },
    {
      "epoch": 1.0101010101010102,
      "grad_norm": 0.17805251479148865,
      "learning_rate": 0.00014908115960235682,
      "loss": 1.0872,
      "step": 2100
    },
    {
      "epoch": 1.034151034151034,
      "grad_norm": 0.18555937707424164,
      "learning_rate": 0.00014687153277750676,
      "loss": 1.0656,
      "step": 2150
    },
    {
      "epoch": 1.0582010582010581,
      "grad_norm": 0.18195118010044098,
      "learning_rate": 0.00014463217735208062,
      "loss": 1.0599,
      "step": 2200
    },
    {
      "epoch": 1.0822510822510822,
      "grad_norm": 0.23314547538757324,
      "learning_rate": 0.00014236451365306674,
      "loss": 1.0696,
      "step": 2250
    },
    {
      "epoch": 1.1063011063011063,
      "grad_norm": 0.22400447726249695,
      "learning_rate": 0.00014006997996217593,
      "loss": 1.0733,
      "step": 2300
    },
    {
      "epoch": 1.1303511303511304,
      "grad_norm": 0.1384362429380417,
      "learning_rate": 0.00013775003160360096,
      "loss": 1.0567,
      "step": 2350
    },
    {
      "epoch": 1.1544011544011543,
      "grad_norm": 0.15328723192214966,
      "learning_rate": 0.00013540614002096701,
      "loss": 1.071,
      "step": 2400
    },
    {
      "epoch": 1.1784511784511784,
      "grad_norm": 0.15236607193946838,
      "learning_rate": 0.00013303979184405826,
      "loss": 1.0684,
      "step": 2450
    },
    {
      "epoch": 1.2025012025012025,
      "grad_norm": 0.5659245252609253,
      "learning_rate": 0.00013065248794591223,
      "loss": 1.0666,
      "step": 2500
    },
    {
      "epoch": 1.2265512265512266,
      "grad_norm": 0.18339212238788605,
      "learning_rate": 0.00012824574249088063,
      "loss": 1.0691,
      "step": 2550
    },
    {
      "epoch": 1.2506012506012505,
      "grad_norm": 0.15872247517108917,
      "learning_rate": 0.0001258210819742599,
      "loss": 1.0642,
      "step": 2600
    },
    {
      "epoch": 1.2746512746512746,
      "grad_norm": 0.17531836032867432,
      "learning_rate": 0.00012338004425410074,
      "loss": 1.061,
      "step": 2650
    },
    {
      "epoch": 1.2987012987012987,
      "grad_norm": 0.17102229595184326,
      "learning_rate": 0.00012092417757581085,
      "loss": 1.0651,
      "step": 2700
    },
    {
      "epoch": 1.3227513227513228,
      "grad_norm": 0.19855375587940216,
      "learning_rate": 0.00011845503959016928,
      "loss": 1.0641,
      "step": 2750
    },
    {
      "epoch": 1.3468013468013469,
      "grad_norm": 0.1624690294265747,
      "learning_rate": 0.0001159741963653755,
      "loss": 1.0575,
      "step": 2800
    },
    {
      "epoch": 1.370851370851371,
      "grad_norm": 0.15423128008842468,
      "learning_rate": 0.00011348322139375948,
      "loss": 1.0695,
      "step": 2850
    },
    {
      "epoch": 1.3949013949013949,
      "grad_norm": 0.12848949432373047,
      "learning_rate": 0.00011098369459378328,
      "loss": 1.0655,
      "step": 2900
    },
    {
      "epoch": 1.418951418951419,
      "grad_norm": 0.1720573604106903,
      "learning_rate": 0.00010847720130796631,
      "loss": 1.0641,
      "step": 2950
    },
    {
      "epoch": 1.443001443001443,
      "grad_norm": 0.17134524881839752,
      "learning_rate": 0.00010596533129737092,
      "loss": 1.0651,
      "step": 3000
    },
    {
      "epoch": 1.467051467051467,
      "grad_norm": 0.13950586318969727,
      "learning_rate": 0.00010344967773328507,
      "loss": 1.0579,
      "step": 3050
    },
    {
      "epoch": 1.491101491101491,
      "grad_norm": 0.19055236876010895,
      "learning_rate": 0.00010093183618674224,
      "loss": 1.0528,
      "step": 3100
    },
    {
      "epoch": 1.5151515151515151,
      "grad_norm": 0.16765938699245453,
      "learning_rate": 9.84134036165192e-05,
      "loss": 1.0547,
      "step": 3150
    },
    {
      "epoch": 1.5392015392015392,
      "grad_norm": 0.15793581306934357,
      "learning_rate": 9.589597735625377e-05,
      "loss": 1.0615,
      "step": 3200
    },
    {
      "epoch": 1.5632515632515633,
      "grad_norm": 0.16215017437934875,
      "learning_rate": 9.338115410132441e-05,
      "loss": 1.0586,
      "step": 3250
    },
    {
      "epoch": 1.5873015873015874,
      "grad_norm": 0.21864274144172668,
      "learning_rate": 9.087052889613518e-05,
      "loss": 1.0609,
      "step": 3300
    },
    {
      "epoch": 1.6113516113516113,
      "grad_norm": 0.1665191650390625,
      "learning_rate": 8.836569412244745e-05,
      "loss": 1.0595,
      "step": 3350
    },
    {
      "epoch": 1.6354016354016354,
      "grad_norm": 0.15962587296962738,
      "learning_rate": 8.586823848940047e-05,
      "loss": 1.0515,
      "step": 3400
    },
    {
      "epoch": 1.6594516594516593,
      "grad_norm": 0.18745359778404236,
      "learning_rate": 8.337974602586152e-05,
      "loss": 1.0592,
      "step": 3450
    },
    {
      "epoch": 1.6835016835016834,
      "grad_norm": 0.19344636797904968,
      "learning_rate": 8.090179507574427e-05,
      "loss": 1.0478,
      "step": 3500
    },
    {
      "epoch": 1.7075517075517075,
      "grad_norm": 0.13223913311958313,
      "learning_rate": 7.843595729693316e-05,
      "loss": 1.045,
      "step": 3550
    },
    {
      "epoch": 1.7316017316017316,
      "grad_norm": 0.17938382923603058,
      "learning_rate": 7.598379666444808e-05,
      "loss": 1.0537,
      "step": 3600
    },
    {
      "epoch": 1.7556517556517557,
      "grad_norm": 0.15650674700737,
      "learning_rate": 7.354686847848242e-05,
      "loss": 1.0498,
      "step": 3650
    },
    {
      "epoch": 1.7797017797017798,
      "grad_norm": 0.2204657793045044,
      "learning_rate": 7.11267183779428e-05,
      "loss": 1.0495,
      "step": 3700
    },
    {
      "epoch": 1.8037518037518039,
      "grad_norm": 0.1426030993461609,
      "learning_rate": 6.872488136011667e-05,
      "loss": 1.0538,
      "step": 3750
    },
    {
      "epoch": 1.8278018278018278,
      "grad_norm": 0.1690046787261963,
      "learning_rate": 6.634288080708952e-05,
      "loss": 1.0509,
      "step": 3800
    },
    {
      "epoch": 1.8518518518518519,
      "grad_norm": 0.1793077439069748,
      "learning_rate": 6.398222751952899e-05,
      "loss": 1.0457,
      "step": 3850
    },
    {
      "epoch": 1.8759018759018757,
      "grad_norm": 0.13664492964744568,
      "learning_rate": 6.164441875844882e-05,
      "loss": 1.0516,
      "step": 3900
    },
    {
      "epoch": 1.8999518999518998,
      "grad_norm": 0.1282956898212433,
      "learning_rate": 5.933093729556062e-05,
      "loss": 1.0486,
      "step": 3950
    },
    {
      "epoch": 1.924001924001924,
      "grad_norm": 0.13937096297740936,
      "learning_rate": 5.7043250472815356e-05,
      "loss": 1.0411,
      "step": 4000
    },
    {
      "epoch": 1.948051948051948,
      "grad_norm": 0.1659151166677475,
      "learning_rate": 5.478280927173145e-05,
      "loss": 1.0378,
      "step": 4050
    },
    {
      "epoch": 1.9721019721019721,
      "grad_norm": 0.1273048371076584,
      "learning_rate": 5.255104739309924e-05,
      "loss": 1.0444,
      "step": 4100
    },
    {
      "epoch": 1.9961519961519962,
      "grad_norm": 0.17163003981113434,
      "learning_rate": 5.0349380347646494e-05,
      "loss": 1.0399,
      "step": 4150
    },
    {
      "epoch": 2.0202020202020203,
      "grad_norm": 0.12823982536792755,
      "learning_rate": 4.8179204558240444e-05,
      "loss": 1.0092,
      "step": 4200
    },
    {
      "epoch": 2.0442520442520444,
      "grad_norm": 0.1164403036236763,
      "learning_rate": 4.6041896474197e-05,
      "loss": 0.9951,
      "step": 4250
    },
    {
      "epoch": 2.068302068302068,
      "grad_norm": 0.1204572319984436,
      "learning_rate": 4.393881169825779e-05,
      "loss": 0.9998,
      "step": 4300
    },
    {
      "epoch": 2.092352092352092,
      "grad_norm": 0.11660825461149216,
      "learning_rate": 4.187128412678969e-05,
      "loss": 0.9983,
      "step": 4350
    },
    {
      "epoch": 2.1164021164021163,
      "grad_norm": 0.11894522607326508,
      "learning_rate": 3.984062510375155e-05,
      "loss": 0.9967,
      "step": 4400
    },
    {
      "epoch": 2.1404521404521404,
      "grad_norm": 0.13249389827251434,
      "learning_rate": 3.7848122588965144e-05,
      "loss": 0.9974,
      "step": 4450
    },
    {
      "epoch": 2.1645021645021645,
      "grad_norm": 0.1497354954481125,
      "learning_rate": 3.5895040341217543e-05,
      "loss": 0.999,
      "step": 4500
    },
    {
      "epoch": 2.1885521885521886,
      "grad_norm": 0.11724729835987091,
      "learning_rate": 3.398261711671309e-05,
      "loss": 1.0033,
      "step": 4550
    },
    {
      "epoch": 2.2126022126022127,
      "grad_norm": 0.11575422435998917,
      "learning_rate": 3.211206588338358e-05,
      "loss": 0.9988,
      "step": 4600
    },
    {
      "epoch": 2.236652236652237,
      "grad_norm": 0.11643755435943604,
      "learning_rate": 3.028457305155483e-05,
      "loss": 0.9971,
      "step": 4650
    },
    {
      "epoch": 2.260702260702261,
      "grad_norm": 0.10507030785083771,
      "learning_rate": 2.8501297721457422e-05,
      "loss": 0.9906,
      "step": 4700
    },
    {
      "epoch": 2.284752284752285,
      "grad_norm": 0.10402993112802505,
      "learning_rate": 2.6763370948059353e-05,
      "loss": 1.0007,
      "step": 4750
    },
    {
      "epoch": 2.3088023088023086,
      "grad_norm": 0.10819538682699203,
      "learning_rate": 2.5071895023686442e-05,
      "loss": 0.9969,
      "step": 4800
    },
    {
      "epoch": 2.3328523328523327,
      "grad_norm": 0.1739010065793991,
      "learning_rate": 2.342794277888547e-05,
      "loss": 0.9976,
      "step": 4850
    },
    {
      "epoch": 2.356902356902357,
      "grad_norm": 0.10811810195446014,
      "learning_rate": 2.1832556901973965e-05,
      "loss": 0.9946,
      "step": 4900
    },
    {
      "epoch": 2.380952380952381,
      "grad_norm": 0.1108924001455307,
      "learning_rate": 2.0286749277707782e-05,
      "loss": 0.9906,
      "step": 4950
    },
    {
      "epoch": 2.405002405002405,
      "grad_norm": 0.10033036023378372,
      "learning_rate": 1.879150034548588e-05,
      "loss": 0.9997,
      "step": 5000
    },
    {
      "epoch": 2.429052429052429,
      "grad_norm": 0.12525063753128052,
      "learning_rate": 1.7347758477500044e-05,
      "loss": 0.9885,
      "step": 5050
    },
    {
      "epoch": 2.4531024531024532,
      "grad_norm": 0.10124919563531876,
      "learning_rate": 1.5956439377222798e-05,
      "loss": 0.9964,
      "step": 5100
    },
    {
      "epoch": 2.4771524771524773,
      "grad_norm": 0.10390116274356842,
      "learning_rate": 1.4618425498616162e-05,
      "loss": 0.9893,
      "step": 5150
    },
    {
      "epoch": 2.501202501202501,
      "grad_norm": 0.1200655922293663,
      "learning_rate": 1.3334565486428996e-05,
      "loss": 0.989,
      "step": 5200
    },
    {
      "epoch": 2.525252525252525,
      "grad_norm": 0.11598347127437592,
      "learning_rate": 1.2105673637938053e-05,
      "loss": 0.994,
      "step": 5250
    },
    {
      "epoch": 2.549302549302549,
      "grad_norm": 0.10302892327308655,
      "learning_rate": 1.0932529386474188e-05,
      "loss": 0.9911,
      "step": 5300
    },
    {
      "epoch": 2.5733525733525733,
      "grad_norm": 0.10021837800741196,
      "learning_rate": 9.815876807061264e-06,
      "loss": 1.0002,
      "step": 5350
    },
    {
      "epoch": 2.5974025974025974,
      "grad_norm": 0.09885919839143753,
      "learning_rate": 8.756424144481312e-06,
      "loss": 0.9882,
      "step": 5400
    },
    {
      "epoch": 2.6214526214526215,
      "grad_norm": 0.10271639376878738,
      "learning_rate": 7.75484336406529e-06,
      "loss": 0.9991,
      "step": 5450
    },
    {
      "epoch": 2.6455026455026456,
      "grad_norm": 0.10511433333158493,
      "learning_rate": 6.8117697254943106e-06,
      "loss": 0.9908,
      "step": 5500
    },
    {
      "epoch": 2.6695526695526697,
      "grad_norm": 0.1004793718457222,
      "learning_rate": 5.927801379881714e-06,
      "loss": 0.9977,
      "step": 5550
    },
    {
      "epoch": 2.6936026936026938,
      "grad_norm": 0.0923864096403122,
      "learning_rate": 5.103498990391509e-06,
      "loss": 0.989,
      "step": 5600
    },
    {
      "epoch": 2.717652717652718,
      "grad_norm": 0.09591204673051834,
      "learning_rate": 4.339385376633775e-06,
      "loss": 0.9917,
      "step": 5650
    },
    {
      "epoch": 2.741702741702742,
      "grad_norm": 0.09710726141929626,
      "learning_rate": 3.6359451830626723e-06,
      "loss": 0.9871,
      "step": 5700
    },
    {
      "epoch": 2.7657527657527656,
      "grad_norm": 0.0920204296708107,
      "learning_rate": 2.993624571587239e-06,
      "loss": 0.9878,
      "step": 5750
    },
    {
      "epoch": 2.7898027898027897,
      "grad_norm": 0.09445653855800629,
      "learning_rate": 2.4128309385900717e-06,
      "loss": 0.9886,
      "step": 5800
    },
    {
      "epoch": 2.813852813852814,
      "grad_norm": 0.0918457955121994,
      "learning_rate": 1.8939326565333037e-06,
      "loss": 0.9872,
      "step": 5850
    },
    {
      "epoch": 2.837902837902838,
      "grad_norm": 0.08985795080661774,
      "learning_rate": 1.437258840315714e-06,
      "loss": 0.9942,
      "step": 5900
    },
    {
      "epoch": 2.861952861952862,
      "grad_norm": 0.08798079937696457,
      "learning_rate": 1.0430991385293575e-06,
      "loss": 0.9911,
      "step": 5950
    },
    {
      "epoch": 2.886002886002886,
      "grad_norm": 0.08569388091564178,
      "learning_rate": 7.117035497478553e-07,
      "loss": 0.9932,
      "step": 6000
    },
    {
      "epoch": 2.91005291005291,
      "grad_norm": 0.08316464722156525,
      "learning_rate": 4.432822639630407e-07,
      "loss": 0.9895,
      "step": 6050
    },
    {
      "epoch": 2.934102934102934,
      "grad_norm": 0.08785533905029297,
      "learning_rate": 2.380055292704575e-07,
      "loss": 0.9937,
      "step": 6100
    },
    {
      "epoch": 2.958152958152958,
      "grad_norm": 0.08768357336521149,
      "learning_rate": 9.600354388833443e-08,
      "loss": 0.9922,
      "step": 6150
    },
    {
      "epoch": 2.982202982202982,
      "grad_norm": 0.08772250264883041,
      "learning_rate": 1.7366373578442397e-08,
      "loss": 0.9919,
      "step": 6200
    }
  ],
  "logging_steps": 50,
  "max_steps": 6237,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 4.355263840216364e+20,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}