{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9990416866315286, "eval_steps": 200, "global_step": 3336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011978917105893628, "grad_norm": 476.6510925292969, "learning_rate": 8e-07, "loss": 11.6475, "step": 10 }, { "epoch": 0.023957834211787255, "grad_norm": 74.57940673828125, "learning_rate": 1.9999928625229307e-06, "loss": 2.3869, "step": 20 }, { "epoch": 0.035936751317680884, "grad_norm": 125.54178619384766, "learning_rate": 1.999912567076008e-06, "loss": 7.1899, "step": 30 }, { "epoch": 0.04791566842357451, "grad_norm": 14.804201126098633, "learning_rate": 1.999743061523497e-06, "loss": 5.0722, "step": 40 }, { "epoch": 0.059894585529468136, "grad_norm": 9.310958862304688, "learning_rate": 1.999484360988329e-06, "loss": 2.9189, "step": 50 }, { "epoch": 0.07187350263536177, "grad_norm": 306.2783508300781, "learning_rate": 1.999136488551224e-06, "loss": 2.8403, "step": 60 }, { "epoch": 0.08385241974125539, "grad_norm": 134.76495361328125, "learning_rate": 1.9986994752486316e-06, "loss": 4.2047, "step": 70 }, { "epoch": 0.09583133684714902, "grad_norm": 78.88103485107422, "learning_rate": 1.998173360069964e-06, "loss": 5.1269, "step": 80 }, { "epoch": 0.10781025395304264, "grad_norm": 60.88028335571289, "learning_rate": 1.997558189954117e-06, "loss": 4.8787, "step": 90 }, { "epoch": 0.11978917105893627, "grad_norm": 48.85028076171875, "learning_rate": 1.9968540197852784e-06, "loss": 2.6971, "step": 100 }, { "epoch": 0.1317680881648299, "grad_norm": 82.52726745605469, "learning_rate": 1.9960609123880376e-06, "loss": 6.6349, "step": 110 }, { "epoch": 0.14374700527072354, "grad_norm": 18.934968948364258, "learning_rate": 1.9951789385217753e-06, "loss": 3.6926, "step": 120 }, { "epoch": 0.15572592237661714, "grad_norm": 95.94075012207031, "learning_rate": 1.9942081768743535e-06, "loss": 5.221, "step": 130 }, { "epoch": 0.16770483948251078, "grad_norm": 69.58992767333984, "learning_rate": 1.9931487140550935e-06, "loss": 5.8621, "step": 140 }, { "epoch": 0.1796837565884044, "grad_norm": 18.388065338134766, "learning_rate": 1.9920006445870497e-06, "loss": 5.2103, "step": 150 }, { "epoch": 0.19166267369429804, "grad_norm": 117.37078857421875, "learning_rate": 1.9907640708985766e-06, "loss": 5.8106, "step": 160 }, { "epoch": 0.20364159080019167, "grad_norm": 118.88228607177734, "learning_rate": 1.9894391033141887e-06, "loss": 4.0891, "step": 170 }, { "epoch": 0.21562050790608528, "grad_norm": 17.98489761352539, "learning_rate": 1.9880258600447204e-06, "loss": 5.7061, "step": 180 }, { "epoch": 0.2275994250119789, "grad_norm": 170.75030517578125, "learning_rate": 1.986524467176777e-06, "loss": 4.2787, "step": 190 }, { "epoch": 0.23957834211787254, "grad_norm": 74.85968017578125, "learning_rate": 1.9849350586614863e-06, "loss": 7.8201, "step": 200 }, { "epoch": 0.23957834211787254, "eval_loss": 1.2861307859420776, "eval_runtime": 238.5238, "eval_samples_per_second": 6.247, "eval_steps_per_second": 3.123, "step": 200 }, { "epoch": 0.25155725922376615, "grad_norm": 254.58087158203125, "learning_rate": 1.983257776302548e-06, "loss": 5.8449, "step": 210 }, { "epoch": 0.2635361763296598, "grad_norm": 12.087563514709473, "learning_rate": 1.9814927697435826e-06, "loss": 5.7451, "step": 220 }, { "epoch": 0.2755150934355534, "grad_norm": 11.869245529174805, "learning_rate": 1.9796401964547794e-06, "loss": 6.4206, "step": 230 }, { "epoch": 0.2874940105414471, "grad_norm": 18.04955291748047, "learning_rate": 1.977700221718848e-06, "loss": 3.3466, "step": 240 }, { "epoch": 0.2994729276473407, "grad_norm": 40.88437271118164, "learning_rate": 1.975673018616273e-06, "loss": 4.5986, "step": 250 }, { "epoch": 0.3114518447532343, "grad_norm": 17.213830947875977, "learning_rate": 1.97355876800987e-06, "loss": 3.6809, "step": 260 }, { "epoch": 0.32343076185912795, "grad_norm": 31.727243423461914, "learning_rate": 1.9713576585286513e-06, "loss": 4.692, "step": 270 }, { "epoch": 0.33540967896502155, "grad_norm": 95.36638641357422, "learning_rate": 1.9690698865509964e-06, "loss": 6.1814, "step": 280 }, { "epoch": 0.3473885960709152, "grad_norm": 25.670534133911133, "learning_rate": 1.966695656187131e-06, "loss": 2.8556, "step": 290 }, { "epoch": 0.3593675131768088, "grad_norm": 137.57431030273438, "learning_rate": 1.9642351792609162e-06, "loss": 3.3607, "step": 300 }, { "epoch": 0.3713464302827024, "grad_norm": 46.084354400634766, "learning_rate": 1.9616886752909523e-06, "loss": 6.1352, "step": 310 }, { "epoch": 0.3833253473885961, "grad_norm": 110.49552154541016, "learning_rate": 1.9590563714709916e-06, "loss": 5.8323, "step": 320 }, { "epoch": 0.3953042644944897, "grad_norm": 77.74449157714844, "learning_rate": 1.9563385026496687e-06, "loss": 5.7407, "step": 330 }, { "epoch": 0.40728318160038335, "grad_norm": 26.705305099487305, "learning_rate": 1.9535353113095493e-06, "loss": 5.3508, "step": 340 }, { "epoch": 0.41926209870627695, "grad_norm": 146.0390167236328, "learning_rate": 1.9506470475454957e-06, "loss": 2.9407, "step": 350 }, { "epoch": 0.43124101581217056, "grad_norm": 25.178991317749023, "learning_rate": 1.947673969042353e-06, "loss": 3.0089, "step": 360 }, { "epoch": 0.4432199329180642, "grad_norm": 8.458102226257324, "learning_rate": 1.9446163410519603e-06, "loss": 2.885, "step": 370 }, { "epoch": 0.4551988500239578, "grad_norm": 94.5208740234375, "learning_rate": 1.9414744363694842e-06, "loss": 3.8878, "step": 380 }, { "epoch": 0.4671777671298515, "grad_norm": 138.86561584472656, "learning_rate": 1.938248535309083e-06, "loss": 5.5948, "step": 390 }, { "epoch": 0.4791566842357451, "grad_norm": 10.55215072631836, "learning_rate": 1.9349389256788943e-06, "loss": 2.9242, "step": 400 }, { "epoch": 0.4791566842357451, "eval_loss": 1.1180063486099243, "eval_runtime": 237.218, "eval_samples_per_second": 6.281, "eval_steps_per_second": 3.141, "step": 400 }, { "epoch": 0.4911356013416387, "grad_norm": 11.329914093017578, "learning_rate": 1.931545902755359e-06, "loss": 5.7209, "step": 410 }, { "epoch": 0.5031145184475323, "grad_norm": 121.2057113647461, "learning_rate": 1.928069769256879e-06, "loss": 4.2294, "step": 420 }, { "epoch": 0.515093435553426, "grad_norm": 70.16046905517578, "learning_rate": 1.9245108353168055e-06, "loss": 5.1172, "step": 430 }, { "epoch": 0.5270723526593196, "grad_norm": 55.029964447021484, "learning_rate": 1.9208694184557735e-06, "loss": 3.8455, "step": 440 }, { "epoch": 0.5390512697652132, "grad_norm": 16.75533103942871, "learning_rate": 1.9171458435533706e-06, "loss": 2.1762, "step": 450 }, { "epoch": 0.5510301868711068, "grad_norm": 37.192169189453125, "learning_rate": 1.913340442819153e-06, "loss": 4.6994, "step": 460 }, { "epoch": 0.5630091039770004, "grad_norm": 190.2852020263672, "learning_rate": 1.9094535557630067e-06, "loss": 8.188, "step": 470 }, { "epoch": 0.5749880210828942, "grad_norm": 14.840240478515625, "learning_rate": 1.905485529164856e-06, "loss": 2.4346, "step": 480 }, { "epoch": 0.5869669381887878, "grad_norm": 17.85882568359375, "learning_rate": 1.9014367170437255e-06, "loss": 5.1088, "step": 490 }, { "epoch": 0.5989458552946814, "grad_norm": 98.2167739868164, "learning_rate": 1.8973074806261558e-06, "loss": 4.4192, "step": 500 }, { "epoch": 0.610924772400575, "grad_norm": 72.538330078125, "learning_rate": 1.8930981883139734e-06, "loss": 4.2753, "step": 510 }, { "epoch": 0.6229036895064686, "grad_norm": 121.5967788696289, "learning_rate": 1.8888092156514252e-06, "loss": 5.0462, "step": 520 }, { "epoch": 0.6348826066123623, "grad_norm": 161.8177947998047, "learning_rate": 1.8844409452916719e-06, "loss": 3.2489, "step": 530 }, { "epoch": 0.6468615237182559, "grad_norm": 149.68197631835938, "learning_rate": 1.8799937669626481e-06, "loss": 4.8399, "step": 540 }, { "epoch": 0.6588404408241495, "grad_norm": 87.29440307617188, "learning_rate": 1.8754680774322934e-06, "loss": 5.3579, "step": 550 }, { "epoch": 0.6708193579300431, "grad_norm": 70.05744171142578, "learning_rate": 1.8708642804731513e-06, "loss": 1.967, "step": 560 }, { "epoch": 0.6827982750359367, "grad_norm": 49.84896469116211, "learning_rate": 1.866182786826347e-06, "loss": 4.1978, "step": 570 }, { "epoch": 0.6947771921418304, "grad_norm": 29.61354637145996, "learning_rate": 1.861424014164941e-06, "loss": 4.025, "step": 580 }, { "epoch": 0.706756109247724, "grad_norm": 99.13072204589844, "learning_rate": 1.8565883870566666e-06, "loss": 4.1162, "step": 590 }, { "epoch": 0.7187350263536176, "grad_norm": 130.23606872558594, "learning_rate": 1.8516763369260492e-06, "loss": 3.0065, "step": 600 }, { "epoch": 0.7187350263536176, "eval_loss": 1.104053258895874, "eval_runtime": 238.4518, "eval_samples_per_second": 6.249, "eval_steps_per_second": 3.124, "step": 600 }, { "epoch": 0.7307139434595112, "grad_norm": 50.47702407836914, "learning_rate": 1.8466883020159161e-06, "loss": 4.3503, "step": 610 }, { "epoch": 0.7426928605654048, "grad_norm": 17.21928596496582, "learning_rate": 1.8416247273482988e-06, "loss": 4.4346, "step": 620 }, { "epoch": 0.7546717776712986, "grad_norm": 49.47705841064453, "learning_rate": 1.8364860646847262e-06, "loss": 3.9906, "step": 630 }, { "epoch": 0.7666506947771922, "grad_norm": 14.143331527709961, "learning_rate": 1.831272772485922e-06, "loss": 3.3026, "step": 640 }, { "epoch": 0.7786296118830858, "grad_norm": 17.33100128173828, "learning_rate": 1.8259853158708997e-06, "loss": 6.0244, "step": 650 }, { "epoch": 0.7906085289889794, "grad_norm": 10.411093711853027, "learning_rate": 1.8206241665754687e-06, "loss": 2.8721, "step": 660 }, { "epoch": 0.802587446094873, "grad_norm": 181.8240966796875, "learning_rate": 1.815189802910143e-06, "loss": 5.1721, "step": 670 }, { "epoch": 0.8145663632007667, "grad_norm": 39.83287048339844, "learning_rate": 1.80968270971747e-06, "loss": 4.9115, "step": 680 }, { "epoch": 0.8265452803066603, "grad_norm": 39.86928176879883, "learning_rate": 1.8041033783287737e-06, "loss": 3.8957, "step": 690 }, { "epoch": 0.8385241974125539, "grad_norm": 57.7371711730957, "learning_rate": 1.7984523065203188e-06, "loss": 3.1863, "step": 700 }, { "epoch": 0.8505031145184475, "grad_norm": 14.148628234863281, "learning_rate": 1.792729998468899e-06, "loss": 4.26, "step": 710 }, { "epoch": 0.8624820316243411, "grad_norm": 71.03279113769531, "learning_rate": 1.7869369647068577e-06, "loss": 4.9559, "step": 720 }, { "epoch": 0.8744609487302348, "grad_norm": 17.670730590820312, "learning_rate": 1.7810737220765372e-06, "loss": 3.9867, "step": 730 }, { "epoch": 0.8864398658361284, "grad_norm": 14.698404312133789, "learning_rate": 1.7751407936841684e-06, "loss": 2.7134, "step": 740 }, { "epoch": 0.898418782942022, "grad_norm": 66.42393493652344, "learning_rate": 1.7691387088532001e-06, "loss": 3.2121, "step": 750 }, { "epoch": 0.9103977000479156, "grad_norm": 76.34748077392578, "learning_rate": 1.7630680030770732e-06, "loss": 4.7613, "step": 760 }, { "epoch": 0.9223766171538093, "grad_norm": 46.27962112426758, "learning_rate": 1.7569292179714465e-06, "loss": 3.2976, "step": 770 }, { "epoch": 0.934355534259703, "grad_norm": 14.20971965789795, "learning_rate": 1.750722901225873e-06, "loss": 1.9176, "step": 780 }, { "epoch": 0.9463344513655966, "grad_norm": 56.962379455566406, "learning_rate": 1.7444496065549384e-06, "loss": 1.9859, "step": 790 }, { "epoch": 0.9583133684714902, "grad_norm": 32.20167541503906, "learning_rate": 1.7381098936488574e-06, "loss": 6.9549, "step": 800 }, { "epoch": 0.9583133684714902, "eval_loss": 1.0978227853775024, "eval_runtime": 238.5511, "eval_samples_per_second": 6.246, "eval_steps_per_second": 3.123, "step": 800 }, { "epoch": 0.9702922855773838, "grad_norm": 82.94165802001953, "learning_rate": 1.7317043281235418e-06, "loss": 4.1317, "step": 810 }, { "epoch": 0.9822712026832774, "grad_norm": 110.44422912597656, "learning_rate": 1.725233481470135e-06, "loss": 3.2924, "step": 820 }, { "epoch": 0.9942501197891711, "grad_norm": 88.77394104003906, "learning_rate": 1.7186979310040268e-06, "loss": 5.5422, "step": 830 }, { "epoch": 1.0071873502635362, "grad_norm": 167.41412353515625, "learning_rate": 1.7120982598133456e-06, "loss": 3.5133, "step": 840 }, { "epoch": 1.0191662673694297, "grad_norm": 16.926607131958008, "learning_rate": 1.7054350567069364e-06, "loss": 4.2376, "step": 850 }, { "epoch": 1.0311451844753234, "grad_norm": 96.12760925292969, "learning_rate": 1.698708916161829e-06, "loss": 3.6823, "step": 860 }, { "epoch": 1.0431241015812172, "grad_norm": 103.28370666503906, "learning_rate": 1.6919204382701987e-06, "loss": 2.5705, "step": 870 }, { "epoch": 1.0551030186871106, "grad_norm": 166.0828857421875, "learning_rate": 1.6850702286858298e-06, "loss": 2.9061, "step": 880 }, { "epoch": 1.0670819357930044, "grad_norm": 136.93392944335938, "learning_rate": 1.678158898570078e-06, "loss": 2.8635, "step": 890 }, { "epoch": 1.0790608528988979, "grad_norm": 14.542271614074707, "learning_rate": 1.6711870645373449e-06, "loss": 4.2555, "step": 900 }, { "epoch": 1.0910397700047916, "grad_norm": 53.567359924316406, "learning_rate": 1.6641553486000651e-06, "loss": 3.1885, "step": 910 }, { "epoch": 1.1030186871106853, "grad_norm": 100.29656982421875, "learning_rate": 1.6570643781132118e-06, "loss": 4.953, "step": 920 }, { "epoch": 1.1149976042165788, "grad_norm": 89.06425476074219, "learning_rate": 1.649914785718324e-06, "loss": 4.9896, "step": 930 }, { "epoch": 1.1269765213224725, "grad_norm": 17.858898162841797, "learning_rate": 1.6427072092870651e-06, "loss": 1.5295, "step": 940 }, { "epoch": 1.138955438428366, "grad_norm": 15.720714569091797, "learning_rate": 1.6354422918643133e-06, "loss": 3.0117, "step": 950 }, { "epoch": 1.1509343555342597, "grad_norm": 13.404827117919922, "learning_rate": 1.628120681610789e-06, "loss": 2.1361, "step": 960 }, { "epoch": 1.1629132726401532, "grad_norm": 130.946044921875, "learning_rate": 1.6207430317452297e-06, "loss": 3.941, "step": 970 }, { "epoch": 1.174892189746047, "grad_norm": 14.730375289916992, "learning_rate": 1.613310000486108e-06, "loss": 3.2318, "step": 980 }, { "epoch": 1.1868711068519406, "grad_norm": 82.65552520751953, "learning_rate": 1.6058222509929096e-06, "loss": 3.9045, "step": 990 }, { "epoch": 1.1988500239578341, "grad_norm": Infinity, "learning_rate": 1.5982804513069664e-06, "loss": 5.5404, "step": 1000 }, { "epoch": 1.1988500239578341, "eval_loss": 1.1037527322769165, "eval_runtime": 238.5531, "eval_samples_per_second": 6.246, "eval_steps_per_second": 3.123, "step": 1000 }, { "epoch": 1.2108289410637278, "grad_norm": 67.75409698486328, "learning_rate": 1.5914471746978935e-06, "loss": 2.6392, "step": 1010 }, { "epoch": 1.2228078581696216, "grad_norm": 49.21822738647461, "learning_rate": 1.5838045373221053e-06, "loss": 4.0259, "step": 1020 }, { "epoch": 1.234786775275515, "grad_norm": 228.187744140625, "learning_rate": 1.5761098141278849e-06, "loss": 5.8343, "step": 1030 }, { "epoch": 1.2467656923814088, "grad_norm": 33.68708419799805, "learning_rate": 1.5683636916223236e-06, "loss": 3.9807, "step": 1040 }, { "epoch": 1.2587446094873023, "grad_norm": 140.39187622070312, "learning_rate": 1.5605668608982526e-06, "loss": 3.9716, "step": 1050 }, { "epoch": 1.270723526593196, "grad_norm": 11.902155876159668, "learning_rate": 1.5527200175725842e-06, "loss": 3.2315, "step": 1060 }, { "epoch": 1.2827024436990895, "grad_norm": 144.3499298095703, "learning_rate": 1.5448238617242488e-06, "loss": 2.6336, "step": 1070 }, { "epoch": 1.2946813608049832, "grad_norm": 59.747928619384766, "learning_rate": 1.5368790978317395e-06, "loss": 3.206, "step": 1080 }, { "epoch": 1.306660277910877, "grad_norm": 53.534950256347656, "learning_rate": 1.5288864347102545e-06, "loss": 4.3036, "step": 1090 }, { "epoch": 1.3186391950167704, "grad_norm": 48.62434387207031, "learning_rate": 1.520846585448463e-06, "loss": 2.4486, "step": 1100 }, { "epoch": 1.3306181121226641, "grad_norm": 18.836971282958984, "learning_rate": 1.512760267344882e-06, "loss": 4.0121, "step": 1110 }, { "epoch": 1.3425970292285578, "grad_norm": 224.24917602539062, "learning_rate": 1.5046282018438814e-06, "loss": 2.8545, "step": 1120 }, { "epoch": 1.3545759463344513, "grad_norm": 18.120187759399414, "learning_rate": 1.4964511144713174e-06, "loss": 3.1619, "step": 1130 }, { "epoch": 1.366554863440345, "grad_norm": 39.76359939575195, "learning_rate": 1.4882297347698048e-06, "loss": 3.0413, "step": 1140 }, { "epoch": 1.3785337805462385, "grad_norm": 64.61255645751953, "learning_rate": 1.4799647962336255e-06, "loss": 3.8001, "step": 1150 }, { "epoch": 1.3905126976521323, "grad_norm": 12.653026580810547, "learning_rate": 1.471657036243291e-06, "loss": 5.532, "step": 1160 }, { "epoch": 1.4024916147580258, "grad_norm": 53.37699508666992, "learning_rate": 1.4633071959997525e-06, "loss": 3.4156, "step": 1170 }, { "epoch": 1.4144705318639195, "grad_norm": 37.08938217163086, "learning_rate": 1.4549160204582731e-06, "loss": 2.5073, "step": 1180 }, { "epoch": 1.4264494489698132, "grad_norm": 139.479736328125, "learning_rate": 1.4464842582619652e-06, "loss": 3.36, "step": 1190 }, { "epoch": 1.4384283660757067, "grad_norm": 88.59599304199219, "learning_rate": 1.4380126616749975e-06, "loss": 5.2213, "step": 1200 }, { "epoch": 1.4384283660757067, "eval_loss": 1.1036500930786133, "eval_runtime": 238.313, "eval_samples_per_second": 6.252, "eval_steps_per_second": 3.126, "step": 1200 }, { "epoch": 1.4504072831816004, "grad_norm": 129.61988830566406, "learning_rate": 1.4295019865154785e-06, "loss": 7.1682, "step": 1210 }, { "epoch": 1.462386200287494, "grad_norm": 19.823545455932617, "learning_rate": 1.4209529920880272e-06, "loss": 4.6843, "step": 1220 }, { "epoch": 1.4743651173933876, "grad_norm": 107.53630065917969, "learning_rate": 1.4123664411160252e-06, "loss": 2.4525, "step": 1230 }, { "epoch": 1.4863440344992813, "grad_norm": 54.906280517578125, "learning_rate": 1.4037430996735722e-06, "loss": 5.9388, "step": 1240 }, { "epoch": 1.4983229516051748, "grad_norm": 77.29488372802734, "learning_rate": 1.3950837371171355e-06, "loss": 5.3705, "step": 1250 }, { "epoch": 1.5103018687110685, "grad_norm": 115.9428482055664, "learning_rate": 1.3863891260169114e-06, "loss": 4.0317, "step": 1260 }, { "epoch": 1.522280785816962, "grad_norm": 19.207189559936523, "learning_rate": 1.3776600420878973e-06, "loss": 3.8767, "step": 1270 }, { "epoch": 1.5342597029228557, "grad_norm": 83.13814544677734, "learning_rate": 1.3688972641206837e-06, "loss": 4.5835, "step": 1280 }, { "epoch": 1.5462386200287495, "grad_norm": 222.7005157470703, "learning_rate": 1.3601015739119733e-06, "loss": 3.3379, "step": 1290 }, { "epoch": 1.558217537134643, "grad_norm": 51.51054382324219, "learning_rate": 1.35127375619483e-06, "loss": 5.4397, "step": 1300 }, { "epoch": 1.5701964542405367, "grad_norm": 109.09092712402344, "learning_rate": 1.3424145985686662e-06, "loss": 3.1896, "step": 1310 }, { "epoch": 1.5821753713464304, "grad_norm": 11.890337944030762, "learning_rate": 1.333524891428976e-06, "loss": 4.4828, "step": 1320 }, { "epoch": 1.5941542884523239, "grad_norm": 18.87173843383789, "learning_rate": 1.324605427896817e-06, "loss": 2.4719, "step": 1330 }, { "epoch": 1.6061332055582176, "grad_norm": 110.26778411865234, "learning_rate": 1.3156570037480497e-06, "loss": 3.4721, "step": 1340 }, { "epoch": 1.6181121226641113, "grad_norm": 19.194913864135742, "learning_rate": 1.3066804173423397e-06, "loss": 4.3532, "step": 1350 }, { "epoch": 1.6300910397700048, "grad_norm": 89.65567016601562, "learning_rate": 1.297676469551931e-06, "loss": 4.1742, "step": 1360 }, { "epoch": 1.6420699568758983, "grad_norm": 15.179718017578125, "learning_rate": 1.2886459636901927e-06, "loss": 4.2612, "step": 1370 }, { "epoch": 1.654048873981792, "grad_norm": 14.557687759399414, "learning_rate": 1.2795897054399498e-06, "loss": 5.2594, "step": 1380 }, { "epoch": 1.6660277910876857, "grad_norm": 20.148624420166016, "learning_rate": 1.2705085027816008e-06, "loss": 3.3919, "step": 1390 }, { "epoch": 1.6780067081935792, "grad_norm": 9.0452241897583, "learning_rate": 1.261403165921032e-06, "loss": 4.3208, "step": 1400 }, { "epoch": 1.6780067081935792, "eval_loss": 1.0974289178848267, "eval_runtime": 238.4129, "eval_samples_per_second": 6.25, "eval_steps_per_second": 3.125, "step": 1400 }, { "epoch": 1.689985625299473, "grad_norm": 18.7681941986084, "learning_rate": 1.2522745072173336e-06, "loss": 2.5784, "step": 1410 }, { "epoch": 1.7019645424053667, "grad_norm": 112.4189224243164, "learning_rate": 1.243123341110321e-06, "loss": 4.0173, "step": 1420 }, { "epoch": 1.7139434595112601, "grad_norm": 13.051095008850098, "learning_rate": 1.2339504840478738e-06, "loss": 3.1098, "step": 1430 }, { "epoch": 1.7259223766171539, "grad_norm": 16.918392181396484, "learning_rate": 1.224756754413092e-06, "loss": 3.1983, "step": 1440 }, { "epoch": 1.7379012937230476, "grad_norm": 167.8545379638672, "learning_rate": 1.2155429724512838e-06, "loss": 4.8368, "step": 1450 }, { "epoch": 1.749880210828941, "grad_norm": 19.237834930419922, "learning_rate": 1.206309960196784e-06, "loss": 3.1809, "step": 1460 }, { "epoch": 1.7618591279348346, "grad_norm": 95.0063247680664, "learning_rate": 1.1970585413996132e-06, "loss": 3.9006, "step": 1470 }, { "epoch": 1.7738380450407283, "grad_norm": 55.70530319213867, "learning_rate": 1.1877895414519858e-06, "loss": 3.3394, "step": 1480 }, { "epoch": 1.785816962146622, "grad_norm": 23.173871994018555, "learning_rate": 1.1785037873146697e-06, "loss": 2.4079, "step": 1490 }, { "epoch": 1.7977958792525155, "grad_norm": 120.7456283569336, "learning_rate": 1.1692021074432054e-06, "loss": 4.2111, "step": 1500 }, { "epoch": 1.8097747963584092, "grad_norm": 89.36892700195312, "learning_rate": 1.1598853317139958e-06, "loss": 1.8205, "step": 1510 }, { "epoch": 1.821753713464303, "grad_norm": 66.96819305419922, "learning_rate": 1.150554291350263e-06, "loss": 4.6707, "step": 1520 }, { "epoch": 1.8337326305701964, "grad_norm": 52.6048583984375, "learning_rate": 1.1412098188478914e-06, "loss": 2.2611, "step": 1530 }, { "epoch": 1.8457115476760901, "grad_norm": 100.39757537841797, "learning_rate": 1.1318527479011513e-06, "loss": 3.3554, "step": 1540 }, { "epoch": 1.8576904647819839, "grad_norm": 12.364606857299805, "learning_rate": 1.1224839133283208e-06, "loss": 2.7868, "step": 1550 }, { "epoch": 1.8696693818878773, "grad_norm": 93.31403350830078, "learning_rate": 1.1131041509972032e-06, "loss": 3.7607, "step": 1560 }, { "epoch": 1.8816482989937708, "grad_norm": 77.30158233642578, "learning_rate": 1.1037142977505548e-06, "loss": 3.28, "step": 1570 }, { "epoch": 1.8936272160996646, "grad_norm": 13.82420825958252, "learning_rate": 1.0943151913314211e-06, "loss": 3.3544, "step": 1580 }, { "epoch": 1.9056061332055583, "grad_norm": 16.398128509521484, "learning_rate": 1.084907670308397e-06, "loss": 2.7871, "step": 1590 }, { "epoch": 1.9175850503114518, "grad_norm": 19.750925064086914, "learning_rate": 1.0754925740008098e-06, "loss": 4.1985, "step": 1600 }, { "epoch": 1.9175850503114518, "eval_loss": 1.1042989492416382, "eval_runtime": 238.4482, "eval_samples_per_second": 6.249, "eval_steps_per_second": 3.124, "step": 1600 }, { "epoch": 1.9295639674173455, "grad_norm": 16.436159133911133, "learning_rate": 1.066070742403839e-06, "loss": 3.9566, "step": 1610 }, { "epoch": 1.9415428845232392, "grad_norm": 67.2239761352539, "learning_rate": 1.056643016113572e-06, "loss": 4.0604, "step": 1620 }, { "epoch": 1.9535218016291327, "grad_norm": 52.419456481933594, "learning_rate": 1.047210236252008e-06, "loss": 4.4566, "step": 1630 }, { "epoch": 1.9655007187350264, "grad_norm": 102.50648498535156, "learning_rate": 1.0377732443920155e-06, "loss": 2.5929, "step": 1640 }, { "epoch": 1.9774796358409201, "grad_norm": 85.2761459350586, "learning_rate": 1.0283328824822498e-06, "loss": 3.278, "step": 1650 }, { "epoch": 1.9894585529468136, "grad_norm": 111.09942626953125, "learning_rate": 1.0188899927720324e-06, "loss": 2.1727, "step": 1660 }, { "epoch": 2.0023957834211785, "grad_norm": 88.2173080444336, "learning_rate": 1.009445417736213e-06, "loss": 4.7098, "step": 1670 }, { "epoch": 2.0143747005270725, "grad_norm": 15.37248420715332, "learning_rate": 1e-06, "loss": 3.3523, "step": 1680 }, { "epoch": 2.026353617632966, "grad_norm": 428.4352111816406, "learning_rate": 9.905545822637871e-07, "loss": 4.4776, "step": 1690 }, { "epoch": 2.0383325347388594, "grad_norm": 16.143062591552734, "learning_rate": 9.811100072279673e-07, "loss": 3.2249, "step": 1700 }, { "epoch": 2.0503114518447534, "grad_norm": 142.73899841308594, "learning_rate": 9.716671175177506e-07, "loss": 3.6488, "step": 1710 }, { "epoch": 2.062290368950647, "grad_norm": 86.692626953125, "learning_rate": 9.622267556079844e-07, "loss": 2.3491, "step": 1720 }, { "epoch": 2.0742692860565404, "grad_norm": 127.98802947998047, "learning_rate": 9.527897637479921e-07, "loss": 4.828, "step": 1730 }, { "epoch": 2.0862482031624343, "grad_norm": 14.027091026306152, "learning_rate": 9.433569838864282e-07, "loss": 4.176, "step": 1740 }, { "epoch": 2.098227120268328, "grad_norm": 16.68279266357422, "learning_rate": 9.33929257596161e-07, "loss": 2.8037, "step": 1750 }, { "epoch": 2.1102060373742213, "grad_norm": 68.17903900146484, "learning_rate": 9.245074259991904e-07, "loss": 3.448, "step": 1760 }, { "epoch": 2.122184954480115, "grad_norm": 90.32327270507812, "learning_rate": 9.150923296916032e-07, "loss": 3.3154, "step": 1770 }, { "epoch": 2.1341638715860087, "grad_norm": 218.0877227783203, "learning_rate": 9.056848086685789e-07, "loss": 3.8818, "step": 1780 }, { "epoch": 2.1461427886919022, "grad_norm": 117.9521484375, "learning_rate": 8.96285702249445e-07, "loss": 3.7345, "step": 1790 }, { "epoch": 2.1581217057977957, "grad_norm": 14.993733406066895, "learning_rate": 8.868958490027966e-07, "loss": 3.3709, "step": 1800 }, { "epoch": 2.1581217057977957, "eval_loss": 1.1172066926956177, "eval_runtime": 238.6675, "eval_samples_per_second": 6.243, "eval_steps_per_second": 3.121, "step": 1800 }, { "epoch": 2.1701006229036897, "grad_norm": 273.74365234375, "learning_rate": 8.775160866716791e-07, "loss": 4.06, "step": 1810 }, { "epoch": 2.182079540009583, "grad_norm": 11.733575820922852, "learning_rate": 8.681472520988488e-07, "loss": 2.4028, "step": 1820 }, { "epoch": 2.1940584571154766, "grad_norm": 270.543212890625, "learning_rate": 8.587901811521087e-07, "loss": 5.1853, "step": 1830 }, { "epoch": 2.2060373742213706, "grad_norm": 18.38782501220703, "learning_rate": 8.494457086497368e-07, "loss": 1.9458, "step": 1840 }, { "epoch": 2.218016291327264, "grad_norm": 127.68826293945312, "learning_rate": 8.401146682860041e-07, "loss": 3.2338, "step": 1850 }, { "epoch": 2.2299952084331576, "grad_norm": 186.4503173828125, "learning_rate": 8.307978925567945e-07, "loss": 2.5094, "step": 1860 }, { "epoch": 2.241974125539051, "grad_norm": 15.038443565368652, "learning_rate": 8.214962126853307e-07, "loss": 2.6388, "step": 1870 }, { "epoch": 2.253953042644945, "grad_norm": 74.26438903808594, "learning_rate": 8.122104585480143e-07, "loss": 2.24, "step": 1880 }, { "epoch": 2.2659319597508385, "grad_norm": 57.38971710205078, "learning_rate": 8.029414586003866e-07, "loss": 4.3915, "step": 1890 }, { "epoch": 2.277910876856732, "grad_norm": 221.86866760253906, "learning_rate": 7.93690039803216e-07, "loss": 3.0979, "step": 1900 }, { "epoch": 2.289889793962626, "grad_norm": 66.43515014648438, "learning_rate": 7.844570275487159e-07, "loss": 2.0459, "step": 1910 }, { "epoch": 2.3018687110685194, "grad_norm": 174.6913604736328, "learning_rate": 7.752432455869081e-07, "loss": 4.3113, "step": 1920 }, { "epoch": 2.313847628174413, "grad_norm": 12.269379615783691, "learning_rate": 7.660495159521264e-07, "loss": 2.6802, "step": 1930 }, { "epoch": 2.3258265452803064, "grad_norm": 14.982316970825195, "learning_rate": 7.56876658889679e-07, "loss": 3.2325, "step": 1940 }, { "epoch": 2.3378054623862004, "grad_norm": 75.86939239501953, "learning_rate": 7.477254927826664e-07, "loss": 1.0064, "step": 1950 }, { "epoch": 2.349784379492094, "grad_norm": 12.59216594696045, "learning_rate": 7.38596834078968e-07, "loss": 2.3464, "step": 1960 }, { "epoch": 2.361763296597988, "grad_norm": 44.80192184448242, "learning_rate": 7.294914972183992e-07, "loss": 3.9336, "step": 1970 }, { "epoch": 2.3737422137038813, "grad_norm": 227.7502899169922, "learning_rate": 7.204102945600502e-07, "loss": 3.3652, "step": 1980 }, { "epoch": 2.3857211308097748, "grad_norm": 202.1083526611328, "learning_rate": 7.113540363098072e-07, "loss": 3.0293, "step": 1990 }, { "epoch": 2.3977000479156683, "grad_norm": 148.12112426757812, "learning_rate": 7.02323530448069e-07, "loss": 3.7548, "step": 2000 }, { "epoch": 2.3977000479156683, "eval_loss": 1.1212018728256226, "eval_runtime": 237.7587, "eval_samples_per_second": 6.267, "eval_steps_per_second": 3.133, "step": 2000 }, { "epoch": 2.409678965021562, "grad_norm": 107.33174133300781, "learning_rate": 6.933195826576603e-07, "loss": 3.9499, "step": 2010 }, { "epoch": 2.4216578821274557, "grad_norm": 15.232194900512695, "learning_rate": 6.843429962519504e-07, "loss": 4.3203, "step": 2020 }, { "epoch": 2.433636799233349, "grad_norm": 14.64511489868164, "learning_rate": 6.75394572103183e-07, "loss": 4.5243, "step": 2030 }, { "epoch": 2.445615716339243, "grad_norm": 16.1613712310791, "learning_rate": 6.664751085710239e-07, "loss": 3.5644, "step": 2040 }, { "epoch": 2.4575946334451366, "grad_norm": 61.8858642578125, "learning_rate": 6.575854014313338e-07, "loss": 3.7972, "step": 2050 }, { "epoch": 2.46957355055103, "grad_norm": 52.300514221191406, "learning_rate": 6.487262438051701e-07, "loss": 3.6956, "step": 2060 }, { "epoch": 2.4815524676569236, "grad_norm": 208.3380126953125, "learning_rate": 6.398984260880266e-07, "loss": 3.6895, "step": 2070 }, { "epoch": 2.4935313847628175, "grad_norm": 18.773216247558594, "learning_rate": 6.311027358793166e-07, "loss": 3.0383, "step": 2080 }, { "epoch": 2.505510301868711, "grad_norm": 19.703941345214844, "learning_rate": 6.223399579121029e-07, "loss": 2.5712, "step": 2090 }, { "epoch": 2.5174892189746045, "grad_norm": 63.63347625732422, "learning_rate": 6.136108739830886e-07, "loss": 2.2939, "step": 2100 }, { "epoch": 2.5294681360804985, "grad_norm": 23.118276596069336, "learning_rate": 6.049162628828644e-07, "loss": 3.329, "step": 2110 }, { "epoch": 2.541447053186392, "grad_norm": 16.502140045166016, "learning_rate": 5.962569003264276e-07, "loss": 4.0458, "step": 2120 }, { "epoch": 2.5534259702922855, "grad_norm": 11.94331169128418, "learning_rate": 5.876335588839746e-07, "loss": 3.7107, "step": 2130 }, { "epoch": 2.565404887398179, "grad_norm": 20.692808151245117, "learning_rate": 5.79047007911973e-07, "loss": 2.3799, "step": 2140 }, { "epoch": 2.577383804504073, "grad_norm": 170.5503692626953, "learning_rate": 5.704980134845213e-07, "loss": 2.808, "step": 2150 }, { "epoch": 2.5893627216099664, "grad_norm": 17.668119430541992, "learning_rate": 5.619873383250029e-07, "loss": 2.4657, "step": 2160 }, { "epoch": 2.6013416387158603, "grad_norm": 12.804652214050293, "learning_rate": 5.535157417380346e-07, "loss": 4.2857, "step": 2170 }, { "epoch": 2.613320555821754, "grad_norm": 231.0024871826172, "learning_rate": 5.450839795417266e-07, "loss": 5.443, "step": 2180 }, { "epoch": 2.6252994729276473, "grad_norm": 156.8734893798828, "learning_rate": 5.366928040002476e-07, "loss": 4.17, "step": 2190 }, { "epoch": 2.637278390033541, "grad_norm": 49.93342590332031, "learning_rate": 5.283429637567091e-07, "loss": 3.2694, "step": 2200 }, { "epoch": 2.637278390033541, "eval_loss": 1.1175537109375, "eval_runtime": 238.0084, "eval_samples_per_second": 6.26, "eval_steps_per_second": 3.13, "step": 2200 }, { "epoch": 2.6492573071394347, "grad_norm": 11.659183502197266, "learning_rate": 5.200352037663745e-07, "loss": 1.2186, "step": 2210 }, { "epoch": 2.6612362242453282, "grad_norm": 16.97669219970703, "learning_rate": 5.117702652301952e-07, "loss": 3.9984, "step": 2220 }, { "epoch": 2.6732151413512217, "grad_norm": 49.165199279785156, "learning_rate": 5.035488855286823e-07, "loss": 2.916, "step": 2230 }, { "epoch": 2.6851940584571157, "grad_norm": 14.636496543884277, "learning_rate": 4.953717981561186e-07, "loss": 3.1995, "step": 2240 }, { "epoch": 2.697172975563009, "grad_norm": 13.993462562561035, "learning_rate": 4.872397326551179e-07, "loss": 3.1531, "step": 2250 }, { "epoch": 2.7091518926689027, "grad_norm": 14.866923332214355, "learning_rate": 4.791534145515368e-07, "loss": 2.7951, "step": 2260 }, { "epoch": 2.721130809774796, "grad_norm": 12.874122619628906, "learning_rate": 4.711135652897452e-07, "loss": 4.0197, "step": 2270 }, { "epoch": 2.73310972688069, "grad_norm": 166.74929809570312, "learning_rate": 4.6312090216826074e-07, "loss": 4.324, "step": 2280 }, { "epoch": 2.7450886439865836, "grad_norm": 169.29539489746094, "learning_rate": 4.551761382757513e-07, "loss": 4.1737, "step": 2290 }, { "epoch": 2.757067561092477, "grad_norm": 12.698741912841797, "learning_rate": 4.4727998242741627e-07, "loss": 3.2982, "step": 2300 }, { "epoch": 2.769046478198371, "grad_norm": 97.41565704345703, "learning_rate": 4.394331391017474e-07, "loss": 3.0522, "step": 2310 }, { "epoch": 2.7810253953042645, "grad_norm": 13.397387504577637, "learning_rate": 4.316363083776766e-07, "loss": 5.0599, "step": 2320 }, { "epoch": 2.793004312410158, "grad_norm": 14.772340774536133, "learning_rate": 4.2389018587211524e-07, "loss": 2.43, "step": 2330 }, { "epoch": 2.8049832295160515, "grad_norm": 218.20303344726562, "learning_rate": 4.1619546267789453e-07, "loss": 5.5137, "step": 2340 }, { "epoch": 2.8169621466219454, "grad_norm": 312.639892578125, "learning_rate": 4.0855282530210676e-07, "loss": 4.4751, "step": 2350 }, { "epoch": 2.828941063727839, "grad_norm": 293.28387451171875, "learning_rate": 4.0096295560485547e-07, "loss": 4.1398, "step": 2360 }, { "epoch": 2.840919980833733, "grad_norm": 84.95050811767578, "learning_rate": 3.934265307384239e-07, "loss": 3.7418, "step": 2370 }, { "epoch": 2.8528988979396264, "grad_norm": 108.16950988769531, "learning_rate": 3.8594422308685793e-07, "loss": 3.392, "step": 2380 }, { "epoch": 2.86487781504552, "grad_norm": 258.593505859375, "learning_rate": 3.785167002059799e-07, "loss": 4.247, "step": 2390 }, { "epoch": 2.8768567321514134, "grad_norm": 247.61813354492188, "learning_rate": 3.7114462476382966e-07, "loss": 3.9058, "step": 2400 }, { "epoch": 2.8768567321514134, "eval_loss": 1.125927209854126, "eval_runtime": 237.8939, "eval_samples_per_second": 6.263, "eval_steps_per_second": 3.132, "step": 2400 }, { "epoch": 2.8888356492573073, "grad_norm": 82.83909606933594, "learning_rate": 3.6382865448154187e-07, "loss": 3.9744, "step": 2410 }, { "epoch": 2.900814566363201, "grad_norm": 289.5834655761719, "learning_rate": 3.5656944207466633e-07, "loss": 4.8423, "step": 2420 }, { "epoch": 2.9127934834690943, "grad_norm": 11.12870979309082, "learning_rate": 3.4936763519493495e-07, "loss": 1.8868, "step": 2430 }, { "epoch": 2.924772400574988, "grad_norm": 259.54205322265625, "learning_rate": 3.4222387637247806e-07, "loss": 5.073, "step": 2440 }, { "epoch": 2.9367513176808817, "grad_norm": 260.9941711425781, "learning_rate": 3.351388029585007e-07, "loss": 3.9769, "step": 2450 }, { "epoch": 2.948730234786775, "grad_norm": 138.75767517089844, "learning_rate": 3.281130470684166e-07, "loss": 1.8905, "step": 2460 }, { "epoch": 2.9607091518926687, "grad_norm": 212.35589599609375, "learning_rate": 3.2114723552545606e-07, "loss": 5.6336, "step": 2470 }, { "epoch": 2.9726880689985626, "grad_norm": 77.375, "learning_rate": 3.142419898047399e-07, "loss": 2.5325, "step": 2480 }, { "epoch": 2.984666986104456, "grad_norm": 301.5721740722656, "learning_rate": 3.073979259778332e-07, "loss": 2.8451, "step": 2490 }, { "epoch": 2.9966459032103496, "grad_norm": 23.281728744506836, "learning_rate": 3.006156546577796e-07, "loss": 4.271, "step": 2500 }, { "epoch": 3.009583133684715, "grad_norm": 60.59037399291992, "learning_rate": 2.9389578094462607e-07, "loss": 4.768, "step": 2510 }, { "epoch": 3.0215620507906085, "grad_norm": 12.599560737609863, "learning_rate": 2.872389043714343e-07, "loss": 3.4648, "step": 2520 }, { "epoch": 3.033540967896502, "grad_norm": 60.7338981628418, "learning_rate": 2.806456188507943e-07, "loss": 4.1664, "step": 2530 }, { "epoch": 3.045519885002396, "grad_norm": 124.4323959350586, "learning_rate": 2.7411651262183465e-07, "loss": 3.2584, "step": 2540 }, { "epoch": 3.0574988021082894, "grad_norm": 10.696157455444336, "learning_rate": 2.676521681977425e-07, "loss": 1.6209, "step": 2550 }, { "epoch": 3.069477719214183, "grad_norm": 78.30728149414062, "learning_rate": 2.612531623137922e-07, "loss": 2.9967, "step": 2560 }, { "epoch": 3.081456636320077, "grad_norm": 12.110499382019043, "learning_rate": 2.5492006587589033e-07, "loss": 1.8501, "step": 2570 }, { "epoch": 3.0934355534259703, "grad_norm": 142.67459106445312, "learning_rate": 2.4865344390964016e-07, "loss": 6.7426, "step": 2580 }, { "epoch": 3.105414470531864, "grad_norm": 84.46195220947266, "learning_rate": 2.424538555099326e-07, "loss": 3.011, "step": 2590 }, { "epoch": 3.1173933876377578, "grad_norm": 287.4181213378906, "learning_rate": 2.3632185379106383e-07, "loss": 4.2906, "step": 2600 }, { "epoch": 3.1173933876377578, "eval_loss": 1.1396493911743164, "eval_runtime": 240.643, "eval_samples_per_second": 6.192, "eval_steps_per_second": 3.096, "step": 2600 }, { "epoch": 3.1293723047436512, "grad_norm": 17.018083572387695, "learning_rate": 2.302579858373881e-07, "loss": 2.1286, "step": 2610 }, { "epoch": 3.1413512218495447, "grad_norm": 13.265679359436035, "learning_rate": 2.2426279265450708e-07, "loss": 2.9965, "step": 2620 }, { "epoch": 3.1533301389554382, "grad_norm": 156.8545684814453, "learning_rate": 2.183368091210037e-07, "loss": 3.7899, "step": 2630 }, { "epoch": 3.165309056061332, "grad_norm": 17.818618774414062, "learning_rate": 2.1248056394072078e-07, "loss": 4.1165, "step": 2640 }, { "epoch": 3.1772879731672257, "grad_norm": 15.270909309387207, "learning_rate": 2.0669457959559177e-07, "loss": 3.1192, "step": 2650 }, { "epoch": 3.189266890273119, "grad_norm": 78.96019744873047, "learning_rate": 2.0097937229902485e-07, "loss": 5.3403, "step": 2660 }, { "epoch": 3.201245807379013, "grad_norm": 376.2461242675781, "learning_rate": 1.9533545194984791e-07, "loss": 3.9551, "step": 2670 }, { "epoch": 3.2132247244849066, "grad_norm": 16.849010467529297, "learning_rate": 1.8976332208681744e-07, "loss": 5.6715, "step": 2680 }, { "epoch": 3.2252036415908, "grad_norm": 171.6584930419922, "learning_rate": 1.8426347984369273e-07, "loss": 5.2323, "step": 2690 }, { "epoch": 3.237182558696694, "grad_norm": 120.8387680053711, "learning_rate": 1.788364159048833e-07, "loss": 4.2853, "step": 2700 }, { "epoch": 3.2491614758025875, "grad_norm": 13.419800758361816, "learning_rate": 1.734826144616698e-07, "loss": 2.9811, "step": 2710 }, { "epoch": 3.261140392908481, "grad_norm": 196.2430877685547, "learning_rate": 1.6820255316900756e-07, "loss": 4.3565, "step": 2720 }, { "epoch": 3.2731193100143745, "grad_norm": 15.63242244720459, "learning_rate": 1.6299670310290915e-07, "loss": 2.4933, "step": 2730 }, { "epoch": 3.2850982271202684, "grad_norm": 60.21677017211914, "learning_rate": 1.5786552871841774e-07, "loss": 2.6513, "step": 2740 }, { "epoch": 3.297077144226162, "grad_norm": 223.22097778320312, "learning_rate": 1.528094878081677e-07, "loss": 3.3477, "step": 2750 }, { "epoch": 3.3090560613320554, "grad_norm": 95.59901428222656, "learning_rate": 1.478290314615427e-07, "loss": 3.0678, "step": 2760 }, { "epoch": 3.3210349784379494, "grad_norm": 36.64326095581055, "learning_rate": 1.4292460402442995e-07, "loss": 3.4483, "step": 2770 }, { "epoch": 3.333013895543843, "grad_norm": 233.20370483398438, "learning_rate": 1.3809664305957625e-07, "loss": 2.9447, "step": 2780 }, { "epoch": 3.3449928126497364, "grad_norm": 14.824277877807617, "learning_rate": 1.3334557930754963e-07, "loss": 2.547, "step": 2790 }, { "epoch": 3.35697172975563, "grad_norm": 208.10154724121094, "learning_rate": 1.2867183664831038e-07, "loss": 6.0572, "step": 2800 }, { "epoch": 3.35697172975563, "eval_loss": 1.147083044052124, "eval_runtime": 239.357, "eval_samples_per_second": 6.225, "eval_steps_per_second": 3.113, "step": 2800 }, { "epoch": 3.368950646861524, "grad_norm": 167.9786834716797, "learning_rate": 1.2407583206339256e-07, "loss": 2.6004, "step": 2810 }, { "epoch": 3.3809295639674173, "grad_norm": 236.07484436035156, "learning_rate": 1.195579755987024e-07, "loss": 2.3534, "step": 2820 }, { "epoch": 3.3929084810733112, "grad_norm": 38.18111801147461, "learning_rate": 1.1511867032793321e-07, "loss": 2.5498, "step": 2830 }, { "epoch": 3.4048873981792047, "grad_norm": 136.2451934814453, "learning_rate": 1.107583123166066e-07, "loss": 5.3208, "step": 2840 }, { "epoch": 3.416866315285098, "grad_norm": 316.2095947265625, "learning_rate": 1.0647729058673427e-07, "loss": 4.3772, "step": 2850 }, { "epoch": 3.4288452323909917, "grad_norm": 14.228015899658203, "learning_rate": 1.0227598708211172e-07, "loss": 2.2948, "step": 2860 }, { "epoch": 3.4408241494968856, "grad_norm": 52.19302749633789, "learning_rate": 9.81547766342401e-08, "loss": 2.223, "step": 2870 }, { "epoch": 3.452803066602779, "grad_norm": 18.04366683959961, "learning_rate": 9.411402692888715e-08, "loss": 3.6671, "step": 2880 }, { "epoch": 3.4647819837086726, "grad_norm": 12.00094985961914, "learning_rate": 9.015409847328037e-08, "loss": 2.3488, "step": 2890 }, { "epoch": 3.4767609008145666, "grad_norm": 88.41423797607422, "learning_rate": 8.62753445639457e-08, "loss": 3.2758, "step": 2900 }, { "epoch": 3.48873981792046, "grad_norm": 56.944828033447266, "learning_rate": 8.247811125518489e-08, "loss": 1.576, "step": 2910 }, { "epoch": 3.5007187350263536, "grad_norm": 323.6489562988281, "learning_rate": 7.876273732820327e-08, "loss": 4.9528, "step": 2920 }, { "epoch": 3.512697652132247, "grad_norm": 10.932809829711914, "learning_rate": 7.51295542608834e-08, "loss": 3.6918, "step": 2930 }, { "epoch": 3.524676569238141, "grad_norm": 12.945392608642578, "learning_rate": 7.157888619821106e-08, "loss": 1.7161, "step": 2940 }, { "epoch": 3.5366554863440345, "grad_norm": 358.42498779296875, "learning_rate": 6.811104992335648e-08, "loss": 4.4565, "step": 2950 }, { "epoch": 3.548634403449928, "grad_norm": 231.78411865234375, "learning_rate": 6.47263548294108e-08, "loss": 3.5586, "step": 2960 }, { "epoch": 3.560613320555822, "grad_norm": 231.84341430664062, "learning_rate": 6.142510289178337e-08, "loss": 3.4724, "step": 2970 }, { "epoch": 3.5725922376617154, "grad_norm": 198.78810119628906, "learning_rate": 5.8207588641260185e-08, "loss": 2.5415, "step": 2980 }, { "epoch": 3.584571154767609, "grad_norm": 76.49454498291016, "learning_rate": 5.507409913772543e-08, "loss": 3.7494, "step": 2990 }, { "epoch": 3.5965500718735024, "grad_norm": 262.359130859375, "learning_rate": 5.202491394455155e-08, "loss": 4.0544, "step": 3000 }, { "epoch": 3.5965500718735024, "eval_loss": 1.1526151895523071, "eval_runtime": 238.4822, "eval_samples_per_second": 6.248, "eval_steps_per_second": 3.124, "step": 3000 }, { "epoch": 3.6085289889793963, "grad_norm": 14.773038864135742, "learning_rate": 4.9060305103657e-08, "loss": 3.8126, "step": 3010 }, { "epoch": 3.62050790608529, "grad_norm": 288.3175354003906, "learning_rate": 4.61805371112356e-08, "loss": 2.2371, "step": 3020 }, { "epoch": 3.6324868231911838, "grad_norm": 15.71839427947998, "learning_rate": 4.3661497350331423e-08, "loss": 2.8255, "step": 3030 }, { "epoch": 3.6444657402970773, "grad_norm": 47.34138870239258, "learning_rate": 4.094362852900846e-08, "loss": 2.4564, "step": 3040 }, { "epoch": 3.6564446574029708, "grad_norm": 133.03443908691406, "learning_rate": 3.8311324709047524e-08, "loss": 3.9076, "step": 3050 }, { "epoch": 3.6684235745088642, "grad_norm": 85.79124450683594, "learning_rate": 3.57648207390836e-08, "loss": 1.6264, "step": 3060 }, { "epoch": 3.680402491614758, "grad_norm": 11.73284912109375, "learning_rate": 3.3304343812869175e-08, "loss": 2.2377, "step": 3070 }, { "epoch": 3.6923814087206517, "grad_norm": 103.52379608154297, "learning_rate": 3.0930113449003536e-08, "loss": 2.5226, "step": 3080 }, { "epoch": 3.704360325826545, "grad_norm": 121.46673583984375, "learning_rate": 2.8642341471348585e-08, "loss": 5.8129, "step": 3090 }, { "epoch": 3.716339242932439, "grad_norm": 39.363887786865234, "learning_rate": 2.644123199013004e-08, "loss": 1.8684, "step": 3100 }, { "epoch": 3.7283181600383326, "grad_norm": 22.17119598388672, "learning_rate": 2.432698138372713e-08, "loss": 4.5753, "step": 3110 }, { "epoch": 3.740297077144226, "grad_norm": 123.43873596191406, "learning_rate": 2.2299778281151927e-08, "loss": 3.4706, "step": 3120 }, { "epoch": 3.7522759942501196, "grad_norm": 62.01347351074219, "learning_rate": 2.03598035452206e-08, "loss": 1.0962, "step": 3130 }, { "epoch": 3.7642549113560135, "grad_norm": 106.05194091796875, "learning_rate": 1.8507230256417316e-08, "loss": 4.0847, "step": 3140 }, { "epoch": 3.776233828461907, "grad_norm": 14.505044937133789, "learning_rate": 1.674222369745182e-08, "loss": 2.7005, "step": 3150 }, { "epoch": 3.7882127455678005, "grad_norm": 74.14771270751953, "learning_rate": 1.5064941338513548e-08, "loss": 4.5833, "step": 3160 }, { "epoch": 3.8001916626736945, "grad_norm": 109.61576080322266, "learning_rate": 1.3475532823222779e-08, "loss": 3.3511, "step": 3170 }, { "epoch": 3.812170579779588, "grad_norm": 142.658447265625, "learning_rate": 1.1974139955279294e-08, "loss": 4.0569, "step": 3180 }, { "epoch": 3.8241494968854814, "grad_norm": 374.00701904296875, "learning_rate": 1.0560896685811061e-08, "loss": 3.657, "step": 3190 }, { "epoch": 3.836128413991375, "grad_norm": 191.76698303222656, "learning_rate": 9.235929101423457e-09, "loss": 3.2204, "step": 3200 }, { "epoch": 3.836128413991375, "eval_loss": 1.1512279510498047, "eval_runtime": 239.5597, "eval_samples_per_second": 6.22, "eval_steps_per_second": 3.11, "step": 3200 }, { "epoch": 3.848107331097269, "grad_norm": 89.6385726928711, "learning_rate": 7.99935541295016e-09, "loss": 4.2172, "step": 3210 }, { "epoch": 3.8600862482031624, "grad_norm": 52.579227447509766, "learning_rate": 6.8512859449064705e-09, "loss": 3.1792, "step": 3220 }, { "epoch": 3.8720651653090563, "grad_norm": 159.96087646484375, "learning_rate": 5.791823125646522e-09, "loss": 4.6078, "step": 3230 }, { "epoch": 3.88404408241495, "grad_norm": 248.06581115722656, "learning_rate": 4.8210614782245866e-09, "loss": 4.6227, "step": 3240 }, { "epoch": 3.8960229995208433, "grad_norm": 10.945176124572754, "learning_rate": 3.939087611962377e-09, "loss": 1.8884, "step": 3250 }, { "epoch": 3.908001916626737, "grad_norm": 279.2950439453125, "learning_rate": 3.1459802147214554e-09, "loss": 3.5358, "step": 3260 }, { "epoch": 3.9199808337326307, "grad_norm": 278.5760192871094, "learning_rate": 2.441810045883175e-09, "loss": 4.0377, "step": 3270 }, { "epoch": 3.9319597508385242, "grad_norm": 20.70584487915039, "learning_rate": 1.8266399300355118e-09, "loss": 4.9587, "step": 3280 }, { "epoch": 3.9439386679444177, "grad_norm": 238.9534912109375, "learning_rate": 1.300524751368326e-09, "loss": 3.8602, "step": 3290 }, { "epoch": 3.9559175850503117, "grad_norm": 16.01015281677246, "learning_rate": 8.635114487760553e-10, "loss": 1.4575, "step": 3300 }, { "epoch": 3.967896502156205, "grad_norm": 14.110264778137207, "learning_rate": 5.156390116707321e-10, "loss": 1.1673, "step": 3310 }, { "epoch": 3.9798754192620986, "grad_norm": 43.93592834472656, "learning_rate": 2.56938476502655e-10, "loss": 3.8769, "step": 3320 }, { "epoch": 3.991854336367992, "grad_norm": 9.19588851928711, "learning_rate": 8.743292399204793e-11, "loss": 2.4475, "step": 3330 } ], "logging_steps": 10, "max_steps": 3336, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.982695446856335e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }