{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3957, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.08993126249565843, "learning_rate": 5.050505050505052e-07, "loss": 1.9018, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.07331289037109069, "learning_rate": 2.5252525252525253e-06, "loss": 1.764, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.07765988729484478, "learning_rate": 5.050505050505051e-06, "loss": 1.6755, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.09328963425207142, "learning_rate": 7.5757575757575764e-06, "loss": 1.7942, "step": 15 }, { "epoch": 0.02, "grad_norm": 0.08439300726187475, "learning_rate": 1.0101010101010101e-05, "loss": 1.9255, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.09969799609843567, "learning_rate": 1.2626262626262628e-05, "loss": 1.6785, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.08101062126150285, "learning_rate": 1.5151515151515153e-05, "loss": 1.6021, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.09324937558599246, "learning_rate": 1.7676767676767676e-05, "loss": 1.8021, "step": 35 }, { "epoch": 0.03, "grad_norm": 0.08809158633106223, "learning_rate": 2.0202020202020203e-05, "loss": 1.8128, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.12040981734567617, "learning_rate": 2.272727272727273e-05, "loss": 1.9518, "step": 45 }, { "epoch": 0.04, "grad_norm": 0.10564260792758735, "learning_rate": 2.5252525252525256e-05, "loss": 1.844, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.11067902372614258, "learning_rate": 2.777777777777778e-05, "loss": 1.7309, "step": 55 }, { "epoch": 0.05, "grad_norm": 0.17208150270588693, "learning_rate": 3.0303030303030306e-05, "loss": 1.83, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.17753444313487116, "learning_rate": 3.282828282828283e-05, "loss": 1.7143, "step": 65 }, { "epoch": 0.05, "grad_norm": 0.11795959596262973, "learning_rate": 3.535353535353535e-05, "loss": 1.6863, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.1604849588266011, "learning_rate": 3.787878787878788e-05, "loss": 1.7895, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.14614002357667696, "learning_rate": 4.0404040404040405e-05, "loss": 1.7037, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.17612584653207902, "learning_rate": 4.292929292929293e-05, "loss": 1.7624, "step": 85 }, { "epoch": 0.07, "grad_norm": 0.1725623148760258, "learning_rate": 4.545454545454546e-05, "loss": 1.7826, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.20023707797673052, "learning_rate": 4.797979797979798e-05, "loss": 1.7551, "step": 95 }, { "epoch": 0.08, "grad_norm": 0.19340080776803978, "learning_rate": 5.050505050505051e-05, "loss": 1.8434, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.17540911026085398, "learning_rate": 5.303030303030303e-05, "loss": 1.8444, "step": 105 }, { "epoch": 0.08, "grad_norm": 0.17607693067428137, "learning_rate": 5.555555555555556e-05, "loss": 1.7179, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.18623446289553894, "learning_rate": 5.808080808080808e-05, "loss": 1.8005, "step": 115 }, { "epoch": 0.09, "grad_norm": 0.22653423501586226, "learning_rate": 6.060606060606061e-05, "loss": 1.8171, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.19917898944232487, "learning_rate": 6.313131313131313e-05, "loss": 1.7935, "step": 125 }, { "epoch": 0.1, "grad_norm": 0.17977021019465064, "learning_rate": 6.565656565656566e-05, "loss": 1.847, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.21882213186080465, "learning_rate": 6.818181818181818e-05, "loss": 1.7249, "step": 135 }, { "epoch": 0.11, "grad_norm": 0.19872378885041136, "learning_rate": 7.07070707070707e-05, "loss": 1.8317, "step": 140 }, { "epoch": 0.11, "grad_norm": 0.18503126257452687, "learning_rate": 7.323232323232324e-05, "loss": 1.7036, "step": 145 }, { "epoch": 0.11, "grad_norm": 0.19374257378242796, "learning_rate": 7.575757575757576e-05, "loss": 1.7408, "step": 150 }, { "epoch": 0.12, "grad_norm": 0.20435751977610797, "learning_rate": 7.828282828282829e-05, "loss": 1.7453, "step": 155 }, { "epoch": 0.12, "grad_norm": 0.18626365580812038, "learning_rate": 8.080808080808081e-05, "loss": 1.7538, "step": 160 }, { "epoch": 0.13, "grad_norm": 0.22638414276196805, "learning_rate": 8.333333333333334e-05, "loss": 1.7755, "step": 165 }, { "epoch": 0.13, "grad_norm": 0.19644895370384188, "learning_rate": 8.585858585858586e-05, "loss": 1.8446, "step": 170 }, { "epoch": 0.13, "grad_norm": 0.19159413735540007, "learning_rate": 8.83838383838384e-05, "loss": 1.6323, "step": 175 }, { "epoch": 0.14, "grad_norm": 0.17020103839194523, "learning_rate": 9.090909090909092e-05, "loss": 1.7957, "step": 180 }, { "epoch": 0.14, "grad_norm": 0.19164694691999767, "learning_rate": 9.343434343434344e-05, "loss": 1.9204, "step": 185 }, { "epoch": 0.14, "grad_norm": 0.19378174604020243, "learning_rate": 9.595959595959596e-05, "loss": 1.6792, "step": 190 }, { "epoch": 0.15, "grad_norm": 0.24199163008867994, "learning_rate": 9.848484848484849e-05, "loss": 1.7818, "step": 195 }, { "epoch": 0.15, "grad_norm": 0.1791702851062047, "learning_rate": 0.00010101010101010102, "loss": 1.6407, "step": 200 }, { "epoch": 0.16, "grad_norm": 0.2127448005277486, "learning_rate": 0.00010353535353535353, "loss": 1.8173, "step": 205 }, { "epoch": 0.16, "grad_norm": 0.18625979651987537, "learning_rate": 0.00010606060606060606, "loss": 1.7401, "step": 210 }, { "epoch": 0.16, "grad_norm": 0.2602576963144457, "learning_rate": 0.0001085858585858586, "loss": 1.8104, "step": 215 }, { "epoch": 0.17, "grad_norm": 0.19387518149584881, "learning_rate": 0.00011111111111111112, "loss": 1.8442, "step": 220 }, { "epoch": 0.17, "grad_norm": 0.22413096563678928, "learning_rate": 0.00011363636363636365, "loss": 1.6172, "step": 225 }, { "epoch": 0.17, "grad_norm": 0.21913536165908545, "learning_rate": 0.00011616161616161616, "loss": 1.6973, "step": 230 }, { "epoch": 0.18, "grad_norm": 0.2083524734994055, "learning_rate": 0.00011868686868686869, "loss": 1.7593, "step": 235 }, { "epoch": 0.18, "grad_norm": 0.22803196006710846, "learning_rate": 0.00012121212121212122, "loss": 1.731, "step": 240 }, { "epoch": 0.19, "grad_norm": 0.16039461658504198, "learning_rate": 0.00012373737373737374, "loss": 1.5913, "step": 245 }, { "epoch": 0.19, "grad_norm": 0.2185859764067758, "learning_rate": 0.00012626262626262626, "loss": 1.637, "step": 250 }, { "epoch": 0.19, "grad_norm": 0.19449925000530618, "learning_rate": 0.00012878787878787878, "loss": 1.5634, "step": 255 }, { "epoch": 0.2, "grad_norm": 0.18094727231062543, "learning_rate": 0.00013131313131313133, "loss": 1.6769, "step": 260 }, { "epoch": 0.2, "grad_norm": 0.2362383395641708, "learning_rate": 0.00013383838383838385, "loss": 1.7723, "step": 265 }, { "epoch": 0.2, "grad_norm": 0.1756303905738309, "learning_rate": 0.00013636363636363637, "loss": 1.7622, "step": 270 }, { "epoch": 0.21, "grad_norm": 0.18784556886056825, "learning_rate": 0.0001388888888888889, "loss": 1.648, "step": 275 }, { "epoch": 0.21, "grad_norm": 0.23195176017229427, "learning_rate": 0.0001414141414141414, "loss": 1.846, "step": 280 }, { "epoch": 0.22, "grad_norm": 0.22241261139284105, "learning_rate": 0.00014393939393939396, "loss": 1.6282, "step": 285 }, { "epoch": 0.22, "grad_norm": 0.1959378752266171, "learning_rate": 0.00014646464646464648, "loss": 1.7298, "step": 290 }, { "epoch": 0.22, "grad_norm": 0.18110574265575713, "learning_rate": 0.000148989898989899, "loss": 1.6463, "step": 295 }, { "epoch": 0.23, "grad_norm": 0.19727075597861782, "learning_rate": 0.00015151515151515152, "loss": 1.7925, "step": 300 }, { "epoch": 0.23, "grad_norm": 0.20574697015902954, "learning_rate": 0.00015404040404040406, "loss": 1.6835, "step": 305 }, { "epoch": 0.24, "grad_norm": 0.18178501884804188, "learning_rate": 0.00015656565656565658, "loss": 1.8534, "step": 310 }, { "epoch": 0.24, "grad_norm": 0.20396286221213047, "learning_rate": 0.0001590909090909091, "loss": 1.9553, "step": 315 }, { "epoch": 0.24, "grad_norm": 0.19731656072570272, "learning_rate": 0.00016161616161616162, "loss": 1.7907, "step": 320 }, { "epoch": 0.25, "grad_norm": 0.15745281662564334, "learning_rate": 0.00016414141414141414, "loss": 1.7516, "step": 325 }, { "epoch": 0.25, "grad_norm": 0.17389045576146547, "learning_rate": 0.0001666666666666667, "loss": 1.6362, "step": 330 }, { "epoch": 0.25, "grad_norm": 0.2055099842458337, "learning_rate": 0.00016919191919191918, "loss": 1.711, "step": 335 }, { "epoch": 0.26, "grad_norm": 0.16967943859036833, "learning_rate": 0.00017171717171717173, "loss": 1.7327, "step": 340 }, { "epoch": 0.26, "grad_norm": 0.20493364192749108, "learning_rate": 0.00017424242424242425, "loss": 1.7575, "step": 345 }, { "epoch": 0.27, "grad_norm": 0.22713873700263487, "learning_rate": 0.0001767676767676768, "loss": 1.6266, "step": 350 }, { "epoch": 0.27, "grad_norm": 0.22661135493794904, "learning_rate": 0.00017929292929292931, "loss": 1.5738, "step": 355 }, { "epoch": 0.27, "grad_norm": 0.2181059846275241, "learning_rate": 0.00018181818181818183, "loss": 1.6742, "step": 360 }, { "epoch": 0.28, "grad_norm": 0.17088148508773793, "learning_rate": 0.00018434343434343435, "loss": 1.7483, "step": 365 }, { "epoch": 0.28, "grad_norm": 0.2533679574468662, "learning_rate": 0.00018686868686868687, "loss": 1.8377, "step": 370 }, { "epoch": 0.28, "grad_norm": 0.19728510430536142, "learning_rate": 0.00018939393939393942, "loss": 1.6412, "step": 375 }, { "epoch": 0.29, "grad_norm": 0.1846496893995934, "learning_rate": 0.00019191919191919191, "loss": 1.6605, "step": 380 }, { "epoch": 0.29, "grad_norm": 0.20677111282109845, "learning_rate": 0.00019444444444444446, "loss": 1.9151, "step": 385 }, { "epoch": 0.3, "grad_norm": 0.1843158891748435, "learning_rate": 0.00019696969696969698, "loss": 1.6697, "step": 390 }, { "epoch": 0.3, "grad_norm": 0.19889363775332344, "learning_rate": 0.0001994949494949495, "loss": 1.7293, "step": 395 }, { "epoch": 0.3, "grad_norm": 0.18003384908917786, "learning_rate": 0.00019999937734807612, "loss": 1.8024, "step": 400 }, { "epoch": 0.31, "grad_norm": 0.18260287569380637, "learning_rate": 0.00019999684783792443, "loss": 1.6779, "step": 405 }, { "epoch": 0.31, "grad_norm": 0.21622290040357123, "learning_rate": 0.00019999237260298072, "loss": 1.6577, "step": 410 }, { "epoch": 0.31, "grad_norm": 0.19728853094941184, "learning_rate": 0.00019998595173032347, "loss": 1.6211, "step": 415 }, { "epoch": 0.32, "grad_norm": 0.16427481018358323, "learning_rate": 0.00019997758534488915, "loss": 1.6793, "step": 420 }, { "epoch": 0.32, "grad_norm": 0.1464512551401983, "learning_rate": 0.00019996727360946972, "loss": 1.731, "step": 425 }, { "epoch": 0.33, "grad_norm": 0.1895744669006413, "learning_rate": 0.00019995501672470951, "loss": 1.7024, "step": 430 }, { "epoch": 0.33, "grad_norm": 0.13688692966034832, "learning_rate": 0.00019994081492910124, "loss": 1.8371, "step": 435 }, { "epoch": 0.33, "grad_norm": 0.20101726127225358, "learning_rate": 0.0001999246684989815, "loss": 1.7473, "step": 440 }, { "epoch": 0.34, "grad_norm": 0.20241522090213954, "learning_rate": 0.00019990657774852534, "loss": 1.7423, "step": 445 }, { "epoch": 0.34, "grad_norm": 0.1767592377256186, "learning_rate": 0.00019988654302974, "loss": 1.8304, "step": 450 }, { "epoch": 0.34, "grad_norm": 0.2373965969657545, "learning_rate": 0.00019986456473245826, "loss": 1.8509, "step": 455 }, { "epoch": 0.35, "grad_norm": 0.21300866974991087, "learning_rate": 0.00019984064328433084, "loss": 1.7339, "step": 460 }, { "epoch": 0.35, "grad_norm": 0.185425069119908, "learning_rate": 0.00019981477915081793, "loss": 1.7523, "step": 465 }, { "epoch": 0.36, "grad_norm": 0.1955299660793198, "learning_rate": 0.00019978697283518023, "loss": 1.804, "step": 470 }, { "epoch": 0.36, "grad_norm": 0.19829821726437152, "learning_rate": 0.00019975722487846918, "loss": 1.8378, "step": 475 }, { "epoch": 0.36, "grad_norm": 0.13816451624075418, "learning_rate": 0.0001997255358595164, "loss": 1.791, "step": 480 }, { "epoch": 0.37, "grad_norm": 0.17416550139224937, "learning_rate": 0.00019969190639492244, "loss": 1.6882, "step": 485 }, { "epoch": 0.37, "grad_norm": 0.19361009276270708, "learning_rate": 0.00019965633713904472, "loss": 1.7448, "step": 490 }, { "epoch": 0.38, "grad_norm": 0.20655351119978135, "learning_rate": 0.00019961882878398492, "loss": 1.7804, "step": 495 }, { "epoch": 0.38, "grad_norm": 0.2104318907698028, "learning_rate": 0.0001995793820595754, "loss": 1.7399, "step": 500 }, { "epoch": 0.38, "grad_norm": 0.1970506865196183, "learning_rate": 0.00019953799773336507, "loss": 1.662, "step": 505 }, { "epoch": 0.39, "grad_norm": 0.12911497323739385, "learning_rate": 0.00019949467661060433, "loss": 1.6589, "step": 510 }, { "epoch": 0.39, "grad_norm": 0.18727055645023982, "learning_rate": 0.00019944941953422968, "loss": 1.7437, "step": 515 }, { "epoch": 0.39, "grad_norm": 0.21063285499774953, "learning_rate": 0.000199402227384847, "loss": 1.837, "step": 520 }, { "epoch": 0.4, "grad_norm": 0.16895692207829008, "learning_rate": 0.00019935310108071453, "loss": 1.7406, "step": 525 }, { "epoch": 0.4, "grad_norm": 0.2316031917603028, "learning_rate": 0.00019930204157772515, "loss": 1.8237, "step": 530 }, { "epoch": 0.41, "grad_norm": 0.14077975973845075, "learning_rate": 0.00019924904986938754, "loss": 1.8804, "step": 535 }, { "epoch": 0.41, "grad_norm": 0.25152961069767266, "learning_rate": 0.000199194126986807, "loss": 1.7984, "step": 540 }, { "epoch": 0.41, "grad_norm": 0.18475665649785333, "learning_rate": 0.00019913727399866545, "loss": 1.7, "step": 545 }, { "epoch": 0.42, "grad_norm": 0.15993162566307856, "learning_rate": 0.00019907849201120033, "loss": 1.8694, "step": 550 }, { "epoch": 0.42, "grad_norm": 0.21887423989587396, "learning_rate": 0.00019901778216818345, "loss": 1.699, "step": 555 }, { "epoch": 0.42, "grad_norm": 0.18385555657977046, "learning_rate": 0.00019895514565089855, "loss": 1.7936, "step": 560 }, { "epoch": 0.43, "grad_norm": 0.15762946661816535, "learning_rate": 0.00019889058367811822, "loss": 1.6613, "step": 565 }, { "epoch": 0.43, "grad_norm": 0.19204775302628793, "learning_rate": 0.0001988240975060804, "loss": 1.5856, "step": 570 }, { "epoch": 0.44, "grad_norm": 0.1697199863146858, "learning_rate": 0.00019875568842846382, "loss": 1.672, "step": 575 }, { "epoch": 0.44, "grad_norm": 0.1410887592852674, "learning_rate": 0.0001986853577763628, "loss": 1.6269, "step": 580 }, { "epoch": 0.44, "grad_norm": 0.1783222763204088, "learning_rate": 0.00019861310691826143, "loss": 1.8029, "step": 585 }, { "epoch": 0.45, "grad_norm": 0.20484278901882244, "learning_rate": 0.00019853893726000683, "loss": 1.6194, "step": 590 }, { "epoch": 0.45, "grad_norm": 0.1808969694192384, "learning_rate": 0.00019846285024478202, "loss": 1.7084, "step": 595 }, { "epoch": 0.45, "grad_norm": 0.1965951187170914, "learning_rate": 0.00019838484735307748, "loss": 1.706, "step": 600 }, { "epoch": 0.46, "grad_norm": 0.1555012346720015, "learning_rate": 0.0001983049301026627, "loss": 1.464, "step": 605 }, { "epoch": 0.46, "grad_norm": 0.2229027944987823, "learning_rate": 0.00019822310004855652, "loss": 1.673, "step": 610 }, { "epoch": 0.47, "grad_norm": 0.169635050611861, "learning_rate": 0.00019813935878299662, "loss": 1.6593, "step": 615 }, { "epoch": 0.47, "grad_norm": 0.16624303946845476, "learning_rate": 0.0001980537079354091, "loss": 1.7164, "step": 620 }, { "epoch": 0.47, "grad_norm": 0.1544208624543807, "learning_rate": 0.00019796614917237616, "loss": 1.5616, "step": 625 }, { "epoch": 0.48, "grad_norm": 0.1900272509930039, "learning_rate": 0.00019787668419760408, "loss": 1.6552, "step": 630 }, { "epoch": 0.48, "grad_norm": 0.20362594606792483, "learning_rate": 0.00019778531475188996, "loss": 1.7175, "step": 635 }, { "epoch": 0.49, "grad_norm": 0.15933464850430776, "learning_rate": 0.00019769204261308774, "loss": 1.6599, "step": 640 }, { "epoch": 0.49, "grad_norm": 0.15846354449923994, "learning_rate": 0.00019759686959607383, "loss": 1.7152, "step": 645 }, { "epoch": 0.49, "grad_norm": 0.1731064728813603, "learning_rate": 0.00019749979755271155, "loss": 1.8006, "step": 650 }, { "epoch": 0.5, "grad_norm": 0.2036118054344575, "learning_rate": 0.00019740082837181526, "loss": 1.6992, "step": 655 }, { "epoch": 0.5, "grad_norm": 0.20595935892977982, "learning_rate": 0.00019729996397911356, "loss": 1.7571, "step": 660 }, { "epoch": 0.5, "grad_norm": 0.1816167430276872, "learning_rate": 0.00019719720633721178, "loss": 1.8058, "step": 665 }, { "epoch": 0.51, "grad_norm": 0.2139611683255453, "learning_rate": 0.00019709255744555389, "loss": 1.8398, "step": 670 }, { "epoch": 0.51, "grad_norm": 0.18514013236898805, "learning_rate": 0.0001969860193403835, "loss": 1.7307, "step": 675 }, { "epoch": 0.52, "grad_norm": 0.17288244809213096, "learning_rate": 0.00019687759409470426, "loss": 1.7242, "step": 680 }, { "epoch": 0.52, "grad_norm": 0.15953349037713735, "learning_rate": 0.00019676728381823956, "loss": 1.6435, "step": 685 }, { "epoch": 0.52, "grad_norm": 0.20963390389942183, "learning_rate": 0.00019665509065739149, "loss": 1.6791, "step": 690 }, { "epoch": 0.53, "grad_norm": 0.19075148330166494, "learning_rate": 0.000196541016795199, "loss": 1.505, "step": 695 }, { "epoch": 0.53, "grad_norm": 0.22817672978454195, "learning_rate": 0.00019642506445129545, "loss": 1.8361, "step": 700 }, { "epoch": 0.53, "grad_norm": 0.1925013343867196, "learning_rate": 0.00019630723588186545, "loss": 1.7126, "step": 705 }, { "epoch": 0.54, "grad_norm": 0.16780528759294142, "learning_rate": 0.000196187533379601, "loss": 1.6649, "step": 710 }, { "epoch": 0.54, "grad_norm": 0.17707927803137202, "learning_rate": 0.00019606595927365675, "loss": 1.6551, "step": 715 }, { "epoch": 0.55, "grad_norm": 0.22525846033337887, "learning_rate": 0.00019594251592960479, "loss": 1.7401, "step": 720 }, { "epoch": 0.55, "grad_norm": 0.1953310514707257, "learning_rate": 0.0001958172057493886, "loss": 1.6944, "step": 725 }, { "epoch": 0.55, "grad_norm": 0.2085121645512001, "learning_rate": 0.0001956900311712763, "loss": 1.663, "step": 730 }, { "epoch": 0.56, "grad_norm": 0.17093646250615369, "learning_rate": 0.0001955609946698131, "loss": 1.772, "step": 735 }, { "epoch": 0.56, "grad_norm": 0.19564116222725914, "learning_rate": 0.00019543009875577346, "loss": 1.6328, "step": 740 }, { "epoch": 0.56, "grad_norm": 0.215195812549034, "learning_rate": 0.0001952973459761118, "loss": 1.6438, "step": 745 }, { "epoch": 0.57, "grad_norm": 0.19377558972597342, "learning_rate": 0.0001951627389139134, "loss": 1.7442, "step": 750 }, { "epoch": 0.57, "grad_norm": 0.1792011980095539, "learning_rate": 0.00019502628018834372, "loss": 1.7518, "step": 755 }, { "epoch": 0.58, "grad_norm": 0.18977603295326154, "learning_rate": 0.00019488797245459773, "loss": 1.688, "step": 760 }, { "epoch": 0.58, "grad_norm": 0.19341540153355985, "learning_rate": 0.00019474781840384816, "loss": 1.7562, "step": 765 }, { "epoch": 0.58, "grad_norm": 0.14738398424312027, "learning_rate": 0.00019460582076319302, "loss": 1.7244, "step": 770 }, { "epoch": 0.59, "grad_norm": 0.1496446026997031, "learning_rate": 0.00019446198229560276, "loss": 1.7083, "step": 775 }, { "epoch": 0.59, "grad_norm": 0.2151992641933425, "learning_rate": 0.00019431630579986632, "loss": 1.7078, "step": 780 }, { "epoch": 0.6, "grad_norm": 0.1972075447483379, "learning_rate": 0.00019416879411053673, "loss": 1.7665, "step": 785 }, { "epoch": 0.6, "grad_norm": 0.20871968848692934, "learning_rate": 0.00019401945009787594, "loss": 1.6636, "step": 790 }, { "epoch": 0.6, "grad_norm": 0.2047491094137733, "learning_rate": 0.0001938682766677991, "loss": 1.8061, "step": 795 }, { "epoch": 0.61, "grad_norm": 0.1622522396758859, "learning_rate": 0.00019371527676181777, "loss": 1.8645, "step": 800 }, { "epoch": 0.61, "grad_norm": 0.1714969472958251, "learning_rate": 0.00019356045335698296, "loss": 1.8266, "step": 805 }, { "epoch": 0.61, "grad_norm": 0.1971306915514917, "learning_rate": 0.00019340380946582695, "loss": 1.7205, "step": 810 }, { "epoch": 0.62, "grad_norm": 0.17020111323913545, "learning_rate": 0.00019324534813630487, "loss": 1.7339, "step": 815 }, { "epoch": 0.62, "grad_norm": 0.18250825908624654, "learning_rate": 0.00019308507245173527, "loss": 1.5188, "step": 820 }, { "epoch": 0.63, "grad_norm": 0.18593221945740382, "learning_rate": 0.0001929229855307402, "loss": 1.654, "step": 825 }, { "epoch": 0.63, "grad_norm": 0.22850385689556876, "learning_rate": 0.00019275909052718447, "loss": 1.7814, "step": 830 }, { "epoch": 0.63, "grad_norm": 0.19759950903326942, "learning_rate": 0.00019259339063011432, "loss": 1.744, "step": 835 }, { "epoch": 0.64, "grad_norm": 0.17215997030755548, "learning_rate": 0.00019242588906369536, "loss": 1.8283, "step": 840 }, { "epoch": 0.64, "grad_norm": 0.19136317284315416, "learning_rate": 0.00019225658908714983, "loss": 1.6163, "step": 845 }, { "epoch": 0.64, "grad_norm": 0.254426335434924, "learning_rate": 0.00019208549399469318, "loss": 1.7618, "step": 850 }, { "epoch": 0.65, "grad_norm": 0.21881435842952657, "learning_rate": 0.00019191260711547001, "loss": 1.7315, "step": 855 }, { "epoch": 0.65, "grad_norm": 0.20799528199612635, "learning_rate": 0.0001917379318134892, "loss": 1.7859, "step": 860 }, { "epoch": 0.66, "grad_norm": 0.17796834357588534, "learning_rate": 0.00019156147148755855, "loss": 1.7345, "step": 865 }, { "epoch": 0.66, "grad_norm": 0.1855849493493474, "learning_rate": 0.0001913832295712186, "loss": 1.6232, "step": 870 }, { "epoch": 0.66, "grad_norm": 0.20017349406812152, "learning_rate": 0.00019120320953267586, "loss": 1.7546, "step": 875 }, { "epoch": 0.67, "grad_norm": 0.2146332192452092, "learning_rate": 0.0001910214148747352, "loss": 1.6231, "step": 880 }, { "epoch": 0.67, "grad_norm": 0.15896122862532144, "learning_rate": 0.0001908378491347319, "loss": 1.5535, "step": 885 }, { "epoch": 0.67, "grad_norm": 0.17416174476856394, "learning_rate": 0.00019065251588446265, "loss": 1.6337, "step": 890 }, { "epoch": 0.68, "grad_norm": 0.23962933659259386, "learning_rate": 0.0001904654187301161, "loss": 1.8581, "step": 895 }, { "epoch": 0.68, "grad_norm": 0.17002626630746845, "learning_rate": 0.0001902765613122028, "loss": 1.6537, "step": 900 }, { "epoch": 0.69, "grad_norm": 0.23553588791103638, "learning_rate": 0.0001900859473054841, "loss": 1.7497, "step": 905 }, { "epoch": 0.69, "grad_norm": 0.17184692025828147, "learning_rate": 0.00018989358041890094, "loss": 1.6305, "step": 910 }, { "epoch": 0.69, "grad_norm": 0.19276600455036005, "learning_rate": 0.00018969946439550148, "loss": 1.6965, "step": 915 }, { "epoch": 0.7, "grad_norm": 0.2266174277702017, "learning_rate": 0.0001895036030123684, "loss": 1.7845, "step": 920 }, { "epoch": 0.7, "grad_norm": 0.15948175213103422, "learning_rate": 0.0001893060000805453, "loss": 1.582, "step": 925 }, { "epoch": 0.71, "grad_norm": 0.20415510379076665, "learning_rate": 0.00018910665944496264, "loss": 1.6576, "step": 930 }, { "epoch": 0.71, "grad_norm": 0.20826411615417578, "learning_rate": 0.00018890558498436282, "loss": 1.7243, "step": 935 }, { "epoch": 0.71, "grad_norm": 0.2535182644413305, "learning_rate": 0.00018870278061122484, "loss": 1.5795, "step": 940 }, { "epoch": 0.72, "grad_norm": 0.17063517897800512, "learning_rate": 0.00018849825027168803, "loss": 1.6361, "step": 945 }, { "epoch": 0.72, "grad_norm": 0.1898841623155248, "learning_rate": 0.00018829199794547535, "loss": 1.7526, "step": 950 }, { "epoch": 0.72, "grad_norm": 0.19033639448531828, "learning_rate": 0.00018808402764581596, "loss": 1.5943, "step": 955 }, { "epoch": 0.73, "grad_norm": 0.1647576077524525, "learning_rate": 0.0001878743434193671, "loss": 1.7575, "step": 960 }, { "epoch": 0.73, "grad_norm": 0.2070226518164384, "learning_rate": 0.00018766294934613535, "loss": 1.741, "step": 965 }, { "epoch": 0.74, "grad_norm": 0.21633633400820462, "learning_rate": 0.00018744984953939726, "loss": 1.6967, "step": 970 }, { "epoch": 0.74, "grad_norm": 0.2039504411965307, "learning_rate": 0.0001872350481456193, "loss": 1.6825, "step": 975 }, { "epoch": 0.74, "grad_norm": 0.19382809212719235, "learning_rate": 0.0001870185493443772, "loss": 1.7494, "step": 980 }, { "epoch": 0.75, "grad_norm": 0.17836311560595738, "learning_rate": 0.0001868003573482746, "loss": 1.6326, "step": 985 }, { "epoch": 0.75, "grad_norm": 0.18940985276826594, "learning_rate": 0.0001865804764028611, "loss": 1.6823, "step": 990 }, { "epoch": 0.75, "grad_norm": 0.15827883706638377, "learning_rate": 0.0001863589107865496, "loss": 1.8507, "step": 995 }, { "epoch": 0.76, "grad_norm": 0.2024112582787964, "learning_rate": 0.00018613566481053315, "loss": 1.6737, "step": 1000 }, { "epoch": 0.76, "grad_norm": 0.18631332115379975, "learning_rate": 0.00018591074281870099, "loss": 1.6391, "step": 1005 }, { "epoch": 0.77, "grad_norm": 0.2322177223268837, "learning_rate": 0.00018568414918755397, "loss": 1.7185, "step": 1010 }, { "epoch": 0.77, "grad_norm": 0.19585063603806546, "learning_rate": 0.00018545588832611956, "loss": 1.8829, "step": 1015 }, { "epoch": 0.77, "grad_norm": 0.19046268057109556, "learning_rate": 0.00018522596467586598, "loss": 1.6889, "step": 1020 }, { "epoch": 0.78, "grad_norm": 0.2319521660184869, "learning_rate": 0.00018499438271061568, "loss": 1.7148, "step": 1025 }, { "epoch": 0.78, "grad_norm": 0.18401426887501984, "learning_rate": 0.0001847611469364584, "loss": 1.6355, "step": 1030 }, { "epoch": 0.78, "grad_norm": 0.16467673844089234, "learning_rate": 0.00018452626189166345, "loss": 1.5748, "step": 1035 }, { "epoch": 0.79, "grad_norm": 0.21515271715044545, "learning_rate": 0.0001842897321465915, "loss": 1.7172, "step": 1040 }, { "epoch": 0.79, "grad_norm": 0.20010536585072475, "learning_rate": 0.0001840515623036055, "loss": 1.7331, "step": 1045 }, { "epoch": 0.8, "grad_norm": 0.15220183718369495, "learning_rate": 0.0001838117569969812, "loss": 1.7703, "step": 1050 }, { "epoch": 0.8, "grad_norm": 0.19249950248721495, "learning_rate": 0.00018357032089281702, "loss": 1.7356, "step": 1055 }, { "epoch": 0.8, "grad_norm": 0.15685889188495356, "learning_rate": 0.00018332725868894313, "loss": 1.5789, "step": 1060 }, { "epoch": 0.81, "grad_norm": 0.22123166856945198, "learning_rate": 0.00018308257511483018, "loss": 1.7449, "step": 1065 }, { "epoch": 0.81, "grad_norm": 0.21921162541787237, "learning_rate": 0.00018283627493149721, "loss": 1.592, "step": 1070 }, { "epoch": 0.82, "grad_norm": 0.15892072068340937, "learning_rate": 0.00018258836293141907, "loss": 1.6588, "step": 1075 }, { "epoch": 0.82, "grad_norm": 0.2129268440643301, "learning_rate": 0.000182338843938433, "loss": 1.6687, "step": 1080 }, { "epoch": 0.82, "grad_norm": 0.18558886049158316, "learning_rate": 0.000182087722807645, "loss": 1.6204, "step": 1085 }, { "epoch": 0.83, "grad_norm": 0.21759739469279235, "learning_rate": 0.00018183500442533514, "loss": 1.7012, "step": 1090 }, { "epoch": 0.83, "grad_norm": 0.16739812153050462, "learning_rate": 0.00018158069370886266, "loss": 1.7749, "step": 1095 }, { "epoch": 0.83, "grad_norm": 0.2120028175464506, "learning_rate": 0.0001813247956065702, "loss": 1.7076, "step": 1100 }, { "epoch": 0.84, "grad_norm": 0.21506301830058508, "learning_rate": 0.00018106731509768753, "loss": 1.6561, "step": 1105 }, { "epoch": 0.84, "grad_norm": 0.21374007008875692, "learning_rate": 0.00018080825719223468, "loss": 1.7721, "step": 1110 }, { "epoch": 0.85, "grad_norm": 0.21473556112453085, "learning_rate": 0.00018054762693092444, "loss": 1.5391, "step": 1115 }, { "epoch": 0.85, "grad_norm": 0.1928094864305794, "learning_rate": 0.00018028542938506426, "loss": 1.7297, "step": 1120 }, { "epoch": 0.85, "grad_norm": 0.22195616352181186, "learning_rate": 0.0001800216696564576, "loss": 1.6239, "step": 1125 }, { "epoch": 0.86, "grad_norm": 0.2493704349381919, "learning_rate": 0.00017975635287730473, "loss": 1.7736, "step": 1130 }, { "epoch": 0.86, "grad_norm": 0.1871166430212898, "learning_rate": 0.00017948948421010264, "loss": 1.67, "step": 1135 }, { "epoch": 0.86, "grad_norm": 0.16460126336549072, "learning_rate": 0.00017922106884754488, "loss": 1.7331, "step": 1140 }, { "epoch": 0.87, "grad_norm": 0.18707990225784327, "learning_rate": 0.0001789511120124203, "loss": 1.5608, "step": 1145 }, { "epoch": 0.87, "grad_norm": 0.21751770239029078, "learning_rate": 0.00017867961895751163, "loss": 1.721, "step": 1150 }, { "epoch": 0.88, "grad_norm": 0.1674742118307801, "learning_rate": 0.00017840659496549298, "loss": 1.7339, "step": 1155 }, { "epoch": 0.88, "grad_norm": 0.19173527103482793, "learning_rate": 0.00017813204534882738, "loss": 1.7348, "step": 1160 }, { "epoch": 0.88, "grad_norm": 0.18468049289167895, "learning_rate": 0.0001778559754496631, "loss": 1.6823, "step": 1165 }, { "epoch": 0.89, "grad_norm": 0.2069730744593729, "learning_rate": 0.00017757839063972997, "loss": 1.8253, "step": 1170 }, { "epoch": 0.89, "grad_norm": 0.2139312404074137, "learning_rate": 0.00017729929632023472, "loss": 1.7013, "step": 1175 }, { "epoch": 0.89, "grad_norm": 0.1764736094502213, "learning_rate": 0.00017701869792175593, "loss": 1.8235, "step": 1180 }, { "epoch": 0.9, "grad_norm": 0.21944309103277923, "learning_rate": 0.00017673660090413823, "loss": 1.8237, "step": 1185 }, { "epoch": 0.9, "grad_norm": 0.20268987883171422, "learning_rate": 0.00017645301075638634, "loss": 1.6992, "step": 1190 }, { "epoch": 0.91, "grad_norm": 0.19400968339090352, "learning_rate": 0.00017616793299655794, "loss": 1.8662, "step": 1195 }, { "epoch": 0.91, "grad_norm": 0.18489832863809344, "learning_rate": 0.00017588137317165657, "loss": 1.6986, "step": 1200 }, { "epoch": 0.91, "grad_norm": 0.17738333103257395, "learning_rate": 0.0001755933368575235, "loss": 1.6783, "step": 1205 }, { "epoch": 0.92, "grad_norm": 0.17926192606119037, "learning_rate": 0.0001753038296587294, "loss": 1.7627, "step": 1210 }, { "epoch": 0.92, "grad_norm": 0.20194075183870522, "learning_rate": 0.00017501285720846523, "loss": 1.7846, "step": 1215 }, { "epoch": 0.92, "grad_norm": 0.19331786071311133, "learning_rate": 0.0001747204251684325, "loss": 1.7143, "step": 1220 }, { "epoch": 0.93, "grad_norm": 0.23530188097310437, "learning_rate": 0.00017442653922873327, "loss": 1.7296, "step": 1225 }, { "epoch": 0.93, "grad_norm": 0.17594594764152405, "learning_rate": 0.0001741312051077594, "loss": 1.7335, "step": 1230 }, { "epoch": 0.94, "grad_norm": 0.20934249136020208, "learning_rate": 0.00017383442855208124, "loss": 1.6646, "step": 1235 }, { "epoch": 0.94, "grad_norm": 0.2111005617028846, "learning_rate": 0.00017353621533633583, "loss": 1.5756, "step": 1240 }, { "epoch": 0.94, "grad_norm": 0.21413727626671644, "learning_rate": 0.00017323657126311454, "loss": 1.4917, "step": 1245 }, { "epoch": 0.95, "grad_norm": 0.2391299536210697, "learning_rate": 0.0001729355021628502, "loss": 1.7283, "step": 1250 }, { "epoch": 0.95, "grad_norm": 0.19381232926045663, "learning_rate": 0.00017263301389370362, "loss": 1.7907, "step": 1255 }, { "epoch": 0.96, "grad_norm": 0.21223075585900172, "learning_rate": 0.0001723291123414495, "loss": 1.7412, "step": 1260 }, { "epoch": 0.96, "grad_norm": 0.18560634331207926, "learning_rate": 0.00017202380341936212, "loss": 1.7287, "step": 1265 }, { "epoch": 0.96, "grad_norm": 0.18941317978765862, "learning_rate": 0.00017171709306810012, "loss": 1.5956, "step": 1270 }, { "epoch": 0.97, "grad_norm": 0.17108900888850623, "learning_rate": 0.000171408987255591, "loss": 1.7789, "step": 1275 }, { "epoch": 0.97, "grad_norm": 0.19233373904164977, "learning_rate": 0.00017109949197691485, "loss": 1.7397, "step": 1280 }, { "epoch": 0.97, "grad_norm": 0.1697480170006848, "learning_rate": 0.00017078861325418797, "loss": 1.5765, "step": 1285 }, { "epoch": 0.98, "grad_norm": 0.17575403691888572, "learning_rate": 0.00017047635713644528, "loss": 1.8137, "step": 1290 }, { "epoch": 0.98, "grad_norm": 0.19700603487956356, "learning_rate": 0.00017016272969952304, "loss": 1.8248, "step": 1295 }, { "epoch": 0.99, "grad_norm": 0.25577968967800774, "learning_rate": 0.0001698477370459405, "loss": 1.5227, "step": 1300 }, { "epoch": 0.99, "grad_norm": 0.21182493743068362, "learning_rate": 0.00016953138530478092, "loss": 1.6463, "step": 1305 }, { "epoch": 0.99, "grad_norm": 0.24187008234174068, "learning_rate": 0.0001692136806315726, "loss": 1.677, "step": 1310 }, { "epoch": 1.0, "grad_norm": 0.23079613874981772, "learning_rate": 0.00016889462920816902, "loss": 1.6987, "step": 1315 }, { "epoch": 1.0, "grad_norm": 0.18959747421576906, "learning_rate": 0.00016857423724262849, "loss": 1.6143, "step": 1320 }, { "epoch": 1.0, "grad_norm": 0.19193767521915664, "learning_rate": 0.00016825251096909343, "loss": 1.6523, "step": 1325 }, { "epoch": 1.01, "grad_norm": 0.1851789336505185, "learning_rate": 0.00016792945664766907, "loss": 1.5728, "step": 1330 }, { "epoch": 1.01, "grad_norm": 0.14492627661875204, "learning_rate": 0.00016760508056430152, "loss": 1.5701, "step": 1335 }, { "epoch": 1.02, "grad_norm": 0.2700845196747031, "learning_rate": 0.0001672793890306556, "loss": 1.8245, "step": 1340 }, { "epoch": 1.02, "grad_norm": 0.1983440671335701, "learning_rate": 0.00016695238838399206, "loss": 1.7108, "step": 1345 }, { "epoch": 1.02, "grad_norm": 0.17701113866794518, "learning_rate": 0.0001666240849870441, "loss": 1.5517, "step": 1350 }, { "epoch": 1.03, "grad_norm": 0.16944238367848802, "learning_rate": 0.0001662944852278936, "loss": 1.7263, "step": 1355 }, { "epoch": 1.03, "grad_norm": 0.20201061964568917, "learning_rate": 0.00016596359551984704, "loss": 1.6212, "step": 1360 }, { "epoch": 1.03, "grad_norm": 0.16272112017898177, "learning_rate": 0.0001656314223013104, "loss": 1.6557, "step": 1365 }, { "epoch": 1.04, "grad_norm": 0.2050184080142653, "learning_rate": 0.00016529797203566405, "loss": 1.6203, "step": 1370 }, { "epoch": 1.04, "grad_norm": 0.18868029622446703, "learning_rate": 0.00016496325121113706, "loss": 1.5994, "step": 1375 }, { "epoch": 1.05, "grad_norm": 0.18530725289838731, "learning_rate": 0.00016462726634068075, "loss": 1.661, "step": 1380 }, { "epoch": 1.05, "grad_norm": 0.22254932266214475, "learning_rate": 0.00016429002396184215, "loss": 1.5779, "step": 1385 }, { "epoch": 1.05, "grad_norm": 0.35454879952816054, "learning_rate": 0.00016395153063663667, "loss": 1.4926, "step": 1390 }, { "epoch": 1.06, "grad_norm": 0.2083539962991777, "learning_rate": 0.00016361179295142046, "loss": 1.668, "step": 1395 }, { "epoch": 1.06, "grad_norm": 0.20105783428150303, "learning_rate": 0.00016327081751676227, "loss": 1.7475, "step": 1400 }, { "epoch": 1.07, "grad_norm": 0.19073307130103012, "learning_rate": 0.0001629286109673148, "loss": 1.6726, "step": 1405 }, { "epoch": 1.07, "grad_norm": 0.21132776602726958, "learning_rate": 0.00016258517996168564, "loss": 1.745, "step": 1410 }, { "epoch": 1.07, "grad_norm": 0.23336177448110548, "learning_rate": 0.0001622405311823076, "loss": 1.7185, "step": 1415 }, { "epoch": 1.08, "grad_norm": 0.19045792239686193, "learning_rate": 0.00016189467133530884, "loss": 1.6369, "step": 1420 }, { "epoch": 1.08, "grad_norm": 0.1470674402518224, "learning_rate": 0.0001615476071503823, "loss": 1.6593, "step": 1425 }, { "epoch": 1.08, "grad_norm": 0.1895764202504411, "learning_rate": 0.0001611993453806547, "loss": 1.5879, "step": 1430 }, { "epoch": 1.09, "grad_norm": 0.21781610564885606, "learning_rate": 0.0001608498928025553, "loss": 1.6377, "step": 1435 }, { "epoch": 1.09, "grad_norm": 0.21082770226036582, "learning_rate": 0.00016049925621568382, "loss": 1.5626, "step": 1440 }, { "epoch": 1.1, "grad_norm": 0.2288377931408156, "learning_rate": 0.00016014744244267833, "loss": 1.7531, "step": 1445 }, { "epoch": 1.1, "grad_norm": 0.1822265551052057, "learning_rate": 0.00015979445832908242, "loss": 1.691, "step": 1450 }, { "epoch": 1.1, "grad_norm": 0.23100268355259115, "learning_rate": 0.00015944031074321204, "loss": 1.7622, "step": 1455 }, { "epoch": 1.11, "grad_norm": 0.18624779842903288, "learning_rate": 0.00015908500657602174, "loss": 1.5919, "step": 1460 }, { "epoch": 1.11, "grad_norm": 0.20357926913176824, "learning_rate": 0.0001587285527409707, "loss": 1.6288, "step": 1465 }, { "epoch": 1.11, "grad_norm": 0.20919686630022472, "learning_rate": 0.00015837095617388827, "loss": 1.6705, "step": 1470 }, { "epoch": 1.12, "grad_norm": 0.1993582841062667, "learning_rate": 0.0001580122238328387, "loss": 1.6516, "step": 1475 }, { "epoch": 1.12, "grad_norm": 0.2547942602076731, "learning_rate": 0.00015765236269798627, "loss": 1.5036, "step": 1480 }, { "epoch": 1.13, "grad_norm": 0.1807424509361345, "learning_rate": 0.00015729137977145893, "loss": 1.6089, "step": 1485 }, { "epoch": 1.13, "grad_norm": 0.18264437292208377, "learning_rate": 0.0001569292820772124, "loss": 1.7353, "step": 1490 }, { "epoch": 1.13, "grad_norm": 0.21311009253554458, "learning_rate": 0.00015656607666089334, "loss": 1.6574, "step": 1495 }, { "epoch": 1.14, "grad_norm": 0.18453680363642788, "learning_rate": 0.0001562017705897024, "loss": 1.5736, "step": 1500 }, { "epoch": 1.14, "grad_norm": 0.23644760312940358, "learning_rate": 0.00015583637095225656, "loss": 1.7076, "step": 1505 }, { "epoch": 1.14, "grad_norm": 0.19899933139163767, "learning_rate": 0.00015546988485845125, "loss": 1.665, "step": 1510 }, { "epoch": 1.15, "grad_norm": 0.23202505382527974, "learning_rate": 0.0001551023194393221, "loss": 1.7191, "step": 1515 }, { "epoch": 1.15, "grad_norm": 0.21073033640879407, "learning_rate": 0.00015473368184690597, "loss": 1.6123, "step": 1520 }, { "epoch": 1.16, "grad_norm": 0.22019036120363472, "learning_rate": 0.00015436397925410201, "loss": 1.6909, "step": 1525 }, { "epoch": 1.16, "grad_norm": 0.20817813902248655, "learning_rate": 0.00015399321885453202, "loss": 1.7648, "step": 1530 }, { "epoch": 1.16, "grad_norm": 0.21714232280510767, "learning_rate": 0.00015362140786240035, "loss": 1.6718, "step": 1535 }, { "epoch": 1.17, "grad_norm": 0.20478633851375716, "learning_rate": 0.00015324855351235372, "loss": 1.7586, "step": 1540 }, { "epoch": 1.17, "grad_norm": 0.19046880552839732, "learning_rate": 0.00015287466305934037, "loss": 1.695, "step": 1545 }, { "epoch": 1.18, "grad_norm": 0.23309832393442634, "learning_rate": 0.0001524997437784689, "loss": 1.584, "step": 1550 }, { "epoch": 1.18, "grad_norm": 0.23887396172176847, "learning_rate": 0.00015212380296486652, "loss": 1.5742, "step": 1555 }, { "epoch": 1.18, "grad_norm": 0.18128179052277552, "learning_rate": 0.0001517468479335376, "loss": 1.6802, "step": 1560 }, { "epoch": 1.19, "grad_norm": 0.22086322507654135, "learning_rate": 0.00015136888601922072, "loss": 1.7222, "step": 1565 }, { "epoch": 1.19, "grad_norm": 0.18870219517815454, "learning_rate": 0.0001509899245762464, "loss": 1.5664, "step": 1570 }, { "epoch": 1.19, "grad_norm": 0.2276718864826248, "learning_rate": 0.00015060997097839386, "loss": 1.7565, "step": 1575 }, { "epoch": 1.2, "grad_norm": 0.20329327239158157, "learning_rate": 0.00015022903261874748, "loss": 1.6774, "step": 1580 }, { "epoch": 1.2, "grad_norm": 0.18898688137814482, "learning_rate": 0.00014984711690955297, "loss": 1.6518, "step": 1585 }, { "epoch": 1.21, "grad_norm": 0.22865474882055875, "learning_rate": 0.00014946423128207322, "loss": 1.7247, "step": 1590 }, { "epoch": 1.21, "grad_norm": 0.21027592834116465, "learning_rate": 0.00014908038318644373, "loss": 1.7849, "step": 1595 }, { "epoch": 1.21, "grad_norm": 0.20948671991840284, "learning_rate": 0.0001486955800915274, "loss": 1.5386, "step": 1600 }, { "epoch": 1.22, "grad_norm": 0.21227729763658884, "learning_rate": 0.0001483098294847695, "loss": 1.602, "step": 1605 }, { "epoch": 1.22, "grad_norm": 0.21630672435558576, "learning_rate": 0.00014792313887205182, "loss": 1.6772, "step": 1610 }, { "epoch": 1.22, "grad_norm": 0.21541507503873228, "learning_rate": 0.00014753551577754664, "loss": 1.6862, "step": 1615 }, { "epoch": 1.23, "grad_norm": 0.2480903001762983, "learning_rate": 0.0001471469677435704, "loss": 1.5916, "step": 1620 }, { "epoch": 1.23, "grad_norm": 0.20716645798924263, "learning_rate": 0.00014675750233043679, "loss": 1.7072, "step": 1625 }, { "epoch": 1.24, "grad_norm": 0.22397565488829696, "learning_rate": 0.00014636712711630978, "loss": 1.6036, "step": 1630 }, { "epoch": 1.24, "grad_norm": 0.19584834615434676, "learning_rate": 0.00014597584969705616, "loss": 1.6366, "step": 1635 }, { "epoch": 1.24, "grad_norm": 0.22273274810197669, "learning_rate": 0.00014558367768609766, "loss": 1.6545, "step": 1640 }, { "epoch": 1.25, "grad_norm": 0.30141032612570196, "learning_rate": 0.00014519061871426286, "loss": 1.6668, "step": 1645 }, { "epoch": 1.25, "grad_norm": 0.2508746414625482, "learning_rate": 0.0001447966804296387, "loss": 1.5583, "step": 1650 }, { "epoch": 1.25, "grad_norm": 0.2656543660091513, "learning_rate": 0.00014440187049742165, "loss": 1.6114, "step": 1655 }, { "epoch": 1.26, "grad_norm": 0.22762072721537044, "learning_rate": 0.00014400619659976863, "loss": 1.5218, "step": 1660 }, { "epoch": 1.26, "grad_norm": 0.21625802298436558, "learning_rate": 0.00014360966643564747, "loss": 1.6282, "step": 1665 }, { "epoch": 1.27, "grad_norm": 0.18758356388629857, "learning_rate": 0.00014321228772068702, "loss": 1.5724, "step": 1670 }, { "epoch": 1.27, "grad_norm": 0.22894089207752852, "learning_rate": 0.0001428140681870272, "loss": 1.5875, "step": 1675 }, { "epoch": 1.27, "grad_norm": 0.25952806547918694, "learning_rate": 0.0001424150155831685, "loss": 1.6728, "step": 1680 }, { "epoch": 1.28, "grad_norm": 0.3304544222948505, "learning_rate": 0.00014201513767382108, "loss": 1.6944, "step": 1685 }, { "epoch": 1.28, "grad_norm": 0.21745874371742022, "learning_rate": 0.00014161444223975383, "loss": 1.5649, "step": 1690 }, { "epoch": 1.29, "grad_norm": 0.18668861489627886, "learning_rate": 0.0001412129370776429, "loss": 1.6646, "step": 1695 }, { "epoch": 1.29, "grad_norm": 0.2514658628873574, "learning_rate": 0.00014081062999992005, "loss": 1.6427, "step": 1700 }, { "epoch": 1.29, "grad_norm": 0.23075565689636676, "learning_rate": 0.0001404075288346206, "loss": 1.7089, "step": 1705 }, { "epoch": 1.3, "grad_norm": 0.2005453142298327, "learning_rate": 0.00014000364142523103, "loss": 1.7236, "step": 1710 }, { "epoch": 1.3, "grad_norm": 0.21925735664261978, "learning_rate": 0.00013959897563053662, "loss": 1.7193, "step": 1715 }, { "epoch": 1.3, "grad_norm": 0.22755950679993744, "learning_rate": 0.00013919353932446822, "loss": 1.6178, "step": 1720 }, { "epoch": 1.31, "grad_norm": 0.24575725474371382, "learning_rate": 0.0001387873403959492, "loss": 1.6914, "step": 1725 }, { "epoch": 1.31, "grad_norm": 0.22868287217989744, "learning_rate": 0.00013838038674874193, "loss": 1.6021, "step": 1730 }, { "epoch": 1.32, "grad_norm": 0.21889496061933156, "learning_rate": 0.00013797268630129413, "loss": 1.8092, "step": 1735 }, { "epoch": 1.32, "grad_norm": 0.19238702480865116, "learning_rate": 0.0001375642469865844, "loss": 1.54, "step": 1740 }, { "epoch": 1.32, "grad_norm": 0.24437133183257548, "learning_rate": 0.00013715507675196836, "loss": 1.5477, "step": 1745 }, { "epoch": 1.33, "grad_norm": 0.21331661362588805, "learning_rate": 0.0001367451835590237, "loss": 1.6229, "step": 1750 }, { "epoch": 1.33, "grad_norm": 0.22934227073111574, "learning_rate": 0.00013633457538339514, "loss": 1.7056, "step": 1755 }, { "epoch": 1.33, "grad_norm": 0.21991726124527775, "learning_rate": 0.00013592326021463977, "loss": 1.7322, "step": 1760 }, { "epoch": 1.34, "grad_norm": 0.2279246851535844, "learning_rate": 0.00013551124605607097, "loss": 1.5663, "step": 1765 }, { "epoch": 1.34, "grad_norm": 0.21252716182463233, "learning_rate": 0.00013509854092460312, "loss": 1.6308, "step": 1770 }, { "epoch": 1.35, "grad_norm": 0.19276878334978295, "learning_rate": 0.0001346851528505954, "loss": 1.629, "step": 1775 }, { "epoch": 1.35, "grad_norm": 0.20349606898831232, "learning_rate": 0.00013427108987769566, "loss": 1.6323, "step": 1780 }, { "epoch": 1.35, "grad_norm": 0.280403908850998, "learning_rate": 0.00013385636006268368, "loss": 1.5647, "step": 1785 }, { "epoch": 1.36, "grad_norm": 0.204649437629767, "learning_rate": 0.00013344097147531469, "loss": 1.6706, "step": 1790 }, { "epoch": 1.36, "grad_norm": 0.2355526525352747, "learning_rate": 0.00013302493219816223, "loss": 1.6661, "step": 1795 }, { "epoch": 1.36, "grad_norm": 0.23955342033240548, "learning_rate": 0.00013260825032646083, "loss": 1.7684, "step": 1800 }, { "epoch": 1.37, "grad_norm": 0.1782918443154143, "learning_rate": 0.00013219093396794852, "loss": 1.7357, "step": 1805 }, { "epoch": 1.37, "grad_norm": 0.20676511108669285, "learning_rate": 0.00013177299124270911, "loss": 1.7935, "step": 1810 }, { "epoch": 1.38, "grad_norm": 0.24468072304122832, "learning_rate": 0.0001313544302830142, "loss": 1.6357, "step": 1815 }, { "epoch": 1.38, "grad_norm": 0.3442798924803141, "learning_rate": 0.00013093525923316482, "loss": 1.7283, "step": 1820 }, { "epoch": 1.38, "grad_norm": 0.18543047699982895, "learning_rate": 0.00013051548624933314, "loss": 1.6756, "step": 1825 }, { "epoch": 1.39, "grad_norm": 0.18961104598393633, "learning_rate": 0.00013009511949940358, "loss": 1.6258, "step": 1830 }, { "epoch": 1.39, "grad_norm": 0.23772840081980506, "learning_rate": 0.00012967416716281414, "loss": 1.6197, "step": 1835 }, { "epoch": 1.39, "grad_norm": 0.20599306112898513, "learning_rate": 0.00012925263743039693, "loss": 1.6155, "step": 1840 }, { "epoch": 1.4, "grad_norm": 0.17872981947947883, "learning_rate": 0.00012883053850421897, "loss": 1.817, "step": 1845 }, { "epoch": 1.4, "grad_norm": 0.21082979842365093, "learning_rate": 0.00012840787859742266, "loss": 1.7045, "step": 1850 }, { "epoch": 1.41, "grad_norm": 0.21065592453908275, "learning_rate": 0.00012798466593406583, "loss": 1.5825, "step": 1855 }, { "epoch": 1.41, "grad_norm": 0.21798103821826761, "learning_rate": 0.00012756090874896172, "loss": 1.7622, "step": 1860 }, { "epoch": 1.41, "grad_norm": 0.22916268453103483, "learning_rate": 0.00012713661528751888, "loss": 1.5324, "step": 1865 }, { "epoch": 1.42, "grad_norm": 0.2668875410933402, "learning_rate": 0.00012671179380558062, "loss": 1.647, "step": 1870 }, { "epoch": 1.42, "grad_norm": 0.19627830855058848, "learning_rate": 0.00012628645256926438, "loss": 1.5994, "step": 1875 }, { "epoch": 1.43, "grad_norm": 0.21241423084048555, "learning_rate": 0.0001258605998548009, "loss": 1.622, "step": 1880 }, { "epoch": 1.43, "grad_norm": 0.2546778643093178, "learning_rate": 0.0001254342439483733, "loss": 1.6916, "step": 1885 }, { "epoch": 1.43, "grad_norm": 0.20610950008732792, "learning_rate": 0.00012500739314595563, "loss": 1.7455, "step": 1890 }, { "epoch": 1.44, "grad_norm": 0.2219569529434739, "learning_rate": 0.00012458005575315147, "loss": 1.6683, "step": 1895 }, { "epoch": 1.44, "grad_norm": 0.20787095642170883, "learning_rate": 0.0001241522400850327, "loss": 1.6202, "step": 1900 }, { "epoch": 1.44, "grad_norm": 0.2275845179745845, "learning_rate": 0.0001237239544659771, "loss": 1.8088, "step": 1905 }, { "epoch": 1.45, "grad_norm": 0.24655110446766015, "learning_rate": 0.0001232952072295069, "loss": 1.5618, "step": 1910 }, { "epoch": 1.45, "grad_norm": 0.23084716022254811, "learning_rate": 0.0001228660067181263, "loss": 1.7204, "step": 1915 }, { "epoch": 1.46, "grad_norm": 0.2420965499906573, "learning_rate": 0.00012243636128315939, "loss": 1.5581, "step": 1920 }, { "epoch": 1.46, "grad_norm": 0.25054116126933823, "learning_rate": 0.0001220062792845873, "loss": 1.5808, "step": 1925 }, { "epoch": 1.46, "grad_norm": 0.24876893838844386, "learning_rate": 0.00012157576909088599, "loss": 1.6291, "step": 1930 }, { "epoch": 1.47, "grad_norm": 0.22724411732153027, "learning_rate": 0.00012114483907886308, "loss": 1.7218, "step": 1935 }, { "epoch": 1.47, "grad_norm": 0.23781633823944948, "learning_rate": 0.00012071349763349484, "loss": 1.6696, "step": 1940 }, { "epoch": 1.47, "grad_norm": 0.2611267676195103, "learning_rate": 0.00012028175314776344, "loss": 1.7099, "step": 1945 }, { "epoch": 1.48, "grad_norm": 0.25342034309056527, "learning_rate": 0.00011984961402249311, "loss": 1.6931, "step": 1950 }, { "epoch": 1.48, "grad_norm": 0.20391686876564638, "learning_rate": 0.00011941708866618697, "loss": 1.7043, "step": 1955 }, { "epoch": 1.49, "grad_norm": 0.2005457898894919, "learning_rate": 0.0001189841854948634, "loss": 1.5758, "step": 1960 }, { "epoch": 1.49, "grad_norm": 0.19157508121631642, "learning_rate": 0.00011855091293189234, "loss": 1.5831, "step": 1965 }, { "epoch": 1.49, "grad_norm": 0.23409302527114853, "learning_rate": 0.00011811727940783108, "loss": 1.6668, "step": 1970 }, { "epoch": 1.5, "grad_norm": 0.19820344277697435, "learning_rate": 0.00011768329336026062, "loss": 1.6894, "step": 1975 }, { "epoch": 1.5, "grad_norm": 0.23641920754497897, "learning_rate": 0.0001172489632336213, "loss": 1.8362, "step": 1980 }, { "epoch": 1.5, "grad_norm": 0.20503090615743924, "learning_rate": 0.00011681429747904842, "loss": 1.6885, "step": 1985 }, { "epoch": 1.51, "grad_norm": 0.18474233550647523, "learning_rate": 0.00011637930455420798, "loss": 1.7196, "step": 1990 }, { "epoch": 1.51, "grad_norm": 0.2775657036754379, "learning_rate": 0.00011594399292313192, "loss": 1.7362, "step": 1995 }, { "epoch": 1.52, "grad_norm": 0.23760102898739513, "learning_rate": 0.00011550837105605354, "loss": 1.5986, "step": 2000 }, { "epoch": 1.52, "grad_norm": 0.18850041877265183, "learning_rate": 0.00011507244742924274, "loss": 1.7116, "step": 2005 }, { "epoch": 1.52, "grad_norm": 0.2164959021230041, "learning_rate": 0.000114636230524841, "loss": 1.578, "step": 2010 }, { "epoch": 1.53, "grad_norm": 0.269300085641628, "learning_rate": 0.00011419972883069623, "loss": 1.5605, "step": 2015 }, { "epoch": 1.53, "grad_norm": 0.24787445167484887, "learning_rate": 0.00011376295084019792, "loss": 1.6663, "step": 2020 }, { "epoch": 1.54, "grad_norm": 0.21140623194389616, "learning_rate": 0.00011332590505211159, "loss": 1.658, "step": 2025 }, { "epoch": 1.54, "grad_norm": 0.25921900302870593, "learning_rate": 0.00011288859997041353, "loss": 1.6459, "step": 2030 }, { "epoch": 1.54, "grad_norm": 0.2608666502284525, "learning_rate": 0.00011245104410412537, "loss": 1.6928, "step": 2035 }, { "epoch": 1.55, "grad_norm": 0.22406449938146802, "learning_rate": 0.00011201324596714844, "loss": 1.4791, "step": 2040 }, { "epoch": 1.55, "grad_norm": 0.19647960391415928, "learning_rate": 0.00011157521407809815, "loss": 1.698, "step": 2045 }, { "epoch": 1.55, "grad_norm": 0.1897962583849219, "learning_rate": 0.00011113695696013824, "loss": 1.8167, "step": 2050 }, { "epoch": 1.56, "grad_norm": 0.20712759197533817, "learning_rate": 0.0001106984831408149, "loss": 1.7501, "step": 2055 }, { "epoch": 1.56, "grad_norm": 0.23079961827033185, "learning_rate": 0.00011025980115189086, "loss": 1.5934, "step": 2060 }, { "epoch": 1.57, "grad_norm": 0.22104873487185864, "learning_rate": 0.00010982091952917943, "loss": 1.6686, "step": 2065 }, { "epoch": 1.57, "grad_norm": 0.20639504694734737, "learning_rate": 0.00010938184681237833, "loss": 1.7136, "step": 2070 }, { "epoch": 1.57, "grad_norm": 0.2417721960073701, "learning_rate": 0.00010894259154490354, "loss": 1.6702, "step": 2075 }, { "epoch": 1.58, "grad_norm": 0.21810729625691397, "learning_rate": 0.00010850316227372312, "loss": 1.7477, "step": 2080 }, { "epoch": 1.58, "grad_norm": 0.23170201171415503, "learning_rate": 0.00010806356754919091, "loss": 1.6943, "step": 2085 }, { "epoch": 1.58, "grad_norm": 0.22093119739393355, "learning_rate": 0.00010762381592488002, "loss": 1.623, "step": 2090 }, { "epoch": 1.59, "grad_norm": 0.21034721922753088, "learning_rate": 0.00010718391595741657, "loss": 1.6084, "step": 2095 }, { "epoch": 1.59, "grad_norm": 0.22443726771939806, "learning_rate": 0.00010674387620631308, "loss": 1.5536, "step": 2100 }, { "epoch": 1.6, "grad_norm": 0.22568508558473213, "learning_rate": 0.00010630370523380202, "loss": 1.469, "step": 2105 }, { "epoch": 1.6, "grad_norm": 0.3332888137498032, "learning_rate": 0.00010586341160466904, "loss": 1.6488, "step": 2110 }, { "epoch": 1.6, "grad_norm": 0.2129808005413702, "learning_rate": 0.00010542300388608652, "loss": 1.6101, "step": 2115 }, { "epoch": 1.61, "grad_norm": 0.20553693555408575, "learning_rate": 0.00010498249064744679, "loss": 1.4872, "step": 2120 }, { "epoch": 1.61, "grad_norm": 0.2445112542992352, "learning_rate": 0.00010454188046019524, "loss": 1.7005, "step": 2125 }, { "epoch": 1.61, "grad_norm": 0.20844778510756687, "learning_rate": 0.00010410118189766387, "loss": 1.5589, "step": 2130 }, { "epoch": 1.62, "grad_norm": 0.2223212290874802, "learning_rate": 0.0001036604035349041, "loss": 1.6621, "step": 2135 }, { "epoch": 1.62, "grad_norm": 0.20479585313872112, "learning_rate": 0.00010321955394852018, "loss": 1.7061, "step": 2140 }, { "epoch": 1.63, "grad_norm": 0.17606184812861142, "learning_rate": 0.0001027786417165022, "loss": 1.5607, "step": 2145 }, { "epoch": 1.63, "grad_norm": 0.2676349610853098, "learning_rate": 0.0001023376754180592, "loss": 1.6232, "step": 2150 }, { "epoch": 1.63, "grad_norm": 0.2068560787418325, "learning_rate": 0.00010189666363345223, "loss": 1.5724, "step": 2155 }, { "epoch": 1.64, "grad_norm": 0.19641973239797275, "learning_rate": 0.00010145561494382742, "loss": 1.5305, "step": 2160 }, { "epoch": 1.64, "grad_norm": 0.2574797520893005, "learning_rate": 0.00010101453793104898, "loss": 1.6025, "step": 2165 }, { "epoch": 1.65, "grad_norm": 0.2827194584842853, "learning_rate": 0.00010057344117753222, "loss": 1.5882, "step": 2170 }, { "epoch": 1.65, "grad_norm": 0.19936180521947827, "learning_rate": 0.00010013233326607661, "loss": 1.5706, "step": 2175 }, { "epoch": 1.65, "grad_norm": 0.21819696462759022, "learning_rate": 9.969122277969865e-05, "loss": 1.6623, "step": 2180 }, { "epoch": 1.66, "grad_norm": 0.225417352707018, "learning_rate": 9.9250118301465e-05, "loss": 1.6255, "step": 2185 }, { "epoch": 1.66, "grad_norm": 0.3143651738447285, "learning_rate": 9.880902841432544e-05, "loss": 1.4905, "step": 2190 }, { "epoch": 1.66, "grad_norm": 0.23749234423783855, "learning_rate": 9.836796170094571e-05, "loss": 1.6156, "step": 2195 }, { "epoch": 1.67, "grad_norm": 0.23579593383210742, "learning_rate": 9.792692674354079e-05, "loss": 1.6963, "step": 2200 }, { "epoch": 1.67, "grad_norm": 0.2032329245708717, "learning_rate": 9.748593212370773e-05, "loss": 1.6733, "step": 2205 }, { "epoch": 1.68, "grad_norm": 0.20661047812325195, "learning_rate": 9.704498642225856e-05, "loss": 1.622, "step": 2210 }, { "epoch": 1.68, "grad_norm": 0.18970352315906064, "learning_rate": 9.660409821905363e-05, "loss": 1.7834, "step": 2215 }, { "epoch": 1.68, "grad_norm": 0.17832580771616308, "learning_rate": 9.616327609283445e-05, "loss": 1.6989, "step": 2220 }, { "epoch": 1.69, "grad_norm": 0.21859704299949706, "learning_rate": 9.572252862105673e-05, "loss": 1.7946, "step": 2225 }, { "epoch": 1.69, "grad_norm": 0.24897942412148671, "learning_rate": 9.528186437972368e-05, "loss": 1.564, "step": 2230 }, { "epoch": 1.69, "grad_norm": 0.20109922805508615, "learning_rate": 9.484129194321896e-05, "loss": 1.6594, "step": 2235 }, { "epoch": 1.7, "grad_norm": 0.19546463521855884, "learning_rate": 9.440081988413987e-05, "loss": 1.542, "step": 2240 }, { "epoch": 1.7, "grad_norm": 0.20254596218430737, "learning_rate": 9.396045677313067e-05, "loss": 1.8142, "step": 2245 }, { "epoch": 1.71, "grad_norm": 0.1936135057396683, "learning_rate": 9.352021117871574e-05, "loss": 1.5564, "step": 2250 }, { "epoch": 1.71, "grad_norm": 0.2096445430714542, "learning_rate": 9.308009166713263e-05, "loss": 1.6735, "step": 2255 }, { "epoch": 1.71, "grad_norm": 0.22672329152194862, "learning_rate": 9.264010680216583e-05, "loss": 1.6761, "step": 2260 }, { "epoch": 1.72, "grad_norm": 0.24482242735211057, "learning_rate": 9.220026514497983e-05, "loss": 1.5988, "step": 2265 }, { "epoch": 1.72, "grad_norm": 0.24736418279884478, "learning_rate": 9.176057525395252e-05, "loss": 1.5844, "step": 2270 }, { "epoch": 1.72, "grad_norm": 0.1987944867199659, "learning_rate": 9.132104568450879e-05, "loss": 1.6997, "step": 2275 }, { "epoch": 1.73, "grad_norm": 0.1850913674566201, "learning_rate": 9.088168498895408e-05, "loss": 1.5696, "step": 2280 }, { "epoch": 1.73, "grad_norm": 0.24393794217168674, "learning_rate": 9.044250171630778e-05, "loss": 1.7403, "step": 2285 }, { "epoch": 1.74, "grad_norm": 0.19475525279873163, "learning_rate": 9.000350441213708e-05, "loss": 1.5984, "step": 2290 }, { "epoch": 1.74, "grad_norm": 0.2218761532729913, "learning_rate": 8.956470161839072e-05, "loss": 1.6681, "step": 2295 }, { "epoch": 1.74, "grad_norm": 0.24957778768532196, "learning_rate": 8.912610187323248e-05, "loss": 1.6169, "step": 2300 }, { "epoch": 1.75, "grad_norm": 0.2510725868859042, "learning_rate": 8.868771371087539e-05, "loss": 1.639, "step": 2305 }, { "epoch": 1.75, "grad_norm": 0.19643293153400068, "learning_rate": 8.82495456614155e-05, "loss": 1.7237, "step": 2310 }, { "epoch": 1.76, "grad_norm": 0.26450396919742597, "learning_rate": 8.781160625066588e-05, "loss": 1.6528, "step": 2315 }, { "epoch": 1.76, "grad_norm": 0.22179001551390587, "learning_rate": 8.737390399999086e-05, "loss": 1.5533, "step": 2320 }, { "epoch": 1.76, "grad_norm": 0.2346687653947156, "learning_rate": 8.693644742614017e-05, "loss": 1.6104, "step": 2325 }, { "epoch": 1.77, "grad_norm": 0.25806483606045055, "learning_rate": 8.649924504108302e-05, "loss": 1.6052, "step": 2330 }, { "epoch": 1.77, "grad_norm": 0.1786075330646357, "learning_rate": 8.606230535184283e-05, "loss": 1.5603, "step": 2335 }, { "epoch": 1.77, "grad_norm": 0.254068816191818, "learning_rate": 8.562563686033145e-05, "loss": 1.7643, "step": 2340 }, { "epoch": 1.78, "grad_norm": 0.21344041020108453, "learning_rate": 8.518924806318378e-05, "loss": 1.6584, "step": 2345 }, { "epoch": 1.78, "grad_norm": 0.2082041629797306, "learning_rate": 8.47531474515925e-05, "loss": 1.7992, "step": 2350 }, { "epoch": 1.79, "grad_norm": 0.2645099180130053, "learning_rate": 8.431734351114284e-05, "loss": 1.6361, "step": 2355 }, { "epoch": 1.79, "grad_norm": 0.22698336003173047, "learning_rate": 8.388184472164736e-05, "loss": 1.646, "step": 2360 }, { "epoch": 1.79, "grad_norm": 0.24003288864061173, "learning_rate": 8.34466595569811e-05, "loss": 1.6379, "step": 2365 }, { "epoch": 1.8, "grad_norm": 0.19443085064409085, "learning_rate": 8.301179648491669e-05, "loss": 1.73, "step": 2370 }, { "epoch": 1.8, "grad_norm": 0.24311509067570025, "learning_rate": 8.257726396695933e-05, "loss": 1.6802, "step": 2375 }, { "epoch": 1.8, "grad_norm": 0.24648929428851593, "learning_rate": 8.214307045818254e-05, "loss": 1.7708, "step": 2380 }, { "epoch": 1.81, "grad_norm": 0.1940516179768531, "learning_rate": 8.17092244070634e-05, "loss": 1.5857, "step": 2385 }, { "epoch": 1.81, "grad_norm": 0.2361070277608161, "learning_rate": 8.127573425531814e-05, "loss": 1.6411, "step": 2390 }, { "epoch": 1.82, "grad_norm": 0.2835364928454071, "learning_rate": 8.084260843773799e-05, "loss": 1.7818, "step": 2395 }, { "epoch": 1.82, "grad_norm": 0.18047213778922655, "learning_rate": 8.040985538202505e-05, "loss": 1.587, "step": 2400 }, { "epoch": 1.82, "grad_norm": 0.21940093931140764, "learning_rate": 7.997748350862822e-05, "loss": 1.6795, "step": 2405 }, { "epoch": 1.83, "grad_norm": 0.20557324059132212, "learning_rate": 7.954550123057939e-05, "loss": 1.638, "step": 2410 }, { "epoch": 1.83, "grad_norm": 0.23522437885683956, "learning_rate": 7.911391695332988e-05, "loss": 1.6176, "step": 2415 }, { "epoch": 1.83, "grad_norm": 0.20227659422834685, "learning_rate": 7.868273907458661e-05, "loss": 1.5562, "step": 2420 }, { "epoch": 1.84, "grad_norm": 0.17957107180807144, "learning_rate": 7.825197598414895e-05, "loss": 1.6577, "step": 2425 }, { "epoch": 1.84, "grad_norm": 0.21134479099989728, "learning_rate": 7.782163606374536e-05, "loss": 1.5407, "step": 2430 }, { "epoch": 1.85, "grad_norm": 0.2190101821746382, "learning_rate": 7.739172768687028e-05, "loss": 1.6901, "step": 2435 }, { "epoch": 1.85, "grad_norm": 0.22909832831883262, "learning_rate": 7.696225921862126e-05, "loss": 1.6517, "step": 2440 }, { "epoch": 1.85, "grad_norm": 0.1922087104118847, "learning_rate": 7.653323901553625e-05, "loss": 1.5558, "step": 2445 }, { "epoch": 1.86, "grad_norm": 0.2535390902934386, "learning_rate": 7.610467542543073e-05, "loss": 1.7802, "step": 2450 }, { "epoch": 1.86, "grad_norm": 0.20264859592749507, "learning_rate": 7.567657678723565e-05, "loss": 1.6141, "step": 2455 }, { "epoch": 1.87, "grad_norm": 0.2534081482654566, "learning_rate": 7.52489514308349e-05, "loss": 1.6593, "step": 2460 }, { "epoch": 1.87, "grad_norm": 0.24401202904206418, "learning_rate": 7.482180767690334e-05, "loss": 1.5982, "step": 2465 }, { "epoch": 1.87, "grad_norm": 0.2805376490259695, "learning_rate": 7.439515383674485e-05, "loss": 1.7126, "step": 2470 }, { "epoch": 1.88, "grad_norm": 0.24585333566417664, "learning_rate": 7.396899821213072e-05, "loss": 1.5644, "step": 2475 }, { "epoch": 1.88, "grad_norm": 0.22491029483008115, "learning_rate": 7.354334909513791e-05, "loss": 1.6765, "step": 2480 }, { "epoch": 1.88, "grad_norm": 0.23458997274256846, "learning_rate": 7.311821476798789e-05, "loss": 1.6122, "step": 2485 }, { "epoch": 1.89, "grad_norm": 0.17595992796667512, "learning_rate": 7.269360350288547e-05, "loss": 1.8356, "step": 2490 }, { "epoch": 1.89, "grad_norm": 0.18759163970832302, "learning_rate": 7.226952356185765e-05, "loss": 1.4984, "step": 2495 }, { "epoch": 1.9, "grad_norm": 0.236927434671597, "learning_rate": 7.184598319659317e-05, "loss": 1.6798, "step": 2500 }, { "epoch": 1.9, "grad_norm": 0.26802038257147875, "learning_rate": 7.142299064828169e-05, "loss": 1.5844, "step": 2505 }, { "epoch": 1.9, "grad_norm": 0.1751974293734832, "learning_rate": 7.100055414745346e-05, "loss": 1.6365, "step": 2510 }, { "epoch": 1.91, "grad_norm": 0.23254005323825433, "learning_rate": 7.057868191381936e-05, "loss": 1.4657, "step": 2515 }, { "epoch": 1.91, "grad_norm": 0.264348812986722, "learning_rate": 7.015738215611079e-05, "loss": 1.7816, "step": 2520 }, { "epoch": 1.91, "grad_norm": 0.27530320883320614, "learning_rate": 6.973666307191996e-05, "loss": 1.6751, "step": 2525 }, { "epoch": 1.92, "grad_norm": 0.19339613251333393, "learning_rate": 6.931653284754042e-05, "loss": 1.7293, "step": 2530 }, { "epoch": 1.92, "grad_norm": 0.2151392309486146, "learning_rate": 6.889699965780787e-05, "loss": 1.7334, "step": 2535 }, { "epoch": 1.93, "grad_norm": 0.22448766537331677, "learning_rate": 6.847807166594083e-05, "loss": 1.6827, "step": 2540 }, { "epoch": 1.93, "grad_norm": 0.2286115948636003, "learning_rate": 6.805975702338208e-05, "loss": 1.6562, "step": 2545 }, { "epoch": 1.93, "grad_norm": 0.2118908130790939, "learning_rate": 6.764206386963991e-05, "loss": 1.6091, "step": 2550 }, { "epoch": 1.94, "grad_norm": 0.240925966059138, "learning_rate": 6.722500033212974e-05, "loss": 1.6314, "step": 2555 }, { "epoch": 1.94, "grad_norm": 0.2271694074825516, "learning_rate": 6.680857452601598e-05, "loss": 1.7589, "step": 2560 }, { "epoch": 1.94, "grad_norm": 0.2168118018671656, "learning_rate": 6.639279455405432e-05, "loss": 1.6201, "step": 2565 }, { "epoch": 1.95, "grad_norm": 0.21224810091098364, "learning_rate": 6.597766850643361e-05, "loss": 1.5842, "step": 2570 }, { "epoch": 1.95, "grad_norm": 0.19581859607212743, "learning_rate": 6.556320446061902e-05, "loss": 1.5586, "step": 2575 }, { "epoch": 1.96, "grad_norm": 0.20327112477714954, "learning_rate": 6.514941048119435e-05, "loss": 1.6303, "step": 2580 }, { "epoch": 1.96, "grad_norm": 0.22810086515914976, "learning_rate": 6.47362946197055e-05, "loss": 1.7332, "step": 2585 }, { "epoch": 1.96, "grad_norm": 0.22278333474431392, "learning_rate": 6.432386491450361e-05, "loss": 1.6293, "step": 2590 }, { "epoch": 1.97, "grad_norm": 0.23128655487134384, "learning_rate": 6.391212939058861e-05, "loss": 1.6937, "step": 2595 }, { "epoch": 1.97, "grad_norm": 0.24641830926598107, "learning_rate": 6.350109605945323e-05, "loss": 1.4982, "step": 2600 }, { "epoch": 1.97, "grad_norm": 0.24123146757419323, "learning_rate": 6.309077291892702e-05, "loss": 1.5107, "step": 2605 }, { "epoch": 1.98, "grad_norm": 0.24138969338364216, "learning_rate": 6.268116795302068e-05, "loss": 1.5448, "step": 2610 }, { "epoch": 1.98, "grad_norm": 0.2515434111696446, "learning_rate": 6.227228913177081e-05, "loss": 1.559, "step": 2615 }, { "epoch": 1.99, "grad_norm": 0.2554427971564699, "learning_rate": 6.186414441108487e-05, "loss": 1.6211, "step": 2620 }, { "epoch": 1.99, "grad_norm": 0.20773791558688393, "learning_rate": 6.14567417325861e-05, "loss": 1.6058, "step": 2625 }, { "epoch": 1.99, "grad_norm": 0.20109572317054908, "learning_rate": 6.105008902345935e-05, "loss": 1.5911, "step": 2630 }, { "epoch": 2.0, "grad_norm": 0.21186779196561445, "learning_rate": 6.064419419629662e-05, "loss": 1.6227, "step": 2635 }, { "epoch": 2.0, "grad_norm": 0.2150487580932417, "learning_rate": 6.023906514894313e-05, "loss": 1.5839, "step": 2640 }, { "epoch": 2.01, "grad_norm": 0.24636199955981808, "learning_rate": 5.983470976434369e-05, "loss": 1.5764, "step": 2645 }, { "epoch": 2.01, "grad_norm": 0.22093610448062864, "learning_rate": 5.943113591038928e-05, "loss": 1.7157, "step": 2650 }, { "epoch": 2.01, "grad_norm": 0.21359568862552614, "learning_rate": 5.902835143976393e-05, "loss": 1.6359, "step": 2655 }, { "epoch": 2.02, "grad_norm": 0.2219633405623727, "learning_rate": 5.862636418979198e-05, "loss": 1.6484, "step": 2660 }, { "epoch": 2.02, "grad_norm": 0.24148935530595134, "learning_rate": 5.822518198228565e-05, "loss": 1.52, "step": 2665 }, { "epoch": 2.02, "grad_norm": 0.22871052628894134, "learning_rate": 5.782481262339261e-05, "loss": 1.5583, "step": 2670 }, { "epoch": 2.03, "grad_norm": 0.18016152517949127, "learning_rate": 5.742526390344427e-05, "loss": 1.7094, "step": 2675 }, { "epoch": 2.03, "grad_norm": 0.27927714573640977, "learning_rate": 5.702654359680428e-05, "loss": 1.7229, "step": 2680 }, { "epoch": 2.04, "grad_norm": 0.20272089890919007, "learning_rate": 5.662865946171696e-05, "loss": 1.7436, "step": 2685 }, { "epoch": 2.04, "grad_norm": 0.25187946618078394, "learning_rate": 5.6231619240156694e-05, "loss": 1.5926, "step": 2690 }, { "epoch": 2.04, "grad_norm": 0.23619447456603418, "learning_rate": 5.5835430657676976e-05, "loss": 1.5177, "step": 2695 }, { "epoch": 2.05, "grad_norm": 0.23076862233533377, "learning_rate": 5.544010142326026e-05, "loss": 1.6432, "step": 2700 }, { "epoch": 2.05, "grad_norm": 0.2509266079111979, "learning_rate": 5.504563922916799e-05, "loss": 1.6125, "step": 2705 }, { "epoch": 2.05, "grad_norm": 0.26527998507107736, "learning_rate": 5.4652051750790825e-05, "loss": 1.5384, "step": 2710 }, { "epoch": 2.06, "grad_norm": 0.24254486560490685, "learning_rate": 5.425934664649921e-05, "loss": 1.6641, "step": 2715 }, { "epoch": 2.06, "grad_norm": 0.22497341374372068, "learning_rate": 5.3867531557494674e-05, "loss": 1.4442, "step": 2720 }, { "epoch": 2.07, "grad_norm": 0.22811203680708553, "learning_rate": 5.347661410766087e-05, "loss": 1.6313, "step": 2725 }, { "epoch": 2.07, "grad_norm": 0.2193211927138723, "learning_rate": 5.308660190341528e-05, "loss": 1.4835, "step": 2730 }, { "epoch": 2.07, "grad_norm": 0.23158894991713072, "learning_rate": 5.2697502533561226e-05, "loss": 1.5765, "step": 2735 }, { "epoch": 2.08, "grad_norm": 0.2160152191509828, "learning_rate": 5.230932356914032e-05, "loss": 1.6395, "step": 2740 }, { "epoch": 2.08, "grad_norm": 0.23138300560468752, "learning_rate": 5.1922072563284986e-05, "loss": 1.6645, "step": 2745 }, { "epoch": 2.08, "grad_norm": 0.27219186986752913, "learning_rate": 5.153575705107152e-05, "loss": 1.5842, "step": 2750 }, { "epoch": 2.09, "grad_norm": 0.24365055871265076, "learning_rate": 5.115038454937362e-05, "loss": 1.7234, "step": 2755 }, { "epoch": 2.09, "grad_norm": 0.22921672259925305, "learning_rate": 5.076596255671592e-05, "loss": 1.5756, "step": 2760 }, { "epoch": 2.1, "grad_norm": 0.2538431765730713, "learning_rate": 5.0382498553128265e-05, "loss": 1.6491, "step": 2765 }, { "epoch": 2.1, "grad_norm": 0.25913968900209966, "learning_rate": 5.000000000000002e-05, "loss": 1.5438, "step": 2770 }, { "epoch": 2.1, "grad_norm": 0.291257818004918, "learning_rate": 4.9618474339934916e-05, "loss": 1.5995, "step": 2775 }, { "epoch": 2.11, "grad_norm": 0.24432948267207238, "learning_rate": 4.9237928996606384e-05, "loss": 1.5999, "step": 2780 }, { "epoch": 2.11, "grad_norm": 0.26418330324646966, "learning_rate": 4.88583713746129e-05, "loss": 1.7175, "step": 2785 }, { "epoch": 2.12, "grad_norm": 0.2647804130194954, "learning_rate": 4.8479808859333964e-05, "loss": 1.5083, "step": 2790 }, { "epoch": 2.12, "grad_norm": 0.23990236642151055, "learning_rate": 4.810224881678652e-05, "loss": 1.5032, "step": 2795 }, { "epoch": 2.12, "grad_norm": 0.22406476212806528, "learning_rate": 4.772569859348156e-05, "loss": 1.6183, "step": 2800 }, { "epoch": 2.13, "grad_norm": 0.17599248862626268, "learning_rate": 4.735016551628095e-05, "loss": 1.694, "step": 2805 }, { "epoch": 2.13, "grad_norm": 0.27545889362059484, "learning_rate": 4.697565689225528e-05, "loss": 1.6074, "step": 2810 }, { "epoch": 2.13, "grad_norm": 0.27997532830437954, "learning_rate": 4.660218000854143e-05, "loss": 1.5062, "step": 2815 }, { "epoch": 2.14, "grad_norm": 0.2803170335965896, "learning_rate": 4.6229742132200746e-05, "loss": 1.6516, "step": 2820 }, { "epoch": 2.14, "grad_norm": 0.22582531196940026, "learning_rate": 4.585835051007774e-05, "loss": 1.6168, "step": 2825 }, { "epoch": 2.15, "grad_norm": 0.22856148303418752, "learning_rate": 4.548801236865912e-05, "loss": 1.5435, "step": 2830 }, { "epoch": 2.15, "grad_norm": 0.2764784030904549, "learning_rate": 4.511873491393304e-05, "loss": 1.6409, "step": 2835 }, { "epoch": 2.15, "grad_norm": 0.21257264261069672, "learning_rate": 4.475052533124893e-05, "loss": 1.5581, "step": 2840 }, { "epoch": 2.16, "grad_norm": 0.21196439275175047, "learning_rate": 4.438339078517785e-05, "loss": 1.5538, "step": 2845 }, { "epoch": 2.16, "grad_norm": 0.2832145647608719, "learning_rate": 4.401733841937279e-05, "loss": 1.724, "step": 2850 }, { "epoch": 2.16, "grad_norm": 0.27147849615384506, "learning_rate": 4.3652375356429974e-05, "loss": 1.5014, "step": 2855 }, { "epoch": 2.17, "grad_norm": 0.2610576760484019, "learning_rate": 4.328850869775001e-05, "loss": 1.6749, "step": 2860 }, { "epoch": 2.17, "grad_norm": 0.23914287887699434, "learning_rate": 4.292574552339981e-05, "loss": 1.5328, "step": 2865 }, { "epoch": 2.18, "grad_norm": 0.24065502762902322, "learning_rate": 4.256409289197495e-05, "loss": 1.5942, "step": 2870 }, { "epoch": 2.18, "grad_norm": 0.2083191016158885, "learning_rate": 4.2203557840462214e-05, "loss": 1.5539, "step": 2875 }, { "epoch": 2.18, "grad_norm": 0.20639182389301813, "learning_rate": 4.184414738410248e-05, "loss": 1.5646, "step": 2880 }, { "epoch": 2.19, "grad_norm": 0.23727403239283584, "learning_rate": 4.148586851625461e-05, "loss": 1.5353, "step": 2885 }, { "epoch": 2.19, "grad_norm": 0.24508287577637505, "learning_rate": 4.112872820825915e-05, "loss": 1.4418, "step": 2890 }, { "epoch": 2.19, "grad_norm": 0.2475936795575314, "learning_rate": 4.077273340930263e-05, "loss": 1.6643, "step": 2895 }, { "epoch": 2.2, "grad_norm": 0.2505899184192717, "learning_rate": 4.041789104628241e-05, "loss": 1.5577, "step": 2900 }, { "epoch": 2.2, "grad_norm": 0.24093576954008833, "learning_rate": 4.006420802367205e-05, "loss": 1.6784, "step": 2905 }, { "epoch": 2.21, "grad_norm": 0.2561236323684272, "learning_rate": 3.971169122338668e-05, "loss": 1.6165, "step": 2910 }, { "epoch": 2.21, "grad_norm": 0.24280603594696593, "learning_rate": 3.936034750464927e-05, "loss": 1.6695, "step": 2915 }, { "epoch": 2.21, "grad_norm": 0.2602730047803284, "learning_rate": 3.901018370385724e-05, "loss": 1.5697, "step": 2920 }, { "epoch": 2.22, "grad_norm": 0.21146640994821633, "learning_rate": 3.866120663444914e-05, "loss": 1.5399, "step": 2925 }, { "epoch": 2.22, "grad_norm": 0.24075711387924426, "learning_rate": 3.831342308677247e-05, "loss": 1.5597, "step": 2930 }, { "epoch": 2.23, "grad_norm": 0.24793331779495362, "learning_rate": 3.7966839827951196e-05, "loss": 1.6434, "step": 2935 }, { "epoch": 2.23, "grad_norm": 0.19558506394109187, "learning_rate": 3.762146360175427e-05, "loss": 1.6499, "step": 2940 }, { "epoch": 2.23, "grad_norm": 0.35587028915030966, "learning_rate": 3.727730112846444e-05, "loss": 1.5089, "step": 2945 }, { "epoch": 2.24, "grad_norm": 0.2570330063437446, "learning_rate": 3.693435910474732e-05, "loss": 1.6548, "step": 2950 }, { "epoch": 2.24, "grad_norm": 0.28077059284475103, "learning_rate": 3.659264420352122e-05, "loss": 1.6528, "step": 2955 }, { "epoch": 2.24, "grad_norm": 0.23035257395244374, "learning_rate": 3.6252163073827294e-05, "loss": 1.4482, "step": 2960 }, { "epoch": 2.25, "grad_norm": 0.2051186918638722, "learning_rate": 3.5912922340700206e-05, "loss": 1.5015, "step": 2965 }, { "epoch": 2.25, "grad_norm": 0.22455945185810877, "learning_rate": 3.557492860503893e-05, "loss": 1.5176, "step": 2970 }, { "epoch": 2.26, "grad_norm": 0.23453638209680727, "learning_rate": 3.5238188443478795e-05, "loss": 1.6343, "step": 2975 }, { "epoch": 2.26, "grad_norm": 0.24470156257503126, "learning_rate": 3.4902708408263066e-05, "loss": 1.7663, "step": 2980 }, { "epoch": 2.26, "grad_norm": 0.23135832322132918, "learning_rate": 3.45684950271158e-05, "loss": 1.5837, "step": 2985 }, { "epoch": 2.27, "grad_norm": 0.2608640064079802, "learning_rate": 3.423555480311457e-05, "loss": 1.6173, "step": 2990 }, { "epoch": 2.27, "grad_norm": 0.31078928098679404, "learning_rate": 3.3903894214564026e-05, "loss": 1.5177, "step": 2995 }, { "epoch": 2.27, "grad_norm": 0.26258430453244713, "learning_rate": 3.3573519714869914e-05, "loss": 1.6865, "step": 3000 }, { "epoch": 2.28, "grad_norm": 0.2733284038434726, "learning_rate": 3.324443773241349e-05, "loss": 1.3619, "step": 3005 }, { "epoch": 2.28, "grad_norm": 0.2369163548191094, "learning_rate": 3.291665467042618e-05, "loss": 1.6509, "step": 3010 }, { "epoch": 2.29, "grad_norm": 0.2664340527286697, "learning_rate": 3.25901769068654e-05, "loss": 1.6038, "step": 3015 }, { "epoch": 2.29, "grad_norm": 0.23398120063750877, "learning_rate": 3.2265010794290195e-05, "loss": 1.663, "step": 3020 }, { "epoch": 2.29, "grad_norm": 0.2781275708933271, "learning_rate": 3.1941162659737647e-05, "loss": 1.6429, "step": 3025 }, { "epoch": 2.3, "grad_norm": 0.2687866606825216, "learning_rate": 3.16186388045998e-05, "loss": 1.6853, "step": 3030 }, { "epoch": 2.3, "grad_norm": 0.23644485510225058, "learning_rate": 3.129744550450113e-05, "loss": 1.6027, "step": 3035 }, { "epoch": 2.3, "grad_norm": 0.24644290933624716, "learning_rate": 3.09775890091763e-05, "loss": 1.6018, "step": 3040 }, { "epoch": 2.31, "grad_norm": 0.2259139537131363, "learning_rate": 3.065907554234858e-05, "loss": 1.6607, "step": 3045 }, { "epoch": 2.31, "grad_norm": 0.24004959008038543, "learning_rate": 3.034191130160887e-05, "loss": 1.5377, "step": 3050 }, { "epoch": 2.32, "grad_norm": 0.2213008661812979, "learning_rate": 3.0026102458294924e-05, "loss": 1.5613, "step": 3055 }, { "epoch": 2.32, "grad_norm": 0.2079094581228579, "learning_rate": 2.9711655157371443e-05, "loss": 1.5085, "step": 3060 }, { "epoch": 2.32, "grad_norm": 0.2527748569210639, "learning_rate": 2.9398575517310355e-05, "loss": 1.5855, "step": 3065 }, { "epoch": 2.33, "grad_norm": 0.2141370968928817, "learning_rate": 2.9086869629971836e-05, "loss": 1.5732, "step": 3070 }, { "epoch": 2.33, "grad_norm": 0.24493685886391817, "learning_rate": 2.8776543560485857e-05, "loss": 1.6197, "step": 3075 }, { "epoch": 2.34, "grad_norm": 0.2316788505534105, "learning_rate": 2.8467603347133997e-05, "loss": 1.648, "step": 3080 }, { "epoch": 2.34, "grad_norm": 0.25146411778731, "learning_rate": 2.816005500123203e-05, "loss": 1.5525, "step": 3085 }, { "epoch": 2.34, "grad_norm": 0.22407696629199808, "learning_rate": 2.785390450701303e-05, "loss": 1.7218, "step": 3090 }, { "epoch": 2.35, "grad_norm": 0.27013300544460844, "learning_rate": 2.7549157821510885e-05, "loss": 1.5804, "step": 3095 }, { "epoch": 2.35, "grad_norm": 0.25388595748221704, "learning_rate": 2.7245820874444272e-05, "loss": 1.7398, "step": 3100 }, { "epoch": 2.35, "grad_norm": 0.19843759758285218, "learning_rate": 2.6943899568101405e-05, "loss": 1.6999, "step": 3105 }, { "epoch": 2.36, "grad_norm": 0.20783915655026464, "learning_rate": 2.6643399777225232e-05, "loss": 1.6114, "step": 3110 }, { "epoch": 2.36, "grad_norm": 0.2496800397125067, "learning_rate": 2.6344327348898958e-05, "loss": 1.5217, "step": 3115 }, { "epoch": 2.37, "grad_norm": 0.22235249882770752, "learning_rate": 2.6046688102432382e-05, "loss": 1.6871, "step": 3120 }, { "epoch": 2.37, "grad_norm": 0.2462186333352102, "learning_rate": 2.5750487829248726e-05, "loss": 1.7788, "step": 3125 }, { "epoch": 2.37, "grad_norm": 0.20018170839209692, "learning_rate": 2.545573229277175e-05, "loss": 1.6076, "step": 3130 }, { "epoch": 2.38, "grad_norm": 0.2704237119402894, "learning_rate": 2.5162427228313857e-05, "loss": 1.6456, "step": 3135 }, { "epoch": 2.38, "grad_norm": 0.2735737465777087, "learning_rate": 2.4870578342964245e-05, "loss": 1.6402, "step": 3140 }, { "epoch": 2.38, "grad_norm": 0.2188413596766906, "learning_rate": 2.458019131547803e-05, "loss": 1.5193, "step": 3145 }, { "epoch": 2.39, "grad_norm": 0.2821633184600081, "learning_rate": 2.429127179616575e-05, "loss": 1.6363, "step": 3150 }, { "epoch": 2.39, "grad_norm": 0.20714886526036308, "learning_rate": 2.4003825406783308e-05, "loss": 1.669, "step": 3155 }, { "epoch": 2.4, "grad_norm": 0.2661408497359453, "learning_rate": 2.3717857740422644e-05, "loss": 1.5488, "step": 3160 }, { "epoch": 2.4, "grad_norm": 0.2535527034852724, "learning_rate": 2.343337436140295e-05, "loss": 1.5851, "step": 3165 }, { "epoch": 2.4, "grad_norm": 0.2629746106882043, "learning_rate": 2.3150380805162418e-05, "loss": 1.5467, "step": 3170 }, { "epoch": 2.41, "grad_norm": 0.24098285831571226, "learning_rate": 2.2868882578150285e-05, "loss": 1.6417, "step": 3175 }, { "epoch": 2.41, "grad_norm": 0.28638431202213366, "learning_rate": 2.258888515772005e-05, "loss": 1.6915, "step": 3180 }, { "epoch": 2.41, "grad_norm": 0.319171053435643, "learning_rate": 2.2310393992022704e-05, "loss": 1.6324, "step": 3185 }, { "epoch": 2.42, "grad_norm": 0.2054749944090956, "learning_rate": 2.2033414499900685e-05, "loss": 1.5694, "step": 3190 }, { "epoch": 2.42, "grad_norm": 0.2515694982134836, "learning_rate": 2.1757952070782504e-05, "loss": 1.598, "step": 3195 }, { "epoch": 2.43, "grad_norm": 0.23267628383812705, "learning_rate": 2.148401206457793e-05, "loss": 1.4513, "step": 3200 }, { "epoch": 2.43, "grad_norm": 0.25390773868938254, "learning_rate": 2.121159981157359e-05, "loss": 1.5906, "step": 3205 }, { "epoch": 2.43, "grad_norm": 0.251154990702733, "learning_rate": 2.0940720612329258e-05, "loss": 1.4707, "step": 3210 }, { "epoch": 2.44, "grad_norm": 0.24909067323121328, "learning_rate": 2.067137973757489e-05, "loss": 1.6214, "step": 3215 }, { "epoch": 2.44, "grad_norm": 0.23515254331621996, "learning_rate": 2.0403582428107792e-05, "loss": 1.3762, "step": 3220 }, { "epoch": 2.45, "grad_norm": 0.24320094875542947, "learning_rate": 2.0137333894690912e-05, "loss": 1.4732, "step": 3225 }, { "epoch": 2.45, "grad_norm": 0.26976839590657536, "learning_rate": 1.987263931795126e-05, "loss": 1.5325, "step": 3230 }, { "epoch": 2.45, "grad_norm": 0.2480855244121356, "learning_rate": 1.9609503848279144e-05, "loss": 1.6336, "step": 3235 }, { "epoch": 2.46, "grad_norm": 0.23767732175608752, "learning_rate": 1.9347932605728093e-05, "loss": 1.564, "step": 3240 }, { "epoch": 2.46, "grad_norm": 0.2727265524309786, "learning_rate": 1.9087930679915023e-05, "loss": 1.6079, "step": 3245 }, { "epoch": 2.46, "grad_norm": 0.22286517973459688, "learning_rate": 1.882950312992131e-05, "loss": 1.4002, "step": 3250 }, { "epoch": 2.47, "grad_norm": 0.2456900771009275, "learning_rate": 1.8572654984194392e-05, "loss": 1.5994, "step": 3255 }, { "epoch": 2.47, "grad_norm": 0.2771873738066393, "learning_rate": 1.8317391240449876e-05, "loss": 1.6214, "step": 3260 }, { "epoch": 2.48, "grad_norm": 0.2770007292533942, "learning_rate": 1.8063716865574266e-05, "loss": 1.4663, "step": 3265 }, { "epoch": 2.48, "grad_norm": 0.24034756553369535, "learning_rate": 1.781163679552831e-05, "loss": 1.6507, "step": 3270 }, { "epoch": 2.48, "grad_norm": 0.2286386450562912, "learning_rate": 1.7561155935251094e-05, "loss": 1.5512, "step": 3275 }, { "epoch": 2.49, "grad_norm": 0.2594167587325395, "learning_rate": 1.7312279158564415e-05, "loss": 1.6027, "step": 3280 }, { "epoch": 2.49, "grad_norm": 0.2127951461073897, "learning_rate": 1.706501130807806e-05, "loss": 1.6896, "step": 3285 }, { "epoch": 2.49, "grad_norm": 0.2796245456905501, "learning_rate": 1.6819357195095597e-05, "loss": 1.6376, "step": 3290 }, { "epoch": 2.5, "grad_norm": 0.23110557133342613, "learning_rate": 1.657532159952062e-05, "loss": 1.5277, "step": 3295 }, { "epoch": 2.5, "grad_norm": 0.24542029689976314, "learning_rate": 1.6332909269763953e-05, "loss": 1.7143, "step": 3300 }, { "epoch": 2.51, "grad_norm": 0.23539162074782163, "learning_rate": 1.609212492265103e-05, "loss": 1.7028, "step": 3305 }, { "epoch": 2.51, "grad_norm": 0.2629785684260658, "learning_rate": 1.585297324333027e-05, "loss": 1.4392, "step": 3310 }, { "epoch": 2.51, "grad_norm": 0.2933973664128153, "learning_rate": 1.561545888518192e-05, "loss": 1.7234, "step": 3315 }, { "epoch": 2.52, "grad_norm": 0.23954470728817145, "learning_rate": 1.537958646972737e-05, "loss": 1.4944, "step": 3320 }, { "epoch": 2.52, "grad_norm": 0.23980954615598538, "learning_rate": 1.5145360586539336e-05, "loss": 1.5851, "step": 3325 }, { "epoch": 2.52, "grad_norm": 0.21087011175193957, "learning_rate": 1.4912785793152583e-05, "loss": 1.5208, "step": 3330 }, { "epoch": 2.53, "grad_norm": 0.23976449951280604, "learning_rate": 1.4681866614975227e-05, "loss": 1.5722, "step": 3335 }, { "epoch": 2.53, "grad_norm": 0.22800754377440097, "learning_rate": 1.4452607545200492e-05, "loss": 1.6206, "step": 3340 }, { "epoch": 2.54, "grad_norm": 0.21262175469660566, "learning_rate": 1.4225013044719615e-05, "loss": 1.5784, "step": 3345 }, { "epoch": 2.54, "grad_norm": 0.2436947408558131, "learning_rate": 1.3999087542034817e-05, "loss": 1.5594, "step": 3350 }, { "epoch": 2.54, "grad_norm": 0.23425763672194239, "learning_rate": 1.3774835433173172e-05, "loss": 1.6784, "step": 3355 }, { "epoch": 2.55, "grad_norm": 0.22447628853598572, "learning_rate": 1.3552261081601091e-05, "loss": 1.6606, "step": 3360 }, { "epoch": 2.55, "grad_norm": 0.2410908531230671, "learning_rate": 1.3331368818139445e-05, "loss": 1.5011, "step": 3365 }, { "epoch": 2.55, "grad_norm": 0.2107811996936363, "learning_rate": 1.3112162940879225e-05, "loss": 1.6211, "step": 3370 }, { "epoch": 2.56, "grad_norm": 0.23349707446690013, "learning_rate": 1.289464771509804e-05, "loss": 1.4912, "step": 3375 }, { "epoch": 2.56, "grad_norm": 0.23161951144663487, "learning_rate": 1.2678827373176894e-05, "loss": 1.5809, "step": 3380 }, { "epoch": 2.57, "grad_norm": 0.23879959809777346, "learning_rate": 1.2464706114518088e-05, "loss": 1.6276, "step": 3385 }, { "epoch": 2.57, "grad_norm": 0.2421829350347233, "learning_rate": 1.2252288105463405e-05, "loss": 1.6212, "step": 3390 }, { "epoch": 2.57, "grad_norm": 0.21883362123837063, "learning_rate": 1.2041577479212963e-05, "loss": 1.6288, "step": 3395 }, { "epoch": 2.58, "grad_norm": 0.22802087223126732, "learning_rate": 1.1832578335744882e-05, "loss": 1.6313, "step": 3400 }, { "epoch": 2.58, "grad_norm": 0.26121795799269726, "learning_rate": 1.1625294741735526e-05, "loss": 1.656, "step": 3405 }, { "epoch": 2.59, "grad_norm": 0.24409811650460989, "learning_rate": 1.1419730730480305e-05, "loss": 1.618, "step": 3410 }, { "epoch": 2.59, "grad_norm": 0.21817658760534914, "learning_rate": 1.1215890301815201e-05, "loss": 1.5273, "step": 3415 }, { "epoch": 2.59, "grad_norm": 0.21593460359416924, "learning_rate": 1.101377742203903e-05, "loss": 1.5447, "step": 3420 }, { "epoch": 2.6, "grad_norm": 0.2425820551445123, "learning_rate": 1.0813396023836142e-05, "loss": 1.5712, "step": 3425 }, { "epoch": 2.6, "grad_norm": 0.2625166861814797, "learning_rate": 1.0614750006200014e-05, "loss": 1.6605, "step": 3430 }, { "epoch": 2.6, "grad_norm": 0.2615766591081601, "learning_rate": 1.0417843234357282e-05, "loss": 1.5986, "step": 3435 }, { "epoch": 2.61, "grad_norm": 0.2581808450444552, "learning_rate": 1.022267953969257e-05, "loss": 1.641, "step": 3440 }, { "epoch": 2.61, "grad_norm": 0.2746391682993569, "learning_rate": 1.0029262719674015e-05, "loss": 1.6293, "step": 3445 }, { "epoch": 2.62, "grad_norm": 0.20322726411468045, "learning_rate": 9.837596537779237e-06, "loss": 1.5418, "step": 3450 }, { "epoch": 2.62, "grad_norm": 0.24234603845064367, "learning_rate": 9.647684723422213e-06, "loss": 1.6451, "step": 3455 }, { "epoch": 2.62, "grad_norm": 0.24585798612987492, "learning_rate": 9.459530971880681e-06, "loss": 1.5217, "step": 3460 }, { "epoch": 2.63, "grad_norm": 0.24090792198358563, "learning_rate": 9.27313894422428e-06, "loss": 1.7077, "step": 3465 }, { "epoch": 2.63, "grad_norm": 0.2794920103874086, "learning_rate": 9.088512267243143e-06, "loss": 1.7315, "step": 3470 }, { "epoch": 2.63, "grad_norm": 0.21561282445650495, "learning_rate": 8.905654533377583e-06, "loss": 1.6059, "step": 3475 }, { "epoch": 2.64, "grad_norm": 0.2348946654088957, "learning_rate": 8.724569300648034e-06, "loss": 1.7123, "step": 3480 }, { "epoch": 2.64, "grad_norm": 0.29915508239234007, "learning_rate": 8.545260092585805e-06, "loss": 1.6167, "step": 3485 }, { "epoch": 2.65, "grad_norm": 0.2609864197177778, "learning_rate": 8.367730398164574e-06, "loss": 1.6634, "step": 3490 }, { "epoch": 2.65, "grad_norm": 0.32383778611721475, "learning_rate": 8.19198367173255e-06, "loss": 1.631, "step": 3495 }, { "epoch": 2.65, "grad_norm": 0.24225506413140852, "learning_rate": 8.018023332945112e-06, "loss": 1.5466, "step": 3500 }, { "epoch": 2.66, "grad_norm": 0.2686708766662986, "learning_rate": 7.845852766698426e-06, "loss": 1.5889, "step": 3505 }, { "epoch": 2.66, "grad_norm": 0.24593185494412043, "learning_rate": 7.675475323063475e-06, "loss": 1.5796, "step": 3510 }, { "epoch": 2.66, "grad_norm": 0.27604151432217455, "learning_rate": 7.5068943172209025e-06, "loss": 1.6281, "step": 3515 }, { "epoch": 2.67, "grad_norm": 0.2211924967690316, "learning_rate": 7.340113029396567e-06, "loss": 1.5407, "step": 3520 }, { "epoch": 2.67, "grad_norm": 0.2549240432311639, "learning_rate": 7.175134704797592e-06, "loss": 1.6782, "step": 3525 }, { "epoch": 2.68, "grad_norm": 0.22194169097137223, "learning_rate": 7.011962553549345e-06, "loss": 1.639, "step": 3530 }, { "epoch": 2.68, "grad_norm": 0.24109057602354814, "learning_rate": 6.8505997506329024e-06, "loss": 1.6421, "step": 3535 }, { "epoch": 2.68, "grad_norm": 0.298662548692409, "learning_rate": 6.691049435823327e-06, "loss": 1.5672, "step": 3540 }, { "epoch": 2.69, "grad_norm": 0.21821362720901652, "learning_rate": 6.533314713628458e-06, "loss": 1.5832, "step": 3545 }, { "epoch": 2.69, "grad_norm": 0.28320776205122955, "learning_rate": 6.377398653228661e-06, "loss": 1.5686, "step": 3550 }, { "epoch": 2.7, "grad_norm": 0.2647885175395758, "learning_rate": 6.22330428841702e-06, "loss": 1.3694, "step": 3555 }, { "epoch": 2.7, "grad_norm": 0.23693055785085496, "learning_rate": 6.071034617540294e-06, "loss": 1.4096, "step": 3560 }, { "epoch": 2.7, "grad_norm": 0.2811316512655128, "learning_rate": 5.9205926034406e-06, "loss": 1.7223, "step": 3565 }, { "epoch": 2.71, "grad_norm": 0.25921259473226205, "learning_rate": 5.771981173397811e-06, "loss": 1.6491, "step": 3570 }, { "epoch": 2.71, "grad_norm": 0.2384614576434685, "learning_rate": 5.625203219072495e-06, "loss": 1.5796, "step": 3575 }, { "epoch": 2.71, "grad_norm": 0.24972301573225342, "learning_rate": 5.480261596449698e-06, "loss": 1.6484, "step": 3580 }, { "epoch": 2.72, "grad_norm": 0.2790877252142927, "learning_rate": 5.337159125783453e-06, "loss": 1.6747, "step": 3585 }, { "epoch": 2.72, "grad_norm": 0.26526860024829096, "learning_rate": 5.195898591541748e-06, "loss": 1.631, "step": 3590 }, { "epoch": 2.73, "grad_norm": 0.2359453480631305, "learning_rate": 5.056482742352486e-06, "loss": 1.5224, "step": 3595 }, { "epoch": 2.73, "grad_norm": 0.2749211055400865, "learning_rate": 4.9189142909498945e-06, "loss": 1.5348, "step": 3600 }, { "epoch": 2.73, "grad_norm": 0.20381886685736014, "learning_rate": 4.783195914121818e-06, "loss": 1.6092, "step": 3605 }, { "epoch": 2.74, "grad_norm": 0.27694673318367125, "learning_rate": 4.649330252657613e-06, "loss": 1.5524, "step": 3610 }, { "epoch": 2.74, "grad_norm": 0.24104039690274334, "learning_rate": 4.517319911296747e-06, "loss": 1.6131, "step": 3615 }, { "epoch": 2.74, "grad_norm": 0.21654379256196502, "learning_rate": 4.387167458678121e-06, "loss": 1.5537, "step": 3620 }, { "epoch": 2.75, "grad_norm": 0.22452456642122062, "learning_rate": 4.2588754272900985e-06, "loss": 1.5051, "step": 3625 }, { "epoch": 2.75, "grad_norm": 0.2338963199910069, "learning_rate": 4.132446313421246e-06, "loss": 1.6882, "step": 3630 }, { "epoch": 2.76, "grad_norm": 0.2584997684032959, "learning_rate": 4.00788257711171e-06, "loss": 1.5014, "step": 3635 }, { "epoch": 2.76, "grad_norm": 0.2908213514842905, "learning_rate": 3.885186642105376e-06, "loss": 1.6277, "step": 3640 }, { "epoch": 2.76, "grad_norm": 0.19233168126321265, "learning_rate": 3.7643608958027543e-06, "loss": 1.7565, "step": 3645 }, { "epoch": 2.77, "grad_norm": 0.270149572768491, "learning_rate": 3.6454076892144418e-06, "loss": 1.6004, "step": 3650 }, { "epoch": 2.77, "grad_norm": 0.2403966892086897, "learning_rate": 3.5283293369154036e-06, "loss": 1.5425, "step": 3655 }, { "epoch": 2.77, "grad_norm": 0.1959498671099479, "learning_rate": 3.4131281170000083e-06, "loss": 1.6043, "step": 3660 }, { "epoch": 2.78, "grad_norm": 0.2291068939021477, "learning_rate": 3.2998062710375864e-06, "loss": 1.6167, "step": 3665 }, { "epoch": 2.78, "grad_norm": 0.2669658026144572, "learning_rate": 3.188366004028931e-06, "loss": 1.6093, "step": 3670 }, { "epoch": 2.79, "grad_norm": 0.21167755912643296, "learning_rate": 3.0788094843632655e-06, "loss": 1.6288, "step": 3675 }, { "epoch": 2.79, "grad_norm": 0.27869409187190786, "learning_rate": 2.9711388437761445e-06, "loss": 1.5781, "step": 3680 }, { "epoch": 2.79, "grad_norm": 0.21975752059084885, "learning_rate": 2.8653561773079764e-06, "loss": 1.6193, "step": 3685 }, { "epoch": 2.8, "grad_norm": 0.22758480435581485, "learning_rate": 2.7614635432632097e-06, "loss": 1.7111, "step": 3690 }, { "epoch": 2.8, "grad_norm": 0.253380043181827, "learning_rate": 2.6594629631702783e-06, "loss": 1.6528, "step": 3695 }, { "epoch": 2.81, "grad_norm": 0.22066629732671186, "learning_rate": 2.5593564217423314e-06, "loss": 1.5717, "step": 3700 }, { "epoch": 2.81, "grad_norm": 0.20545309594834268, "learning_rate": 2.461145866838599e-06, "loss": 1.5816, "step": 3705 }, { "epoch": 2.81, "grad_norm": 0.2298245690861381, "learning_rate": 2.364833209426376e-06, "loss": 1.5273, "step": 3710 }, { "epoch": 2.82, "grad_norm": 0.28452051640315046, "learning_rate": 2.270420323544009e-06, "loss": 1.5568, "step": 3715 }, { "epoch": 2.82, "grad_norm": 0.20796391559506347, "learning_rate": 2.177909046264348e-06, "loss": 1.6991, "step": 3720 }, { "epoch": 2.82, "grad_norm": 0.26581893695586506, "learning_rate": 2.0873011776589957e-06, "loss": 1.517, "step": 3725 }, { "epoch": 2.83, "grad_norm": 0.22796087968697157, "learning_rate": 1.998598480763247e-06, "loss": 1.7992, "step": 3730 }, { "epoch": 2.83, "grad_norm": 0.2336977716987997, "learning_rate": 1.911802681541919e-06, "loss": 1.513, "step": 3735 }, { "epoch": 2.84, "grad_norm": 0.2497871821535283, "learning_rate": 1.8269154688556056e-06, "loss": 1.5704, "step": 3740 }, { "epoch": 2.84, "grad_norm": 0.23620748564286875, "learning_rate": 1.7439384944279213e-06, "loss": 1.4392, "step": 3745 }, { "epoch": 2.84, "grad_norm": 0.23378030531695476, "learning_rate": 1.6628733728133227e-06, "loss": 1.5813, "step": 3750 }, { "epoch": 2.85, "grad_norm": 0.2146995504847581, "learning_rate": 1.5837216813656908e-06, "loss": 1.5966, "step": 3755 }, { "epoch": 2.85, "grad_norm": 0.2382092413535891, "learning_rate": 1.506484960207677e-06, "loss": 1.553, "step": 3760 }, { "epoch": 2.85, "grad_norm": 0.19096405564266747, "learning_rate": 1.4311647122006721e-06, "loss": 1.5538, "step": 3765 }, { "epoch": 2.86, "grad_norm": 0.2877533644995907, "learning_rate": 1.3577624029155966e-06, "loss": 1.5703, "step": 3770 }, { "epoch": 2.86, "grad_norm": 0.25619739351479454, "learning_rate": 1.2862794606044337e-06, "loss": 1.4537, "step": 3775 }, { "epoch": 2.87, "grad_norm": 0.17985034388431237, "learning_rate": 1.216717276172341e-06, "loss": 1.7393, "step": 3780 }, { "epoch": 2.87, "grad_norm": 0.24865342548144834, "learning_rate": 1.1490772031506392e-06, "loss": 1.681, "step": 3785 }, { "epoch": 2.87, "grad_norm": 0.2682691239787855, "learning_rate": 1.0833605576705096e-06, "loss": 1.7253, "step": 3790 }, { "epoch": 2.88, "grad_norm": 0.2694273976328748, "learning_rate": 1.0195686184373166e-06, "loss": 1.5678, "step": 3795 }, { "epoch": 2.88, "grad_norm": 0.21203591608829347, "learning_rate": 9.577026267057476e-07, "loss": 1.615, "step": 3800 }, { "epoch": 2.88, "grad_norm": 0.24345143310387468, "learning_rate": 8.97763786255712e-07, "loss": 1.5338, "step": 3805 }, { "epoch": 2.89, "grad_norm": 0.2579598936883795, "learning_rate": 8.397532633688254e-07, "loss": 1.5515, "step": 3810 }, { "epoch": 2.89, "grad_norm": 0.2434697228699266, "learning_rate": 7.836721868058061e-07, "loss": 1.7675, "step": 3815 }, { "epoch": 2.9, "grad_norm": 0.2440247854883621, "learning_rate": 7.295216477844702e-07, "loss": 1.6179, "step": 3820 }, { "epoch": 2.9, "grad_norm": 0.25830362738889806, "learning_rate": 6.773026999584708e-07, "loss": 1.698, "step": 3825 }, { "epoch": 2.9, "grad_norm": 0.2622736838540384, "learning_rate": 6.270163593968703e-07, "loss": 1.6485, "step": 3830 }, { "epoch": 2.91, "grad_norm": 0.2065165799768009, "learning_rate": 5.786636045643112e-07, "loss": 1.6278, "step": 3835 }, { "epoch": 2.91, "grad_norm": 0.22985561856815895, "learning_rate": 5.322453763019653e-07, "loss": 1.5524, "step": 3840 }, { "epoch": 2.92, "grad_norm": 0.2727208719350262, "learning_rate": 4.877625778092809e-07, "loss": 1.6646, "step": 3845 }, { "epoch": 2.92, "grad_norm": 0.2664985065225481, "learning_rate": 4.4521607462640893e-07, "loss": 1.5143, "step": 3850 }, { "epoch": 2.92, "grad_norm": 0.24545453500067022, "learning_rate": 4.046066946172822e-07, "loss": 1.6567, "step": 3855 }, { "epoch": 2.93, "grad_norm": 0.24678453590974866, "learning_rate": 3.659352279535733e-07, "loss": 1.6106, "step": 3860 }, { "epoch": 2.93, "grad_norm": 0.28024299756070853, "learning_rate": 3.292024270993399e-07, "loss": 1.4444, "step": 3865 }, { "epoch": 2.93, "grad_norm": 0.21189400608157236, "learning_rate": 2.9440900679631457e-07, "loss": 1.5323, "step": 3870 }, { "epoch": 2.94, "grad_norm": 0.24352825534009542, "learning_rate": 2.615556440500377e-07, "loss": 1.6129, "step": 3875 }, { "epoch": 2.94, "grad_norm": 0.24710743453957604, "learning_rate": 2.306429781166908e-07, "loss": 1.6064, "step": 3880 }, { "epoch": 2.95, "grad_norm": 0.2939949777477219, "learning_rate": 2.016716104906391e-07, "loss": 1.5547, "step": 3885 }, { "epoch": 2.95, "grad_norm": 0.24855903276634353, "learning_rate": 1.7464210489273047e-07, "loss": 1.4292, "step": 3890 }, { "epoch": 2.95, "grad_norm": 0.29838466749808656, "learning_rate": 1.4955498725932604e-07, "loss": 1.6796, "step": 3895 }, { "epoch": 2.96, "grad_norm": 0.24063497135231618, "learning_rate": 1.2641074573209733e-07, "loss": 1.6524, "step": 3900 }, { "epoch": 2.96, "grad_norm": 0.266868894727991, "learning_rate": 1.0520983064847833e-07, "loss": 1.6033, "step": 3905 }, { "epoch": 2.96, "grad_norm": 0.2468474483223204, "learning_rate": 8.595265453292811e-08, "loss": 1.5643, "step": 3910 }, { "epoch": 2.97, "grad_norm": 0.20433798961816185, "learning_rate": 6.86395920889149e-08, "loss": 1.5261, "step": 3915 }, { "epoch": 2.97, "grad_norm": 0.24801643390934636, "learning_rate": 5.327098019159982e-08, "loss": 1.7088, "step": 3920 }, { "epoch": 2.98, "grad_norm": 0.25316391379721404, "learning_rate": 3.9847117881308685e-08, "loss": 1.6097, "step": 3925 }, { "epoch": 2.98, "grad_norm": 0.2589299223311166, "learning_rate": 2.8368266357681194e-08, "loss": 1.7897, "step": 3930 }, { "epoch": 2.98, "grad_norm": 0.2615270175678582, "learning_rate": 1.8834648974630497e-08, "loss": 1.7153, "step": 3935 }, { "epoch": 2.99, "grad_norm": 0.23820419945511057, "learning_rate": 1.12464512359578e-08, "loss": 1.489, "step": 3940 }, { "epoch": 2.99, "grad_norm": 0.24057944682864915, "learning_rate": 5.603820791755254e-09, "loss": 1.4223, "step": 3945 }, { "epoch": 2.99, "grad_norm": 0.28827145571809903, "learning_rate": 1.9068674355415815e-09, "loss": 1.6161, "step": 3950 }, { "epoch": 3.0, "grad_norm": 0.4006550408742116, "learning_rate": 1.5566310213044333e-10, "loss": 1.556, "step": 3955 }, { "epoch": 3.0, "step": 3957, "total_flos": 1.244366244937728e+16, "train_loss": 1.656136889695097, "train_runtime": 24205.6115, "train_samples_per_second": 0.654, "train_steps_per_second": 0.163 } ], "logging_steps": 5, "max_steps": 3957, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.244366244937728e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }