diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5574 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 3957, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.08993126249565843, + "learning_rate": 5.050505050505052e-07, + "loss": 1.9018, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.07331289037109069, + "learning_rate": 2.5252525252525253e-06, + "loss": 1.764, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.07765988729484478, + "learning_rate": 5.050505050505051e-06, + "loss": 1.6755, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.09328963425207142, + "learning_rate": 7.5757575757575764e-06, + "loss": 1.7942, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.08439300726187475, + "learning_rate": 1.0101010101010101e-05, + "loss": 1.9255, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.09969799609843567, + "learning_rate": 1.2626262626262628e-05, + "loss": 1.6785, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.08101062126150285, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.6021, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.09324937558599246, + "learning_rate": 1.7676767676767676e-05, + "loss": 1.8021, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 0.08809158633106223, + "learning_rate": 2.0202020202020203e-05, + "loss": 1.8128, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 0.12040981734567617, + "learning_rate": 2.272727272727273e-05, + "loss": 1.9518, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 0.10564260792758735, + "learning_rate": 2.5252525252525256e-05, + "loss": 1.844, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 0.11067902372614258, + "learning_rate": 2.777777777777778e-05, + "loss": 1.7309, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 0.17208150270588693, + "learning_rate": 3.0303030303030306e-05, + "loss": 1.83, + "step": 60 + }, + { + "epoch": 0.05, + "grad_norm": 0.17753444313487116, + "learning_rate": 3.282828282828283e-05, + "loss": 1.7143, + "step": 65 + }, + { + "epoch": 0.05, + "grad_norm": 0.11795959596262973, + "learning_rate": 3.535353535353535e-05, + "loss": 1.6863, + "step": 70 + }, + { + "epoch": 0.06, + "grad_norm": 0.1604849588266011, + "learning_rate": 3.787878787878788e-05, + "loss": 1.7895, + "step": 75 + }, + { + "epoch": 0.06, + "grad_norm": 0.14614002357667696, + "learning_rate": 4.0404040404040405e-05, + "loss": 1.7037, + "step": 80 + }, + { + "epoch": 0.06, + "grad_norm": 0.17612584653207902, + "learning_rate": 4.292929292929293e-05, + "loss": 1.7624, + "step": 85 + }, + { + "epoch": 0.07, + "grad_norm": 0.1725623148760258, + "learning_rate": 4.545454545454546e-05, + "loss": 1.7826, + "step": 90 + }, + { + "epoch": 0.07, + "grad_norm": 0.20023707797673052, + "learning_rate": 4.797979797979798e-05, + "loss": 1.7551, + "step": 95 + }, + { + "epoch": 0.08, + "grad_norm": 0.19340080776803978, + "learning_rate": 5.050505050505051e-05, + "loss": 1.8434, + "step": 100 + }, + { + "epoch": 0.08, + "grad_norm": 0.17540911026085398, + "learning_rate": 5.303030303030303e-05, + "loss": 1.8444, + "step": 105 + }, + { + "epoch": 0.08, + "grad_norm": 0.17607693067428137, + "learning_rate": 5.555555555555556e-05, + "loss": 1.7179, + "step": 110 + }, + { + "epoch": 0.09, + "grad_norm": 0.18623446289553894, + "learning_rate": 5.808080808080808e-05, + "loss": 1.8005, + "step": 115 + }, + { + "epoch": 0.09, + "grad_norm": 0.22653423501586226, + "learning_rate": 6.060606060606061e-05, + "loss": 1.8171, + "step": 120 + }, + { + "epoch": 0.09, + "grad_norm": 0.19917898944232487, + "learning_rate": 6.313131313131313e-05, + "loss": 1.7935, + "step": 125 + }, + { + "epoch": 0.1, + "grad_norm": 0.17977021019465064, + "learning_rate": 6.565656565656566e-05, + "loss": 1.847, + "step": 130 + }, + { + "epoch": 0.1, + "grad_norm": 0.21882213186080465, + "learning_rate": 6.818181818181818e-05, + "loss": 1.7249, + "step": 135 + }, + { + "epoch": 0.11, + "grad_norm": 0.19872378885041136, + "learning_rate": 7.07070707070707e-05, + "loss": 1.8317, + "step": 140 + }, + { + "epoch": 0.11, + "grad_norm": 0.18503126257452687, + "learning_rate": 7.323232323232324e-05, + "loss": 1.7036, + "step": 145 + }, + { + "epoch": 0.11, + "grad_norm": 0.19374257378242796, + "learning_rate": 7.575757575757576e-05, + "loss": 1.7408, + "step": 150 + }, + { + "epoch": 0.12, + "grad_norm": 0.20435751977610797, + "learning_rate": 7.828282828282829e-05, + "loss": 1.7453, + "step": 155 + }, + { + "epoch": 0.12, + "grad_norm": 0.18626365580812038, + "learning_rate": 8.080808080808081e-05, + "loss": 1.7538, + "step": 160 + }, + { + "epoch": 0.13, + "grad_norm": 0.22638414276196805, + "learning_rate": 8.333333333333334e-05, + "loss": 1.7755, + "step": 165 + }, + { + "epoch": 0.13, + "grad_norm": 0.19644895370384188, + "learning_rate": 8.585858585858586e-05, + "loss": 1.8446, + "step": 170 + }, + { + "epoch": 0.13, + "grad_norm": 0.19159413735540007, + "learning_rate": 8.83838383838384e-05, + "loss": 1.6323, + "step": 175 + }, + { + "epoch": 0.14, + "grad_norm": 0.17020103839194523, + "learning_rate": 9.090909090909092e-05, + "loss": 1.7957, + "step": 180 + }, + { + "epoch": 0.14, + "grad_norm": 0.19164694691999767, + "learning_rate": 9.343434343434344e-05, + "loss": 1.9204, + "step": 185 + }, + { + "epoch": 0.14, + "grad_norm": 0.19378174604020243, + "learning_rate": 9.595959595959596e-05, + "loss": 1.6792, + "step": 190 + }, + { + "epoch": 0.15, + "grad_norm": 0.24199163008867994, + "learning_rate": 9.848484848484849e-05, + "loss": 1.7818, + "step": 195 + }, + { + "epoch": 0.15, + "grad_norm": 0.1791702851062047, + "learning_rate": 0.00010101010101010102, + "loss": 1.6407, + "step": 200 + }, + { + "epoch": 0.16, + "grad_norm": 0.2127448005277486, + "learning_rate": 0.00010353535353535353, + "loss": 1.8173, + "step": 205 + }, + { + "epoch": 0.16, + "grad_norm": 0.18625979651987537, + "learning_rate": 0.00010606060606060606, + "loss": 1.7401, + "step": 210 + }, + { + "epoch": 0.16, + "grad_norm": 0.2602576963144457, + "learning_rate": 0.0001085858585858586, + "loss": 1.8104, + "step": 215 + }, + { + "epoch": 0.17, + "grad_norm": 0.19387518149584881, + "learning_rate": 0.00011111111111111112, + "loss": 1.8442, + "step": 220 + }, + { + "epoch": 0.17, + "grad_norm": 0.22413096563678928, + "learning_rate": 0.00011363636363636365, + "loss": 1.6172, + "step": 225 + }, + { + "epoch": 0.17, + "grad_norm": 0.21913536165908545, + "learning_rate": 0.00011616161616161616, + "loss": 1.6973, + "step": 230 + }, + { + "epoch": 0.18, + "grad_norm": 0.2083524734994055, + "learning_rate": 0.00011868686868686869, + "loss": 1.7593, + "step": 235 + }, + { + "epoch": 0.18, + "grad_norm": 0.22803196006710846, + "learning_rate": 0.00012121212121212122, + "loss": 1.731, + "step": 240 + }, + { + "epoch": 0.19, + "grad_norm": 0.16039461658504198, + "learning_rate": 0.00012373737373737374, + "loss": 1.5913, + "step": 245 + }, + { + "epoch": 0.19, + "grad_norm": 0.2185859764067758, + "learning_rate": 0.00012626262626262626, + "loss": 1.637, + "step": 250 + }, + { + "epoch": 0.19, + "grad_norm": 0.19449925000530618, + "learning_rate": 0.00012878787878787878, + "loss": 1.5634, + "step": 255 + }, + { + "epoch": 0.2, + "grad_norm": 0.18094727231062543, + "learning_rate": 0.00013131313131313133, + "loss": 1.6769, + "step": 260 + }, + { + "epoch": 0.2, + "grad_norm": 0.2362383395641708, + "learning_rate": 0.00013383838383838385, + "loss": 1.7723, + "step": 265 + }, + { + "epoch": 0.2, + "grad_norm": 0.1756303905738309, + "learning_rate": 0.00013636363636363637, + "loss": 1.7622, + "step": 270 + }, + { + "epoch": 0.21, + "grad_norm": 0.18784556886056825, + "learning_rate": 0.0001388888888888889, + "loss": 1.648, + "step": 275 + }, + { + "epoch": 0.21, + "grad_norm": 0.23195176017229427, + "learning_rate": 0.0001414141414141414, + "loss": 1.846, + "step": 280 + }, + { + "epoch": 0.22, + "grad_norm": 0.22241261139284105, + "learning_rate": 0.00014393939393939396, + "loss": 1.6282, + "step": 285 + }, + { + "epoch": 0.22, + "grad_norm": 0.1959378752266171, + "learning_rate": 0.00014646464646464648, + "loss": 1.7298, + "step": 290 + }, + { + "epoch": 0.22, + "grad_norm": 0.18110574265575713, + "learning_rate": 0.000148989898989899, + "loss": 1.6463, + "step": 295 + }, + { + "epoch": 0.23, + "grad_norm": 0.19727075597861782, + "learning_rate": 0.00015151515151515152, + "loss": 1.7925, + "step": 300 + }, + { + "epoch": 0.23, + "grad_norm": 0.20574697015902954, + "learning_rate": 0.00015404040404040406, + "loss": 1.6835, + "step": 305 + }, + { + "epoch": 0.24, + "grad_norm": 0.18178501884804188, + "learning_rate": 0.00015656565656565658, + "loss": 1.8534, + "step": 310 + }, + { + "epoch": 0.24, + "grad_norm": 0.20396286221213047, + "learning_rate": 0.0001590909090909091, + "loss": 1.9553, + "step": 315 + }, + { + "epoch": 0.24, + "grad_norm": 0.19731656072570272, + "learning_rate": 0.00016161616161616162, + "loss": 1.7907, + "step": 320 + }, + { + "epoch": 0.25, + "grad_norm": 0.15745281662564334, + "learning_rate": 0.00016414141414141414, + "loss": 1.7516, + "step": 325 + }, + { + "epoch": 0.25, + "grad_norm": 0.17389045576146547, + "learning_rate": 0.0001666666666666667, + "loss": 1.6362, + "step": 330 + }, + { + "epoch": 0.25, + "grad_norm": 0.2055099842458337, + "learning_rate": 0.00016919191919191918, + "loss": 1.711, + "step": 335 + }, + { + "epoch": 0.26, + "grad_norm": 0.16967943859036833, + "learning_rate": 0.00017171717171717173, + "loss": 1.7327, + "step": 340 + }, + { + "epoch": 0.26, + "grad_norm": 0.20493364192749108, + "learning_rate": 0.00017424242424242425, + "loss": 1.7575, + "step": 345 + }, + { + "epoch": 0.27, + "grad_norm": 0.22713873700263487, + "learning_rate": 0.0001767676767676768, + "loss": 1.6266, + "step": 350 + }, + { + "epoch": 0.27, + "grad_norm": 0.22661135493794904, + "learning_rate": 0.00017929292929292931, + "loss": 1.5738, + "step": 355 + }, + { + "epoch": 0.27, + "grad_norm": 0.2181059846275241, + "learning_rate": 0.00018181818181818183, + "loss": 1.6742, + "step": 360 + }, + { + "epoch": 0.28, + "grad_norm": 0.17088148508773793, + "learning_rate": 0.00018434343434343435, + "loss": 1.7483, + "step": 365 + }, + { + "epoch": 0.28, + "grad_norm": 0.2533679574468662, + "learning_rate": 0.00018686868686868687, + "loss": 1.8377, + "step": 370 + }, + { + "epoch": 0.28, + "grad_norm": 0.19728510430536142, + "learning_rate": 0.00018939393939393942, + "loss": 1.6412, + "step": 375 + }, + { + "epoch": 0.29, + "grad_norm": 0.1846496893995934, + "learning_rate": 0.00019191919191919191, + "loss": 1.6605, + "step": 380 + }, + { + "epoch": 0.29, + "grad_norm": 0.20677111282109845, + "learning_rate": 0.00019444444444444446, + "loss": 1.9151, + "step": 385 + }, + { + "epoch": 0.3, + "grad_norm": 0.1843158891748435, + "learning_rate": 0.00019696969696969698, + "loss": 1.6697, + "step": 390 + }, + { + "epoch": 0.3, + "grad_norm": 0.19889363775332344, + "learning_rate": 0.0001994949494949495, + "loss": 1.7293, + "step": 395 + }, + { + "epoch": 0.3, + "grad_norm": 0.18003384908917786, + "learning_rate": 0.00019999937734807612, + "loss": 1.8024, + "step": 400 + }, + { + "epoch": 0.31, + "grad_norm": 0.18260287569380637, + "learning_rate": 0.00019999684783792443, + "loss": 1.6779, + "step": 405 + }, + { + "epoch": 0.31, + "grad_norm": 0.21622290040357123, + "learning_rate": 0.00019999237260298072, + "loss": 1.6577, + "step": 410 + }, + { + "epoch": 0.31, + "grad_norm": 0.19728853094941184, + "learning_rate": 0.00019998595173032347, + "loss": 1.6211, + "step": 415 + }, + { + "epoch": 0.32, + "grad_norm": 0.16427481018358323, + "learning_rate": 0.00019997758534488915, + "loss": 1.6793, + "step": 420 + }, + { + "epoch": 0.32, + "grad_norm": 0.1464512551401983, + "learning_rate": 0.00019996727360946972, + "loss": 1.731, + "step": 425 + }, + { + "epoch": 0.33, + "grad_norm": 0.1895744669006413, + "learning_rate": 0.00019995501672470951, + "loss": 1.7024, + "step": 430 + }, + { + "epoch": 0.33, + "grad_norm": 0.13688692966034832, + "learning_rate": 0.00019994081492910124, + "loss": 1.8371, + "step": 435 + }, + { + "epoch": 0.33, + "grad_norm": 0.20101726127225358, + "learning_rate": 0.0001999246684989815, + "loss": 1.7473, + "step": 440 + }, + { + "epoch": 0.34, + "grad_norm": 0.20241522090213954, + "learning_rate": 0.00019990657774852534, + "loss": 1.7423, + "step": 445 + }, + { + "epoch": 0.34, + "grad_norm": 0.1767592377256186, + "learning_rate": 0.00019988654302974, + "loss": 1.8304, + "step": 450 + }, + { + "epoch": 0.34, + "grad_norm": 0.2373965969657545, + "learning_rate": 0.00019986456473245826, + "loss": 1.8509, + "step": 455 + }, + { + "epoch": 0.35, + "grad_norm": 0.21300866974991087, + "learning_rate": 0.00019984064328433084, + "loss": 1.7339, + "step": 460 + }, + { + "epoch": 0.35, + "grad_norm": 0.185425069119908, + "learning_rate": 0.00019981477915081793, + "loss": 1.7523, + "step": 465 + }, + { + "epoch": 0.36, + "grad_norm": 0.1955299660793198, + "learning_rate": 0.00019978697283518023, + "loss": 1.804, + "step": 470 + }, + { + "epoch": 0.36, + "grad_norm": 0.19829821726437152, + "learning_rate": 0.00019975722487846918, + "loss": 1.8378, + "step": 475 + }, + { + "epoch": 0.36, + "grad_norm": 0.13816451624075418, + "learning_rate": 0.0001997255358595164, + "loss": 1.791, + "step": 480 + }, + { + "epoch": 0.37, + "grad_norm": 0.17416550139224937, + "learning_rate": 0.00019969190639492244, + "loss": 1.6882, + "step": 485 + }, + { + "epoch": 0.37, + "grad_norm": 0.19361009276270708, + "learning_rate": 0.00019965633713904472, + "loss": 1.7448, + "step": 490 + }, + { + "epoch": 0.38, + "grad_norm": 0.20655351119978135, + "learning_rate": 0.00019961882878398492, + "loss": 1.7804, + "step": 495 + }, + { + "epoch": 0.38, + "grad_norm": 0.2104318907698028, + "learning_rate": 0.0001995793820595754, + "loss": 1.7399, + "step": 500 + }, + { + "epoch": 0.38, + "grad_norm": 0.1970506865196183, + "learning_rate": 0.00019953799773336507, + "loss": 1.662, + "step": 505 + }, + { + "epoch": 0.39, + "grad_norm": 0.12911497323739385, + "learning_rate": 0.00019949467661060433, + "loss": 1.6589, + "step": 510 + }, + { + "epoch": 0.39, + "grad_norm": 0.18727055645023982, + "learning_rate": 0.00019944941953422968, + "loss": 1.7437, + "step": 515 + }, + { + "epoch": 0.39, + "grad_norm": 0.21063285499774953, + "learning_rate": 0.000199402227384847, + "loss": 1.837, + "step": 520 + }, + { + "epoch": 0.4, + "grad_norm": 0.16895692207829008, + "learning_rate": 0.00019935310108071453, + "loss": 1.7406, + "step": 525 + }, + { + "epoch": 0.4, + "grad_norm": 0.2316031917603028, + "learning_rate": 0.00019930204157772515, + "loss": 1.8237, + "step": 530 + }, + { + "epoch": 0.41, + "grad_norm": 0.14077975973845075, + "learning_rate": 0.00019924904986938754, + "loss": 1.8804, + "step": 535 + }, + { + "epoch": 0.41, + "grad_norm": 0.25152961069767266, + "learning_rate": 0.000199194126986807, + "loss": 1.7984, + "step": 540 + }, + { + "epoch": 0.41, + "grad_norm": 0.18475665649785333, + "learning_rate": 0.00019913727399866545, + "loss": 1.7, + "step": 545 + }, + { + "epoch": 0.42, + "grad_norm": 0.15993162566307856, + "learning_rate": 0.00019907849201120033, + "loss": 1.8694, + "step": 550 + }, + { + "epoch": 0.42, + "grad_norm": 0.21887423989587396, + "learning_rate": 0.00019901778216818345, + "loss": 1.699, + "step": 555 + }, + { + "epoch": 0.42, + "grad_norm": 0.18385555657977046, + "learning_rate": 0.00019895514565089855, + "loss": 1.7936, + "step": 560 + }, + { + "epoch": 0.43, + "grad_norm": 0.15762946661816535, + "learning_rate": 0.00019889058367811822, + "loss": 1.6613, + "step": 565 + }, + { + "epoch": 0.43, + "grad_norm": 0.19204775302628793, + "learning_rate": 0.0001988240975060804, + "loss": 1.5856, + "step": 570 + }, + { + "epoch": 0.44, + "grad_norm": 0.1697199863146858, + "learning_rate": 0.00019875568842846382, + "loss": 1.672, + "step": 575 + }, + { + "epoch": 0.44, + "grad_norm": 0.1410887592852674, + "learning_rate": 0.0001986853577763628, + "loss": 1.6269, + "step": 580 + }, + { + "epoch": 0.44, + "grad_norm": 0.1783222763204088, + "learning_rate": 0.00019861310691826143, + "loss": 1.8029, + "step": 585 + }, + { + "epoch": 0.45, + "grad_norm": 0.20484278901882244, + "learning_rate": 0.00019853893726000683, + "loss": 1.6194, + "step": 590 + }, + { + "epoch": 0.45, + "grad_norm": 0.1808969694192384, + "learning_rate": 0.00019846285024478202, + "loss": 1.7084, + "step": 595 + }, + { + "epoch": 0.45, + "grad_norm": 0.1965951187170914, + "learning_rate": 0.00019838484735307748, + "loss": 1.706, + "step": 600 + }, + { + "epoch": 0.46, + "grad_norm": 0.1555012346720015, + "learning_rate": 0.0001983049301026627, + "loss": 1.464, + "step": 605 + }, + { + "epoch": 0.46, + "grad_norm": 0.2229027944987823, + "learning_rate": 0.00019822310004855652, + "loss": 1.673, + "step": 610 + }, + { + "epoch": 0.47, + "grad_norm": 0.169635050611861, + "learning_rate": 0.00019813935878299662, + "loss": 1.6593, + "step": 615 + }, + { + "epoch": 0.47, + "grad_norm": 0.16624303946845476, + "learning_rate": 0.0001980537079354091, + "loss": 1.7164, + "step": 620 + }, + { + "epoch": 0.47, + "grad_norm": 0.1544208624543807, + "learning_rate": 0.00019796614917237616, + "loss": 1.5616, + "step": 625 + }, + { + "epoch": 0.48, + "grad_norm": 0.1900272509930039, + "learning_rate": 0.00019787668419760408, + "loss": 1.6552, + "step": 630 + }, + { + "epoch": 0.48, + "grad_norm": 0.20362594606792483, + "learning_rate": 0.00019778531475188996, + "loss": 1.7175, + "step": 635 + }, + { + "epoch": 0.49, + "grad_norm": 0.15933464850430776, + "learning_rate": 0.00019769204261308774, + "loss": 1.6599, + "step": 640 + }, + { + "epoch": 0.49, + "grad_norm": 0.15846354449923994, + "learning_rate": 0.00019759686959607383, + "loss": 1.7152, + "step": 645 + }, + { + "epoch": 0.49, + "grad_norm": 0.1731064728813603, + "learning_rate": 0.00019749979755271155, + "loss": 1.8006, + "step": 650 + }, + { + "epoch": 0.5, + "grad_norm": 0.2036118054344575, + "learning_rate": 0.00019740082837181526, + "loss": 1.6992, + "step": 655 + }, + { + "epoch": 0.5, + "grad_norm": 0.20595935892977982, + "learning_rate": 0.00019729996397911356, + "loss": 1.7571, + "step": 660 + }, + { + "epoch": 0.5, + "grad_norm": 0.1816167430276872, + "learning_rate": 0.00019719720633721178, + "loss": 1.8058, + "step": 665 + }, + { + "epoch": 0.51, + "grad_norm": 0.2139611683255453, + "learning_rate": 0.00019709255744555389, + "loss": 1.8398, + "step": 670 + }, + { + "epoch": 0.51, + "grad_norm": 0.18514013236898805, + "learning_rate": 0.0001969860193403835, + "loss": 1.7307, + "step": 675 + }, + { + "epoch": 0.52, + "grad_norm": 0.17288244809213096, + "learning_rate": 0.00019687759409470426, + "loss": 1.7242, + "step": 680 + }, + { + "epoch": 0.52, + "grad_norm": 0.15953349037713735, + "learning_rate": 0.00019676728381823956, + "loss": 1.6435, + "step": 685 + }, + { + "epoch": 0.52, + "grad_norm": 0.20963390389942183, + "learning_rate": 0.00019665509065739149, + "loss": 1.6791, + "step": 690 + }, + { + "epoch": 0.53, + "grad_norm": 0.19075148330166494, + "learning_rate": 0.000196541016795199, + "loss": 1.505, + "step": 695 + }, + { + "epoch": 0.53, + "grad_norm": 0.22817672978454195, + "learning_rate": 0.00019642506445129545, + "loss": 1.8361, + "step": 700 + }, + { + "epoch": 0.53, + "grad_norm": 0.1925013343867196, + "learning_rate": 0.00019630723588186545, + "loss": 1.7126, + "step": 705 + }, + { + "epoch": 0.54, + "grad_norm": 0.16780528759294142, + "learning_rate": 0.000196187533379601, + "loss": 1.6649, + "step": 710 + }, + { + "epoch": 0.54, + "grad_norm": 0.17707927803137202, + "learning_rate": 0.00019606595927365675, + "loss": 1.6551, + "step": 715 + }, + { + "epoch": 0.55, + "grad_norm": 0.22525846033337887, + "learning_rate": 0.00019594251592960479, + "loss": 1.7401, + "step": 720 + }, + { + "epoch": 0.55, + "grad_norm": 0.1953310514707257, + "learning_rate": 0.0001958172057493886, + "loss": 1.6944, + "step": 725 + }, + { + "epoch": 0.55, + "grad_norm": 0.2085121645512001, + "learning_rate": 0.0001956900311712763, + "loss": 1.663, + "step": 730 + }, + { + "epoch": 0.56, + "grad_norm": 0.17093646250615369, + "learning_rate": 0.0001955609946698131, + "loss": 1.772, + "step": 735 + }, + { + "epoch": 0.56, + "grad_norm": 0.19564116222725914, + "learning_rate": 0.00019543009875577346, + "loss": 1.6328, + "step": 740 + }, + { + "epoch": 0.56, + "grad_norm": 0.215195812549034, + "learning_rate": 0.0001952973459761118, + "loss": 1.6438, + "step": 745 + }, + { + "epoch": 0.57, + "grad_norm": 0.19377558972597342, + "learning_rate": 0.0001951627389139134, + "loss": 1.7442, + "step": 750 + }, + { + "epoch": 0.57, + "grad_norm": 0.1792011980095539, + "learning_rate": 0.00019502628018834372, + "loss": 1.7518, + "step": 755 + }, + { + "epoch": 0.58, + "grad_norm": 0.18977603295326154, + "learning_rate": 0.00019488797245459773, + "loss": 1.688, + "step": 760 + }, + { + "epoch": 0.58, + "grad_norm": 0.19341540153355985, + "learning_rate": 0.00019474781840384816, + "loss": 1.7562, + "step": 765 + }, + { + "epoch": 0.58, + "grad_norm": 0.14738398424312027, + "learning_rate": 0.00019460582076319302, + "loss": 1.7244, + "step": 770 + }, + { + "epoch": 0.59, + "grad_norm": 0.1496446026997031, + "learning_rate": 0.00019446198229560276, + "loss": 1.7083, + "step": 775 + }, + { + "epoch": 0.59, + "grad_norm": 0.2151992641933425, + "learning_rate": 0.00019431630579986632, + "loss": 1.7078, + "step": 780 + }, + { + "epoch": 0.6, + "grad_norm": 0.1972075447483379, + "learning_rate": 0.00019416879411053673, + "loss": 1.7665, + "step": 785 + }, + { + "epoch": 0.6, + "grad_norm": 0.20871968848692934, + "learning_rate": 0.00019401945009787594, + "loss": 1.6636, + "step": 790 + }, + { + "epoch": 0.6, + "grad_norm": 0.2047491094137733, + "learning_rate": 0.0001938682766677991, + "loss": 1.8061, + "step": 795 + }, + { + "epoch": 0.61, + "grad_norm": 0.1622522396758859, + "learning_rate": 0.00019371527676181777, + "loss": 1.8645, + "step": 800 + }, + { + "epoch": 0.61, + "grad_norm": 0.1714969472958251, + "learning_rate": 0.00019356045335698296, + "loss": 1.8266, + "step": 805 + }, + { + "epoch": 0.61, + "grad_norm": 0.1971306915514917, + "learning_rate": 0.00019340380946582695, + "loss": 1.7205, + "step": 810 + }, + { + "epoch": 0.62, + "grad_norm": 0.17020111323913545, + "learning_rate": 0.00019324534813630487, + "loss": 1.7339, + "step": 815 + }, + { + "epoch": 0.62, + "grad_norm": 0.18250825908624654, + "learning_rate": 0.00019308507245173527, + "loss": 1.5188, + "step": 820 + }, + { + "epoch": 0.63, + "grad_norm": 0.18593221945740382, + "learning_rate": 0.0001929229855307402, + "loss": 1.654, + "step": 825 + }, + { + "epoch": 0.63, + "grad_norm": 0.22850385689556876, + "learning_rate": 0.00019275909052718447, + "loss": 1.7814, + "step": 830 + }, + { + "epoch": 0.63, + "grad_norm": 0.19759950903326942, + "learning_rate": 0.00019259339063011432, + "loss": 1.744, + "step": 835 + }, + { + "epoch": 0.64, + "grad_norm": 0.17215997030755548, + "learning_rate": 0.00019242588906369536, + "loss": 1.8283, + "step": 840 + }, + { + "epoch": 0.64, + "grad_norm": 0.19136317284315416, + "learning_rate": 0.00019225658908714983, + "loss": 1.6163, + "step": 845 + }, + { + "epoch": 0.64, + "grad_norm": 0.254426335434924, + "learning_rate": 0.00019208549399469318, + "loss": 1.7618, + "step": 850 + }, + { + "epoch": 0.65, + "grad_norm": 0.21881435842952657, + "learning_rate": 0.00019191260711547001, + "loss": 1.7315, + "step": 855 + }, + { + "epoch": 0.65, + "grad_norm": 0.20799528199612635, + "learning_rate": 0.0001917379318134892, + "loss": 1.7859, + "step": 860 + }, + { + "epoch": 0.66, + "grad_norm": 0.17796834357588534, + "learning_rate": 0.00019156147148755855, + "loss": 1.7345, + "step": 865 + }, + { + "epoch": 0.66, + "grad_norm": 0.1855849493493474, + "learning_rate": 0.0001913832295712186, + "loss": 1.6232, + "step": 870 + }, + { + "epoch": 0.66, + "grad_norm": 0.20017349406812152, + "learning_rate": 0.00019120320953267586, + "loss": 1.7546, + "step": 875 + }, + { + "epoch": 0.67, + "grad_norm": 0.2146332192452092, + "learning_rate": 0.0001910214148747352, + "loss": 1.6231, + "step": 880 + }, + { + "epoch": 0.67, + "grad_norm": 0.15896122862532144, + "learning_rate": 0.0001908378491347319, + "loss": 1.5535, + "step": 885 + }, + { + "epoch": 0.67, + "grad_norm": 0.17416174476856394, + "learning_rate": 0.00019065251588446265, + "loss": 1.6337, + "step": 890 + }, + { + "epoch": 0.68, + "grad_norm": 0.23962933659259386, + "learning_rate": 0.0001904654187301161, + "loss": 1.8581, + "step": 895 + }, + { + "epoch": 0.68, + "grad_norm": 0.17002626630746845, + "learning_rate": 0.0001902765613122028, + "loss": 1.6537, + "step": 900 + }, + { + "epoch": 0.69, + "grad_norm": 0.23553588791103638, + "learning_rate": 0.0001900859473054841, + "loss": 1.7497, + "step": 905 + }, + { + "epoch": 0.69, + "grad_norm": 0.17184692025828147, + "learning_rate": 0.00018989358041890094, + "loss": 1.6305, + "step": 910 + }, + { + "epoch": 0.69, + "grad_norm": 0.19276600455036005, + "learning_rate": 0.00018969946439550148, + "loss": 1.6965, + "step": 915 + }, + { + "epoch": 0.7, + "grad_norm": 0.2266174277702017, + "learning_rate": 0.0001895036030123684, + "loss": 1.7845, + "step": 920 + }, + { + "epoch": 0.7, + "grad_norm": 0.15948175213103422, + "learning_rate": 0.0001893060000805453, + "loss": 1.582, + "step": 925 + }, + { + "epoch": 0.71, + "grad_norm": 0.20415510379076665, + "learning_rate": 0.00018910665944496264, + "loss": 1.6576, + "step": 930 + }, + { + "epoch": 0.71, + "grad_norm": 0.20826411615417578, + "learning_rate": 0.00018890558498436282, + "loss": 1.7243, + "step": 935 + }, + { + "epoch": 0.71, + "grad_norm": 0.2535182644413305, + "learning_rate": 0.00018870278061122484, + "loss": 1.5795, + "step": 940 + }, + { + "epoch": 0.72, + "grad_norm": 0.17063517897800512, + "learning_rate": 0.00018849825027168803, + "loss": 1.6361, + "step": 945 + }, + { + "epoch": 0.72, + "grad_norm": 0.1898841623155248, + "learning_rate": 0.00018829199794547535, + "loss": 1.7526, + "step": 950 + }, + { + "epoch": 0.72, + "grad_norm": 0.19033639448531828, + "learning_rate": 0.00018808402764581596, + "loss": 1.5943, + "step": 955 + }, + { + "epoch": 0.73, + "grad_norm": 0.1647576077524525, + "learning_rate": 0.0001878743434193671, + "loss": 1.7575, + "step": 960 + }, + { + "epoch": 0.73, + "grad_norm": 0.2070226518164384, + "learning_rate": 0.00018766294934613535, + "loss": 1.741, + "step": 965 + }, + { + "epoch": 0.74, + "grad_norm": 0.21633633400820462, + "learning_rate": 0.00018744984953939726, + "loss": 1.6967, + "step": 970 + }, + { + "epoch": 0.74, + "grad_norm": 0.2039504411965307, + "learning_rate": 0.0001872350481456193, + "loss": 1.6825, + "step": 975 + }, + { + "epoch": 0.74, + "grad_norm": 0.19382809212719235, + "learning_rate": 0.0001870185493443772, + "loss": 1.7494, + "step": 980 + }, + { + "epoch": 0.75, + "grad_norm": 0.17836311560595738, + "learning_rate": 0.0001868003573482746, + "loss": 1.6326, + "step": 985 + }, + { + "epoch": 0.75, + "grad_norm": 0.18940985276826594, + "learning_rate": 0.0001865804764028611, + "loss": 1.6823, + "step": 990 + }, + { + "epoch": 0.75, + "grad_norm": 0.15827883706638377, + "learning_rate": 0.0001863589107865496, + "loss": 1.8507, + "step": 995 + }, + { + "epoch": 0.76, + "grad_norm": 0.2024112582787964, + "learning_rate": 0.00018613566481053315, + "loss": 1.6737, + "step": 1000 + }, + { + "epoch": 0.76, + "grad_norm": 0.18631332115379975, + "learning_rate": 0.00018591074281870099, + "loss": 1.6391, + "step": 1005 + }, + { + "epoch": 0.77, + "grad_norm": 0.2322177223268837, + "learning_rate": 0.00018568414918755397, + "loss": 1.7185, + "step": 1010 + }, + { + "epoch": 0.77, + "grad_norm": 0.19585063603806546, + "learning_rate": 0.00018545588832611956, + "loss": 1.8829, + "step": 1015 + }, + { + "epoch": 0.77, + "grad_norm": 0.19046268057109556, + "learning_rate": 0.00018522596467586598, + "loss": 1.6889, + "step": 1020 + }, + { + "epoch": 0.78, + "grad_norm": 0.2319521660184869, + "learning_rate": 0.00018499438271061568, + "loss": 1.7148, + "step": 1025 + }, + { + "epoch": 0.78, + "grad_norm": 0.18401426887501984, + "learning_rate": 0.0001847611469364584, + "loss": 1.6355, + "step": 1030 + }, + { + "epoch": 0.78, + "grad_norm": 0.16467673844089234, + "learning_rate": 0.00018452626189166345, + "loss": 1.5748, + "step": 1035 + }, + { + "epoch": 0.79, + "grad_norm": 0.21515271715044545, + "learning_rate": 0.0001842897321465915, + "loss": 1.7172, + "step": 1040 + }, + { + "epoch": 0.79, + "grad_norm": 0.20010536585072475, + "learning_rate": 0.0001840515623036055, + "loss": 1.7331, + "step": 1045 + }, + { + "epoch": 0.8, + "grad_norm": 0.15220183718369495, + "learning_rate": 0.0001838117569969812, + "loss": 1.7703, + "step": 1050 + }, + { + "epoch": 0.8, + "grad_norm": 0.19249950248721495, + "learning_rate": 0.00018357032089281702, + "loss": 1.7356, + "step": 1055 + }, + { + "epoch": 0.8, + "grad_norm": 0.15685889188495356, + "learning_rate": 0.00018332725868894313, + "loss": 1.5789, + "step": 1060 + }, + { + "epoch": 0.81, + "grad_norm": 0.22123166856945198, + "learning_rate": 0.00018308257511483018, + "loss": 1.7449, + "step": 1065 + }, + { + "epoch": 0.81, + "grad_norm": 0.21921162541787237, + "learning_rate": 0.00018283627493149721, + "loss": 1.592, + "step": 1070 + }, + { + "epoch": 0.82, + "grad_norm": 0.15892072068340937, + "learning_rate": 0.00018258836293141907, + "loss": 1.6588, + "step": 1075 + }, + { + "epoch": 0.82, + "grad_norm": 0.2129268440643301, + "learning_rate": 0.000182338843938433, + "loss": 1.6687, + "step": 1080 + }, + { + "epoch": 0.82, + "grad_norm": 0.18558886049158316, + "learning_rate": 0.000182087722807645, + "loss": 1.6204, + "step": 1085 + }, + { + "epoch": 0.83, + "grad_norm": 0.21759739469279235, + "learning_rate": 0.00018183500442533514, + "loss": 1.7012, + "step": 1090 + }, + { + "epoch": 0.83, + "grad_norm": 0.16739812153050462, + "learning_rate": 0.00018158069370886266, + "loss": 1.7749, + "step": 1095 + }, + { + "epoch": 0.83, + "grad_norm": 0.2120028175464506, + "learning_rate": 0.0001813247956065702, + "loss": 1.7076, + "step": 1100 + }, + { + "epoch": 0.84, + "grad_norm": 0.21506301830058508, + "learning_rate": 0.00018106731509768753, + "loss": 1.6561, + "step": 1105 + }, + { + "epoch": 0.84, + "grad_norm": 0.21374007008875692, + "learning_rate": 0.00018080825719223468, + "loss": 1.7721, + "step": 1110 + }, + { + "epoch": 0.85, + "grad_norm": 0.21473556112453085, + "learning_rate": 0.00018054762693092444, + "loss": 1.5391, + "step": 1115 + }, + { + "epoch": 0.85, + "grad_norm": 0.1928094864305794, + "learning_rate": 0.00018028542938506426, + "loss": 1.7297, + "step": 1120 + }, + { + "epoch": 0.85, + "grad_norm": 0.22195616352181186, + "learning_rate": 0.0001800216696564576, + "loss": 1.6239, + "step": 1125 + }, + { + "epoch": 0.86, + "grad_norm": 0.2493704349381919, + "learning_rate": 0.00017975635287730473, + "loss": 1.7736, + "step": 1130 + }, + { + "epoch": 0.86, + "grad_norm": 0.1871166430212898, + "learning_rate": 0.00017948948421010264, + "loss": 1.67, + "step": 1135 + }, + { + "epoch": 0.86, + "grad_norm": 0.16460126336549072, + "learning_rate": 0.00017922106884754488, + "loss": 1.7331, + "step": 1140 + }, + { + "epoch": 0.87, + "grad_norm": 0.18707990225784327, + "learning_rate": 0.0001789511120124203, + "loss": 1.5608, + "step": 1145 + }, + { + "epoch": 0.87, + "grad_norm": 0.21751770239029078, + "learning_rate": 0.00017867961895751163, + "loss": 1.721, + "step": 1150 + }, + { + "epoch": 0.88, + "grad_norm": 0.1674742118307801, + "learning_rate": 0.00017840659496549298, + "loss": 1.7339, + "step": 1155 + }, + { + "epoch": 0.88, + "grad_norm": 0.19173527103482793, + "learning_rate": 0.00017813204534882738, + "loss": 1.7348, + "step": 1160 + }, + { + "epoch": 0.88, + "grad_norm": 0.18468049289167895, + "learning_rate": 0.0001778559754496631, + "loss": 1.6823, + "step": 1165 + }, + { + "epoch": 0.89, + "grad_norm": 0.2069730744593729, + "learning_rate": 0.00017757839063972997, + "loss": 1.8253, + "step": 1170 + }, + { + "epoch": 0.89, + "grad_norm": 0.2139312404074137, + "learning_rate": 0.00017729929632023472, + "loss": 1.7013, + "step": 1175 + }, + { + "epoch": 0.89, + "grad_norm": 0.1764736094502213, + "learning_rate": 0.00017701869792175593, + "loss": 1.8235, + "step": 1180 + }, + { + "epoch": 0.9, + "grad_norm": 0.21944309103277923, + "learning_rate": 0.00017673660090413823, + "loss": 1.8237, + "step": 1185 + }, + { + "epoch": 0.9, + "grad_norm": 0.20268987883171422, + "learning_rate": 0.00017645301075638634, + "loss": 1.6992, + "step": 1190 + }, + { + "epoch": 0.91, + "grad_norm": 0.19400968339090352, + "learning_rate": 0.00017616793299655794, + "loss": 1.8662, + "step": 1195 + }, + { + "epoch": 0.91, + "grad_norm": 0.18489832863809344, + "learning_rate": 0.00017588137317165657, + "loss": 1.6986, + "step": 1200 + }, + { + "epoch": 0.91, + "grad_norm": 0.17738333103257395, + "learning_rate": 0.0001755933368575235, + "loss": 1.6783, + "step": 1205 + }, + { + "epoch": 0.92, + "grad_norm": 0.17926192606119037, + "learning_rate": 0.0001753038296587294, + "loss": 1.7627, + "step": 1210 + }, + { + "epoch": 0.92, + "grad_norm": 0.20194075183870522, + "learning_rate": 0.00017501285720846523, + "loss": 1.7846, + "step": 1215 + }, + { + "epoch": 0.92, + "grad_norm": 0.19331786071311133, + "learning_rate": 0.0001747204251684325, + "loss": 1.7143, + "step": 1220 + }, + { + "epoch": 0.93, + "grad_norm": 0.23530188097310437, + "learning_rate": 0.00017442653922873327, + "loss": 1.7296, + "step": 1225 + }, + { + "epoch": 0.93, + "grad_norm": 0.17594594764152405, + "learning_rate": 0.0001741312051077594, + "loss": 1.7335, + "step": 1230 + }, + { + "epoch": 0.94, + "grad_norm": 0.20934249136020208, + "learning_rate": 0.00017383442855208124, + "loss": 1.6646, + "step": 1235 + }, + { + "epoch": 0.94, + "grad_norm": 0.2111005617028846, + "learning_rate": 0.00017353621533633583, + "loss": 1.5756, + "step": 1240 + }, + { + "epoch": 0.94, + "grad_norm": 0.21413727626671644, + "learning_rate": 0.00017323657126311454, + "loss": 1.4917, + "step": 1245 + }, + { + "epoch": 0.95, + "grad_norm": 0.2391299536210697, + "learning_rate": 0.0001729355021628502, + "loss": 1.7283, + "step": 1250 + }, + { + "epoch": 0.95, + "grad_norm": 0.19381232926045663, + "learning_rate": 0.00017263301389370362, + "loss": 1.7907, + "step": 1255 + }, + { + "epoch": 0.96, + "grad_norm": 0.21223075585900172, + "learning_rate": 0.0001723291123414495, + "loss": 1.7412, + "step": 1260 + }, + { + "epoch": 0.96, + "grad_norm": 0.18560634331207926, + "learning_rate": 0.00017202380341936212, + "loss": 1.7287, + "step": 1265 + }, + { + "epoch": 0.96, + "grad_norm": 0.18941317978765862, + "learning_rate": 0.00017171709306810012, + "loss": 1.5956, + "step": 1270 + }, + { + "epoch": 0.97, + "grad_norm": 0.17108900888850623, + "learning_rate": 0.000171408987255591, + "loss": 1.7789, + "step": 1275 + }, + { + "epoch": 0.97, + "grad_norm": 0.19233373904164977, + "learning_rate": 0.00017109949197691485, + "loss": 1.7397, + "step": 1280 + }, + { + "epoch": 0.97, + "grad_norm": 0.1697480170006848, + "learning_rate": 0.00017078861325418797, + "loss": 1.5765, + "step": 1285 + }, + { + "epoch": 0.98, + "grad_norm": 0.17575403691888572, + "learning_rate": 0.00017047635713644528, + "loss": 1.8137, + "step": 1290 + }, + { + "epoch": 0.98, + "grad_norm": 0.19700603487956356, + "learning_rate": 0.00017016272969952304, + "loss": 1.8248, + "step": 1295 + }, + { + "epoch": 0.99, + "grad_norm": 0.25577968967800774, + "learning_rate": 0.0001698477370459405, + "loss": 1.5227, + "step": 1300 + }, + { + "epoch": 0.99, + "grad_norm": 0.21182493743068362, + "learning_rate": 0.00016953138530478092, + "loss": 1.6463, + "step": 1305 + }, + { + "epoch": 0.99, + "grad_norm": 0.24187008234174068, + "learning_rate": 0.0001692136806315726, + "loss": 1.677, + "step": 1310 + }, + { + "epoch": 1.0, + "grad_norm": 0.23079613874981772, + "learning_rate": 0.00016889462920816902, + "loss": 1.6987, + "step": 1315 + }, + { + "epoch": 1.0, + "grad_norm": 0.18959747421576906, + "learning_rate": 0.00016857423724262849, + "loss": 1.6143, + "step": 1320 + }, + { + "epoch": 1.0, + "grad_norm": 0.19193767521915664, + "learning_rate": 0.00016825251096909343, + "loss": 1.6523, + "step": 1325 + }, + { + "epoch": 1.01, + "grad_norm": 0.1851789336505185, + "learning_rate": 0.00016792945664766907, + "loss": 1.5728, + "step": 1330 + }, + { + "epoch": 1.01, + "grad_norm": 0.14492627661875204, + "learning_rate": 0.00016760508056430152, + "loss": 1.5701, + "step": 1335 + }, + { + "epoch": 1.02, + "grad_norm": 0.2700845196747031, + "learning_rate": 0.0001672793890306556, + "loss": 1.8245, + "step": 1340 + }, + { + "epoch": 1.02, + "grad_norm": 0.1983440671335701, + "learning_rate": 0.00016695238838399206, + "loss": 1.7108, + "step": 1345 + }, + { + "epoch": 1.02, + "grad_norm": 0.17701113866794518, + "learning_rate": 0.0001666240849870441, + "loss": 1.5517, + "step": 1350 + }, + { + "epoch": 1.03, + "grad_norm": 0.16944238367848802, + "learning_rate": 0.0001662944852278936, + "loss": 1.7263, + "step": 1355 + }, + { + "epoch": 1.03, + "grad_norm": 0.20201061964568917, + "learning_rate": 0.00016596359551984704, + "loss": 1.6212, + "step": 1360 + }, + { + "epoch": 1.03, + "grad_norm": 0.16272112017898177, + "learning_rate": 0.0001656314223013104, + "loss": 1.6557, + "step": 1365 + }, + { + "epoch": 1.04, + "grad_norm": 0.2050184080142653, + "learning_rate": 0.00016529797203566405, + "loss": 1.6203, + "step": 1370 + }, + { + "epoch": 1.04, + "grad_norm": 0.18868029622446703, + "learning_rate": 0.00016496325121113706, + "loss": 1.5994, + "step": 1375 + }, + { + "epoch": 1.05, + "grad_norm": 0.18530725289838731, + "learning_rate": 0.00016462726634068075, + "loss": 1.661, + "step": 1380 + }, + { + "epoch": 1.05, + "grad_norm": 0.22254932266214475, + "learning_rate": 0.00016429002396184215, + "loss": 1.5779, + "step": 1385 + }, + { + "epoch": 1.05, + "grad_norm": 0.35454879952816054, + "learning_rate": 0.00016395153063663667, + "loss": 1.4926, + "step": 1390 + }, + { + "epoch": 1.06, + "grad_norm": 0.2083539962991777, + "learning_rate": 0.00016361179295142046, + "loss": 1.668, + "step": 1395 + }, + { + "epoch": 1.06, + "grad_norm": 0.20105783428150303, + "learning_rate": 0.00016327081751676227, + "loss": 1.7475, + "step": 1400 + }, + { + "epoch": 1.07, + "grad_norm": 0.19073307130103012, + "learning_rate": 0.0001629286109673148, + "loss": 1.6726, + "step": 1405 + }, + { + "epoch": 1.07, + "grad_norm": 0.21132776602726958, + "learning_rate": 0.00016258517996168564, + "loss": 1.745, + "step": 1410 + }, + { + "epoch": 1.07, + "grad_norm": 0.23336177448110548, + "learning_rate": 0.0001622405311823076, + "loss": 1.7185, + "step": 1415 + }, + { + "epoch": 1.08, + "grad_norm": 0.19045792239686193, + "learning_rate": 0.00016189467133530884, + "loss": 1.6369, + "step": 1420 + }, + { + "epoch": 1.08, + "grad_norm": 0.1470674402518224, + "learning_rate": 0.0001615476071503823, + "loss": 1.6593, + "step": 1425 + }, + { + "epoch": 1.08, + "grad_norm": 0.1895764202504411, + "learning_rate": 0.0001611993453806547, + "loss": 1.5879, + "step": 1430 + }, + { + "epoch": 1.09, + "grad_norm": 0.21781610564885606, + "learning_rate": 0.0001608498928025553, + "loss": 1.6377, + "step": 1435 + }, + { + "epoch": 1.09, + "grad_norm": 0.21082770226036582, + "learning_rate": 0.00016049925621568382, + "loss": 1.5626, + "step": 1440 + }, + { + "epoch": 1.1, + "grad_norm": 0.2288377931408156, + "learning_rate": 0.00016014744244267833, + "loss": 1.7531, + "step": 1445 + }, + { + "epoch": 1.1, + "grad_norm": 0.1822265551052057, + "learning_rate": 0.00015979445832908242, + "loss": 1.691, + "step": 1450 + }, + { + "epoch": 1.1, + "grad_norm": 0.23100268355259115, + "learning_rate": 0.00015944031074321204, + "loss": 1.7622, + "step": 1455 + }, + { + "epoch": 1.11, + "grad_norm": 0.18624779842903288, + "learning_rate": 0.00015908500657602174, + "loss": 1.5919, + "step": 1460 + }, + { + "epoch": 1.11, + "grad_norm": 0.20357926913176824, + "learning_rate": 0.0001587285527409707, + "loss": 1.6288, + "step": 1465 + }, + { + "epoch": 1.11, + "grad_norm": 0.20919686630022472, + "learning_rate": 0.00015837095617388827, + "loss": 1.6705, + "step": 1470 + }, + { + "epoch": 1.12, + "grad_norm": 0.1993582841062667, + "learning_rate": 0.0001580122238328387, + "loss": 1.6516, + "step": 1475 + }, + { + "epoch": 1.12, + "grad_norm": 0.2547942602076731, + "learning_rate": 0.00015765236269798627, + "loss": 1.5036, + "step": 1480 + }, + { + "epoch": 1.13, + "grad_norm": 0.1807424509361345, + "learning_rate": 0.00015729137977145893, + "loss": 1.6089, + "step": 1485 + }, + { + "epoch": 1.13, + "grad_norm": 0.18264437292208377, + "learning_rate": 0.0001569292820772124, + "loss": 1.7353, + "step": 1490 + }, + { + "epoch": 1.13, + "grad_norm": 0.21311009253554458, + "learning_rate": 0.00015656607666089334, + "loss": 1.6574, + "step": 1495 + }, + { + "epoch": 1.14, + "grad_norm": 0.18453680363642788, + "learning_rate": 0.0001562017705897024, + "loss": 1.5736, + "step": 1500 + }, + { + "epoch": 1.14, + "grad_norm": 0.23644760312940358, + "learning_rate": 0.00015583637095225656, + "loss": 1.7076, + "step": 1505 + }, + { + "epoch": 1.14, + "grad_norm": 0.19899933139163767, + "learning_rate": 0.00015546988485845125, + "loss": 1.665, + "step": 1510 + }, + { + "epoch": 1.15, + "grad_norm": 0.23202505382527974, + "learning_rate": 0.0001551023194393221, + "loss": 1.7191, + "step": 1515 + }, + { + "epoch": 1.15, + "grad_norm": 0.21073033640879407, + "learning_rate": 0.00015473368184690597, + "loss": 1.6123, + "step": 1520 + }, + { + "epoch": 1.16, + "grad_norm": 0.22019036120363472, + "learning_rate": 0.00015436397925410201, + "loss": 1.6909, + "step": 1525 + }, + { + "epoch": 1.16, + "grad_norm": 0.20817813902248655, + "learning_rate": 0.00015399321885453202, + "loss": 1.7648, + "step": 1530 + }, + { + "epoch": 1.16, + "grad_norm": 0.21714232280510767, + "learning_rate": 0.00015362140786240035, + "loss": 1.6718, + "step": 1535 + }, + { + "epoch": 1.17, + "grad_norm": 0.20478633851375716, + "learning_rate": 0.00015324855351235372, + "loss": 1.7586, + "step": 1540 + }, + { + "epoch": 1.17, + "grad_norm": 0.19046880552839732, + "learning_rate": 0.00015287466305934037, + "loss": 1.695, + "step": 1545 + }, + { + "epoch": 1.18, + "grad_norm": 0.23309832393442634, + "learning_rate": 0.0001524997437784689, + "loss": 1.584, + "step": 1550 + }, + { + "epoch": 1.18, + "grad_norm": 0.23887396172176847, + "learning_rate": 0.00015212380296486652, + "loss": 1.5742, + "step": 1555 + }, + { + "epoch": 1.18, + "grad_norm": 0.18128179052277552, + "learning_rate": 0.0001517468479335376, + "loss": 1.6802, + "step": 1560 + }, + { + "epoch": 1.19, + "grad_norm": 0.22086322507654135, + "learning_rate": 0.00015136888601922072, + "loss": 1.7222, + "step": 1565 + }, + { + "epoch": 1.19, + "grad_norm": 0.18870219517815454, + "learning_rate": 0.0001509899245762464, + "loss": 1.5664, + "step": 1570 + }, + { + "epoch": 1.19, + "grad_norm": 0.2276718864826248, + "learning_rate": 0.00015060997097839386, + "loss": 1.7565, + "step": 1575 + }, + { + "epoch": 1.2, + "grad_norm": 0.20329327239158157, + "learning_rate": 0.00015022903261874748, + "loss": 1.6774, + "step": 1580 + }, + { + "epoch": 1.2, + "grad_norm": 0.18898688137814482, + "learning_rate": 0.00014984711690955297, + "loss": 1.6518, + "step": 1585 + }, + { + "epoch": 1.21, + "grad_norm": 0.22865474882055875, + "learning_rate": 0.00014946423128207322, + "loss": 1.7247, + "step": 1590 + }, + { + "epoch": 1.21, + "grad_norm": 0.21027592834116465, + "learning_rate": 0.00014908038318644373, + "loss": 1.7849, + "step": 1595 + }, + { + "epoch": 1.21, + "grad_norm": 0.20948671991840284, + "learning_rate": 0.0001486955800915274, + "loss": 1.5386, + "step": 1600 + }, + { + "epoch": 1.22, + "grad_norm": 0.21227729763658884, + "learning_rate": 0.0001483098294847695, + "loss": 1.602, + "step": 1605 + }, + { + "epoch": 1.22, + "grad_norm": 0.21630672435558576, + "learning_rate": 0.00014792313887205182, + "loss": 1.6772, + "step": 1610 + }, + { + "epoch": 1.22, + "grad_norm": 0.21541507503873228, + "learning_rate": 0.00014753551577754664, + "loss": 1.6862, + "step": 1615 + }, + { + "epoch": 1.23, + "grad_norm": 0.2480903001762983, + "learning_rate": 0.0001471469677435704, + "loss": 1.5916, + "step": 1620 + }, + { + "epoch": 1.23, + "grad_norm": 0.20716645798924263, + "learning_rate": 0.00014675750233043679, + "loss": 1.7072, + "step": 1625 + }, + { + "epoch": 1.24, + "grad_norm": 0.22397565488829696, + "learning_rate": 0.00014636712711630978, + "loss": 1.6036, + "step": 1630 + }, + { + "epoch": 1.24, + "grad_norm": 0.19584834615434676, + "learning_rate": 0.00014597584969705616, + "loss": 1.6366, + "step": 1635 + }, + { + "epoch": 1.24, + "grad_norm": 0.22273274810197669, + "learning_rate": 0.00014558367768609766, + "loss": 1.6545, + "step": 1640 + }, + { + "epoch": 1.25, + "grad_norm": 0.30141032612570196, + "learning_rate": 0.00014519061871426286, + "loss": 1.6668, + "step": 1645 + }, + { + "epoch": 1.25, + "grad_norm": 0.2508746414625482, + "learning_rate": 0.0001447966804296387, + "loss": 1.5583, + "step": 1650 + }, + { + "epoch": 1.25, + "grad_norm": 0.2656543660091513, + "learning_rate": 0.00014440187049742165, + "loss": 1.6114, + "step": 1655 + }, + { + "epoch": 1.26, + "grad_norm": 0.22762072721537044, + "learning_rate": 0.00014400619659976863, + "loss": 1.5218, + "step": 1660 + }, + { + "epoch": 1.26, + "grad_norm": 0.21625802298436558, + "learning_rate": 0.00014360966643564747, + "loss": 1.6282, + "step": 1665 + }, + { + "epoch": 1.27, + "grad_norm": 0.18758356388629857, + "learning_rate": 0.00014321228772068702, + "loss": 1.5724, + "step": 1670 + }, + { + "epoch": 1.27, + "grad_norm": 0.22894089207752852, + "learning_rate": 0.0001428140681870272, + "loss": 1.5875, + "step": 1675 + }, + { + "epoch": 1.27, + "grad_norm": 0.25952806547918694, + "learning_rate": 0.0001424150155831685, + "loss": 1.6728, + "step": 1680 + }, + { + "epoch": 1.28, + "grad_norm": 0.3304544222948505, + "learning_rate": 0.00014201513767382108, + "loss": 1.6944, + "step": 1685 + }, + { + "epoch": 1.28, + "grad_norm": 0.21745874371742022, + "learning_rate": 0.00014161444223975383, + "loss": 1.5649, + "step": 1690 + }, + { + "epoch": 1.29, + "grad_norm": 0.18668861489627886, + "learning_rate": 0.0001412129370776429, + "loss": 1.6646, + "step": 1695 + }, + { + "epoch": 1.29, + "grad_norm": 0.2514658628873574, + "learning_rate": 0.00014081062999992005, + "loss": 1.6427, + "step": 1700 + }, + { + "epoch": 1.29, + "grad_norm": 0.23075565689636676, + "learning_rate": 0.0001404075288346206, + "loss": 1.7089, + "step": 1705 + }, + { + "epoch": 1.3, + "grad_norm": 0.2005453142298327, + "learning_rate": 0.00014000364142523103, + "loss": 1.7236, + "step": 1710 + }, + { + "epoch": 1.3, + "grad_norm": 0.21925735664261978, + "learning_rate": 0.00013959897563053662, + "loss": 1.7193, + "step": 1715 + }, + { + "epoch": 1.3, + "grad_norm": 0.22755950679993744, + "learning_rate": 0.00013919353932446822, + "loss": 1.6178, + "step": 1720 + }, + { + "epoch": 1.31, + "grad_norm": 0.24575725474371382, + "learning_rate": 0.0001387873403959492, + "loss": 1.6914, + "step": 1725 + }, + { + "epoch": 1.31, + "grad_norm": 0.22868287217989744, + "learning_rate": 0.00013838038674874193, + "loss": 1.6021, + "step": 1730 + }, + { + "epoch": 1.32, + "grad_norm": 0.21889496061933156, + "learning_rate": 0.00013797268630129413, + "loss": 1.8092, + "step": 1735 + }, + { + "epoch": 1.32, + "grad_norm": 0.19238702480865116, + "learning_rate": 0.0001375642469865844, + "loss": 1.54, + "step": 1740 + }, + { + "epoch": 1.32, + "grad_norm": 0.24437133183257548, + "learning_rate": 0.00013715507675196836, + "loss": 1.5477, + "step": 1745 + }, + { + "epoch": 1.33, + "grad_norm": 0.21331661362588805, + "learning_rate": 0.0001367451835590237, + "loss": 1.6229, + "step": 1750 + }, + { + "epoch": 1.33, + "grad_norm": 0.22934227073111574, + "learning_rate": 0.00013633457538339514, + "loss": 1.7056, + "step": 1755 + }, + { + "epoch": 1.33, + "grad_norm": 0.21991726124527775, + "learning_rate": 0.00013592326021463977, + "loss": 1.7322, + "step": 1760 + }, + { + "epoch": 1.34, + "grad_norm": 0.2279246851535844, + "learning_rate": 0.00013551124605607097, + "loss": 1.5663, + "step": 1765 + }, + { + "epoch": 1.34, + "grad_norm": 0.21252716182463233, + "learning_rate": 0.00013509854092460312, + "loss": 1.6308, + "step": 1770 + }, + { + "epoch": 1.35, + "grad_norm": 0.19276878334978295, + "learning_rate": 0.0001346851528505954, + "loss": 1.629, + "step": 1775 + }, + { + "epoch": 1.35, + "grad_norm": 0.20349606898831232, + "learning_rate": 0.00013427108987769566, + "loss": 1.6323, + "step": 1780 + }, + { + "epoch": 1.35, + "grad_norm": 0.280403908850998, + "learning_rate": 0.00013385636006268368, + "loss": 1.5647, + "step": 1785 + }, + { + "epoch": 1.36, + "grad_norm": 0.204649437629767, + "learning_rate": 0.00013344097147531469, + "loss": 1.6706, + "step": 1790 + }, + { + "epoch": 1.36, + "grad_norm": 0.2355526525352747, + "learning_rate": 0.00013302493219816223, + "loss": 1.6661, + "step": 1795 + }, + { + "epoch": 1.36, + "grad_norm": 0.23955342033240548, + "learning_rate": 0.00013260825032646083, + "loss": 1.7684, + "step": 1800 + }, + { + "epoch": 1.37, + "grad_norm": 0.1782918443154143, + "learning_rate": 0.00013219093396794852, + "loss": 1.7357, + "step": 1805 + }, + { + "epoch": 1.37, + "grad_norm": 0.20676511108669285, + "learning_rate": 0.00013177299124270911, + "loss": 1.7935, + "step": 1810 + }, + { + "epoch": 1.38, + "grad_norm": 0.24468072304122832, + "learning_rate": 0.0001313544302830142, + "loss": 1.6357, + "step": 1815 + }, + { + "epoch": 1.38, + "grad_norm": 0.3442798924803141, + "learning_rate": 0.00013093525923316482, + "loss": 1.7283, + "step": 1820 + }, + { + "epoch": 1.38, + "grad_norm": 0.18543047699982895, + "learning_rate": 0.00013051548624933314, + "loss": 1.6756, + "step": 1825 + }, + { + "epoch": 1.39, + "grad_norm": 0.18961104598393633, + "learning_rate": 0.00013009511949940358, + "loss": 1.6258, + "step": 1830 + }, + { + "epoch": 1.39, + "grad_norm": 0.23772840081980506, + "learning_rate": 0.00012967416716281414, + "loss": 1.6197, + "step": 1835 + }, + { + "epoch": 1.39, + "grad_norm": 0.20599306112898513, + "learning_rate": 0.00012925263743039693, + "loss": 1.6155, + "step": 1840 + }, + { + "epoch": 1.4, + "grad_norm": 0.17872981947947883, + "learning_rate": 0.00012883053850421897, + "loss": 1.817, + "step": 1845 + }, + { + "epoch": 1.4, + "grad_norm": 0.21082979842365093, + "learning_rate": 0.00012840787859742266, + "loss": 1.7045, + "step": 1850 + }, + { + "epoch": 1.41, + "grad_norm": 0.21065592453908275, + "learning_rate": 0.00012798466593406583, + "loss": 1.5825, + "step": 1855 + }, + { + "epoch": 1.41, + "grad_norm": 0.21798103821826761, + "learning_rate": 0.00012756090874896172, + "loss": 1.7622, + "step": 1860 + }, + { + "epoch": 1.41, + "grad_norm": 0.22916268453103483, + "learning_rate": 0.00012713661528751888, + "loss": 1.5324, + "step": 1865 + }, + { + "epoch": 1.42, + "grad_norm": 0.2668875410933402, + "learning_rate": 0.00012671179380558062, + "loss": 1.647, + "step": 1870 + }, + { + "epoch": 1.42, + "grad_norm": 0.19627830855058848, + "learning_rate": 0.00012628645256926438, + "loss": 1.5994, + "step": 1875 + }, + { + "epoch": 1.43, + "grad_norm": 0.21241423084048555, + "learning_rate": 0.0001258605998548009, + "loss": 1.622, + "step": 1880 + }, + { + "epoch": 1.43, + "grad_norm": 0.2546778643093178, + "learning_rate": 0.0001254342439483733, + "loss": 1.6916, + "step": 1885 + }, + { + "epoch": 1.43, + "grad_norm": 0.20610950008732792, + "learning_rate": 0.00012500739314595563, + "loss": 1.7455, + "step": 1890 + }, + { + "epoch": 1.44, + "grad_norm": 0.2219569529434739, + "learning_rate": 0.00012458005575315147, + "loss": 1.6683, + "step": 1895 + }, + { + "epoch": 1.44, + "grad_norm": 0.20787095642170883, + "learning_rate": 0.0001241522400850327, + "loss": 1.6202, + "step": 1900 + }, + { + "epoch": 1.44, + "grad_norm": 0.2275845179745845, + "learning_rate": 0.0001237239544659771, + "loss": 1.8088, + "step": 1905 + }, + { + "epoch": 1.45, + "grad_norm": 0.24655110446766015, + "learning_rate": 0.0001232952072295069, + "loss": 1.5618, + "step": 1910 + }, + { + "epoch": 1.45, + "grad_norm": 0.23084716022254811, + "learning_rate": 0.0001228660067181263, + "loss": 1.7204, + "step": 1915 + }, + { + "epoch": 1.46, + "grad_norm": 0.2420965499906573, + "learning_rate": 0.00012243636128315939, + "loss": 1.5581, + "step": 1920 + }, + { + "epoch": 1.46, + "grad_norm": 0.25054116126933823, + "learning_rate": 0.0001220062792845873, + "loss": 1.5808, + "step": 1925 + }, + { + "epoch": 1.46, + "grad_norm": 0.24876893838844386, + "learning_rate": 0.00012157576909088599, + "loss": 1.6291, + "step": 1930 + }, + { + "epoch": 1.47, + "grad_norm": 0.22724411732153027, + "learning_rate": 0.00012114483907886308, + "loss": 1.7218, + "step": 1935 + }, + { + "epoch": 1.47, + "grad_norm": 0.23781633823944948, + "learning_rate": 0.00012071349763349484, + "loss": 1.6696, + "step": 1940 + }, + { + "epoch": 1.47, + "grad_norm": 0.2611267676195103, + "learning_rate": 0.00012028175314776344, + "loss": 1.7099, + "step": 1945 + }, + { + "epoch": 1.48, + "grad_norm": 0.25342034309056527, + "learning_rate": 0.00011984961402249311, + "loss": 1.6931, + "step": 1950 + }, + { + "epoch": 1.48, + "grad_norm": 0.20391686876564638, + "learning_rate": 0.00011941708866618697, + "loss": 1.7043, + "step": 1955 + }, + { + "epoch": 1.49, + "grad_norm": 0.2005457898894919, + "learning_rate": 0.0001189841854948634, + "loss": 1.5758, + "step": 1960 + }, + { + "epoch": 1.49, + "grad_norm": 0.19157508121631642, + "learning_rate": 0.00011855091293189234, + "loss": 1.5831, + "step": 1965 + }, + { + "epoch": 1.49, + "grad_norm": 0.23409302527114853, + "learning_rate": 0.00011811727940783108, + "loss": 1.6668, + "step": 1970 + }, + { + "epoch": 1.5, + "grad_norm": 0.19820344277697435, + "learning_rate": 0.00011768329336026062, + "loss": 1.6894, + "step": 1975 + }, + { + "epoch": 1.5, + "grad_norm": 0.23641920754497897, + "learning_rate": 0.0001172489632336213, + "loss": 1.8362, + "step": 1980 + }, + { + "epoch": 1.5, + "grad_norm": 0.20503090615743924, + "learning_rate": 0.00011681429747904842, + "loss": 1.6885, + "step": 1985 + }, + { + "epoch": 1.51, + "grad_norm": 0.18474233550647523, + "learning_rate": 0.00011637930455420798, + "loss": 1.7196, + "step": 1990 + }, + { + "epoch": 1.51, + "grad_norm": 0.2775657036754379, + "learning_rate": 0.00011594399292313192, + "loss": 1.7362, + "step": 1995 + }, + { + "epoch": 1.52, + "grad_norm": 0.23760102898739513, + "learning_rate": 0.00011550837105605354, + "loss": 1.5986, + "step": 2000 + }, + { + "epoch": 1.52, + "grad_norm": 0.18850041877265183, + "learning_rate": 0.00011507244742924274, + "loss": 1.7116, + "step": 2005 + }, + { + "epoch": 1.52, + "grad_norm": 0.2164959021230041, + "learning_rate": 0.000114636230524841, + "loss": 1.578, + "step": 2010 + }, + { + "epoch": 1.53, + "grad_norm": 0.269300085641628, + "learning_rate": 0.00011419972883069623, + "loss": 1.5605, + "step": 2015 + }, + { + "epoch": 1.53, + "grad_norm": 0.24787445167484887, + "learning_rate": 0.00011376295084019792, + "loss": 1.6663, + "step": 2020 + }, + { + "epoch": 1.54, + "grad_norm": 0.21140623194389616, + "learning_rate": 0.00011332590505211159, + "loss": 1.658, + "step": 2025 + }, + { + "epoch": 1.54, + "grad_norm": 0.25921900302870593, + "learning_rate": 0.00011288859997041353, + "loss": 1.6459, + "step": 2030 + }, + { + "epoch": 1.54, + "grad_norm": 0.2608666502284525, + "learning_rate": 0.00011245104410412537, + "loss": 1.6928, + "step": 2035 + }, + { + "epoch": 1.55, + "grad_norm": 0.22406449938146802, + "learning_rate": 0.00011201324596714844, + "loss": 1.4791, + "step": 2040 + }, + { + "epoch": 1.55, + "grad_norm": 0.19647960391415928, + "learning_rate": 0.00011157521407809815, + "loss": 1.698, + "step": 2045 + }, + { + "epoch": 1.55, + "grad_norm": 0.1897962583849219, + "learning_rate": 0.00011113695696013824, + "loss": 1.8167, + "step": 2050 + }, + { + "epoch": 1.56, + "grad_norm": 0.20712759197533817, + "learning_rate": 0.0001106984831408149, + "loss": 1.7501, + "step": 2055 + }, + { + "epoch": 1.56, + "grad_norm": 0.23079961827033185, + "learning_rate": 0.00011025980115189086, + "loss": 1.5934, + "step": 2060 + }, + { + "epoch": 1.57, + "grad_norm": 0.22104873487185864, + "learning_rate": 0.00010982091952917943, + "loss": 1.6686, + "step": 2065 + }, + { + "epoch": 1.57, + "grad_norm": 0.20639504694734737, + "learning_rate": 0.00010938184681237833, + "loss": 1.7136, + "step": 2070 + }, + { + "epoch": 1.57, + "grad_norm": 0.2417721960073701, + "learning_rate": 0.00010894259154490354, + "loss": 1.6702, + "step": 2075 + }, + { + "epoch": 1.58, + "grad_norm": 0.21810729625691397, + "learning_rate": 0.00010850316227372312, + "loss": 1.7477, + "step": 2080 + }, + { + "epoch": 1.58, + "grad_norm": 0.23170201171415503, + "learning_rate": 0.00010806356754919091, + "loss": 1.6943, + "step": 2085 + }, + { + "epoch": 1.58, + "grad_norm": 0.22093119739393355, + "learning_rate": 0.00010762381592488002, + "loss": 1.623, + "step": 2090 + }, + { + "epoch": 1.59, + "grad_norm": 0.21034721922753088, + "learning_rate": 0.00010718391595741657, + "loss": 1.6084, + "step": 2095 + }, + { + "epoch": 1.59, + "grad_norm": 0.22443726771939806, + "learning_rate": 0.00010674387620631308, + "loss": 1.5536, + "step": 2100 + }, + { + "epoch": 1.6, + "grad_norm": 0.22568508558473213, + "learning_rate": 0.00010630370523380202, + "loss": 1.469, + "step": 2105 + }, + { + "epoch": 1.6, + "grad_norm": 0.3332888137498032, + "learning_rate": 0.00010586341160466904, + "loss": 1.6488, + "step": 2110 + }, + { + "epoch": 1.6, + "grad_norm": 0.2129808005413702, + "learning_rate": 0.00010542300388608652, + "loss": 1.6101, + "step": 2115 + }, + { + "epoch": 1.61, + "grad_norm": 0.20553693555408575, + "learning_rate": 0.00010498249064744679, + "loss": 1.4872, + "step": 2120 + }, + { + "epoch": 1.61, + "grad_norm": 0.2445112542992352, + "learning_rate": 0.00010454188046019524, + "loss": 1.7005, + "step": 2125 + }, + { + "epoch": 1.61, + "grad_norm": 0.20844778510756687, + "learning_rate": 0.00010410118189766387, + "loss": 1.5589, + "step": 2130 + }, + { + "epoch": 1.62, + "grad_norm": 0.2223212290874802, + "learning_rate": 0.0001036604035349041, + "loss": 1.6621, + "step": 2135 + }, + { + "epoch": 1.62, + "grad_norm": 0.20479585313872112, + "learning_rate": 0.00010321955394852018, + "loss": 1.7061, + "step": 2140 + }, + { + "epoch": 1.63, + "grad_norm": 0.17606184812861142, + "learning_rate": 0.0001027786417165022, + "loss": 1.5607, + "step": 2145 + }, + { + "epoch": 1.63, + "grad_norm": 0.2676349610853098, + "learning_rate": 0.0001023376754180592, + "loss": 1.6232, + "step": 2150 + }, + { + "epoch": 1.63, + "grad_norm": 0.2068560787418325, + "learning_rate": 0.00010189666363345223, + "loss": 1.5724, + "step": 2155 + }, + { + "epoch": 1.64, + "grad_norm": 0.19641973239797275, + "learning_rate": 0.00010145561494382742, + "loss": 1.5305, + "step": 2160 + }, + { + "epoch": 1.64, + "grad_norm": 0.2574797520893005, + "learning_rate": 0.00010101453793104898, + "loss": 1.6025, + "step": 2165 + }, + { + "epoch": 1.65, + "grad_norm": 0.2827194584842853, + "learning_rate": 0.00010057344117753222, + "loss": 1.5882, + "step": 2170 + }, + { + "epoch": 1.65, + "grad_norm": 0.19936180521947827, + "learning_rate": 0.00010013233326607661, + "loss": 1.5706, + "step": 2175 + }, + { + "epoch": 1.65, + "grad_norm": 0.21819696462759022, + "learning_rate": 9.969122277969865e-05, + "loss": 1.6623, + "step": 2180 + }, + { + "epoch": 1.66, + "grad_norm": 0.225417352707018, + "learning_rate": 9.9250118301465e-05, + "loss": 1.6255, + "step": 2185 + }, + { + "epoch": 1.66, + "grad_norm": 0.3143651738447285, + "learning_rate": 9.880902841432544e-05, + "loss": 1.4905, + "step": 2190 + }, + { + "epoch": 1.66, + "grad_norm": 0.23749234423783855, + "learning_rate": 9.836796170094571e-05, + "loss": 1.6156, + "step": 2195 + }, + { + "epoch": 1.67, + "grad_norm": 0.23579593383210742, + "learning_rate": 9.792692674354079e-05, + "loss": 1.6963, + "step": 2200 + }, + { + "epoch": 1.67, + "grad_norm": 0.2032329245708717, + "learning_rate": 9.748593212370773e-05, + "loss": 1.6733, + "step": 2205 + }, + { + "epoch": 1.68, + "grad_norm": 0.20661047812325195, + "learning_rate": 9.704498642225856e-05, + "loss": 1.622, + "step": 2210 + }, + { + "epoch": 1.68, + "grad_norm": 0.18970352315906064, + "learning_rate": 9.660409821905363e-05, + "loss": 1.7834, + "step": 2215 + }, + { + "epoch": 1.68, + "grad_norm": 0.17832580771616308, + "learning_rate": 9.616327609283445e-05, + "loss": 1.6989, + "step": 2220 + }, + { + "epoch": 1.69, + "grad_norm": 0.21859704299949706, + "learning_rate": 9.572252862105673e-05, + "loss": 1.7946, + "step": 2225 + }, + { + "epoch": 1.69, + "grad_norm": 0.24897942412148671, + "learning_rate": 9.528186437972368e-05, + "loss": 1.564, + "step": 2230 + }, + { + "epoch": 1.69, + "grad_norm": 0.20109922805508615, + "learning_rate": 9.484129194321896e-05, + "loss": 1.6594, + "step": 2235 + }, + { + "epoch": 1.7, + "grad_norm": 0.19546463521855884, + "learning_rate": 9.440081988413987e-05, + "loss": 1.542, + "step": 2240 + }, + { + "epoch": 1.7, + "grad_norm": 0.20254596218430737, + "learning_rate": 9.396045677313067e-05, + "loss": 1.8142, + "step": 2245 + }, + { + "epoch": 1.71, + "grad_norm": 0.1936135057396683, + "learning_rate": 9.352021117871574e-05, + "loss": 1.5564, + "step": 2250 + }, + { + "epoch": 1.71, + "grad_norm": 0.2096445430714542, + "learning_rate": 9.308009166713263e-05, + "loss": 1.6735, + "step": 2255 + }, + { + "epoch": 1.71, + "grad_norm": 0.22672329152194862, + "learning_rate": 9.264010680216583e-05, + "loss": 1.6761, + "step": 2260 + }, + { + "epoch": 1.72, + "grad_norm": 0.24482242735211057, + "learning_rate": 9.220026514497983e-05, + "loss": 1.5988, + "step": 2265 + }, + { + "epoch": 1.72, + "grad_norm": 0.24736418279884478, + "learning_rate": 9.176057525395252e-05, + "loss": 1.5844, + "step": 2270 + }, + { + "epoch": 1.72, + "grad_norm": 0.1987944867199659, + "learning_rate": 9.132104568450879e-05, + "loss": 1.6997, + "step": 2275 + }, + { + "epoch": 1.73, + "grad_norm": 0.1850913674566201, + "learning_rate": 9.088168498895408e-05, + "loss": 1.5696, + "step": 2280 + }, + { + "epoch": 1.73, + "grad_norm": 0.24393794217168674, + "learning_rate": 9.044250171630778e-05, + "loss": 1.7403, + "step": 2285 + }, + { + "epoch": 1.74, + "grad_norm": 0.19475525279873163, + "learning_rate": 9.000350441213708e-05, + "loss": 1.5984, + "step": 2290 + }, + { + "epoch": 1.74, + "grad_norm": 0.2218761532729913, + "learning_rate": 8.956470161839072e-05, + "loss": 1.6681, + "step": 2295 + }, + { + "epoch": 1.74, + "grad_norm": 0.24957778768532196, + "learning_rate": 8.912610187323248e-05, + "loss": 1.6169, + "step": 2300 + }, + { + "epoch": 1.75, + "grad_norm": 0.2510725868859042, + "learning_rate": 8.868771371087539e-05, + "loss": 1.639, + "step": 2305 + }, + { + "epoch": 1.75, + "grad_norm": 0.19643293153400068, + "learning_rate": 8.82495456614155e-05, + "loss": 1.7237, + "step": 2310 + }, + { + "epoch": 1.76, + "grad_norm": 0.26450396919742597, + "learning_rate": 8.781160625066588e-05, + "loss": 1.6528, + "step": 2315 + }, + { + "epoch": 1.76, + "grad_norm": 0.22179001551390587, + "learning_rate": 8.737390399999086e-05, + "loss": 1.5533, + "step": 2320 + }, + { + "epoch": 1.76, + "grad_norm": 0.2346687653947156, + "learning_rate": 8.693644742614017e-05, + "loss": 1.6104, + "step": 2325 + }, + { + "epoch": 1.77, + "grad_norm": 0.25806483606045055, + "learning_rate": 8.649924504108302e-05, + "loss": 1.6052, + "step": 2330 + }, + { + "epoch": 1.77, + "grad_norm": 0.1786075330646357, + "learning_rate": 8.606230535184283e-05, + "loss": 1.5603, + "step": 2335 + }, + { + "epoch": 1.77, + "grad_norm": 0.254068816191818, + "learning_rate": 8.562563686033145e-05, + "loss": 1.7643, + "step": 2340 + }, + { + "epoch": 1.78, + "grad_norm": 0.21344041020108453, + "learning_rate": 8.518924806318378e-05, + "loss": 1.6584, + "step": 2345 + }, + { + "epoch": 1.78, + "grad_norm": 0.2082041629797306, + "learning_rate": 8.47531474515925e-05, + "loss": 1.7992, + "step": 2350 + }, + { + "epoch": 1.79, + "grad_norm": 0.2645099180130053, + "learning_rate": 8.431734351114284e-05, + "loss": 1.6361, + "step": 2355 + }, + { + "epoch": 1.79, + "grad_norm": 0.22698336003173047, + "learning_rate": 8.388184472164736e-05, + "loss": 1.646, + "step": 2360 + }, + { + "epoch": 1.79, + "grad_norm": 0.24003288864061173, + "learning_rate": 8.34466595569811e-05, + "loss": 1.6379, + "step": 2365 + }, + { + "epoch": 1.8, + "grad_norm": 0.19443085064409085, + "learning_rate": 8.301179648491669e-05, + "loss": 1.73, + "step": 2370 + }, + { + "epoch": 1.8, + "grad_norm": 0.24311509067570025, + "learning_rate": 8.257726396695933e-05, + "loss": 1.6802, + "step": 2375 + }, + { + "epoch": 1.8, + "grad_norm": 0.24648929428851593, + "learning_rate": 8.214307045818254e-05, + "loss": 1.7708, + "step": 2380 + }, + { + "epoch": 1.81, + "grad_norm": 0.1940516179768531, + "learning_rate": 8.17092244070634e-05, + "loss": 1.5857, + "step": 2385 + }, + { + "epoch": 1.81, + "grad_norm": 0.2361070277608161, + "learning_rate": 8.127573425531814e-05, + "loss": 1.6411, + "step": 2390 + }, + { + "epoch": 1.82, + "grad_norm": 0.2835364928454071, + "learning_rate": 8.084260843773799e-05, + "loss": 1.7818, + "step": 2395 + }, + { + "epoch": 1.82, + "grad_norm": 0.18047213778922655, + "learning_rate": 8.040985538202505e-05, + "loss": 1.587, + "step": 2400 + }, + { + "epoch": 1.82, + "grad_norm": 0.21940093931140764, + "learning_rate": 7.997748350862822e-05, + "loss": 1.6795, + "step": 2405 + }, + { + "epoch": 1.83, + "grad_norm": 0.20557324059132212, + "learning_rate": 7.954550123057939e-05, + "loss": 1.638, + "step": 2410 + }, + { + "epoch": 1.83, + "grad_norm": 0.23522437885683956, + "learning_rate": 7.911391695332988e-05, + "loss": 1.6176, + "step": 2415 + }, + { + "epoch": 1.83, + "grad_norm": 0.20227659422834685, + "learning_rate": 7.868273907458661e-05, + "loss": 1.5562, + "step": 2420 + }, + { + "epoch": 1.84, + "grad_norm": 0.17957107180807144, + "learning_rate": 7.825197598414895e-05, + "loss": 1.6577, + "step": 2425 + }, + { + "epoch": 1.84, + "grad_norm": 0.21134479099989728, + "learning_rate": 7.782163606374536e-05, + "loss": 1.5407, + "step": 2430 + }, + { + "epoch": 1.85, + "grad_norm": 0.2190101821746382, + "learning_rate": 7.739172768687028e-05, + "loss": 1.6901, + "step": 2435 + }, + { + "epoch": 1.85, + "grad_norm": 0.22909832831883262, + "learning_rate": 7.696225921862126e-05, + "loss": 1.6517, + "step": 2440 + }, + { + "epoch": 1.85, + "grad_norm": 0.1922087104118847, + "learning_rate": 7.653323901553625e-05, + "loss": 1.5558, + "step": 2445 + }, + { + "epoch": 1.86, + "grad_norm": 0.2535390902934386, + "learning_rate": 7.610467542543073e-05, + "loss": 1.7802, + "step": 2450 + }, + { + "epoch": 1.86, + "grad_norm": 0.20264859592749507, + "learning_rate": 7.567657678723565e-05, + "loss": 1.6141, + "step": 2455 + }, + { + "epoch": 1.87, + "grad_norm": 0.2534081482654566, + "learning_rate": 7.52489514308349e-05, + "loss": 1.6593, + "step": 2460 + }, + { + "epoch": 1.87, + "grad_norm": 0.24401202904206418, + "learning_rate": 7.482180767690334e-05, + "loss": 1.5982, + "step": 2465 + }, + { + "epoch": 1.87, + "grad_norm": 0.2805376490259695, + "learning_rate": 7.439515383674485e-05, + "loss": 1.7126, + "step": 2470 + }, + { + "epoch": 1.88, + "grad_norm": 0.24585333566417664, + "learning_rate": 7.396899821213072e-05, + "loss": 1.5644, + "step": 2475 + }, + { + "epoch": 1.88, + "grad_norm": 0.22491029483008115, + "learning_rate": 7.354334909513791e-05, + "loss": 1.6765, + "step": 2480 + }, + { + "epoch": 1.88, + "grad_norm": 0.23458997274256846, + "learning_rate": 7.311821476798789e-05, + "loss": 1.6122, + "step": 2485 + }, + { + "epoch": 1.89, + "grad_norm": 0.17595992796667512, + "learning_rate": 7.269360350288547e-05, + "loss": 1.8356, + "step": 2490 + }, + { + "epoch": 1.89, + "grad_norm": 0.18759163970832302, + "learning_rate": 7.226952356185765e-05, + "loss": 1.4984, + "step": 2495 + }, + { + "epoch": 1.9, + "grad_norm": 0.236927434671597, + "learning_rate": 7.184598319659317e-05, + "loss": 1.6798, + "step": 2500 + }, + { + "epoch": 1.9, + "grad_norm": 0.26802038257147875, + "learning_rate": 7.142299064828169e-05, + "loss": 1.5844, + "step": 2505 + }, + { + "epoch": 1.9, + "grad_norm": 0.1751974293734832, + "learning_rate": 7.100055414745346e-05, + "loss": 1.6365, + "step": 2510 + }, + { + "epoch": 1.91, + "grad_norm": 0.23254005323825433, + "learning_rate": 7.057868191381936e-05, + "loss": 1.4657, + "step": 2515 + }, + { + "epoch": 1.91, + "grad_norm": 0.264348812986722, + "learning_rate": 7.015738215611079e-05, + "loss": 1.7816, + "step": 2520 + }, + { + "epoch": 1.91, + "grad_norm": 0.27530320883320614, + "learning_rate": 6.973666307191996e-05, + "loss": 1.6751, + "step": 2525 + }, + { + "epoch": 1.92, + "grad_norm": 0.19339613251333393, + "learning_rate": 6.931653284754042e-05, + "loss": 1.7293, + "step": 2530 + }, + { + "epoch": 1.92, + "grad_norm": 0.2151392309486146, + "learning_rate": 6.889699965780787e-05, + "loss": 1.7334, + "step": 2535 + }, + { + "epoch": 1.93, + "grad_norm": 0.22448766537331677, + "learning_rate": 6.847807166594083e-05, + "loss": 1.6827, + "step": 2540 + }, + { + "epoch": 1.93, + "grad_norm": 0.2286115948636003, + "learning_rate": 6.805975702338208e-05, + "loss": 1.6562, + "step": 2545 + }, + { + "epoch": 1.93, + "grad_norm": 0.2118908130790939, + "learning_rate": 6.764206386963991e-05, + "loss": 1.6091, + "step": 2550 + }, + { + "epoch": 1.94, + "grad_norm": 0.240925966059138, + "learning_rate": 6.722500033212974e-05, + "loss": 1.6314, + "step": 2555 + }, + { + "epoch": 1.94, + "grad_norm": 0.2271694074825516, + "learning_rate": 6.680857452601598e-05, + "loss": 1.7589, + "step": 2560 + }, + { + "epoch": 1.94, + "grad_norm": 0.2168118018671656, + "learning_rate": 6.639279455405432e-05, + "loss": 1.6201, + "step": 2565 + }, + { + "epoch": 1.95, + "grad_norm": 0.21224810091098364, + "learning_rate": 6.597766850643361e-05, + "loss": 1.5842, + "step": 2570 + }, + { + "epoch": 1.95, + "grad_norm": 0.19581859607212743, + "learning_rate": 6.556320446061902e-05, + "loss": 1.5586, + "step": 2575 + }, + { + "epoch": 1.96, + "grad_norm": 0.20327112477714954, + "learning_rate": 6.514941048119435e-05, + "loss": 1.6303, + "step": 2580 + }, + { + "epoch": 1.96, + "grad_norm": 0.22810086515914976, + "learning_rate": 6.47362946197055e-05, + "loss": 1.7332, + "step": 2585 + }, + { + "epoch": 1.96, + "grad_norm": 0.22278333474431392, + "learning_rate": 6.432386491450361e-05, + "loss": 1.6293, + "step": 2590 + }, + { + "epoch": 1.97, + "grad_norm": 0.23128655487134384, + "learning_rate": 6.391212939058861e-05, + "loss": 1.6937, + "step": 2595 + }, + { + "epoch": 1.97, + "grad_norm": 0.24641830926598107, + "learning_rate": 6.350109605945323e-05, + "loss": 1.4982, + "step": 2600 + }, + { + "epoch": 1.97, + "grad_norm": 0.24123146757419323, + "learning_rate": 6.309077291892702e-05, + "loss": 1.5107, + "step": 2605 + }, + { + "epoch": 1.98, + "grad_norm": 0.24138969338364216, + "learning_rate": 6.268116795302068e-05, + "loss": 1.5448, + "step": 2610 + }, + { + "epoch": 1.98, + "grad_norm": 0.2515434111696446, + "learning_rate": 6.227228913177081e-05, + "loss": 1.559, + "step": 2615 + }, + { + "epoch": 1.99, + "grad_norm": 0.2554427971564699, + "learning_rate": 6.186414441108487e-05, + "loss": 1.6211, + "step": 2620 + }, + { + "epoch": 1.99, + "grad_norm": 0.20773791558688393, + "learning_rate": 6.14567417325861e-05, + "loss": 1.6058, + "step": 2625 + }, + { + "epoch": 1.99, + "grad_norm": 0.20109572317054908, + "learning_rate": 6.105008902345935e-05, + "loss": 1.5911, + "step": 2630 + }, + { + "epoch": 2.0, + "grad_norm": 0.21186779196561445, + "learning_rate": 6.064419419629662e-05, + "loss": 1.6227, + "step": 2635 + }, + { + "epoch": 2.0, + "grad_norm": 0.2150487580932417, + "learning_rate": 6.023906514894313e-05, + "loss": 1.5839, + "step": 2640 + }, + { + "epoch": 2.01, + "grad_norm": 0.24636199955981808, + "learning_rate": 5.983470976434369e-05, + "loss": 1.5764, + "step": 2645 + }, + { + "epoch": 2.01, + "grad_norm": 0.22093610448062864, + "learning_rate": 5.943113591038928e-05, + "loss": 1.7157, + "step": 2650 + }, + { + "epoch": 2.01, + "grad_norm": 0.21359568862552614, + "learning_rate": 5.902835143976393e-05, + "loss": 1.6359, + "step": 2655 + }, + { + "epoch": 2.02, + "grad_norm": 0.2219633405623727, + "learning_rate": 5.862636418979198e-05, + "loss": 1.6484, + "step": 2660 + }, + { + "epoch": 2.02, + "grad_norm": 0.24148935530595134, + "learning_rate": 5.822518198228565e-05, + "loss": 1.52, + "step": 2665 + }, + { + "epoch": 2.02, + "grad_norm": 0.22871052628894134, + "learning_rate": 5.782481262339261e-05, + "loss": 1.5583, + "step": 2670 + }, + { + "epoch": 2.03, + "grad_norm": 0.18016152517949127, + "learning_rate": 5.742526390344427e-05, + "loss": 1.7094, + "step": 2675 + }, + { + "epoch": 2.03, + "grad_norm": 0.27927714573640977, + "learning_rate": 5.702654359680428e-05, + "loss": 1.7229, + "step": 2680 + }, + { + "epoch": 2.04, + "grad_norm": 0.20272089890919007, + "learning_rate": 5.662865946171696e-05, + "loss": 1.7436, + "step": 2685 + }, + { + "epoch": 2.04, + "grad_norm": 0.25187946618078394, + "learning_rate": 5.6231619240156694e-05, + "loss": 1.5926, + "step": 2690 + }, + { + "epoch": 2.04, + "grad_norm": 0.23619447456603418, + "learning_rate": 5.5835430657676976e-05, + "loss": 1.5177, + "step": 2695 + }, + { + "epoch": 2.05, + "grad_norm": 0.23076862233533377, + "learning_rate": 5.544010142326026e-05, + "loss": 1.6432, + "step": 2700 + }, + { + "epoch": 2.05, + "grad_norm": 0.2509266079111979, + "learning_rate": 5.504563922916799e-05, + "loss": 1.6125, + "step": 2705 + }, + { + "epoch": 2.05, + "grad_norm": 0.26527998507107736, + "learning_rate": 5.4652051750790825e-05, + "loss": 1.5384, + "step": 2710 + }, + { + "epoch": 2.06, + "grad_norm": 0.24254486560490685, + "learning_rate": 5.425934664649921e-05, + "loss": 1.6641, + "step": 2715 + }, + { + "epoch": 2.06, + "grad_norm": 0.22497341374372068, + "learning_rate": 5.3867531557494674e-05, + "loss": 1.4442, + "step": 2720 + }, + { + "epoch": 2.07, + "grad_norm": 0.22811203680708553, + "learning_rate": 5.347661410766087e-05, + "loss": 1.6313, + "step": 2725 + }, + { + "epoch": 2.07, + "grad_norm": 0.2193211927138723, + "learning_rate": 5.308660190341528e-05, + "loss": 1.4835, + "step": 2730 + }, + { + "epoch": 2.07, + "grad_norm": 0.23158894991713072, + "learning_rate": 5.2697502533561226e-05, + "loss": 1.5765, + "step": 2735 + }, + { + "epoch": 2.08, + "grad_norm": 0.2160152191509828, + "learning_rate": 5.230932356914032e-05, + "loss": 1.6395, + "step": 2740 + }, + { + "epoch": 2.08, + "grad_norm": 0.23138300560468752, + "learning_rate": 5.1922072563284986e-05, + "loss": 1.6645, + "step": 2745 + }, + { + "epoch": 2.08, + "grad_norm": 0.27219186986752913, + "learning_rate": 5.153575705107152e-05, + "loss": 1.5842, + "step": 2750 + }, + { + "epoch": 2.09, + "grad_norm": 0.24365055871265076, + "learning_rate": 5.115038454937362e-05, + "loss": 1.7234, + "step": 2755 + }, + { + "epoch": 2.09, + "grad_norm": 0.22921672259925305, + "learning_rate": 5.076596255671592e-05, + "loss": 1.5756, + "step": 2760 + }, + { + "epoch": 2.1, + "grad_norm": 0.2538431765730713, + "learning_rate": 5.0382498553128265e-05, + "loss": 1.6491, + "step": 2765 + }, + { + "epoch": 2.1, + "grad_norm": 0.25913968900209966, + "learning_rate": 5.000000000000002e-05, + "loss": 1.5438, + "step": 2770 + }, + { + "epoch": 2.1, + "grad_norm": 0.291257818004918, + "learning_rate": 4.9618474339934916e-05, + "loss": 1.5995, + "step": 2775 + }, + { + "epoch": 2.11, + "grad_norm": 0.24432948267207238, + "learning_rate": 4.9237928996606384e-05, + "loss": 1.5999, + "step": 2780 + }, + { + "epoch": 2.11, + "grad_norm": 0.26418330324646966, + "learning_rate": 4.88583713746129e-05, + "loss": 1.7175, + "step": 2785 + }, + { + "epoch": 2.12, + "grad_norm": 0.2647804130194954, + "learning_rate": 4.8479808859333964e-05, + "loss": 1.5083, + "step": 2790 + }, + { + "epoch": 2.12, + "grad_norm": 0.23990236642151055, + "learning_rate": 4.810224881678652e-05, + "loss": 1.5032, + "step": 2795 + }, + { + "epoch": 2.12, + "grad_norm": 0.22406476212806528, + "learning_rate": 4.772569859348156e-05, + "loss": 1.6183, + "step": 2800 + }, + { + "epoch": 2.13, + "grad_norm": 0.17599248862626268, + "learning_rate": 4.735016551628095e-05, + "loss": 1.694, + "step": 2805 + }, + { + "epoch": 2.13, + "grad_norm": 0.27545889362059484, + "learning_rate": 4.697565689225528e-05, + "loss": 1.6074, + "step": 2810 + }, + { + "epoch": 2.13, + "grad_norm": 0.27997532830437954, + "learning_rate": 4.660218000854143e-05, + "loss": 1.5062, + "step": 2815 + }, + { + "epoch": 2.14, + "grad_norm": 0.2803170335965896, + "learning_rate": 4.6229742132200746e-05, + "loss": 1.6516, + "step": 2820 + }, + { + "epoch": 2.14, + "grad_norm": 0.22582531196940026, + "learning_rate": 4.585835051007774e-05, + "loss": 1.6168, + "step": 2825 + }, + { + "epoch": 2.15, + "grad_norm": 0.22856148303418752, + "learning_rate": 4.548801236865912e-05, + "loss": 1.5435, + "step": 2830 + }, + { + "epoch": 2.15, + "grad_norm": 0.2764784030904549, + "learning_rate": 4.511873491393304e-05, + "loss": 1.6409, + "step": 2835 + }, + { + "epoch": 2.15, + "grad_norm": 0.21257264261069672, + "learning_rate": 4.475052533124893e-05, + "loss": 1.5581, + "step": 2840 + }, + { + "epoch": 2.16, + "grad_norm": 0.21196439275175047, + "learning_rate": 4.438339078517785e-05, + "loss": 1.5538, + "step": 2845 + }, + { + "epoch": 2.16, + "grad_norm": 0.2832145647608719, + "learning_rate": 4.401733841937279e-05, + "loss": 1.724, + "step": 2850 + }, + { + "epoch": 2.16, + "grad_norm": 0.27147849615384506, + "learning_rate": 4.3652375356429974e-05, + "loss": 1.5014, + "step": 2855 + }, + { + "epoch": 2.17, + "grad_norm": 0.2610576760484019, + "learning_rate": 4.328850869775001e-05, + "loss": 1.6749, + "step": 2860 + }, + { + "epoch": 2.17, + "grad_norm": 0.23914287887699434, + "learning_rate": 4.292574552339981e-05, + "loss": 1.5328, + "step": 2865 + }, + { + "epoch": 2.18, + "grad_norm": 0.24065502762902322, + "learning_rate": 4.256409289197495e-05, + "loss": 1.5942, + "step": 2870 + }, + { + "epoch": 2.18, + "grad_norm": 0.2083191016158885, + "learning_rate": 4.2203557840462214e-05, + "loss": 1.5539, + "step": 2875 + }, + { + "epoch": 2.18, + "grad_norm": 0.20639182389301813, + "learning_rate": 4.184414738410248e-05, + "loss": 1.5646, + "step": 2880 + }, + { + "epoch": 2.19, + "grad_norm": 0.23727403239283584, + "learning_rate": 4.148586851625461e-05, + "loss": 1.5353, + "step": 2885 + }, + { + "epoch": 2.19, + "grad_norm": 0.24508287577637505, + "learning_rate": 4.112872820825915e-05, + "loss": 1.4418, + "step": 2890 + }, + { + "epoch": 2.19, + "grad_norm": 0.2475936795575314, + "learning_rate": 4.077273340930263e-05, + "loss": 1.6643, + "step": 2895 + }, + { + "epoch": 2.2, + "grad_norm": 0.2505899184192717, + "learning_rate": 4.041789104628241e-05, + "loss": 1.5577, + "step": 2900 + }, + { + "epoch": 2.2, + "grad_norm": 0.24093576954008833, + "learning_rate": 4.006420802367205e-05, + "loss": 1.6784, + "step": 2905 + }, + { + "epoch": 2.21, + "grad_norm": 0.2561236323684272, + "learning_rate": 3.971169122338668e-05, + "loss": 1.6165, + "step": 2910 + }, + { + "epoch": 2.21, + "grad_norm": 0.24280603594696593, + "learning_rate": 3.936034750464927e-05, + "loss": 1.6695, + "step": 2915 + }, + { + "epoch": 2.21, + "grad_norm": 0.2602730047803284, + "learning_rate": 3.901018370385724e-05, + "loss": 1.5697, + "step": 2920 + }, + { + "epoch": 2.22, + "grad_norm": 0.21146640994821633, + "learning_rate": 3.866120663444914e-05, + "loss": 1.5399, + "step": 2925 + }, + { + "epoch": 2.22, + "grad_norm": 0.24075711387924426, + "learning_rate": 3.831342308677247e-05, + "loss": 1.5597, + "step": 2930 + }, + { + "epoch": 2.23, + "grad_norm": 0.24793331779495362, + "learning_rate": 3.7966839827951196e-05, + "loss": 1.6434, + "step": 2935 + }, + { + "epoch": 2.23, + "grad_norm": 0.19558506394109187, + "learning_rate": 3.762146360175427e-05, + "loss": 1.6499, + "step": 2940 + }, + { + "epoch": 2.23, + "grad_norm": 0.35587028915030966, + "learning_rate": 3.727730112846444e-05, + "loss": 1.5089, + "step": 2945 + }, + { + "epoch": 2.24, + "grad_norm": 0.2570330063437446, + "learning_rate": 3.693435910474732e-05, + "loss": 1.6548, + "step": 2950 + }, + { + "epoch": 2.24, + "grad_norm": 0.28077059284475103, + "learning_rate": 3.659264420352122e-05, + "loss": 1.6528, + "step": 2955 + }, + { + "epoch": 2.24, + "grad_norm": 0.23035257395244374, + "learning_rate": 3.6252163073827294e-05, + "loss": 1.4482, + "step": 2960 + }, + { + "epoch": 2.25, + "grad_norm": 0.2051186918638722, + "learning_rate": 3.5912922340700206e-05, + "loss": 1.5015, + "step": 2965 + }, + { + "epoch": 2.25, + "grad_norm": 0.22455945185810877, + "learning_rate": 3.557492860503893e-05, + "loss": 1.5176, + "step": 2970 + }, + { + "epoch": 2.26, + "grad_norm": 0.23453638209680727, + "learning_rate": 3.5238188443478795e-05, + "loss": 1.6343, + "step": 2975 + }, + { + "epoch": 2.26, + "grad_norm": 0.24470156257503126, + "learning_rate": 3.4902708408263066e-05, + "loss": 1.7663, + "step": 2980 + }, + { + "epoch": 2.26, + "grad_norm": 0.23135832322132918, + "learning_rate": 3.45684950271158e-05, + "loss": 1.5837, + "step": 2985 + }, + { + "epoch": 2.27, + "grad_norm": 0.2608640064079802, + "learning_rate": 3.423555480311457e-05, + "loss": 1.6173, + "step": 2990 + }, + { + "epoch": 2.27, + "grad_norm": 0.31078928098679404, + "learning_rate": 3.3903894214564026e-05, + "loss": 1.5177, + "step": 2995 + }, + { + "epoch": 2.27, + "grad_norm": 0.26258430453244713, + "learning_rate": 3.3573519714869914e-05, + "loss": 1.6865, + "step": 3000 + }, + { + "epoch": 2.28, + "grad_norm": 0.2733284038434726, + "learning_rate": 3.324443773241349e-05, + "loss": 1.3619, + "step": 3005 + }, + { + "epoch": 2.28, + "grad_norm": 0.2369163548191094, + "learning_rate": 3.291665467042618e-05, + "loss": 1.6509, + "step": 3010 + }, + { + "epoch": 2.29, + "grad_norm": 0.2664340527286697, + "learning_rate": 3.25901769068654e-05, + "loss": 1.6038, + "step": 3015 + }, + { + "epoch": 2.29, + "grad_norm": 0.23398120063750877, + "learning_rate": 3.2265010794290195e-05, + "loss": 1.663, + "step": 3020 + }, + { + "epoch": 2.29, + "grad_norm": 0.2781275708933271, + "learning_rate": 3.1941162659737647e-05, + "loss": 1.6429, + "step": 3025 + }, + { + "epoch": 2.3, + "grad_norm": 0.2687866606825216, + "learning_rate": 3.16186388045998e-05, + "loss": 1.6853, + "step": 3030 + }, + { + "epoch": 2.3, + "grad_norm": 0.23644485510225058, + "learning_rate": 3.129744550450113e-05, + "loss": 1.6027, + "step": 3035 + }, + { + "epoch": 2.3, + "grad_norm": 0.24644290933624716, + "learning_rate": 3.09775890091763e-05, + "loss": 1.6018, + "step": 3040 + }, + { + "epoch": 2.31, + "grad_norm": 0.2259139537131363, + "learning_rate": 3.065907554234858e-05, + "loss": 1.6607, + "step": 3045 + }, + { + "epoch": 2.31, + "grad_norm": 0.24004959008038543, + "learning_rate": 3.034191130160887e-05, + "loss": 1.5377, + "step": 3050 + }, + { + "epoch": 2.32, + "grad_norm": 0.2213008661812979, + "learning_rate": 3.0026102458294924e-05, + "loss": 1.5613, + "step": 3055 + }, + { + "epoch": 2.32, + "grad_norm": 0.2079094581228579, + "learning_rate": 2.9711655157371443e-05, + "loss": 1.5085, + "step": 3060 + }, + { + "epoch": 2.32, + "grad_norm": 0.2527748569210639, + "learning_rate": 2.9398575517310355e-05, + "loss": 1.5855, + "step": 3065 + }, + { + "epoch": 2.33, + "grad_norm": 0.2141370968928817, + "learning_rate": 2.9086869629971836e-05, + "loss": 1.5732, + "step": 3070 + }, + { + "epoch": 2.33, + "grad_norm": 0.24493685886391817, + "learning_rate": 2.8776543560485857e-05, + "loss": 1.6197, + "step": 3075 + }, + { + "epoch": 2.34, + "grad_norm": 0.2316788505534105, + "learning_rate": 2.8467603347133997e-05, + "loss": 1.648, + "step": 3080 + }, + { + "epoch": 2.34, + "grad_norm": 0.25146411778731, + "learning_rate": 2.816005500123203e-05, + "loss": 1.5525, + "step": 3085 + }, + { + "epoch": 2.34, + "grad_norm": 0.22407696629199808, + "learning_rate": 2.785390450701303e-05, + "loss": 1.7218, + "step": 3090 + }, + { + "epoch": 2.35, + "grad_norm": 0.27013300544460844, + "learning_rate": 2.7549157821510885e-05, + "loss": 1.5804, + "step": 3095 + }, + { + "epoch": 2.35, + "grad_norm": 0.25388595748221704, + "learning_rate": 2.7245820874444272e-05, + "loss": 1.7398, + "step": 3100 + }, + { + "epoch": 2.35, + "grad_norm": 0.19843759758285218, + "learning_rate": 2.6943899568101405e-05, + "loss": 1.6999, + "step": 3105 + }, + { + "epoch": 2.36, + "grad_norm": 0.20783915655026464, + "learning_rate": 2.6643399777225232e-05, + "loss": 1.6114, + "step": 3110 + }, + { + "epoch": 2.36, + "grad_norm": 0.2496800397125067, + "learning_rate": 2.6344327348898958e-05, + "loss": 1.5217, + "step": 3115 + }, + { + "epoch": 2.37, + "grad_norm": 0.22235249882770752, + "learning_rate": 2.6046688102432382e-05, + "loss": 1.6871, + "step": 3120 + }, + { + "epoch": 2.37, + "grad_norm": 0.2462186333352102, + "learning_rate": 2.5750487829248726e-05, + "loss": 1.7788, + "step": 3125 + }, + { + "epoch": 2.37, + "grad_norm": 0.20018170839209692, + "learning_rate": 2.545573229277175e-05, + "loss": 1.6076, + "step": 3130 + }, + { + "epoch": 2.38, + "grad_norm": 0.2704237119402894, + "learning_rate": 2.5162427228313857e-05, + "loss": 1.6456, + "step": 3135 + }, + { + "epoch": 2.38, + "grad_norm": 0.2735737465777087, + "learning_rate": 2.4870578342964245e-05, + "loss": 1.6402, + "step": 3140 + }, + { + "epoch": 2.38, + "grad_norm": 0.2188413596766906, + "learning_rate": 2.458019131547803e-05, + "loss": 1.5193, + "step": 3145 + }, + { + "epoch": 2.39, + "grad_norm": 0.2821633184600081, + "learning_rate": 2.429127179616575e-05, + "loss": 1.6363, + "step": 3150 + }, + { + "epoch": 2.39, + "grad_norm": 0.20714886526036308, + "learning_rate": 2.4003825406783308e-05, + "loss": 1.669, + "step": 3155 + }, + { + "epoch": 2.4, + "grad_norm": 0.2661408497359453, + "learning_rate": 2.3717857740422644e-05, + "loss": 1.5488, + "step": 3160 + }, + { + "epoch": 2.4, + "grad_norm": 0.2535527034852724, + "learning_rate": 2.343337436140295e-05, + "loss": 1.5851, + "step": 3165 + }, + { + "epoch": 2.4, + "grad_norm": 0.2629746106882043, + "learning_rate": 2.3150380805162418e-05, + "loss": 1.5467, + "step": 3170 + }, + { + "epoch": 2.41, + "grad_norm": 0.24098285831571226, + "learning_rate": 2.2868882578150285e-05, + "loss": 1.6417, + "step": 3175 + }, + { + "epoch": 2.41, + "grad_norm": 0.28638431202213366, + "learning_rate": 2.258888515772005e-05, + "loss": 1.6915, + "step": 3180 + }, + { + "epoch": 2.41, + "grad_norm": 0.319171053435643, + "learning_rate": 2.2310393992022704e-05, + "loss": 1.6324, + "step": 3185 + }, + { + "epoch": 2.42, + "grad_norm": 0.2054749944090956, + "learning_rate": 2.2033414499900685e-05, + "loss": 1.5694, + "step": 3190 + }, + { + "epoch": 2.42, + "grad_norm": 0.2515694982134836, + "learning_rate": 2.1757952070782504e-05, + "loss": 1.598, + "step": 3195 + }, + { + "epoch": 2.43, + "grad_norm": 0.23267628383812705, + "learning_rate": 2.148401206457793e-05, + "loss": 1.4513, + "step": 3200 + }, + { + "epoch": 2.43, + "grad_norm": 0.25390773868938254, + "learning_rate": 2.121159981157359e-05, + "loss": 1.5906, + "step": 3205 + }, + { + "epoch": 2.43, + "grad_norm": 0.251154990702733, + "learning_rate": 2.0940720612329258e-05, + "loss": 1.4707, + "step": 3210 + }, + { + "epoch": 2.44, + "grad_norm": 0.24909067323121328, + "learning_rate": 2.067137973757489e-05, + "loss": 1.6214, + "step": 3215 + }, + { + "epoch": 2.44, + "grad_norm": 0.23515254331621996, + "learning_rate": 2.0403582428107792e-05, + "loss": 1.3762, + "step": 3220 + }, + { + "epoch": 2.45, + "grad_norm": 0.24320094875542947, + "learning_rate": 2.0137333894690912e-05, + "loss": 1.4732, + "step": 3225 + }, + { + "epoch": 2.45, + "grad_norm": 0.26976839590657536, + "learning_rate": 1.987263931795126e-05, + "loss": 1.5325, + "step": 3230 + }, + { + "epoch": 2.45, + "grad_norm": 0.2480855244121356, + "learning_rate": 1.9609503848279144e-05, + "loss": 1.6336, + "step": 3235 + }, + { + "epoch": 2.46, + "grad_norm": 0.23767732175608752, + "learning_rate": 1.9347932605728093e-05, + "loss": 1.564, + "step": 3240 + }, + { + "epoch": 2.46, + "grad_norm": 0.2727265524309786, + "learning_rate": 1.9087930679915023e-05, + "loss": 1.6079, + "step": 3245 + }, + { + "epoch": 2.46, + "grad_norm": 0.22286517973459688, + "learning_rate": 1.882950312992131e-05, + "loss": 1.4002, + "step": 3250 + }, + { + "epoch": 2.47, + "grad_norm": 0.2456900771009275, + "learning_rate": 1.8572654984194392e-05, + "loss": 1.5994, + "step": 3255 + }, + { + "epoch": 2.47, + "grad_norm": 0.2771873738066393, + "learning_rate": 1.8317391240449876e-05, + "loss": 1.6214, + "step": 3260 + }, + { + "epoch": 2.48, + "grad_norm": 0.2770007292533942, + "learning_rate": 1.8063716865574266e-05, + "loss": 1.4663, + "step": 3265 + }, + { + "epoch": 2.48, + "grad_norm": 0.24034756553369535, + "learning_rate": 1.781163679552831e-05, + "loss": 1.6507, + "step": 3270 + }, + { + "epoch": 2.48, + "grad_norm": 0.2286386450562912, + "learning_rate": 1.7561155935251094e-05, + "loss": 1.5512, + "step": 3275 + }, + { + "epoch": 2.49, + "grad_norm": 0.2594167587325395, + "learning_rate": 1.7312279158564415e-05, + "loss": 1.6027, + "step": 3280 + }, + { + "epoch": 2.49, + "grad_norm": 0.2127951461073897, + "learning_rate": 1.706501130807806e-05, + "loss": 1.6896, + "step": 3285 + }, + { + "epoch": 2.49, + "grad_norm": 0.2796245456905501, + "learning_rate": 1.6819357195095597e-05, + "loss": 1.6376, + "step": 3290 + }, + { + "epoch": 2.5, + "grad_norm": 0.23110557133342613, + "learning_rate": 1.657532159952062e-05, + "loss": 1.5277, + "step": 3295 + }, + { + "epoch": 2.5, + "grad_norm": 0.24542029689976314, + "learning_rate": 1.6332909269763953e-05, + "loss": 1.7143, + "step": 3300 + }, + { + "epoch": 2.51, + "grad_norm": 0.23539162074782163, + "learning_rate": 1.609212492265103e-05, + "loss": 1.7028, + "step": 3305 + }, + { + "epoch": 2.51, + "grad_norm": 0.2629785684260658, + "learning_rate": 1.585297324333027e-05, + "loss": 1.4392, + "step": 3310 + }, + { + "epoch": 2.51, + "grad_norm": 0.2933973664128153, + "learning_rate": 1.561545888518192e-05, + "loss": 1.7234, + "step": 3315 + }, + { + "epoch": 2.52, + "grad_norm": 0.23954470728817145, + "learning_rate": 1.537958646972737e-05, + "loss": 1.4944, + "step": 3320 + }, + { + "epoch": 2.52, + "grad_norm": 0.23980954615598538, + "learning_rate": 1.5145360586539336e-05, + "loss": 1.5851, + "step": 3325 + }, + { + "epoch": 2.52, + "grad_norm": 0.21087011175193957, + "learning_rate": 1.4912785793152583e-05, + "loss": 1.5208, + "step": 3330 + }, + { + "epoch": 2.53, + "grad_norm": 0.23976449951280604, + "learning_rate": 1.4681866614975227e-05, + "loss": 1.5722, + "step": 3335 + }, + { + "epoch": 2.53, + "grad_norm": 0.22800754377440097, + "learning_rate": 1.4452607545200492e-05, + "loss": 1.6206, + "step": 3340 + }, + { + "epoch": 2.54, + "grad_norm": 0.21262175469660566, + "learning_rate": 1.4225013044719615e-05, + "loss": 1.5784, + "step": 3345 + }, + { + "epoch": 2.54, + "grad_norm": 0.2436947408558131, + "learning_rate": 1.3999087542034817e-05, + "loss": 1.5594, + "step": 3350 + }, + { + "epoch": 2.54, + "grad_norm": 0.23425763672194239, + "learning_rate": 1.3774835433173172e-05, + "loss": 1.6784, + "step": 3355 + }, + { + "epoch": 2.55, + "grad_norm": 0.22447628853598572, + "learning_rate": 1.3552261081601091e-05, + "loss": 1.6606, + "step": 3360 + }, + { + "epoch": 2.55, + "grad_norm": 0.2410908531230671, + "learning_rate": 1.3331368818139445e-05, + "loss": 1.5011, + "step": 3365 + }, + { + "epoch": 2.55, + "grad_norm": 0.2107811996936363, + "learning_rate": 1.3112162940879225e-05, + "loss": 1.6211, + "step": 3370 + }, + { + "epoch": 2.56, + "grad_norm": 0.23349707446690013, + "learning_rate": 1.289464771509804e-05, + "loss": 1.4912, + "step": 3375 + }, + { + "epoch": 2.56, + "grad_norm": 0.23161951144663487, + "learning_rate": 1.2678827373176894e-05, + "loss": 1.5809, + "step": 3380 + }, + { + "epoch": 2.57, + "grad_norm": 0.23879959809777346, + "learning_rate": 1.2464706114518088e-05, + "loss": 1.6276, + "step": 3385 + }, + { + "epoch": 2.57, + "grad_norm": 0.2421829350347233, + "learning_rate": 1.2252288105463405e-05, + "loss": 1.6212, + "step": 3390 + }, + { + "epoch": 2.57, + "grad_norm": 0.21883362123837063, + "learning_rate": 1.2041577479212963e-05, + "loss": 1.6288, + "step": 3395 + }, + { + "epoch": 2.58, + "grad_norm": 0.22802087223126732, + "learning_rate": 1.1832578335744882e-05, + "loss": 1.6313, + "step": 3400 + }, + { + "epoch": 2.58, + "grad_norm": 0.26121795799269726, + "learning_rate": 1.1625294741735526e-05, + "loss": 1.656, + "step": 3405 + }, + { + "epoch": 2.59, + "grad_norm": 0.24409811650460989, + "learning_rate": 1.1419730730480305e-05, + "loss": 1.618, + "step": 3410 + }, + { + "epoch": 2.59, + "grad_norm": 0.21817658760534914, + "learning_rate": 1.1215890301815201e-05, + "loss": 1.5273, + "step": 3415 + }, + { + "epoch": 2.59, + "grad_norm": 0.21593460359416924, + "learning_rate": 1.101377742203903e-05, + "loss": 1.5447, + "step": 3420 + }, + { + "epoch": 2.6, + "grad_norm": 0.2425820551445123, + "learning_rate": 1.0813396023836142e-05, + "loss": 1.5712, + "step": 3425 + }, + { + "epoch": 2.6, + "grad_norm": 0.2625166861814797, + "learning_rate": 1.0614750006200014e-05, + "loss": 1.6605, + "step": 3430 + }, + { + "epoch": 2.6, + "grad_norm": 0.2615766591081601, + "learning_rate": 1.0417843234357282e-05, + "loss": 1.5986, + "step": 3435 + }, + { + "epoch": 2.61, + "grad_norm": 0.2581808450444552, + "learning_rate": 1.022267953969257e-05, + "loss": 1.641, + "step": 3440 + }, + { + "epoch": 2.61, + "grad_norm": 0.2746391682993569, + "learning_rate": 1.0029262719674015e-05, + "loss": 1.6293, + "step": 3445 + }, + { + "epoch": 2.62, + "grad_norm": 0.20322726411468045, + "learning_rate": 9.837596537779237e-06, + "loss": 1.5418, + "step": 3450 + }, + { + "epoch": 2.62, + "grad_norm": 0.24234603845064367, + "learning_rate": 9.647684723422213e-06, + "loss": 1.6451, + "step": 3455 + }, + { + "epoch": 2.62, + "grad_norm": 0.24585798612987492, + "learning_rate": 9.459530971880681e-06, + "loss": 1.5217, + "step": 3460 + }, + { + "epoch": 2.63, + "grad_norm": 0.24090792198358563, + "learning_rate": 9.27313894422428e-06, + "loss": 1.7077, + "step": 3465 + }, + { + "epoch": 2.63, + "grad_norm": 0.2794920103874086, + "learning_rate": 9.088512267243143e-06, + "loss": 1.7315, + "step": 3470 + }, + { + "epoch": 2.63, + "grad_norm": 0.21561282445650495, + "learning_rate": 8.905654533377583e-06, + "loss": 1.6059, + "step": 3475 + }, + { + "epoch": 2.64, + "grad_norm": 0.2348946654088957, + "learning_rate": 8.724569300648034e-06, + "loss": 1.7123, + "step": 3480 + }, + { + "epoch": 2.64, + "grad_norm": 0.29915508239234007, + "learning_rate": 8.545260092585805e-06, + "loss": 1.6167, + "step": 3485 + }, + { + "epoch": 2.65, + "grad_norm": 0.2609864197177778, + "learning_rate": 8.367730398164574e-06, + "loss": 1.6634, + "step": 3490 + }, + { + "epoch": 2.65, + "grad_norm": 0.32383778611721475, + "learning_rate": 8.19198367173255e-06, + "loss": 1.631, + "step": 3495 + }, + { + "epoch": 2.65, + "grad_norm": 0.24225506413140852, + "learning_rate": 8.018023332945112e-06, + "loss": 1.5466, + "step": 3500 + }, + { + "epoch": 2.66, + "grad_norm": 0.2686708766662986, + "learning_rate": 7.845852766698426e-06, + "loss": 1.5889, + "step": 3505 + }, + { + "epoch": 2.66, + "grad_norm": 0.24593185494412043, + "learning_rate": 7.675475323063475e-06, + "loss": 1.5796, + "step": 3510 + }, + { + "epoch": 2.66, + "grad_norm": 0.27604151432217455, + "learning_rate": 7.5068943172209025e-06, + "loss": 1.6281, + "step": 3515 + }, + { + "epoch": 2.67, + "grad_norm": 0.2211924967690316, + "learning_rate": 7.340113029396567e-06, + "loss": 1.5407, + "step": 3520 + }, + { + "epoch": 2.67, + "grad_norm": 0.2549240432311639, + "learning_rate": 7.175134704797592e-06, + "loss": 1.6782, + "step": 3525 + }, + { + "epoch": 2.68, + "grad_norm": 0.22194169097137223, + "learning_rate": 7.011962553549345e-06, + "loss": 1.639, + "step": 3530 + }, + { + "epoch": 2.68, + "grad_norm": 0.24109057602354814, + "learning_rate": 6.8505997506329024e-06, + "loss": 1.6421, + "step": 3535 + }, + { + "epoch": 2.68, + "grad_norm": 0.298662548692409, + "learning_rate": 6.691049435823327e-06, + "loss": 1.5672, + "step": 3540 + }, + { + "epoch": 2.69, + "grad_norm": 0.21821362720901652, + "learning_rate": 6.533314713628458e-06, + "loss": 1.5832, + "step": 3545 + }, + { + "epoch": 2.69, + "grad_norm": 0.28320776205122955, + "learning_rate": 6.377398653228661e-06, + "loss": 1.5686, + "step": 3550 + }, + { + "epoch": 2.7, + "grad_norm": 0.2647885175395758, + "learning_rate": 6.22330428841702e-06, + "loss": 1.3694, + "step": 3555 + }, + { + "epoch": 2.7, + "grad_norm": 0.23693055785085496, + "learning_rate": 6.071034617540294e-06, + "loss": 1.4096, + "step": 3560 + }, + { + "epoch": 2.7, + "grad_norm": 0.2811316512655128, + "learning_rate": 5.9205926034406e-06, + "loss": 1.7223, + "step": 3565 + }, + { + "epoch": 2.71, + "grad_norm": 0.25921259473226205, + "learning_rate": 5.771981173397811e-06, + "loss": 1.6491, + "step": 3570 + }, + { + "epoch": 2.71, + "grad_norm": 0.2384614576434685, + "learning_rate": 5.625203219072495e-06, + "loss": 1.5796, + "step": 3575 + }, + { + "epoch": 2.71, + "grad_norm": 0.24972301573225342, + "learning_rate": 5.480261596449698e-06, + "loss": 1.6484, + "step": 3580 + }, + { + "epoch": 2.72, + "grad_norm": 0.2790877252142927, + "learning_rate": 5.337159125783453e-06, + "loss": 1.6747, + "step": 3585 + }, + { + "epoch": 2.72, + "grad_norm": 0.26526860024829096, + "learning_rate": 5.195898591541748e-06, + "loss": 1.631, + "step": 3590 + }, + { + "epoch": 2.73, + "grad_norm": 0.2359453480631305, + "learning_rate": 5.056482742352486e-06, + "loss": 1.5224, + "step": 3595 + }, + { + "epoch": 2.73, + "grad_norm": 0.2749211055400865, + "learning_rate": 4.9189142909498945e-06, + "loss": 1.5348, + "step": 3600 + }, + { + "epoch": 2.73, + "grad_norm": 0.20381886685736014, + "learning_rate": 4.783195914121818e-06, + "loss": 1.6092, + "step": 3605 + }, + { + "epoch": 2.74, + "grad_norm": 0.27694673318367125, + "learning_rate": 4.649330252657613e-06, + "loss": 1.5524, + "step": 3610 + }, + { + "epoch": 2.74, + "grad_norm": 0.24104039690274334, + "learning_rate": 4.517319911296747e-06, + "loss": 1.6131, + "step": 3615 + }, + { + "epoch": 2.74, + "grad_norm": 0.21654379256196502, + "learning_rate": 4.387167458678121e-06, + "loss": 1.5537, + "step": 3620 + }, + { + "epoch": 2.75, + "grad_norm": 0.22452456642122062, + "learning_rate": 4.2588754272900985e-06, + "loss": 1.5051, + "step": 3625 + }, + { + "epoch": 2.75, + "grad_norm": 0.2338963199910069, + "learning_rate": 4.132446313421246e-06, + "loss": 1.6882, + "step": 3630 + }, + { + "epoch": 2.76, + "grad_norm": 0.2584997684032959, + "learning_rate": 4.00788257711171e-06, + "loss": 1.5014, + "step": 3635 + }, + { + "epoch": 2.76, + "grad_norm": 0.2908213514842905, + "learning_rate": 3.885186642105376e-06, + "loss": 1.6277, + "step": 3640 + }, + { + "epoch": 2.76, + "grad_norm": 0.19233168126321265, + "learning_rate": 3.7643608958027543e-06, + "loss": 1.7565, + "step": 3645 + }, + { + "epoch": 2.77, + "grad_norm": 0.270149572768491, + "learning_rate": 3.6454076892144418e-06, + "loss": 1.6004, + "step": 3650 + }, + { + "epoch": 2.77, + "grad_norm": 0.2403966892086897, + "learning_rate": 3.5283293369154036e-06, + "loss": 1.5425, + "step": 3655 + }, + { + "epoch": 2.77, + "grad_norm": 0.1959498671099479, + "learning_rate": 3.4131281170000083e-06, + "loss": 1.6043, + "step": 3660 + }, + { + "epoch": 2.78, + "grad_norm": 0.2291068939021477, + "learning_rate": 3.2998062710375864e-06, + "loss": 1.6167, + "step": 3665 + }, + { + "epoch": 2.78, + "grad_norm": 0.2669658026144572, + "learning_rate": 3.188366004028931e-06, + "loss": 1.6093, + "step": 3670 + }, + { + "epoch": 2.79, + "grad_norm": 0.21167755912643296, + "learning_rate": 3.0788094843632655e-06, + "loss": 1.6288, + "step": 3675 + }, + { + "epoch": 2.79, + "grad_norm": 0.27869409187190786, + "learning_rate": 2.9711388437761445e-06, + "loss": 1.5781, + "step": 3680 + }, + { + "epoch": 2.79, + "grad_norm": 0.21975752059084885, + "learning_rate": 2.8653561773079764e-06, + "loss": 1.6193, + "step": 3685 + }, + { + "epoch": 2.8, + "grad_norm": 0.22758480435581485, + "learning_rate": 2.7614635432632097e-06, + "loss": 1.7111, + "step": 3690 + }, + { + "epoch": 2.8, + "grad_norm": 0.253380043181827, + "learning_rate": 2.6594629631702783e-06, + "loss": 1.6528, + "step": 3695 + }, + { + "epoch": 2.81, + "grad_norm": 0.22066629732671186, + "learning_rate": 2.5593564217423314e-06, + "loss": 1.5717, + "step": 3700 + }, + { + "epoch": 2.81, + "grad_norm": 0.20545309594834268, + "learning_rate": 2.461145866838599e-06, + "loss": 1.5816, + "step": 3705 + }, + { + "epoch": 2.81, + "grad_norm": 0.2298245690861381, + "learning_rate": 2.364833209426376e-06, + "loss": 1.5273, + "step": 3710 + }, + { + "epoch": 2.82, + "grad_norm": 0.28452051640315046, + "learning_rate": 2.270420323544009e-06, + "loss": 1.5568, + "step": 3715 + }, + { + "epoch": 2.82, + "grad_norm": 0.20796391559506347, + "learning_rate": 2.177909046264348e-06, + "loss": 1.6991, + "step": 3720 + }, + { + "epoch": 2.82, + "grad_norm": 0.26581893695586506, + "learning_rate": 2.0873011776589957e-06, + "loss": 1.517, + "step": 3725 + }, + { + "epoch": 2.83, + "grad_norm": 0.22796087968697157, + "learning_rate": 1.998598480763247e-06, + "loss": 1.7992, + "step": 3730 + }, + { + "epoch": 2.83, + "grad_norm": 0.2336977716987997, + "learning_rate": 1.911802681541919e-06, + "loss": 1.513, + "step": 3735 + }, + { + "epoch": 2.84, + "grad_norm": 0.2497871821535283, + "learning_rate": 1.8269154688556056e-06, + "loss": 1.5704, + "step": 3740 + }, + { + "epoch": 2.84, + "grad_norm": 0.23620748564286875, + "learning_rate": 1.7439384944279213e-06, + "loss": 1.4392, + "step": 3745 + }, + { + "epoch": 2.84, + "grad_norm": 0.23378030531695476, + "learning_rate": 1.6628733728133227e-06, + "loss": 1.5813, + "step": 3750 + }, + { + "epoch": 2.85, + "grad_norm": 0.2146995504847581, + "learning_rate": 1.5837216813656908e-06, + "loss": 1.5966, + "step": 3755 + }, + { + "epoch": 2.85, + "grad_norm": 0.2382092413535891, + "learning_rate": 1.506484960207677e-06, + "loss": 1.553, + "step": 3760 + }, + { + "epoch": 2.85, + "grad_norm": 0.19096405564266747, + "learning_rate": 1.4311647122006721e-06, + "loss": 1.5538, + "step": 3765 + }, + { + "epoch": 2.86, + "grad_norm": 0.2877533644995907, + "learning_rate": 1.3577624029155966e-06, + "loss": 1.5703, + "step": 3770 + }, + { + "epoch": 2.86, + "grad_norm": 0.25619739351479454, + "learning_rate": 1.2862794606044337e-06, + "loss": 1.4537, + "step": 3775 + }, + { + "epoch": 2.87, + "grad_norm": 0.17985034388431237, + "learning_rate": 1.216717276172341e-06, + "loss": 1.7393, + "step": 3780 + }, + { + "epoch": 2.87, + "grad_norm": 0.24865342548144834, + "learning_rate": 1.1490772031506392e-06, + "loss": 1.681, + "step": 3785 + }, + { + "epoch": 2.87, + "grad_norm": 0.2682691239787855, + "learning_rate": 1.0833605576705096e-06, + "loss": 1.7253, + "step": 3790 + }, + { + "epoch": 2.88, + "grad_norm": 0.2694273976328748, + "learning_rate": 1.0195686184373166e-06, + "loss": 1.5678, + "step": 3795 + }, + { + "epoch": 2.88, + "grad_norm": 0.21203591608829347, + "learning_rate": 9.577026267057476e-07, + "loss": 1.615, + "step": 3800 + }, + { + "epoch": 2.88, + "grad_norm": 0.24345143310387468, + "learning_rate": 8.97763786255712e-07, + "loss": 1.5338, + "step": 3805 + }, + { + "epoch": 2.89, + "grad_norm": 0.2579598936883795, + "learning_rate": 8.397532633688254e-07, + "loss": 1.5515, + "step": 3810 + }, + { + "epoch": 2.89, + "grad_norm": 0.2434697228699266, + "learning_rate": 7.836721868058061e-07, + "loss": 1.7675, + "step": 3815 + }, + { + "epoch": 2.9, + "grad_norm": 0.2440247854883621, + "learning_rate": 7.295216477844702e-07, + "loss": 1.6179, + "step": 3820 + }, + { + "epoch": 2.9, + "grad_norm": 0.25830362738889806, + "learning_rate": 6.773026999584708e-07, + "loss": 1.698, + "step": 3825 + }, + { + "epoch": 2.9, + "grad_norm": 0.2622736838540384, + "learning_rate": 6.270163593968703e-07, + "loss": 1.6485, + "step": 3830 + }, + { + "epoch": 2.91, + "grad_norm": 0.2065165799768009, + "learning_rate": 5.786636045643112e-07, + "loss": 1.6278, + "step": 3835 + }, + { + "epoch": 2.91, + "grad_norm": 0.22985561856815895, + "learning_rate": 5.322453763019653e-07, + "loss": 1.5524, + "step": 3840 + }, + { + "epoch": 2.92, + "grad_norm": 0.2727208719350262, + "learning_rate": 4.877625778092809e-07, + "loss": 1.6646, + "step": 3845 + }, + { + "epoch": 2.92, + "grad_norm": 0.2664985065225481, + "learning_rate": 4.4521607462640893e-07, + "loss": 1.5143, + "step": 3850 + }, + { + "epoch": 2.92, + "grad_norm": 0.24545453500067022, + "learning_rate": 4.046066946172822e-07, + "loss": 1.6567, + "step": 3855 + }, + { + "epoch": 2.93, + "grad_norm": 0.24678453590974866, + "learning_rate": 3.659352279535733e-07, + "loss": 1.6106, + "step": 3860 + }, + { + "epoch": 2.93, + "grad_norm": 0.28024299756070853, + "learning_rate": 3.292024270993399e-07, + "loss": 1.4444, + "step": 3865 + }, + { + "epoch": 2.93, + "grad_norm": 0.21189400608157236, + "learning_rate": 2.9440900679631457e-07, + "loss": 1.5323, + "step": 3870 + }, + { + "epoch": 2.94, + "grad_norm": 0.24352825534009542, + "learning_rate": 2.615556440500377e-07, + "loss": 1.6129, + "step": 3875 + }, + { + "epoch": 2.94, + "grad_norm": 0.24710743453957604, + "learning_rate": 2.306429781166908e-07, + "loss": 1.6064, + "step": 3880 + }, + { + "epoch": 2.95, + "grad_norm": 0.2939949777477219, + "learning_rate": 2.016716104906391e-07, + "loss": 1.5547, + "step": 3885 + }, + { + "epoch": 2.95, + "grad_norm": 0.24855903276634353, + "learning_rate": 1.7464210489273047e-07, + "loss": 1.4292, + "step": 3890 + }, + { + "epoch": 2.95, + "grad_norm": 0.29838466749808656, + "learning_rate": 1.4955498725932604e-07, + "loss": 1.6796, + "step": 3895 + }, + { + "epoch": 2.96, + "grad_norm": 0.24063497135231618, + "learning_rate": 1.2641074573209733e-07, + "loss": 1.6524, + "step": 3900 + }, + { + "epoch": 2.96, + "grad_norm": 0.266868894727991, + "learning_rate": 1.0520983064847833e-07, + "loss": 1.6033, + "step": 3905 + }, + { + "epoch": 2.96, + "grad_norm": 0.2468474483223204, + "learning_rate": 8.595265453292811e-08, + "loss": 1.5643, + "step": 3910 + }, + { + "epoch": 2.97, + "grad_norm": 0.20433798961816185, + "learning_rate": 6.86395920889149e-08, + "loss": 1.5261, + "step": 3915 + }, + { + "epoch": 2.97, + "grad_norm": 0.24801643390934636, + "learning_rate": 5.327098019159982e-08, + "loss": 1.7088, + "step": 3920 + }, + { + "epoch": 2.98, + "grad_norm": 0.25316391379721404, + "learning_rate": 3.9847117881308685e-08, + "loss": 1.6097, + "step": 3925 + }, + { + "epoch": 2.98, + "grad_norm": 0.2589299223311166, + "learning_rate": 2.8368266357681194e-08, + "loss": 1.7897, + "step": 3930 + }, + { + "epoch": 2.98, + "grad_norm": 0.2615270175678582, + "learning_rate": 1.8834648974630497e-08, + "loss": 1.7153, + "step": 3935 + }, + { + "epoch": 2.99, + "grad_norm": 0.23820419945511057, + "learning_rate": 1.12464512359578e-08, + "loss": 1.489, + "step": 3940 + }, + { + "epoch": 2.99, + "grad_norm": 0.24057944682864915, + "learning_rate": 5.603820791755254e-09, + "loss": 1.4223, + "step": 3945 + }, + { + "epoch": 2.99, + "grad_norm": 0.28827145571809903, + "learning_rate": 1.9068674355415815e-09, + "loss": 1.6161, + "step": 3950 + }, + { + "epoch": 3.0, + "grad_norm": 0.4006550408742116, + "learning_rate": 1.5566310213044333e-10, + "loss": 1.556, + "step": 3955 + }, + { + "epoch": 3.0, + "step": 3957, + "total_flos": 1.244366244937728e+16, + "train_loss": 1.656136889695097, + "train_runtime": 24205.6115, + "train_samples_per_second": 0.654, + "train_steps_per_second": 0.163 + } + ], + "logging_steps": 5, + "max_steps": 3957, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.244366244937728e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}