{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9965217391304346, "eval_steps": 500, "global_step": 574, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0034782608695652175, "grad_norm": 0.1545655359230834, "learning_rate": 3.448275862068966e-06, "loss": 0.1804, "step": 1 }, { "epoch": 0.006956521739130435, "grad_norm": 0.15798307731453395, "learning_rate": 6.896551724137932e-06, "loss": 0.1443, "step": 2 }, { "epoch": 0.010434782608695653, "grad_norm": 0.14372383095748037, "learning_rate": 1.0344827586206897e-05, "loss": 0.1414, "step": 3 }, { "epoch": 0.01391304347826087, "grad_norm": 0.2420744995778043, "learning_rate": 1.3793103448275863e-05, "loss": 0.1926, "step": 4 }, { "epoch": 0.017391304347826087, "grad_norm": 0.1463366912249852, "learning_rate": 1.7241379310344828e-05, "loss": 0.1598, "step": 5 }, { "epoch": 0.020869565217391306, "grad_norm": 0.2742107559459329, "learning_rate": 2.0689655172413793e-05, "loss": 0.2451, "step": 6 }, { "epoch": 0.02434782608695652, "grad_norm": 0.1545956455873269, "learning_rate": 2.413793103448276e-05, "loss": 0.1467, "step": 7 }, { "epoch": 0.02782608695652174, "grad_norm": 0.11833712816221738, "learning_rate": 2.7586206896551727e-05, "loss": 0.1146, "step": 8 }, { "epoch": 0.03130434782608696, "grad_norm": 0.1636683202816951, "learning_rate": 3.103448275862069e-05, "loss": 0.1511, "step": 9 }, { "epoch": 0.034782608695652174, "grad_norm": 0.12096851431359755, "learning_rate": 3.4482758620689657e-05, "loss": 0.1392, "step": 10 }, { "epoch": 0.03826086956521739, "grad_norm": 0.20113450226273455, "learning_rate": 3.793103448275862e-05, "loss": 0.1829, "step": 11 }, { "epoch": 0.04173913043478261, "grad_norm": 0.1724183342324261, "learning_rate": 4.1379310344827587e-05, "loss": 0.1393, "step": 12 }, { "epoch": 0.04521739130434783, "grad_norm": 0.16317141755627293, "learning_rate": 4.482758620689655e-05, "loss": 0.1569, "step": 13 }, { "epoch": 0.04869565217391304, "grad_norm": 0.20158026184467487, "learning_rate": 4.827586206896552e-05, "loss": 0.1719, "step": 14 }, { "epoch": 0.05217391304347826, "grad_norm": 0.19268086804807166, "learning_rate": 5.172413793103449e-05, "loss": 0.1783, "step": 15 }, { "epoch": 0.05565217391304348, "grad_norm": 0.15367624679456215, "learning_rate": 5.517241379310345e-05, "loss": 0.1457, "step": 16 }, { "epoch": 0.059130434782608696, "grad_norm": 0.16131163703415627, "learning_rate": 5.862068965517241e-05, "loss": 0.1741, "step": 17 }, { "epoch": 0.06260869565217392, "grad_norm": 0.1513439967052575, "learning_rate": 6.206896551724138e-05, "loss": 0.1567, "step": 18 }, { "epoch": 0.06608695652173913, "grad_norm": 0.11397034244477378, "learning_rate": 6.551724137931034e-05, "loss": 0.1448, "step": 19 }, { "epoch": 0.06956521739130435, "grad_norm": 0.18890710907597627, "learning_rate": 6.896551724137931e-05, "loss": 0.1576, "step": 20 }, { "epoch": 0.07304347826086957, "grad_norm": 0.17148715059837027, "learning_rate": 7.241379310344828e-05, "loss": 0.1531, "step": 21 }, { "epoch": 0.07652173913043478, "grad_norm": 0.15845773761518642, "learning_rate": 7.586206896551724e-05, "loss": 0.1795, "step": 22 }, { "epoch": 0.08, "grad_norm": 0.16432205778499775, "learning_rate": 7.931034482758621e-05, "loss": 0.1455, "step": 23 }, { "epoch": 0.08347826086956522, "grad_norm": 0.18507516537799124, "learning_rate": 8.275862068965517e-05, "loss": 0.1792, "step": 24 }, { "epoch": 0.08695652173913043, "grad_norm": 0.1489906198108428, "learning_rate": 8.620689655172413e-05, "loss": 0.1575, "step": 25 }, { "epoch": 0.09043478260869565, "grad_norm": 0.19257597111889158, "learning_rate": 8.96551724137931e-05, "loss": 0.1977, "step": 26 }, { "epoch": 0.09391304347826086, "grad_norm": 0.15869513580726594, "learning_rate": 9.310344827586207e-05, "loss": 0.1491, "step": 27 }, { "epoch": 0.09739130434782609, "grad_norm": 0.23763138206897608, "learning_rate": 9.655172413793105e-05, "loss": 0.2305, "step": 28 }, { "epoch": 0.10086956521739131, "grad_norm": 0.19313130092481448, "learning_rate": 0.0001, "loss": 0.1991, "step": 29 }, { "epoch": 0.10434782608695652, "grad_norm": 0.15957163254805692, "learning_rate": 0.00010344827586206898, "loss": 0.1494, "step": 30 }, { "epoch": 0.10782608695652174, "grad_norm": 0.15175494387195537, "learning_rate": 0.00010689655172413792, "loss": 0.1539, "step": 31 }, { "epoch": 0.11130434782608696, "grad_norm": 0.1577067484050021, "learning_rate": 0.0001103448275862069, "loss": 0.1481, "step": 32 }, { "epoch": 0.11478260869565217, "grad_norm": 0.09295501667856695, "learning_rate": 0.00011379310344827588, "loss": 0.1018, "step": 33 }, { "epoch": 0.11826086956521739, "grad_norm": 0.13149067291539926, "learning_rate": 0.00011724137931034482, "loss": 0.1176, "step": 34 }, { "epoch": 0.12173913043478261, "grad_norm": 0.15815867098069847, "learning_rate": 0.0001206896551724138, "loss": 0.1315, "step": 35 }, { "epoch": 0.12521739130434784, "grad_norm": 0.1228801998135233, "learning_rate": 0.00012413793103448277, "loss": 0.1226, "step": 36 }, { "epoch": 0.12869565217391304, "grad_norm": 0.14615808183921733, "learning_rate": 0.00012758620689655174, "loss": 0.1351, "step": 37 }, { "epoch": 0.13217391304347825, "grad_norm": 0.13959696283916806, "learning_rate": 0.00013103448275862068, "loss": 0.1265, "step": 38 }, { "epoch": 0.1356521739130435, "grad_norm": 0.1674438071444559, "learning_rate": 0.00013448275862068965, "loss": 0.1763, "step": 39 }, { "epoch": 0.1391304347826087, "grad_norm": 0.14248711889055726, "learning_rate": 0.00013793103448275863, "loss": 0.1273, "step": 40 }, { "epoch": 0.1426086956521739, "grad_norm": 0.12483278168498144, "learning_rate": 0.0001413793103448276, "loss": 0.1158, "step": 41 }, { "epoch": 0.14608695652173914, "grad_norm": 0.12252417486446492, "learning_rate": 0.00014482758620689657, "loss": 0.0978, "step": 42 }, { "epoch": 0.14956521739130435, "grad_norm": 0.1379518468653693, "learning_rate": 0.00014827586206896554, "loss": 0.1265, "step": 43 }, { "epoch": 0.15304347826086956, "grad_norm": 0.1523565561366162, "learning_rate": 0.00015172413793103449, "loss": 0.1823, "step": 44 }, { "epoch": 0.1565217391304348, "grad_norm": 0.1801898533175253, "learning_rate": 0.00015517241379310346, "loss": 0.1999, "step": 45 }, { "epoch": 0.16, "grad_norm": 0.13012748020707876, "learning_rate": 0.00015862068965517243, "loss": 0.1409, "step": 46 }, { "epoch": 0.1634782608695652, "grad_norm": 0.1413893808116691, "learning_rate": 0.00016206896551724137, "loss": 0.1262, "step": 47 }, { "epoch": 0.16695652173913045, "grad_norm": 0.16233434268275468, "learning_rate": 0.00016551724137931035, "loss": 0.1467, "step": 48 }, { "epoch": 0.17043478260869566, "grad_norm": 0.15079503853002107, "learning_rate": 0.00016896551724137932, "loss": 0.1058, "step": 49 }, { "epoch": 0.17391304347826086, "grad_norm": 0.15412291289995766, "learning_rate": 0.00017241379310344826, "loss": 0.168, "step": 50 }, { "epoch": 0.17739130434782607, "grad_norm": 0.1722020517750421, "learning_rate": 0.00017586206896551723, "loss": 0.1183, "step": 51 }, { "epoch": 0.1808695652173913, "grad_norm": 0.10905711916480021, "learning_rate": 0.0001793103448275862, "loss": 0.1093, "step": 52 }, { "epoch": 0.18434782608695652, "grad_norm": 0.16963364557672264, "learning_rate": 0.00018275862068965518, "loss": 0.1557, "step": 53 }, { "epoch": 0.18782608695652173, "grad_norm": 0.15154120729033607, "learning_rate": 0.00018620689655172415, "loss": 0.1594, "step": 54 }, { "epoch": 0.19130434782608696, "grad_norm": 0.13757866713331232, "learning_rate": 0.00018965517241379312, "loss": 0.1407, "step": 55 }, { "epoch": 0.19478260869565217, "grad_norm": 0.08797746875562075, "learning_rate": 0.0001931034482758621, "loss": 0.0941, "step": 56 }, { "epoch": 0.19826086956521738, "grad_norm": 0.18086221573643768, "learning_rate": 0.00019655172413793104, "loss": 0.1781, "step": 57 }, { "epoch": 0.20173913043478262, "grad_norm": 0.17700454857957337, "learning_rate": 0.0002, "loss": 0.1879, "step": 58 }, { "epoch": 0.20521739130434782, "grad_norm": 0.1558083475840659, "learning_rate": 0.00019999814660065618, "loss": 0.1831, "step": 59 }, { "epoch": 0.20869565217391303, "grad_norm": 0.1032213761254349, "learning_rate": 0.00019999258647132646, "loss": 0.1188, "step": 60 }, { "epoch": 0.21217391304347827, "grad_norm": 0.14893393244118194, "learning_rate": 0.00019998331981811366, "loss": 0.1554, "step": 61 }, { "epoch": 0.21565217391304348, "grad_norm": 0.14353596472572114, "learning_rate": 0.00019997034698451395, "loss": 0.1807, "step": 62 }, { "epoch": 0.21913043478260869, "grad_norm": 0.1051492618618541, "learning_rate": 0.00019995366845140415, "loss": 0.1278, "step": 63 }, { "epoch": 0.22260869565217392, "grad_norm": 0.15519178380797527, "learning_rate": 0.00019993328483702393, "loss": 0.1718, "step": 64 }, { "epoch": 0.22608695652173913, "grad_norm": 0.16979535445201727, "learning_rate": 0.00019990919689695286, "loss": 0.1759, "step": 65 }, { "epoch": 0.22956521739130434, "grad_norm": 0.19955078650794816, "learning_rate": 0.0001998814055240823, "loss": 0.1659, "step": 66 }, { "epoch": 0.23304347826086957, "grad_norm": 0.21069141049146595, "learning_rate": 0.00019984991174858257, "loss": 0.1591, "step": 67 }, { "epoch": 0.23652173913043478, "grad_norm": 0.10858740428706376, "learning_rate": 0.00019981471673786452, "loss": 0.1143, "step": 68 }, { "epoch": 0.24, "grad_norm": 0.12877038648097636, "learning_rate": 0.00019977582179653633, "loss": 0.113, "step": 69 }, { "epoch": 0.24347826086956523, "grad_norm": 0.15092333453545853, "learning_rate": 0.00019973322836635518, "loss": 0.183, "step": 70 }, { "epoch": 0.24695652173913044, "grad_norm": 0.12997966260226232, "learning_rate": 0.00019968693802617374, "loss": 0.144, "step": 71 }, { "epoch": 0.25043478260869567, "grad_norm": 0.12761141406209162, "learning_rate": 0.00019963695249188183, "loss": 0.1292, "step": 72 }, { "epoch": 0.2539130434782609, "grad_norm": 0.16597376098252953, "learning_rate": 0.00019958327361634248, "loss": 0.1645, "step": 73 }, { "epoch": 0.2573913043478261, "grad_norm": 0.10098015772720864, "learning_rate": 0.00019952590338932356, "loss": 0.1067, "step": 74 }, { "epoch": 0.2608695652173913, "grad_norm": 0.15925018221087978, "learning_rate": 0.00019946484393742399, "loss": 0.1554, "step": 75 }, { "epoch": 0.2643478260869565, "grad_norm": 0.1532302933171606, "learning_rate": 0.0001994000975239946, "loss": 0.1817, "step": 76 }, { "epoch": 0.2678260869565217, "grad_norm": 0.15154786378403498, "learning_rate": 0.00019933166654905466, "loss": 0.1467, "step": 77 }, { "epoch": 0.271304347826087, "grad_norm": 0.15690138906152937, "learning_rate": 0.00019925955354920265, "loss": 0.1373, "step": 78 }, { "epoch": 0.2747826086956522, "grad_norm": 0.1859438689490505, "learning_rate": 0.0001991837611975223, "loss": 0.1932, "step": 79 }, { "epoch": 0.2782608695652174, "grad_norm": 0.14861843675913228, "learning_rate": 0.00019910429230348347, "loss": 0.1675, "step": 80 }, { "epoch": 0.2817391304347826, "grad_norm": 0.14218774514095903, "learning_rate": 0.00019902114981283812, "loss": 0.1283, "step": 81 }, { "epoch": 0.2852173913043478, "grad_norm": 0.15988803314683084, "learning_rate": 0.00019893433680751103, "loss": 0.1336, "step": 82 }, { "epoch": 0.288695652173913, "grad_norm": 0.15975061567872123, "learning_rate": 0.0001988438565054855, "loss": 0.1676, "step": 83 }, { "epoch": 0.2921739130434783, "grad_norm": 0.0903484060539206, "learning_rate": 0.00019874971226068415, "loss": 0.0909, "step": 84 }, { "epoch": 0.2956521739130435, "grad_norm": 0.12570120193815287, "learning_rate": 0.00019865190756284467, "loss": 0.1333, "step": 85 }, { "epoch": 0.2991304347826087, "grad_norm": 0.12595056424947598, "learning_rate": 0.0001985504460373903, "loss": 0.1092, "step": 86 }, { "epoch": 0.3026086956521739, "grad_norm": 0.13479356357232541, "learning_rate": 0.0001984453314452955, "loss": 0.1478, "step": 87 }, { "epoch": 0.3060869565217391, "grad_norm": 0.13307683198992498, "learning_rate": 0.00019833656768294662, "loss": 0.146, "step": 88 }, { "epoch": 0.3095652173913043, "grad_norm": 0.14686125301552883, "learning_rate": 0.0001982241587819974, "loss": 0.1285, "step": 89 }, { "epoch": 0.3130434782608696, "grad_norm": 0.12720833595582368, "learning_rate": 0.00019810810890921943, "loss": 0.1437, "step": 90 }, { "epoch": 0.3165217391304348, "grad_norm": 0.13968930311918126, "learning_rate": 0.00019798842236634797, "loss": 0.1291, "step": 91 }, { "epoch": 0.32, "grad_norm": 0.16133982393912974, "learning_rate": 0.00019786510358992213, "loss": 0.2008, "step": 92 }, { "epoch": 0.3234782608695652, "grad_norm": 0.1266301495042648, "learning_rate": 0.00019773815715112074, "loss": 0.1372, "step": 93 }, { "epoch": 0.3269565217391304, "grad_norm": 0.12427333520991247, "learning_rate": 0.00019760758775559274, "loss": 0.1432, "step": 94 }, { "epoch": 0.33043478260869563, "grad_norm": 0.13028439018276217, "learning_rate": 0.0001974734002432827, "loss": 0.1354, "step": 95 }, { "epoch": 0.3339130434782609, "grad_norm": 0.13268075146491365, "learning_rate": 0.00019733559958825167, "loss": 0.1189, "step": 96 }, { "epoch": 0.3373913043478261, "grad_norm": 0.2048660606818272, "learning_rate": 0.00019719419089849247, "loss": 0.1566, "step": 97 }, { "epoch": 0.3408695652173913, "grad_norm": 0.11124284248033606, "learning_rate": 0.00019704917941574051, "loss": 0.1299, "step": 98 }, { "epoch": 0.3443478260869565, "grad_norm": 0.1415128364022893, "learning_rate": 0.00019690057051527965, "loss": 0.1396, "step": 99 }, { "epoch": 0.34782608695652173, "grad_norm": 0.10665529705089029, "learning_rate": 0.00019674836970574254, "loss": 0.1314, "step": 100 }, { "epoch": 0.35130434782608694, "grad_norm": 0.14169554362167064, "learning_rate": 0.00019659258262890683, "loss": 0.1281, "step": 101 }, { "epoch": 0.35478260869565215, "grad_norm": 0.16648182361835823, "learning_rate": 0.00019643321505948585, "loss": 0.1511, "step": 102 }, { "epoch": 0.3582608695652174, "grad_norm": 0.15512935363008726, "learning_rate": 0.00019627027290491458, "loss": 0.1362, "step": 103 }, { "epoch": 0.3617391304347826, "grad_norm": 0.14829391492240007, "learning_rate": 0.00019610376220513068, "loss": 0.16, "step": 104 }, { "epoch": 0.3652173913043478, "grad_norm": 0.1721382097621375, "learning_rate": 0.00019593368913235052, "loss": 0.1927, "step": 105 }, { "epoch": 0.36869565217391304, "grad_norm": 0.1073039991014123, "learning_rate": 0.0001957600599908406, "loss": 0.1077, "step": 106 }, { "epoch": 0.37217391304347824, "grad_norm": 0.1765959958499992, "learning_rate": 0.00019558288121668363, "loss": 0.1679, "step": 107 }, { "epoch": 0.37565217391304345, "grad_norm": 0.13247232361226763, "learning_rate": 0.00019540215937754007, "loss": 0.1201, "step": 108 }, { "epoch": 0.3791304347826087, "grad_norm": 0.13402863250728775, "learning_rate": 0.0001952179011724047, "loss": 0.1331, "step": 109 }, { "epoch": 0.3826086956521739, "grad_norm": 0.15379139900705738, "learning_rate": 0.00019503011343135825, "loss": 0.1507, "step": 110 }, { "epoch": 0.38608695652173913, "grad_norm": 0.12569941197730944, "learning_rate": 0.00019483880311531424, "loss": 0.1245, "step": 111 }, { "epoch": 0.38956521739130434, "grad_norm": 0.13176534371798201, "learning_rate": 0.00019464397731576094, "loss": 0.1346, "step": 112 }, { "epoch": 0.39304347826086955, "grad_norm": 0.1308496741778078, "learning_rate": 0.00019444564325449853, "loss": 0.1528, "step": 113 }, { "epoch": 0.39652173913043476, "grad_norm": 0.11662685828907265, "learning_rate": 0.00019424380828337144, "loss": 0.1042, "step": 114 }, { "epoch": 0.4, "grad_norm": 0.15311025163121064, "learning_rate": 0.0001940384798839957, "loss": 0.124, "step": 115 }, { "epoch": 0.40347826086956523, "grad_norm": 0.14271720010282954, "learning_rate": 0.00019382966566748168, "loss": 0.1385, "step": 116 }, { "epoch": 0.40695652173913044, "grad_norm": 0.21076081706460564, "learning_rate": 0.00019361737337415206, "loss": 0.2177, "step": 117 }, { "epoch": 0.41043478260869565, "grad_norm": 0.1326954013355056, "learning_rate": 0.0001934016108732548, "loss": 0.1491, "step": 118 }, { "epoch": 0.41391304347826086, "grad_norm": 0.10972822431140547, "learning_rate": 0.00019318238616267141, "loss": 0.1135, "step": 119 }, { "epoch": 0.41739130434782606, "grad_norm": 0.11664553001228962, "learning_rate": 0.00019295970736862064, "loss": 0.1335, "step": 120 }, { "epoch": 0.42086956521739133, "grad_norm": 0.12037673410124465, "learning_rate": 0.00019273358274535704, "loss": 0.0989, "step": 121 }, { "epoch": 0.42434782608695654, "grad_norm": 0.13278062849114713, "learning_rate": 0.00019250402067486522, "loss": 0.1328, "step": 122 }, { "epoch": 0.42782608695652175, "grad_norm": 0.13381559738712595, "learning_rate": 0.00019227102966654896, "loss": 0.1296, "step": 123 }, { "epoch": 0.43130434782608695, "grad_norm": 0.1646662488521753, "learning_rate": 0.00019203461835691594, "loss": 0.1581, "step": 124 }, { "epoch": 0.43478260869565216, "grad_norm": 0.15934887298251812, "learning_rate": 0.00019179479550925747, "loss": 0.1627, "step": 125 }, { "epoch": 0.43826086956521737, "grad_norm": 0.1410826901549644, "learning_rate": 0.00019155157001332374, "loss": 0.1789, "step": 126 }, { "epoch": 0.44173913043478263, "grad_norm": 0.16699816673214457, "learning_rate": 0.0001913049508849942, "loss": 0.1608, "step": 127 }, { "epoch": 0.44521739130434784, "grad_norm": 0.11736817608666682, "learning_rate": 0.00019105494726594344, "loss": 0.1387, "step": 128 }, { "epoch": 0.44869565217391305, "grad_norm": 0.13490354839004873, "learning_rate": 0.00019080156842330242, "loss": 0.1355, "step": 129 }, { "epoch": 0.45217391304347826, "grad_norm": 0.166052611822799, "learning_rate": 0.00019054482374931467, "loss": 0.1628, "step": 130 }, { "epoch": 0.45565217391304347, "grad_norm": 0.10962794054522577, "learning_rate": 0.00019028472276098844, "loss": 0.1109, "step": 131 }, { "epoch": 0.4591304347826087, "grad_norm": 0.10757925577294936, "learning_rate": 0.00019002127509974376, "loss": 0.1124, "step": 132 }, { "epoch": 0.46260869565217394, "grad_norm": 0.14061789137211347, "learning_rate": 0.00018975449053105505, "loss": 0.1445, "step": 133 }, { "epoch": 0.46608695652173915, "grad_norm": 0.1096963245848753, "learning_rate": 0.00018948437894408918, "loss": 0.1265, "step": 134 }, { "epoch": 0.46956521739130436, "grad_norm": 0.12314690150275322, "learning_rate": 0.00018921095035133898, "loss": 0.1202, "step": 135 }, { "epoch": 0.47304347826086957, "grad_norm": 0.1779920573282376, "learning_rate": 0.0001889342148882519, "loss": 0.1997, "step": 136 }, { "epoch": 0.4765217391304348, "grad_norm": 0.13319522745287313, "learning_rate": 0.00018865418281285444, "loss": 0.1402, "step": 137 }, { "epoch": 0.48, "grad_norm": 0.12083080356885761, "learning_rate": 0.00018837086450537193, "loss": 0.1238, "step": 138 }, { "epoch": 0.4834782608695652, "grad_norm": 0.1582932839712108, "learning_rate": 0.00018808427046784366, "loss": 0.1499, "step": 139 }, { "epoch": 0.48695652173913045, "grad_norm": 0.14876994205070418, "learning_rate": 0.00018779441132373362, "loss": 0.1557, "step": 140 }, { "epoch": 0.49043478260869566, "grad_norm": 0.17699025587530975, "learning_rate": 0.0001875012978175368, "loss": 0.1967, "step": 141 }, { "epoch": 0.49391304347826087, "grad_norm": 0.14037478538346934, "learning_rate": 0.00018720494081438078, "loss": 0.1596, "step": 142 }, { "epoch": 0.4973913043478261, "grad_norm": 0.11128336848068965, "learning_rate": 0.00018690535129962306, "loss": 0.1013, "step": 143 }, { "epoch": 0.5008695652173913, "grad_norm": 0.15354451724868373, "learning_rate": 0.00018660254037844388, "loss": 0.1812, "step": 144 }, { "epoch": 0.5043478260869565, "grad_norm": 0.17621002427736646, "learning_rate": 0.00018629651927543447, "loss": 0.22, "step": 145 }, { "epoch": 0.5078260869565218, "grad_norm": 0.11412894846283952, "learning_rate": 0.000185987299334181, "loss": 0.1277, "step": 146 }, { "epoch": 0.5113043478260869, "grad_norm": 0.10330685267150483, "learning_rate": 0.0001856748920168443, "loss": 0.1149, "step": 147 }, { "epoch": 0.5147826086956522, "grad_norm": 0.16038774046228474, "learning_rate": 0.00018535930890373466, "loss": 0.1614, "step": 148 }, { "epoch": 0.5182608695652174, "grad_norm": 0.12341631086149, "learning_rate": 0.00018504056169288275, "loss": 0.1243, "step": 149 }, { "epoch": 0.5217391304347826, "grad_norm": 0.14222035267405325, "learning_rate": 0.00018471866219960602, "loss": 0.1591, "step": 150 }, { "epoch": 0.5252173913043479, "grad_norm": 0.15381954436682013, "learning_rate": 0.0001843936223560707, "loss": 0.1411, "step": 151 }, { "epoch": 0.528695652173913, "grad_norm": 0.16749949682456056, "learning_rate": 0.0001840654542108494, "loss": 0.173, "step": 152 }, { "epoch": 0.5321739130434783, "grad_norm": 0.16138212597769477, "learning_rate": 0.0001837341699284746, "loss": 0.1378, "step": 153 }, { "epoch": 0.5356521739130434, "grad_norm": 0.11820972909841256, "learning_rate": 0.0001833997817889878, "loss": 0.1415, "step": 154 }, { "epoch": 0.5391304347826087, "grad_norm": 0.1732254350869074, "learning_rate": 0.00018306230218748413, "loss": 0.1565, "step": 155 }, { "epoch": 0.542608695652174, "grad_norm": 0.12134029048709205, "learning_rate": 0.000182721743633653, "loss": 0.1354, "step": 156 }, { "epoch": 0.5460869565217391, "grad_norm": 0.15757519533817987, "learning_rate": 0.00018237811875131444, "loss": 0.1783, "step": 157 }, { "epoch": 0.5495652173913044, "grad_norm": 0.1389328342147638, "learning_rate": 0.0001820314402779511, "loss": 0.1373, "step": 158 }, { "epoch": 0.5530434782608695, "grad_norm": 0.13113073991864377, "learning_rate": 0.00018168172106423607, "loss": 0.1272, "step": 159 }, { "epoch": 0.5565217391304348, "grad_norm": 0.14093537485863689, "learning_rate": 0.00018132897407355657, "loss": 0.1364, "step": 160 }, { "epoch": 0.56, "grad_norm": 0.1407116914405213, "learning_rate": 0.00018097321238153338, "loss": 0.1329, "step": 161 }, { "epoch": 0.5634782608695652, "grad_norm": 0.14535376492750982, "learning_rate": 0.00018061444917553629, "loss": 0.1692, "step": 162 }, { "epoch": 0.5669565217391305, "grad_norm": 0.14031883322639, "learning_rate": 0.00018025269775419507, "loss": 0.1356, "step": 163 }, { "epoch": 0.5704347826086956, "grad_norm": 0.1551541472991319, "learning_rate": 0.00017988797152690671, "loss": 0.148, "step": 164 }, { "epoch": 0.5739130434782609, "grad_norm": 0.16740550198996068, "learning_rate": 0.00017952028401333817, "loss": 0.1643, "step": 165 }, { "epoch": 0.577391304347826, "grad_norm": 0.11979937989365573, "learning_rate": 0.00017914964884292544, "loss": 0.1282, "step": 166 }, { "epoch": 0.5808695652173913, "grad_norm": 0.11342656946095574, "learning_rate": 0.00017877607975436805, "loss": 0.1192, "step": 167 }, { "epoch": 0.5843478260869566, "grad_norm": 0.12812233079916055, "learning_rate": 0.00017839959059512016, "loss": 0.1513, "step": 168 }, { "epoch": 0.5878260869565217, "grad_norm": 0.12442713946144991, "learning_rate": 0.00017802019532087694, "loss": 0.1456, "step": 169 }, { "epoch": 0.591304347826087, "grad_norm": 0.13585627394105457, "learning_rate": 0.00017763790799505747, "loss": 0.155, "step": 170 }, { "epoch": 0.5947826086956521, "grad_norm": 0.10995274239294903, "learning_rate": 0.00017725274278828325, "loss": 0.1008, "step": 171 }, { "epoch": 0.5982608695652174, "grad_norm": 0.13574783390341455, "learning_rate": 0.0001768647139778532, "loss": 0.1766, "step": 172 }, { "epoch": 0.6017391304347826, "grad_norm": 0.12560446559496083, "learning_rate": 0.00017647383594721416, "loss": 0.1378, "step": 173 }, { "epoch": 0.6052173913043478, "grad_norm": 0.24726328454376442, "learning_rate": 0.0001760801231854278, "loss": 0.2, "step": 174 }, { "epoch": 0.6086956521739131, "grad_norm": 0.1300492912908485, "learning_rate": 0.00017568359028663364, "loss": 0.1353, "step": 175 }, { "epoch": 0.6121739130434782, "grad_norm": 0.12024702168048951, "learning_rate": 0.00017528425194950794, "loss": 0.1346, "step": 176 }, { "epoch": 0.6156521739130435, "grad_norm": 0.13400618019089086, "learning_rate": 0.000174882122976719, "loss": 0.147, "step": 177 }, { "epoch": 0.6191304347826087, "grad_norm": 0.10665251622268654, "learning_rate": 0.0001744772182743782, "loss": 0.1269, "step": 178 }, { "epoch": 0.6226086956521739, "grad_norm": 0.12190300959390951, "learning_rate": 0.00017406955285148782, "loss": 0.1263, "step": 179 }, { "epoch": 0.6260869565217392, "grad_norm": 0.08623960123094311, "learning_rate": 0.0001736591418193844, "loss": 0.1075, "step": 180 }, { "epoch": 0.6295652173913043, "grad_norm": 0.15899695178173323, "learning_rate": 0.00017324600039117863, "loss": 0.1335, "step": 181 }, { "epoch": 0.6330434782608696, "grad_norm": 0.12405567103892874, "learning_rate": 0.00017283014388119159, "loss": 0.1261, "step": 182 }, { "epoch": 0.6365217391304347, "grad_norm": 0.12227415658908525, "learning_rate": 0.000172411587704387, "loss": 0.1394, "step": 183 }, { "epoch": 0.64, "grad_norm": 0.10299259784769293, "learning_rate": 0.0001719903473757996, "loss": 0.1179, "step": 184 }, { "epoch": 0.6434782608695652, "grad_norm": 0.18072288336432377, "learning_rate": 0.00017156643850996047, "loss": 0.1678, "step": 185 }, { "epoch": 0.6469565217391304, "grad_norm": 0.13931470098249313, "learning_rate": 0.0001711398768203178, "loss": 0.1468, "step": 186 }, { "epoch": 0.6504347826086957, "grad_norm": 0.142891653601056, "learning_rate": 0.00017071067811865476, "loss": 0.1699, "step": 187 }, { "epoch": 0.6539130434782608, "grad_norm": 0.1543203031358245, "learning_rate": 0.00017027885831450318, "loss": 0.163, "step": 188 }, { "epoch": 0.6573913043478261, "grad_norm": 0.08881257657108957, "learning_rate": 0.0001698444334145539, "loss": 0.0956, "step": 189 }, { "epoch": 0.6608695652173913, "grad_norm": 0.1437015724786564, "learning_rate": 0.0001694074195220634, "loss": 0.1531, "step": 190 }, { "epoch": 0.6643478260869565, "grad_norm": 0.15239548568770145, "learning_rate": 0.0001689678328362569, "loss": 0.1583, "step": 191 }, { "epoch": 0.6678260869565218, "grad_norm": 0.12999990256807817, "learning_rate": 0.00016852568965172791, "loss": 0.1241, "step": 192 }, { "epoch": 0.671304347826087, "grad_norm": 0.16058602233359284, "learning_rate": 0.00016808100635783423, "loss": 0.1901, "step": 193 }, { "epoch": 0.6747826086956522, "grad_norm": 0.09752013699351626, "learning_rate": 0.00016763379943809028, "loss": 0.1104, "step": 194 }, { "epoch": 0.6782608695652174, "grad_norm": 0.1171558354901818, "learning_rate": 0.00016718408546955636, "loss": 0.1393, "step": 195 }, { "epoch": 0.6817391304347826, "grad_norm": 0.12541030208785753, "learning_rate": 0.00016673188112222394, "loss": 0.1339, "step": 196 }, { "epoch": 0.6852173913043478, "grad_norm": 0.16378504667963803, "learning_rate": 0.00016627720315839784, "loss": 0.1896, "step": 197 }, { "epoch": 0.688695652173913, "grad_norm": 0.1254436356043883, "learning_rate": 0.0001658200684320748, "loss": 0.155, "step": 198 }, { "epoch": 0.6921739130434783, "grad_norm": 0.10926424609512125, "learning_rate": 0.00016536049388831894, "loss": 0.1333, "step": 199 }, { "epoch": 0.6956521739130435, "grad_norm": 0.12166335086653808, "learning_rate": 0.00016489849656263337, "loss": 0.1307, "step": 200 }, { "epoch": 0.6991304347826087, "grad_norm": 0.09726778569787221, "learning_rate": 0.00016443409358032887, "loss": 0.1093, "step": 201 }, { "epoch": 0.7026086956521739, "grad_norm": 0.18623972301385774, "learning_rate": 0.00016396730215588915, "loss": 0.1329, "step": 202 }, { "epoch": 0.7060869565217391, "grad_norm": 0.1036420764487769, "learning_rate": 0.00016349813959233255, "loss": 0.1066, "step": 203 }, { "epoch": 0.7095652173913043, "grad_norm": 0.15859483282291995, "learning_rate": 0.00016302662328057088, "loss": 0.1236, "step": 204 }, { "epoch": 0.7130434782608696, "grad_norm": 0.1352010399451213, "learning_rate": 0.00016255277069876454, "loss": 0.1556, "step": 205 }, { "epoch": 0.7165217391304348, "grad_norm": 0.0847816136200446, "learning_rate": 0.00016207659941167485, "loss": 0.1033, "step": 206 }, { "epoch": 0.72, "grad_norm": 0.13868944339810388, "learning_rate": 0.00016159812707001282, "loss": 0.1583, "step": 207 }, { "epoch": 0.7234782608695652, "grad_norm": 0.11403894766591344, "learning_rate": 0.00016111737140978494, "loss": 0.1193, "step": 208 }, { "epoch": 0.7269565217391304, "grad_norm": 0.11921529189670015, "learning_rate": 0.00016063435025163569, "loss": 0.1272, "step": 209 }, { "epoch": 0.7304347826086957, "grad_norm": 0.16113792796352755, "learning_rate": 0.00016014908150018703, "loss": 0.1972, "step": 210 }, { "epoch": 0.7339130434782609, "grad_norm": 0.12349845734675136, "learning_rate": 0.00015966158314337472, "loss": 0.1462, "step": 211 }, { "epoch": 0.7373913043478261, "grad_norm": 0.1502644739489071, "learning_rate": 0.00015917187325178138, "loss": 0.1626, "step": 212 }, { "epoch": 0.7408695652173913, "grad_norm": 0.14447398546355603, "learning_rate": 0.00015867996997796685, "loss": 0.1653, "step": 213 }, { "epoch": 0.7443478260869565, "grad_norm": 0.13747896173823398, "learning_rate": 0.0001581858915557953, "loss": 0.1436, "step": 214 }, { "epoch": 0.7478260869565218, "grad_norm": 0.14978167508747187, "learning_rate": 0.00015768965629975914, "loss": 0.146, "step": 215 }, { "epoch": 0.7513043478260869, "grad_norm": 0.10530370902507546, "learning_rate": 0.0001571912826043003, "loss": 0.1067, "step": 216 }, { "epoch": 0.7547826086956522, "grad_norm": 0.15065236331393017, "learning_rate": 0.00015669078894312848, "loss": 0.1278, "step": 217 }, { "epoch": 0.7582608695652174, "grad_norm": 0.13038147931466645, "learning_rate": 0.00015618819386853606, "loss": 0.1363, "step": 218 }, { "epoch": 0.7617391304347826, "grad_norm": 0.12241560985671367, "learning_rate": 0.0001556835160107107, "loss": 0.1381, "step": 219 }, { "epoch": 0.7652173913043478, "grad_norm": 0.1032079433563102, "learning_rate": 0.0001551767740770446, "loss": 0.1329, "step": 220 }, { "epoch": 0.768695652173913, "grad_norm": 0.10420850780658172, "learning_rate": 0.00015466798685144113, "loss": 0.108, "step": 221 }, { "epoch": 0.7721739130434783, "grad_norm": 0.12440213702363168, "learning_rate": 0.00015415717319361847, "loss": 0.1378, "step": 222 }, { "epoch": 0.7756521739130435, "grad_norm": 0.1441063665454779, "learning_rate": 0.00015364435203841058, "loss": 0.1546, "step": 223 }, { "epoch": 0.7791304347826087, "grad_norm": 0.10283016985275265, "learning_rate": 0.00015312954239506533, "loss": 0.1398, "step": 224 }, { "epoch": 0.782608695652174, "grad_norm": 0.11879627421875508, "learning_rate": 0.0001526127633465398, "loss": 0.1394, "step": 225 }, { "epoch": 0.7860869565217391, "grad_norm": 0.1340444040194527, "learning_rate": 0.00015209403404879303, "loss": 0.1371, "step": 226 }, { "epoch": 0.7895652173913044, "grad_norm": 0.15078724481486633, "learning_rate": 0.00015157337373007578, "loss": 0.1626, "step": 227 }, { "epoch": 0.7930434782608695, "grad_norm": 0.14991040307874806, "learning_rate": 0.0001510508016902179, "loss": 0.1563, "step": 228 }, { "epoch": 0.7965217391304348, "grad_norm": 0.11713195212511589, "learning_rate": 0.00015052633729991294, "loss": 0.1372, "step": 229 }, { "epoch": 0.8, "grad_norm": 0.10665559275288661, "learning_rate": 0.00015000000000000001, "loss": 0.1174, "step": 230 }, { "epoch": 0.8034782608695652, "grad_norm": 0.15701030356110557, "learning_rate": 0.00014947180930074326, "loss": 0.1575, "step": 231 }, { "epoch": 0.8069565217391305, "grad_norm": 0.11847918443040721, "learning_rate": 0.00014894178478110857, "loss": 0.1203, "step": 232 }, { "epoch": 0.8104347826086956, "grad_norm": 0.1285162400608025, "learning_rate": 0.0001484099460880379, "loss": 0.133, "step": 233 }, { "epoch": 0.8139130434782609, "grad_norm": 0.1512166257756219, "learning_rate": 0.00014787631293572092, "loss": 0.1584, "step": 234 }, { "epoch": 0.8173913043478261, "grad_norm": 0.1584657384276377, "learning_rate": 0.00014734090510486433, "loss": 0.176, "step": 235 }, { "epoch": 0.8208695652173913, "grad_norm": 0.10354148249587801, "learning_rate": 0.0001468037424419586, "loss": 0.1288, "step": 236 }, { "epoch": 0.8243478260869566, "grad_norm": 0.11214117311491091, "learning_rate": 0.0001462648448585423, "loss": 0.1221, "step": 237 }, { "epoch": 0.8278260869565217, "grad_norm": 0.14772445459512365, "learning_rate": 0.00014572423233046386, "loss": 0.1329, "step": 238 }, { "epoch": 0.831304347826087, "grad_norm": 0.14615479240284515, "learning_rate": 0.0001451819248971415, "loss": 0.1643, "step": 239 }, { "epoch": 0.8347826086956521, "grad_norm": 0.12753795686628652, "learning_rate": 0.00014463794266081993, "loss": 0.1557, "step": 240 }, { "epoch": 0.8382608695652174, "grad_norm": 0.13887522594093168, "learning_rate": 0.00014409230578582566, "loss": 0.1639, "step": 241 }, { "epoch": 0.8417391304347827, "grad_norm": 0.16912324583465613, "learning_rate": 0.00014354503449781912, "loss": 0.1688, "step": 242 }, { "epoch": 0.8452173913043478, "grad_norm": 0.09449246440948272, "learning_rate": 0.0001429961490830453, "loss": 0.0993, "step": 243 }, { "epoch": 0.8486956521739131, "grad_norm": 0.10550648117339549, "learning_rate": 0.00014244566988758152, "loss": 0.1356, "step": 244 }, { "epoch": 0.8521739130434782, "grad_norm": 0.10969662638776663, "learning_rate": 0.00014189361731658338, "loss": 0.1239, "step": 245 }, { "epoch": 0.8556521739130435, "grad_norm": 0.14808204518572862, "learning_rate": 0.00014134001183352832, "loss": 0.1579, "step": 246 }, { "epoch": 0.8591304347826086, "grad_norm": 0.13859857433183218, "learning_rate": 0.00014078487395945713, "loss": 0.1747, "step": 247 }, { "epoch": 0.8626086956521739, "grad_norm": 0.13502318508676295, "learning_rate": 0.00014022822427221324, "loss": 0.1558, "step": 248 }, { "epoch": 0.8660869565217392, "grad_norm": 0.11993193249652914, "learning_rate": 0.00013967008340567998, "loss": 0.1318, "step": 249 }, { "epoch": 0.8695652173913043, "grad_norm": 0.14432862128479182, "learning_rate": 0.0001391104720490156, "loss": 0.1718, "step": 250 }, { "epoch": 0.8730434782608696, "grad_norm": 0.10960589296514184, "learning_rate": 0.0001385494109458866, "loss": 0.1216, "step": 251 }, { "epoch": 0.8765217391304347, "grad_norm": 0.1444495982064661, "learning_rate": 0.00013798692089369855, "loss": 0.1511, "step": 252 }, { "epoch": 0.88, "grad_norm": 0.14195714442676055, "learning_rate": 0.00013742302274282533, "loss": 0.164, "step": 253 }, { "epoch": 0.8834782608695653, "grad_norm": 0.15939971031248268, "learning_rate": 0.00013685773739583617, "loss": 0.1589, "step": 254 }, { "epoch": 0.8869565217391304, "grad_norm": 0.10567415705517683, "learning_rate": 0.00013629108580672094, "loss": 0.1006, "step": 255 }, { "epoch": 0.8904347826086957, "grad_norm": 0.12878257656430525, "learning_rate": 0.0001357230889801133, "loss": 0.1267, "step": 256 }, { "epoch": 0.8939130434782608, "grad_norm": 0.11395046485825466, "learning_rate": 0.0001351537679705121, "loss": 0.134, "step": 257 }, { "epoch": 0.8973913043478261, "grad_norm": 0.13632342342499126, "learning_rate": 0.00013458314388150114, "loss": 0.1598, "step": 258 }, { "epoch": 0.9008695652173913, "grad_norm": 0.16308025278021065, "learning_rate": 0.00013401123786496664, "loss": 0.2041, "step": 259 }, { "epoch": 0.9043478260869565, "grad_norm": 0.10241355755764081, "learning_rate": 0.00013343807112031327, "loss": 0.1081, "step": 260 }, { "epoch": 0.9078260869565218, "grad_norm": 0.1310395387251736, "learning_rate": 0.00013286366489367846, "loss": 0.158, "step": 261 }, { "epoch": 0.9113043478260869, "grad_norm": 0.13100096116141785, "learning_rate": 0.00013228804047714463, "loss": 0.1607, "step": 262 }, { "epoch": 0.9147826086956522, "grad_norm": 0.11969415969012737, "learning_rate": 0.00013171121920795014, "loss": 0.1308, "step": 263 }, { "epoch": 0.9182608695652174, "grad_norm": 0.1295097570140744, "learning_rate": 0.00013113322246769817, "loss": 0.1502, "step": 264 }, { "epoch": 0.9217391304347826, "grad_norm": 0.11814028103328439, "learning_rate": 0.00013055407168156437, "loss": 0.1241, "step": 265 }, { "epoch": 0.9252173913043479, "grad_norm": 0.11218111509954955, "learning_rate": 0.00012997378831750242, "loss": 0.1381, "step": 266 }, { "epoch": 0.928695652173913, "grad_norm": 0.12021997514568723, "learning_rate": 0.00012939239388544852, "loss": 0.1395, "step": 267 }, { "epoch": 0.9321739130434783, "grad_norm": 0.12114779793419364, "learning_rate": 0.00012880990993652377, "loss": 0.117, "step": 268 }, { "epoch": 0.9356521739130435, "grad_norm": 0.1690185626815269, "learning_rate": 0.00012822635806223557, "loss": 0.2055, "step": 269 }, { "epoch": 0.9391304347826087, "grad_norm": 0.10540099318141671, "learning_rate": 0.00012764175989367718, "loss": 0.1292, "step": 270 }, { "epoch": 0.9426086956521739, "grad_norm": 0.1123676795677547, "learning_rate": 0.00012705613710072575, "loss": 0.1401, "step": 271 }, { "epoch": 0.9460869565217391, "grad_norm": 0.12163076229024251, "learning_rate": 0.00012646951139123934, "loss": 0.1393, "step": 272 }, { "epoch": 0.9495652173913044, "grad_norm": 0.10635388207764115, "learning_rate": 0.00012588190451025207, "loss": 0.1192, "step": 273 }, { "epoch": 0.9530434782608695, "grad_norm": 0.1324746367162532, "learning_rate": 0.00012529333823916807, "loss": 0.1674, "step": 274 }, { "epoch": 0.9565217391304348, "grad_norm": 0.12690900530317173, "learning_rate": 0.00012470383439495416, "loss": 0.164, "step": 275 }, { "epoch": 0.96, "grad_norm": 0.12178811089584775, "learning_rate": 0.0001241134148293311, "loss": 0.1472, "step": 276 }, { "epoch": 0.9634782608695652, "grad_norm": 0.09558226725121408, "learning_rate": 0.0001235221014279636, "loss": 0.1107, "step": 277 }, { "epoch": 0.9669565217391304, "grad_norm": 0.11947361537383715, "learning_rate": 0.00012292991610964903, "loss": 0.1454, "step": 278 }, { "epoch": 0.9704347826086956, "grad_norm": 0.09245448807939725, "learning_rate": 0.000122336880825505, "loss": 0.1063, "step": 279 }, { "epoch": 0.9739130434782609, "grad_norm": 0.12313564570662155, "learning_rate": 0.00012174301755815571, "loss": 0.1482, "step": 280 }, { "epoch": 0.9773913043478261, "grad_norm": 0.14222809451041388, "learning_rate": 0.00012114834832091691, "loss": 0.1905, "step": 281 }, { "epoch": 0.9808695652173913, "grad_norm": 0.10079732072591296, "learning_rate": 0.00012055289515698007, "loss": 0.1114, "step": 282 }, { "epoch": 0.9843478260869565, "grad_norm": 0.0893949612581931, "learning_rate": 0.00011995668013859529, "loss": 0.1057, "step": 283 }, { "epoch": 0.9878260869565217, "grad_norm": 0.0986410641315097, "learning_rate": 0.00011935972536625302, "loss": 0.111, "step": 284 }, { "epoch": 0.991304347826087, "grad_norm": 0.10054024829355615, "learning_rate": 0.00011876205296786493, "loss": 0.0972, "step": 285 }, { "epoch": 0.9947826086956522, "grad_norm": 0.12467802363495945, "learning_rate": 0.00011816368509794364, "loss": 0.147, "step": 286 }, { "epoch": 0.9982608695652174, "grad_norm": 0.08424816142149656, "learning_rate": 0.00011756464393678153, "loss": 0.103, "step": 287 }, { "epoch": 0.9982608695652174, "eval_loss": 0.1444740742444992, "eval_runtime": 52.3252, "eval_samples_per_second": 4.568, "eval_steps_per_second": 0.573, "step": 287 }, { "epoch": 1.0017391304347827, "grad_norm": 0.11878547881930412, "learning_rate": 0.00011696495168962847, "loss": 0.1385, "step": 288 }, { "epoch": 1.0052173913043478, "grad_norm": 0.09391887138015648, "learning_rate": 0.00011636463058586881, "loss": 0.0826, "step": 289 }, { "epoch": 1.008695652173913, "grad_norm": 0.1221171087699073, "learning_rate": 0.00011576370287819736, "loss": 0.1305, "step": 290 }, { "epoch": 1.0121739130434784, "grad_norm": 0.08852002687146088, "learning_rate": 0.0001151621908417945, "loss": 0.0893, "step": 291 }, { "epoch": 1.0156521739130435, "grad_norm": 0.11159916956566551, "learning_rate": 0.00011456011677350051, "loss": 0.1112, "step": 292 }, { "epoch": 1.0191304347826087, "grad_norm": 0.10003818148322566, "learning_rate": 0.000113957502990989, "loss": 0.091, "step": 293 }, { "epoch": 1.0226086956521738, "grad_norm": 0.16412668815167833, "learning_rate": 0.0001133543718319398, "loss": 0.0684, "step": 294 }, { "epoch": 1.0260869565217392, "grad_norm": 0.12591860799015855, "learning_rate": 0.0001127507456532108, "loss": 0.1155, "step": 295 }, { "epoch": 1.0295652173913044, "grad_norm": 0.09691052326677896, "learning_rate": 0.00011214664683000927, "loss": 0.0655, "step": 296 }, { "epoch": 1.0330434782608695, "grad_norm": 0.11401647857375072, "learning_rate": 0.00011154209775506241, "loss": 0.0819, "step": 297 }, { "epoch": 1.0365217391304349, "grad_norm": 0.12069848422212905, "learning_rate": 0.00011093712083778746, "loss": 0.0827, "step": 298 }, { "epoch": 1.04, "grad_norm": 0.11216573920077354, "learning_rate": 0.00011033173850346082, "loss": 0.0754, "step": 299 }, { "epoch": 1.0434782608695652, "grad_norm": 0.14906810717855873, "learning_rate": 0.0001097259731923869, "loss": 0.0888, "step": 300 }, { "epoch": 1.0469565217391303, "grad_norm": 0.17640102936065463, "learning_rate": 0.00010911984735906635, "loss": 0.0987, "step": 301 }, { "epoch": 1.0504347826086957, "grad_norm": 0.10731016230700624, "learning_rate": 0.00010851338347136357, "loss": 0.0654, "step": 302 }, { "epoch": 1.0539130434782609, "grad_norm": 0.13955232812110846, "learning_rate": 0.000107906604009674, "loss": 0.0766, "step": 303 }, { "epoch": 1.057391304347826, "grad_norm": 0.13869916502517549, "learning_rate": 0.00010729953146609076, "loss": 0.0905, "step": 304 }, { "epoch": 1.0608695652173914, "grad_norm": 0.16180614723177286, "learning_rate": 0.00010669218834357091, "loss": 0.1025, "step": 305 }, { "epoch": 1.0643478260869565, "grad_norm": 0.09389888673848854, "learning_rate": 0.00010608459715510139, "loss": 0.0613, "step": 306 }, { "epoch": 1.0678260869565217, "grad_norm": 0.11083339472481404, "learning_rate": 0.00010547678042286436, "loss": 0.0705, "step": 307 }, { "epoch": 1.0713043478260869, "grad_norm": 0.15345557779758465, "learning_rate": 0.00010486876067740252, "loss": 0.0878, "step": 308 }, { "epoch": 1.0747826086956522, "grad_norm": 0.12649607806775048, "learning_rate": 0.00010426056045678376, "loss": 0.0879, "step": 309 }, { "epoch": 1.0782608695652174, "grad_norm": 0.14680466140336335, "learning_rate": 0.0001036522023057659, "loss": 0.0958, "step": 310 }, { "epoch": 1.0817391304347825, "grad_norm": 0.11612953696390602, "learning_rate": 0.0001030437087749609, "loss": 0.0736, "step": 311 }, { "epoch": 1.085217391304348, "grad_norm": 0.11879942840457153, "learning_rate": 0.00010243510241999899, "loss": 0.0723, "step": 312 }, { "epoch": 1.088695652173913, "grad_norm": 0.13060110667263794, "learning_rate": 0.0001018264058006925, "loss": 0.0935, "step": 313 }, { "epoch": 1.0921739130434782, "grad_norm": 0.14907408553806142, "learning_rate": 0.00010121764148019976, "loss": 0.1067, "step": 314 }, { "epoch": 1.0956521739130434, "grad_norm": 0.09945695753413593, "learning_rate": 0.00010060883202418862, "loss": 0.0717, "step": 315 }, { "epoch": 1.0991304347826087, "grad_norm": 0.14172732221333895, "learning_rate": 0.0001, "loss": 0.0965, "step": 316 }, { "epoch": 1.102608695652174, "grad_norm": 0.1308399790176956, "learning_rate": 9.93911679758114e-05, "loss": 0.1035, "step": 317 }, { "epoch": 1.106086956521739, "grad_norm": 0.11697945837103665, "learning_rate": 9.878235851980025e-05, "loss": 0.0904, "step": 318 }, { "epoch": 1.1095652173913044, "grad_norm": 0.12653991847887303, "learning_rate": 9.817359419930751e-05, "loss": 0.0856, "step": 319 }, { "epoch": 1.1130434782608696, "grad_norm": 0.1217289403364997, "learning_rate": 9.756489758000105e-05, "loss": 0.0868, "step": 320 }, { "epoch": 1.1165217391304347, "grad_norm": 0.11310356101526439, "learning_rate": 9.69562912250391e-05, "loss": 0.0866, "step": 321 }, { "epoch": 1.12, "grad_norm": 0.10719359269477195, "learning_rate": 9.63477976942341e-05, "loss": 0.0716, "step": 322 }, { "epoch": 1.1234782608695653, "grad_norm": 0.1512816323423573, "learning_rate": 9.573943954321626e-05, "loss": 0.104, "step": 323 }, { "epoch": 1.1269565217391304, "grad_norm": 0.09749679838740939, "learning_rate": 9.513123932259751e-05, "loss": 0.0767, "step": 324 }, { "epoch": 1.1304347826086956, "grad_norm": 0.12636925131896773, "learning_rate": 9.452321957713564e-05, "loss": 0.0874, "step": 325 }, { "epoch": 1.133913043478261, "grad_norm": 0.08724868085956655, "learning_rate": 9.391540284489862e-05, "loss": 0.0675, "step": 326 }, { "epoch": 1.137391304347826, "grad_norm": 0.09917562166921519, "learning_rate": 9.330781165642907e-05, "loss": 0.0835, "step": 327 }, { "epoch": 1.1408695652173912, "grad_norm": 0.11005238071063954, "learning_rate": 9.270046853390925e-05, "loss": 0.0926, "step": 328 }, { "epoch": 1.1443478260869564, "grad_norm": 0.13592915315342272, "learning_rate": 9.209339599032601e-05, "loss": 0.0921, "step": 329 }, { "epoch": 1.1478260869565218, "grad_norm": 0.09959026553962852, "learning_rate": 9.148661652863642e-05, "loss": 0.0669, "step": 330 }, { "epoch": 1.151304347826087, "grad_norm": 0.12926733392574546, "learning_rate": 9.088015264093365e-05, "loss": 0.0882, "step": 331 }, { "epoch": 1.154782608695652, "grad_norm": 0.12554624045521445, "learning_rate": 9.027402680761309e-05, "loss": 0.0988, "step": 332 }, { "epoch": 1.1582608695652175, "grad_norm": 0.1672440454873292, "learning_rate": 8.966826149653923e-05, "loss": 0.1213, "step": 333 }, { "epoch": 1.1617391304347826, "grad_norm": 0.11985957465820539, "learning_rate": 8.906287916221259e-05, "loss": 0.0868, "step": 334 }, { "epoch": 1.1652173913043478, "grad_norm": 0.1272151243776101, "learning_rate": 8.845790224493763e-05, "loss": 0.0936, "step": 335 }, { "epoch": 1.1686956521739131, "grad_norm": 0.1328045736153317, "learning_rate": 8.785335316999078e-05, "loss": 0.1051, "step": 336 }, { "epoch": 1.1721739130434783, "grad_norm": 0.09448312790900673, "learning_rate": 8.724925434678923e-05, "loss": 0.0735, "step": 337 }, { "epoch": 1.1756521739130434, "grad_norm": 0.13775516158820159, "learning_rate": 8.664562816806022e-05, "loss": 0.0826, "step": 338 }, { "epoch": 1.1791304347826088, "grad_norm": 0.095050504784669, "learning_rate": 8.604249700901101e-05, "loss": 0.0606, "step": 339 }, { "epoch": 1.182608695652174, "grad_norm": 0.10883208791380891, "learning_rate": 8.543988322649954e-05, "loss": 0.0776, "step": 340 }, { "epoch": 1.1860869565217391, "grad_norm": 0.1432959854298642, "learning_rate": 8.483780915820553e-05, "loss": 0.105, "step": 341 }, { "epoch": 1.1895652173913043, "grad_norm": 0.1934560716364753, "learning_rate": 8.423629712180265e-05, "loss": 0.1167, "step": 342 }, { "epoch": 1.1930434782608696, "grad_norm": 0.14737287305329302, "learning_rate": 8.363536941413121e-05, "loss": 0.0952, "step": 343 }, { "epoch": 1.1965217391304348, "grad_norm": 0.1535547643880873, "learning_rate": 8.303504831037154e-05, "loss": 0.1146, "step": 344 }, { "epoch": 1.2, "grad_norm": 0.15481576726903015, "learning_rate": 8.243535606321848e-05, "loss": 0.1088, "step": 345 }, { "epoch": 1.203478260869565, "grad_norm": 0.1589929120048658, "learning_rate": 8.183631490205637e-05, "loss": 0.1288, "step": 346 }, { "epoch": 1.2069565217391305, "grad_norm": 0.12926833828040588, "learning_rate": 8.12379470321351e-05, "loss": 0.0779, "step": 347 }, { "epoch": 1.2104347826086956, "grad_norm": 0.10432967192535712, "learning_rate": 8.064027463374702e-05, "loss": 0.0733, "step": 348 }, { "epoch": 1.2139130434782608, "grad_norm": 0.1423904166119135, "learning_rate": 8.004331986140474e-05, "loss": 0.097, "step": 349 }, { "epoch": 1.2173913043478262, "grad_norm": 0.16415634432026194, "learning_rate": 7.944710484301995e-05, "loss": 0.1044, "step": 350 }, { "epoch": 1.2208695652173913, "grad_norm": 0.14367056293640723, "learning_rate": 7.88516516790831e-05, "loss": 0.108, "step": 351 }, { "epoch": 1.2243478260869565, "grad_norm": 0.09627642646890802, "learning_rate": 7.825698244184431e-05, "loss": 0.0716, "step": 352 }, { "epoch": 1.2278260869565218, "grad_norm": 0.12349504031653168, "learning_rate": 7.766311917449501e-05, "loss": 0.0846, "step": 353 }, { "epoch": 1.231304347826087, "grad_norm": 0.11917707968673376, "learning_rate": 7.707008389035101e-05, "loss": 0.0893, "step": 354 }, { "epoch": 1.2347826086956522, "grad_norm": 0.14958731827081473, "learning_rate": 7.647789857203645e-05, "loss": 0.1005, "step": 355 }, { "epoch": 1.2382608695652173, "grad_norm": 0.09807418540274827, "learning_rate": 7.588658517066892e-05, "loss": 0.0777, "step": 356 }, { "epoch": 1.2417391304347827, "grad_norm": 0.13031128610452009, "learning_rate": 7.529616560504585e-05, "loss": 0.0877, "step": 357 }, { "epoch": 1.2452173913043478, "grad_norm": 0.15458552977098033, "learning_rate": 7.470666176083192e-05, "loss": 0.1006, "step": 358 }, { "epoch": 1.248695652173913, "grad_norm": 0.10086297540969145, "learning_rate": 7.411809548974792e-05, "loss": 0.0771, "step": 359 }, { "epoch": 1.2521739130434781, "grad_norm": 0.10503599360725659, "learning_rate": 7.353048860876064e-05, "loss": 0.0699, "step": 360 }, { "epoch": 1.2556521739130435, "grad_norm": 0.11445411107296893, "learning_rate": 7.294386289927425e-05, "loss": 0.0878, "step": 361 }, { "epoch": 1.2591304347826087, "grad_norm": 0.09163778675554561, "learning_rate": 7.235824010632283e-05, "loss": 0.0774, "step": 362 }, { "epoch": 1.2626086956521738, "grad_norm": 0.12753545759992949, "learning_rate": 7.177364193776441e-05, "loss": 0.0891, "step": 363 }, { "epoch": 1.2660869565217392, "grad_norm": 0.10783034916975004, "learning_rate": 7.119009006347625e-05, "loss": 0.0727, "step": 364 }, { "epoch": 1.2695652173913043, "grad_norm": 0.12242485363979573, "learning_rate": 7.060760611455152e-05, "loss": 0.0628, "step": 365 }, { "epoch": 1.2730434782608695, "grad_norm": 0.0974356463850898, "learning_rate": 7.002621168249759e-05, "loss": 0.0791, "step": 366 }, { "epoch": 1.2765217391304349, "grad_norm": 0.11983018538507342, "learning_rate": 6.944592831843566e-05, "loss": 0.067, "step": 367 }, { "epoch": 1.28, "grad_norm": 0.1364747598273945, "learning_rate": 6.886677753230184e-05, "loss": 0.0905, "step": 368 }, { "epoch": 1.2834782608695652, "grad_norm": 0.13965549240604952, "learning_rate": 6.82887807920499e-05, "loss": 0.0965, "step": 369 }, { "epoch": 1.2869565217391306, "grad_norm": 0.1361838338173524, "learning_rate": 6.77119595228554e-05, "loss": 0.0884, "step": 370 }, { "epoch": 1.2904347826086957, "grad_norm": 0.1554086553741736, "learning_rate": 6.713633510632157e-05, "loss": 0.1058, "step": 371 }, { "epoch": 1.2939130434782609, "grad_norm": 0.13154153458769796, "learning_rate": 6.656192887968675e-05, "loss": 0.1069, "step": 372 }, { "epoch": 1.297391304347826, "grad_norm": 0.12317336873376321, "learning_rate": 6.598876213503339e-05, "loss": 0.0855, "step": 373 }, { "epoch": 1.3008695652173912, "grad_norm": 0.12111523304638382, "learning_rate": 6.541685611849887e-05, "loss": 0.0796, "step": 374 }, { "epoch": 1.3043478260869565, "grad_norm": 0.11822393281008113, "learning_rate": 6.484623202948789e-05, "loss": 0.0678, "step": 375 }, { "epoch": 1.3078260869565217, "grad_norm": 0.14902345594338023, "learning_rate": 6.427691101988673e-05, "loss": 0.095, "step": 376 }, { "epoch": 1.3113043478260868, "grad_norm": 0.1804018948634972, "learning_rate": 6.370891419327907e-05, "loss": 0.1282, "step": 377 }, { "epoch": 1.3147826086956522, "grad_norm": 0.11547994985396455, "learning_rate": 6.314226260416382e-05, "loss": 0.0794, "step": 378 }, { "epoch": 1.3182608695652174, "grad_norm": 0.13442398839445116, "learning_rate": 6.257697725717468e-05, "loss": 0.0828, "step": 379 }, { "epoch": 1.3217391304347825, "grad_norm": 0.16157920308299395, "learning_rate": 6.201307910630146e-05, "loss": 0.0862, "step": 380 }, { "epoch": 1.325217391304348, "grad_norm": 0.09483163105782791, "learning_rate": 6.145058905411343e-05, "loss": 0.0602, "step": 381 }, { "epoch": 1.328695652173913, "grad_norm": 0.1326696358587778, "learning_rate": 6.0889527950984416e-05, "loss": 0.081, "step": 382 }, { "epoch": 1.3321739130434782, "grad_norm": 0.09578653192083227, "learning_rate": 6.0329916594320054e-05, "loss": 0.0632, "step": 383 }, { "epoch": 1.3356521739130436, "grad_norm": 0.1445496359915367, "learning_rate": 5.977177572778678e-05, "loss": 0.1043, "step": 384 }, { "epoch": 1.3391304347826087, "grad_norm": 0.11696872605657838, "learning_rate": 5.921512604054289e-05, "loss": 0.075, "step": 385 }, { "epoch": 1.342608695652174, "grad_norm": 0.10474941138685831, "learning_rate": 5.865998816647171e-05, "loss": 0.0808, "step": 386 }, { "epoch": 1.3460869565217393, "grad_norm": 0.12195030923899196, "learning_rate": 5.8106382683416635e-05, "loss": 0.0906, "step": 387 }, { "epoch": 1.3495652173913044, "grad_norm": 0.1247261310171403, "learning_rate": 5.755433011241851e-05, "loss": 0.0799, "step": 388 }, { "epoch": 1.3530434782608696, "grad_norm": 0.12001527150963033, "learning_rate": 5.7003850916954705e-05, "loss": 0.0737, "step": 389 }, { "epoch": 1.3565217391304347, "grad_norm": 0.12921970865724472, "learning_rate": 5.645496550218089e-05, "loss": 0.0802, "step": 390 }, { "epoch": 1.3599999999999999, "grad_norm": 0.14148810186262428, "learning_rate": 5.5907694214174344e-05, "loss": 0.0998, "step": 391 }, { "epoch": 1.3634782608695653, "grad_norm": 0.1822115264684952, "learning_rate": 5.536205733918007e-05, "loss": 0.1139, "step": 392 }, { "epoch": 1.3669565217391304, "grad_norm": 0.11275316954836014, "learning_rate": 5.4818075102858526e-05, "loss": 0.0839, "step": 393 }, { "epoch": 1.3704347826086956, "grad_norm": 0.1049274592340904, "learning_rate": 5.4275767669536146e-05, "loss": 0.078, "step": 394 }, { "epoch": 1.373913043478261, "grad_norm": 0.1275403647919897, "learning_rate": 5.373515514145772e-05, "loss": 0.0882, "step": 395 }, { "epoch": 1.377391304347826, "grad_norm": 0.1414442736987841, "learning_rate": 5.3196257558041386e-05, "loss": 0.0905, "step": 396 }, { "epoch": 1.3808695652173912, "grad_norm": 0.1647573834843455, "learning_rate": 5.265909489513567e-05, "loss": 0.0868, "step": 397 }, { "epoch": 1.3843478260869566, "grad_norm": 0.14978728162298646, "learning_rate": 5.212368706427912e-05, "loss": 0.0967, "step": 398 }, { "epoch": 1.3878260869565218, "grad_norm": 0.13582863247078658, "learning_rate": 5.159005391196213e-05, "loss": 0.0888, "step": 399 }, { "epoch": 1.391304347826087, "grad_norm": 0.11281045642311609, "learning_rate": 5.105821521889147e-05, "loss": 0.0899, "step": 400 }, { "epoch": 1.3947826086956523, "grad_norm": 0.1525391794429011, "learning_rate": 5.052819069925676e-05, "loss": 0.1121, "step": 401 }, { "epoch": 1.3982608695652174, "grad_norm": 0.10553540876961562, "learning_rate": 5.000000000000002e-05, "loss": 0.0667, "step": 402 }, { "epoch": 1.4017391304347826, "grad_norm": 0.14272542918507544, "learning_rate": 4.947366270008707e-05, "loss": 0.1049, "step": 403 }, { "epoch": 1.4052173913043478, "grad_norm": 0.11523131534313182, "learning_rate": 4.894919830978212e-05, "loss": 0.083, "step": 404 }, { "epoch": 1.4086956521739131, "grad_norm": 0.11250758245733375, "learning_rate": 4.8426626269924266e-05, "loss": 0.0822, "step": 405 }, { "epoch": 1.4121739130434783, "grad_norm": 0.13451779717959741, "learning_rate": 4.790596595120699e-05, "loss": 0.0967, "step": 406 }, { "epoch": 1.4156521739130434, "grad_norm": 0.17014026695649226, "learning_rate": 4.738723665346021e-05, "loss": 0.0952, "step": 407 }, { "epoch": 1.4191304347826086, "grad_norm": 0.11335400231382785, "learning_rate": 4.687045760493468e-05, "loss": 0.0765, "step": 408 }, { "epoch": 1.422608695652174, "grad_norm": 0.13153029025610707, "learning_rate": 4.635564796158945e-05, "loss": 0.0942, "step": 409 }, { "epoch": 1.4260869565217391, "grad_norm": 0.14072727769903307, "learning_rate": 4.5842826806381544e-05, "loss": 0.1033, "step": 410 }, { "epoch": 1.4295652173913043, "grad_norm": 0.19021079673592267, "learning_rate": 4.533201314855891e-05, "loss": 0.0908, "step": 411 }, { "epoch": 1.4330434782608696, "grad_norm": 0.1282315437032552, "learning_rate": 4.48232259229554e-05, "loss": 0.0923, "step": 412 }, { "epoch": 1.4365217391304348, "grad_norm": 0.10482566251391306, "learning_rate": 4.431648398928933e-05, "loss": 0.0769, "step": 413 }, { "epoch": 1.44, "grad_norm": 0.0989285401022153, "learning_rate": 4.381180613146395e-05, "loss": 0.0627, "step": 414 }, { "epoch": 1.4434782608695653, "grad_norm": 0.15004726013623923, "learning_rate": 4.3309211056871546e-05, "loss": 0.107, "step": 415 }, { "epoch": 1.4469565217391305, "grad_norm": 0.10917064763259954, "learning_rate": 4.280871739569972e-05, "loss": 0.0723, "step": 416 }, { "epoch": 1.4504347826086956, "grad_norm": 0.14217337210991582, "learning_rate": 4.231034370024088e-05, "loss": 0.0876, "step": 417 }, { "epoch": 1.453913043478261, "grad_norm": 0.12259499737310682, "learning_rate": 4.181410844420474e-05, "loss": 0.072, "step": 418 }, { "epoch": 1.4573913043478262, "grad_norm": 0.1383064965783125, "learning_rate": 4.132003002203314e-05, "loss": 0.1001, "step": 419 }, { "epoch": 1.4608695652173913, "grad_norm": 0.15628614353703477, "learning_rate": 4.0828126748218654e-05, "loss": 0.1024, "step": 420 }, { "epoch": 1.4643478260869565, "grad_norm": 0.15540806197515133, "learning_rate": 4.0338416856625294e-05, "loss": 0.1064, "step": 421 }, { "epoch": 1.4678260869565216, "grad_norm": 0.12867401972303838, "learning_rate": 3.985091849981297e-05, "loss": 0.0814, "step": 422 }, { "epoch": 1.471304347826087, "grad_norm": 0.10461015345788115, "learning_rate": 3.936564974836431e-05, "loss": 0.0551, "step": 423 }, { "epoch": 1.4747826086956521, "grad_norm": 0.17422707198524348, "learning_rate": 3.8882628590215074e-05, "loss": 0.1068, "step": 424 }, { "epoch": 1.4782608695652173, "grad_norm": 0.11823762504382565, "learning_rate": 3.840187292998717e-05, "loss": 0.0847, "step": 425 }, { "epoch": 1.4817391304347827, "grad_norm": 0.14190454091036495, "learning_rate": 3.7923400588325155e-05, "loss": 0.0985, "step": 426 }, { "epoch": 1.4852173913043478, "grad_norm": 0.1487917306625744, "learning_rate": 3.7447229301235445e-05, "loss": 0.0972, "step": 427 }, { "epoch": 1.488695652173913, "grad_norm": 0.11307811508469943, "learning_rate": 3.697337671942913e-05, "loss": 0.0769, "step": 428 }, { "epoch": 1.4921739130434784, "grad_norm": 0.12456291954504964, "learning_rate": 3.6501860407667465e-05, "loss": 0.0757, "step": 429 }, { "epoch": 1.4956521739130435, "grad_norm": 0.14812964550659216, "learning_rate": 3.60326978441109e-05, "loss": 0.1029, "step": 430 }, { "epoch": 1.4991304347826087, "grad_norm": 0.1681784734853534, "learning_rate": 3.556590641967115e-05, "loss": 0.1252, "step": 431 }, { "epoch": 1.502608695652174, "grad_norm": 0.14613030602008723, "learning_rate": 3.510150343736668e-05, "loss": 0.0912, "step": 432 }, { "epoch": 1.5060869565217392, "grad_norm": 0.15179818766879094, "learning_rate": 3.463950611168111e-05, "loss": 0.0858, "step": 433 }, { "epoch": 1.5095652173913043, "grad_norm": 0.12461414121764455, "learning_rate": 3.4179931567925216e-05, "loss": 0.0824, "step": 434 }, { "epoch": 1.5130434782608697, "grad_norm": 0.11765068168074926, "learning_rate": 3.372279684160221e-05, "loss": 0.0862, "step": 435 }, { "epoch": 1.5165217391304346, "grad_norm": 0.14280556708472175, "learning_rate": 3.3268118877776066e-05, "loss": 0.0954, "step": 436 }, { "epoch": 1.52, "grad_norm": 0.11285620318100742, "learning_rate": 3.281591453044366e-05, "loss": 0.0735, "step": 437 }, { "epoch": 1.5234782608695652, "grad_norm": 0.10694921241597416, "learning_rate": 3.236620056190972e-05, "loss": 0.069, "step": 438 }, { "epoch": 1.5269565217391303, "grad_norm": 0.12484188708941266, "learning_rate": 3.191899364216581e-05, "loss": 0.083, "step": 439 }, { "epoch": 1.5304347826086957, "grad_norm": 0.15429288005492145, "learning_rate": 3.147431034827208e-05, "loss": 0.1033, "step": 440 }, { "epoch": 1.5339130434782609, "grad_norm": 0.1253058317602747, "learning_rate": 3.103216716374312e-05, "loss": 0.0751, "step": 441 }, { "epoch": 1.537391304347826, "grad_norm": 0.11203979862187523, "learning_rate": 3.059258047793661e-05, "loss": 0.0804, "step": 442 }, { "epoch": 1.5408695652173914, "grad_norm": 0.13184136276253297, "learning_rate": 3.0155566585446117e-05, "loss": 0.0892, "step": 443 }, { "epoch": 1.5443478260869565, "grad_norm": 0.10496670695439927, "learning_rate": 2.9721141685496823e-05, "loss": 0.08, "step": 444 }, { "epoch": 1.5478260869565217, "grad_norm": 0.11136343180704414, "learning_rate": 2.9289321881345254e-05, "loss": 0.0764, "step": 445 }, { "epoch": 1.551304347826087, "grad_norm": 0.14576709922104164, "learning_rate": 2.8860123179682242e-05, "loss": 0.1061, "step": 446 }, { "epoch": 1.5547826086956522, "grad_norm": 0.09499364976886815, "learning_rate": 2.8433561490039573e-05, "loss": 0.0745, "step": 447 }, { "epoch": 1.5582608695652174, "grad_norm": 0.12469651410155881, "learning_rate": 2.800965262420043e-05, "loss": 0.086, "step": 448 }, { "epoch": 1.5617391304347827, "grad_norm": 0.0950193427692519, "learning_rate": 2.7588412295613043e-05, "loss": 0.0548, "step": 449 }, { "epoch": 1.5652173913043477, "grad_norm": 0.1436085195291988, "learning_rate": 2.716985611880841e-05, "loss": 0.0923, "step": 450 }, { "epoch": 1.568695652173913, "grad_norm": 0.1220012073528301, "learning_rate": 2.675399960882138e-05, "loss": 0.0835, "step": 451 }, { "epoch": 1.5721739130434784, "grad_norm": 0.14250023280956398, "learning_rate": 2.6340858180615646e-05, "loss": 0.0817, "step": 452 }, { "epoch": 1.5756521739130434, "grad_norm": 0.14016261789642684, "learning_rate": 2.593044714851218e-05, "loss": 0.1009, "step": 453 }, { "epoch": 1.5791304347826087, "grad_norm": 0.1519687009324273, "learning_rate": 2.5522781725621813e-05, "loss": 0.0936, "step": 454 }, { "epoch": 1.5826086956521739, "grad_norm": 0.10018240850657148, "learning_rate": 2.511787702328102e-05, "loss": 0.0695, "step": 455 }, { "epoch": 1.586086956521739, "grad_norm": 0.15832897678113741, "learning_rate": 2.471574805049206e-05, "loss": 0.103, "step": 456 }, { "epoch": 1.5895652173913044, "grad_norm": 0.09635042116603919, "learning_rate": 2.4316409713366352e-05, "loss": 0.0713, "step": 457 }, { "epoch": 1.5930434782608696, "grad_norm": 0.16551038949811617, "learning_rate": 2.3919876814572194e-05, "loss": 0.1165, "step": 458 }, { "epoch": 1.5965217391304347, "grad_norm": 0.1591761285439053, "learning_rate": 2.352616405278586e-05, "loss": 0.1065, "step": 459 }, { "epoch": 1.6, "grad_norm": 0.1257794232379624, "learning_rate": 2.3135286022146785e-05, "loss": 0.0878, "step": 460 }, { "epoch": 1.6034782608695652, "grad_norm": 0.13064370809940834, "learning_rate": 2.2747257211716757e-05, "loss": 0.0878, "step": 461 }, { "epoch": 1.6069565217391304, "grad_norm": 0.1373673611302553, "learning_rate": 2.236209200494258e-05, "loss": 0.08, "step": 462 }, { "epoch": 1.6104347826086958, "grad_norm": 0.15683223957755238, "learning_rate": 2.1979804679123106e-05, "loss": 0.097, "step": 463 }, { "epoch": 1.613913043478261, "grad_norm": 0.11215372603755155, "learning_rate": 2.1600409404879874e-05, "loss": 0.0759, "step": 464 }, { "epoch": 1.617391304347826, "grad_norm": 0.12472859826284394, "learning_rate": 2.122392024563199e-05, "loss": 0.0798, "step": 465 }, { "epoch": 1.6208695652173915, "grad_norm": 0.14167323311602448, "learning_rate": 2.0850351157074598e-05, "loss": 0.1025, "step": 466 }, { "epoch": 1.6243478260869564, "grad_norm": 0.13106838058233283, "learning_rate": 2.047971598666184e-05, "loss": 0.0966, "step": 467 }, { "epoch": 1.6278260869565218, "grad_norm": 0.12245656492036927, "learning_rate": 2.011202847309329e-05, "loss": 0.0858, "step": 468 }, { "epoch": 1.631304347826087, "grad_norm": 0.15076412437271922, "learning_rate": 1.9747302245804945e-05, "loss": 0.0988, "step": 469 }, { "epoch": 1.634782608695652, "grad_norm": 0.1890224571658569, "learning_rate": 1.9385550824463727e-05, "loss": 0.141, "step": 470 }, { "epoch": 1.6382608695652174, "grad_norm": 0.12643818292640252, "learning_rate": 1.9026787618466646e-05, "loss": 0.0821, "step": 471 }, { "epoch": 1.6417391304347826, "grad_norm": 0.11974342973177961, "learning_rate": 1.8671025926443465e-05, "loss": 0.0852, "step": 472 }, { "epoch": 1.6452173913043477, "grad_norm": 0.11053773314022491, "learning_rate": 1.8318278935763955e-05, "loss": 0.0693, "step": 473 }, { "epoch": 1.6486956521739131, "grad_norm": 0.12718860708539992, "learning_rate": 1.7968559722048906e-05, "loss": 0.0759, "step": 474 }, { "epoch": 1.6521739130434783, "grad_norm": 0.11472304774066805, "learning_rate": 1.762188124868557e-05, "loss": 0.0822, "step": 475 }, { "epoch": 1.6556521739130434, "grad_norm": 0.1586172339858714, "learning_rate": 1.7278256366347035e-05, "loss": 0.1156, "step": 476 }, { "epoch": 1.6591304347826088, "grad_norm": 0.16408772559550205, "learning_rate": 1.6937697812515894e-05, "loss": 0.0918, "step": 477 }, { "epoch": 1.662608695652174, "grad_norm": 0.12800527362364758, "learning_rate": 1.660021821101222e-05, "loss": 0.0789, "step": 478 }, { "epoch": 1.666086956521739, "grad_norm": 0.15521778399290198, "learning_rate": 1.626583007152539e-05, "loss": 0.0987, "step": 479 }, { "epoch": 1.6695652173913045, "grad_norm": 0.14944005207844402, "learning_rate": 1.5934545789150623e-05, "loss": 0.1133, "step": 480 }, { "epoch": 1.6730434782608694, "grad_norm": 0.12173810785220801, "learning_rate": 1.5606377643929304e-05, "loss": 0.0794, "step": 481 }, { "epoch": 1.6765217391304348, "grad_norm": 0.12290655885053603, "learning_rate": 1.5281337800393968e-05, "loss": 0.0717, "step": 482 }, { "epoch": 1.6800000000000002, "grad_norm": 0.13763340851307898, "learning_rate": 1.4959438307117247e-05, "loss": 0.095, "step": 483 }, { "epoch": 1.683478260869565, "grad_norm": 0.10678789082393463, "learning_rate": 1.4640691096265358e-05, "loss": 0.0838, "step": 484 }, { "epoch": 1.6869565217391305, "grad_norm": 0.12694424997511286, "learning_rate": 1.4325107983155694e-05, "loss": 0.0884, "step": 485 }, { "epoch": 1.6904347826086956, "grad_norm": 0.13805939087384794, "learning_rate": 1.401270066581899e-05, "loss": 0.0884, "step": 486 }, { "epoch": 1.6939130434782608, "grad_norm": 0.1116542985760522, "learning_rate": 1.3703480724565577e-05, "loss": 0.0819, "step": 487 }, { "epoch": 1.6973913043478261, "grad_norm": 0.130701148914566, "learning_rate": 1.339745962155613e-05, "loss": 0.0942, "step": 488 }, { "epoch": 1.7008695652173913, "grad_norm": 0.12303229923584438, "learning_rate": 1.3094648700376954e-05, "loss": 0.0968, "step": 489 }, { "epoch": 1.7043478260869565, "grad_norm": 0.10050903994662669, "learning_rate": 1.2795059185619229e-05, "loss": 0.064, "step": 490 }, { "epoch": 1.7078260869565218, "grad_norm": 0.13529518412698788, "learning_rate": 1.249870218246323e-05, "loss": 0.0891, "step": 491 }, { "epoch": 1.711304347826087, "grad_norm": 0.11568064512791533, "learning_rate": 1.2205588676266388e-05, "loss": 0.0841, "step": 492 }, { "epoch": 1.7147826086956521, "grad_norm": 0.11324213029173631, "learning_rate": 1.1915729532156372e-05, "loss": 0.0693, "step": 493 }, { "epoch": 1.7182608695652175, "grad_norm": 0.12078490458473878, "learning_rate": 1.1629135494628096e-05, "loss": 0.0809, "step": 494 }, { "epoch": 1.7217391304347827, "grad_norm": 0.15619885447728415, "learning_rate": 1.134581718714558e-05, "loss": 0.0982, "step": 495 }, { "epoch": 1.7252173913043478, "grad_norm": 0.13958396553029748, "learning_rate": 1.1065785111748117e-05, "loss": 0.1006, "step": 496 }, { "epoch": 1.7286956521739132, "grad_norm": 0.11936287781907709, "learning_rate": 1.0789049648661043e-05, "loss": 0.0778, "step": 497 }, { "epoch": 1.7321739130434781, "grad_norm": 0.13994107260501892, "learning_rate": 1.0515621055910817e-05, "loss": 0.0994, "step": 498 }, { "epoch": 1.7356521739130435, "grad_norm": 0.10069177741815626, "learning_rate": 1.0245509468944992e-05, "loss": 0.0798, "step": 499 }, { "epoch": 1.7391304347826086, "grad_norm": 0.1520239032704441, "learning_rate": 9.978724900256265e-06, "loss": 0.0936, "step": 500 }, { "epoch": 1.7426086956521738, "grad_norm": 0.12537489299552443, "learning_rate": 9.715277239011578e-06, "loss": 0.0759, "step": 501 }, { "epoch": 1.7460869565217392, "grad_norm": 0.16914167358101417, "learning_rate": 9.455176250685338e-06, "loss": 0.1159, "step": 502 }, { "epoch": 1.7495652173913043, "grad_norm": 0.12340433382499669, "learning_rate": 9.198431576697608e-06, "loss": 0.0809, "step": 503 }, { "epoch": 1.7530434782608695, "grad_norm": 0.16038700994407892, "learning_rate": 8.945052734056581e-06, "loss": 0.0927, "step": 504 }, { "epoch": 1.7565217391304349, "grad_norm": 0.18736397280927972, "learning_rate": 8.695049115005837e-06, "loss": 0.1138, "step": 505 }, { "epoch": 1.76, "grad_norm": 0.11455094890434803, "learning_rate": 8.448429986676298e-06, "loss": 0.0876, "step": 506 }, { "epoch": 1.7634782608695652, "grad_norm": 0.13381829396413253, "learning_rate": 8.205204490742536e-06, "loss": 0.0932, "step": 507 }, { "epoch": 1.7669565217391305, "grad_norm": 0.10231732967595585, "learning_rate": 7.96538164308407e-06, "loss": 0.0702, "step": 508 }, { "epoch": 1.7704347826086957, "grad_norm": 0.0947188798552471, "learning_rate": 7.728970333451035e-06, "loss": 0.0706, "step": 509 }, { "epoch": 1.7739130434782608, "grad_norm": 0.09733737409054823, "learning_rate": 7.4959793251348055e-06, "loss": 0.0644, "step": 510 }, { "epoch": 1.7773913043478262, "grad_norm": 0.11169634637379897, "learning_rate": 7.2664172546429655e-06, "loss": 0.0709, "step": 511 }, { "epoch": 1.7808695652173911, "grad_norm": 0.12974806998277916, "learning_rate": 7.040292631379386e-06, "loss": 0.0856, "step": 512 }, { "epoch": 1.7843478260869565, "grad_norm": 0.13011819014873824, "learning_rate": 6.817613837328573e-06, "loss": 0.0924, "step": 513 }, { "epoch": 1.787826086956522, "grad_norm": 0.1508887480796253, "learning_rate": 6.598389126745208e-06, "loss": 0.1101, "step": 514 }, { "epoch": 1.7913043478260868, "grad_norm": 0.1528558553271661, "learning_rate": 6.382626625847921e-06, "loss": 0.1014, "step": 515 }, { "epoch": 1.7947826086956522, "grad_norm": 0.13295695013628608, "learning_rate": 6.170334332518324e-06, "loss": 0.0866, "step": 516 }, { "epoch": 1.7982608695652174, "grad_norm": 0.16036744040311404, "learning_rate": 5.961520116004327e-06, "loss": 0.1076, "step": 517 }, { "epoch": 1.8017391304347825, "grad_norm": 0.11717096876409042, "learning_rate": 5.756191716628556e-06, "loss": 0.0688, "step": 518 }, { "epoch": 1.8052173913043479, "grad_norm": 0.11484830279438352, "learning_rate": 5.554356745501454e-06, "loss": 0.0694, "step": 519 }, { "epoch": 1.808695652173913, "grad_norm": 0.17176181086966022, "learning_rate": 5.3560226842390596e-06, "loss": 0.1032, "step": 520 }, { "epoch": 1.8121739130434782, "grad_norm": 0.11739088349195866, "learning_rate": 5.1611968846857815e-06, "loss": 0.0732, "step": 521 }, { "epoch": 1.8156521739130436, "grad_norm": 0.13709017479262753, "learning_rate": 4.969886568641757e-06, "loss": 0.0918, "step": 522 }, { "epoch": 1.8191304347826087, "grad_norm": 0.1280476174629274, "learning_rate": 4.7820988275953045e-06, "loss": 0.0938, "step": 523 }, { "epoch": 1.8226086956521739, "grad_norm": 0.11201422652339658, "learning_rate": 4.597840622459937e-06, "loss": 0.0814, "step": 524 }, { "epoch": 1.8260869565217392, "grad_norm": 0.09871056879272744, "learning_rate": 4.417118783316388e-06, "loss": 0.072, "step": 525 }, { "epoch": 1.8295652173913044, "grad_norm": 0.10542472286239411, "learning_rate": 4.2399400091594154e-06, "loss": 0.068, "step": 526 }, { "epoch": 1.8330434782608696, "grad_norm": 0.14017893040374907, "learning_rate": 4.066310867649481e-06, "loss": 0.1032, "step": 527 }, { "epoch": 1.836521739130435, "grad_norm": 0.11855048113345314, "learning_rate": 3.896237794869339e-06, "loss": 0.0783, "step": 528 }, { "epoch": 1.8399999999999999, "grad_norm": 0.1244102175680237, "learning_rate": 3.729727095085422e-06, "loss": 0.0922, "step": 529 }, { "epoch": 1.8434782608695652, "grad_norm": 0.12180644294551433, "learning_rate": 3.566784940514145e-06, "loss": 0.0807, "step": 530 }, { "epoch": 1.8469565217391304, "grad_norm": 0.09761026100653182, "learning_rate": 3.40741737109318e-06, "loss": 0.0641, "step": 531 }, { "epoch": 1.8504347826086955, "grad_norm": 0.09710029722289329, "learning_rate": 3.2516302942574793e-06, "loss": 0.067, "step": 532 }, { "epoch": 1.853913043478261, "grad_norm": 0.10724535703528021, "learning_rate": 3.0994294847203733e-06, "loss": 0.0743, "step": 533 }, { "epoch": 1.857391304347826, "grad_norm": 0.13083100814230067, "learning_rate": 2.9508205842594728e-06, "loss": 0.0754, "step": 534 }, { "epoch": 1.8608695652173912, "grad_norm": 0.12672158607204304, "learning_rate": 2.8058091015075394e-06, "loss": 0.078, "step": 535 }, { "epoch": 1.8643478260869566, "grad_norm": 0.17103224377006737, "learning_rate": 2.6644004117483356e-06, "loss": 0.0922, "step": 536 }, { "epoch": 1.8678260869565217, "grad_norm": 0.134150142101436, "learning_rate": 2.526599756717285e-06, "loss": 0.1002, "step": 537 }, { "epoch": 1.871304347826087, "grad_norm": 0.129521169878982, "learning_rate": 2.392412244407294e-06, "loss": 0.0836, "step": 538 }, { "epoch": 1.8747826086956523, "grad_norm": 0.10885289790789841, "learning_rate": 2.26184284887927e-06, "loss": 0.0774, "step": 539 }, { "epoch": 1.8782608695652174, "grad_norm": 0.10488094490283079, "learning_rate": 2.134896410077891e-06, "loss": 0.0789, "step": 540 }, { "epoch": 1.8817391304347826, "grad_norm": 0.11889491296378912, "learning_rate": 2.011577633652062e-06, "loss": 0.0782, "step": 541 }, { "epoch": 1.885217391304348, "grad_norm": 0.12096235669049085, "learning_rate": 1.8918910907805732e-06, "loss": 0.0881, "step": 542 }, { "epoch": 1.8886956521739129, "grad_norm": 0.1106479394276716, "learning_rate": 1.7758412180026273e-06, "loss": 0.0802, "step": 543 }, { "epoch": 1.8921739130434783, "grad_norm": 0.12821924742613686, "learning_rate": 1.6634323170533928e-06, "loss": 0.0911, "step": 544 }, { "epoch": 1.8956521739130436, "grad_norm": 0.15604807612172736, "learning_rate": 1.5546685547045192e-06, "loss": 0.1, "step": 545 }, { "epoch": 1.8991304347826086, "grad_norm": 0.1478681396223387, "learning_rate": 1.4495539626097288e-06, "loss": 0.0804, "step": 546 }, { "epoch": 1.902608695652174, "grad_norm": 0.13421748048136942, "learning_rate": 1.348092437155346e-06, "loss": 0.089, "step": 547 }, { "epoch": 1.906086956521739, "grad_norm": 0.11687932254739727, "learning_rate": 1.2502877393158586e-06, "loss": 0.0871, "step": 548 }, { "epoch": 1.9095652173913042, "grad_norm": 0.15643926713744022, "learning_rate": 1.1561434945145277e-06, "loss": 0.104, "step": 549 }, { "epoch": 1.9130434782608696, "grad_norm": 0.10696169647909613, "learning_rate": 1.0656631924889749e-06, "loss": 0.0716, "step": 550 }, { "epoch": 1.9165217391304348, "grad_norm": 0.14019705935951768, "learning_rate": 9.788501871618728e-07, "loss": 0.0898, "step": 551 }, { "epoch": 1.92, "grad_norm": 0.15767772433554056, "learning_rate": 8.957076965165235e-07, "loss": 0.1015, "step": 552 }, { "epoch": 1.9234782608695653, "grad_norm": 0.12202925229447881, "learning_rate": 8.162388024777201e-07, "loss": 0.0889, "step": 553 }, { "epoch": 1.9269565217391305, "grad_norm": 0.14213284579860058, "learning_rate": 7.404464507973608e-07, "loss": 0.1061, "step": 554 }, { "epoch": 1.9304347826086956, "grad_norm": 0.11946138428666646, "learning_rate": 6.683334509453465e-07, "loss": 0.0756, "step": 555 }, { "epoch": 1.933913043478261, "grad_norm": 0.1776730484619494, "learning_rate": 5.999024760054095e-07, "loss": 0.1156, "step": 556 }, { "epoch": 1.9373913043478261, "grad_norm": 0.15552558119011417, "learning_rate": 5.351560625760254e-07, "loss": 0.1111, "step": 557 }, { "epoch": 1.9408695652173913, "grad_norm": 0.1269110866764246, "learning_rate": 4.7409661067642217e-07, "loss": 0.0929, "step": 558 }, { "epoch": 1.9443478260869567, "grad_norm": 0.10309350272790443, "learning_rate": 4.167263836575286e-07, "loss": 0.0547, "step": 559 }, { "epoch": 1.9478260869565216, "grad_norm": 0.12377918248036159, "learning_rate": 3.630475081181861e-07, "loss": 0.0808, "step": 560 }, { "epoch": 1.951304347826087, "grad_norm": 0.12729430798666608, "learning_rate": 3.1306197382624526e-07, "loss": 0.077, "step": 561 }, { "epoch": 1.9547826086956521, "grad_norm": 0.11766868772742071, "learning_rate": 2.667716336448356e-07, "loss": 0.0871, "step": 562 }, { "epoch": 1.9582608695652173, "grad_norm": 0.12138412723458143, "learning_rate": 2.2417820346367635e-07, "loss": 0.0983, "step": 563 }, { "epoch": 1.9617391304347827, "grad_norm": 0.12163696179721654, "learning_rate": 1.8528326213548274e-07, "loss": 0.0855, "step": 564 }, { "epoch": 1.9652173913043478, "grad_norm": 0.1569270166290431, "learning_rate": 1.50088251417424e-07, "loss": 0.1015, "step": 565 }, { "epoch": 1.968695652173913, "grad_norm": 0.12730784199491677, "learning_rate": 1.1859447591769934e-07, "loss": 0.0878, "step": 566 }, { "epoch": 1.9721739130434783, "grad_norm": 0.12648022636737355, "learning_rate": 9.080310304716567e-08, "loss": 0.0842, "step": 567 }, { "epoch": 1.9756521739130435, "grad_norm": 0.11283992913356376, "learning_rate": 6.671516297606095e-08, "loss": 0.0834, "step": 568 }, { "epoch": 1.9791304347826086, "grad_norm": 0.10119868305303333, "learning_rate": 4.6331548595845984e-08, "loss": 0.0667, "step": 569 }, { "epoch": 1.982608695652174, "grad_norm": 0.1227080883131745, "learning_rate": 2.965301548606414e-08, "loss": 0.0873, "step": 570 }, { "epoch": 1.9860869565217392, "grad_norm": 0.158380237566967, "learning_rate": 1.6680181886352676e-08, "loss": 0.1049, "step": 571 }, { "epoch": 1.9895652173913043, "grad_norm": 0.17246726825049064, "learning_rate": 7.413528673549941e-09, "loss": 0.0969, "step": 572 }, { "epoch": 1.9930434782608697, "grad_norm": 0.15178078485673158, "learning_rate": 1.8533993438318852e-09, "loss": 0.0884, "step": 573 }, { "epoch": 1.9965217391304346, "grad_norm": 0.1411963796704214, "learning_rate": 0.0, "loss": 0.0874, "step": 574 }, { "epoch": 1.9965217391304346, "eval_loss": 0.14970487356185913, "eval_runtime": 49.8439, "eval_samples_per_second": 4.795, "eval_steps_per_second": 0.602, "step": 574 }, { "epoch": 1.9965217391304346, "step": 574, "total_flos": 465841769250816.0, "train_loss": 0.11642231966144947, "train_runtime": 5186.3709, "train_samples_per_second": 1.772, "train_steps_per_second": 0.111 } ], "logging_steps": 1, "max_steps": 574, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 465841769250816.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }